Compare commits

..

34 Commits

Author SHA1 Message Date
Ean Garvey
dea405285e Revert python version change and update actions/setup-python to v5 2024-08-08 16:50:15 -05:00
Ean Garvey
b1d2cb3bad Change python version in nightly .yml to 3.11.9 2024-08-08 16:48:41 -05:00
Ean Garvey
4759e808f2 Rest API support and cleanup 2024-08-08 11:37:53 -05:00
Ean Garvey
d5f37eaf20 Bump requirements 2024-06-17 18:16:44 -05:00
Ean Garvey
84bc1437a7 Strip sdxl-turbo options 2024-06-17 17:57:40 -05:00
Ean Garvey
83f424a83e Close advanced setting by default. 2024-06-06 13:22:57 -05:00
Ean Garvey
5b3b262359 Simplify ui further, add CLI option to load a default config 2024-06-06 13:21:25 -05:00
Ean Garvey
67b438eb9f take all ireert calls out of studio flow 2024-06-04 01:46:48 -05:00
Ean Garvey
4aa2d8b2a2 Purge shark/ directory, minimal ireert api usage for dynamically loaded plugins 2024-06-04 00:53:07 -05:00
Ean Garvey
dac7a29eef Purge unused code and patch out iree runtime handling from init 2024-06-03 18:00:05 -05:00
Ean Garvey
59600456be seed fixes 2024-06-02 16:25:16 -05:00
Ean Garvey
e514910202 Remove sdxl 30step config 2024-06-02 14:28:58 -05:00
Ean Garvey
33f6c312d9 limit steps to 2 (gives best results) 2024-06-02 14:25:29 -05:00
Ean Garvey
ab06047108 set a default 2024-06-02 14:23:40 -05:00
Ean Garvey
ac48b843e7 actually reduce steps 2024-06-02 14:00:45 -05:00
Ean Garvey
5f1b5e58d6 igpu dont error on device parse fail 2024-06-02 12:56:44 -05:00
Ean Garvey
6adae49d9b igpu restrictions 2024-06-02 12:51:53 -05:00
Ean Garvey
6abd9ff5cf Reduce available step options for turbo. 2024-06-02 11:41:23 -05:00
Ean Garvey
9957c96014 More noticeable seed changes 2024-06-02 11:39:21 -05:00
Ean Garvey
36b8c2fd6d disable pndm 2024-06-02 11:30:18 -05:00
Ean Garvey
9163c1fc50 small fixes 2024-06-02 11:28:37 -05:00
Ean Garvey
349e9f70fb Progress indicators 2024-06-02 10:18:09 -05:00
Ean Garvey
64e63e7130 znver4 device handling 2024-06-02 10:08:00 -05:00
Ean Garvey
ea8738fb1a Update SRT links 2024-06-02 09:50:09 -05:00
Ean Garvey
2a5bec3c4f Fixes for seed. 2024-06-02 09:46:22 -05:00
Ean Garvey
bb58b01d75 Switch to fixed steps, tweak config loading to prevent race condition 2024-06-01 20:15:53 -05:00
Ean Garvey
02285b33a4 More fixes for demo. 2024-06-01 19:46:52 -05:00
Ean Garvey
f9a1d35b59 Hide chatbot. 2024-06-01 14:24:37 -05:00
Ean Garvey
b1ca19a6e6 Cleanup for demo. 2024-06-01 13:42:51 -05:00
Ean Garvey
b5dea85808 Reduce UI for demos. 2024-06-01 12:00:22 -05:00
Ean Garvey
e75f96f2d7 fixup conditional 2024-06-01 12:00:11 -05:00
Ean Garvey
bf67e2aa3b Formatting 2024-06-01 11:59:10 -05:00
Ean Garvey
c088247aa1 Fix default configs, config loading, and add warnings/early returns for bad configs. 2024-06-01 11:58:51 -05:00
Ean Garvey
42abc6787d Small tweaks to ckpt processing, add tool to prefix params keys 2024-06-01 11:53:40 -05:00
249 changed files with 2212 additions and 27719 deletions

View File

@@ -10,15 +10,15 @@ jobs:
runs-on: ubuntu-latest
# Don't run this in everyone's forks.
if: github.repository == 'nod-ai/AMD-SHARK-Studio'
if: github.repository == 'nod-ai/SHARK'
steps:
- name: Checking out repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
uses: actions/checkout@v2
with:
token: ${{ secrets.NODAI_INVOCATION_TOKEN }}
- name: Run scrape releases script
run: python ./build_tools/scrape_releases.py nod-ai AMD-SHARK-Studio > /tmp/index.html
run: python ./build_tools/scrape_releases.py nod-ai SHARK > /tmp/index.html
shell: bash
- run: git fetch --all
- run: git switch github-pages
@@ -31,7 +31,7 @@ jobs:
- run: git diff --cached --exit-code || git commit -m "Update releases."
- name: GitHub Push
uses: ad-m/github-push-action@d91a481090679876dfc4178fef17f286781251df # v0.8.0
uses: ad-m/github-push-action@v0.6.0
with:
github_token: ${{ secrets.NODAI_INVOCATION_TOKEN }}
branch: github-pages

View File

@@ -17,9 +17,9 @@ jobs:
python-version: ["3.11"]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
@@ -35,32 +35,32 @@ jobs:
- name: Create Release
id: create_release
uses: ncipollo/release-action@440c8c1cb0ed28b9f43e4d1d670870f059653174 # v1.16.0
uses: actions/create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
with:
tag: ${{ env.tag_name }}
name: nod.ai AMDSHARK ${{ env.tag_name }}
tag_name: ${{ env.tag_name }}
release_name: nod.ai SHARK ${{ env.tag_name }}
body: |
Automatic snapshot release of nod.ai AMDSHARK.
Automatic snapshot release of nod.ai SHARK.
draft: true
prerelease: true
- name: Build Package
- name: Build Package (api only)
shell: powershell
run: |
./setup_venv.ps1
python process_skipfiles.py
$env:AMDSHARK_PACKAGE_VERSION=${{ env.package_version }}
$env:SHARK_PACKAGE_VERSION=${{ env.package_version }}
pip install -e .
pip freeze -l
pyinstaller .\apps\amdshark_studio\amdshark_studio.spec
mv ./dist/nodai_amdshark_studio.exe ./dist/nodai_amdshark_studio_${{ env.package_version_ }}.exe
signtool sign /f c:\g\amdshark_02152023.cer /fd certHash /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/nodai_amdshark_studio_${{ env.package_version_ }}.exe
pyinstaller .\apps\shark_studio\shark_studio_apionly.spec
mv ./dist/nodai_shark_studio.exe ./dist/nodai_shark_studio_${{ env.package_version_ }}.exe
signtool sign /f c:\g\shark_02152023.cer /fd certHash /csp "eToken Base Cryptographic Provider" /k "${{ secrets.CI_CERT }}" ./dist/nodai_shark_studio_${{ env.package_version_ }}.exe
- name: Upload Release Assets
id: upload-release-assets
uses: dwenegar/upload-release-assets@fe47e06814723c7b1bea3a7e46cf93d5f020d0c3 # v3
uses: dwenegar/upload-release-assets@v1
env:
GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
with:
@@ -70,7 +70,7 @@ jobs:
- name: Publish Release
id: publish_release
uses: eregon/publish-release@01df127f5e9a3c26935118e22e738d95b59d10ce # v1.0.6
uses: eregon/publish-release@v1
env:
GITHUB_TOKEN: ${{ secrets.NODAI_INVOCATION_TOKEN }}
with:

View File

@@ -1,19 +1,19 @@
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
name: Validate AMDShark Studio
name: Validate Shark Studio
on:
push:
branches: [ main ]
paths-ignore:
- '**.md'
- 'amdshark/examples/**'
- 'shark/examples/**'
pull_request:
branches: [ main ]
paths-ignore:
- '**.md'
- 'amdshark/examples/**'
- 'shark/examples/**'
workflow_dispatch:
# Ensure that only a single job or workflow using the same
@@ -39,7 +39,7 @@ jobs:
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- uses: actions/checkout@v3
- name: Set Environment Variables
run: |
@@ -51,7 +51,7 @@ jobs:
echo ${{ matrix.python-version }} >> $GITHUB_WORKSPACE/.python-version
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
uses: actions/setup-python@v4
with:
python-version: '${{ matrix.python-version }}'
@@ -66,7 +66,7 @@ jobs:
run: |
# black format check
black --version
black --check apps/amdshark_studio
black --check apps/shark_studio
# stop the build if there are Python syntax errors or undefined names
flake8 . --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
@@ -77,9 +77,9 @@ jobs:
if: matrix.suite == 'cpu'
run: |
cd $GITHUB_WORKSPACE
python${{ matrix.python-version }} -m venv amdshark.venv
source amdshark.venv/bin/activate
python${{ matrix.python-version }} -m venv shark.venv
source shark.venv/bin/activate
pip install -r requirements.txt --no-cache-dir
pip install -e .
# Disabled due to hang when exporting test llama2
# python apps/amdshark_studio/tests/api_test.py
# python apps/shark_studio/tests/api_test.py

8
.gitignore vendored
View File

@@ -164,15 +164,15 @@ cython_debug/
# vscode related
.vscode
# AMDShark related artifacts
# Shark related artifacts
*venv/
amdshark_tmp/
shark_tmp/
*.vmfb
.use-iree
tank/dict_configs.py
*.csv
reproducers/
apps/amdshark_studio/web/configs
apps/shark_studio/web/configs
# ORT related artefacts
cache_models/
@@ -189,7 +189,7 @@ variants.json
# models folder
apps/stable_diffusion/web/models/
# model artifacts (AMDSHARK)
# model artifacts (SHARK)
*.tempfile
*.mlir
*.vmfb

4
.gitmodules vendored
View File

@@ -1,4 +1,4 @@
[submodule "inference/thirdparty/amdshark-runtime"]
path = inference/thirdparty/amdshark-runtime
[submodule "inference/thirdparty/shark-runtime"]
path = inference/thirdparty/shark-runtime
url =https://github.com/nod-ai/SRT.git
branch = shark-06032022

120
README.md
View File

@@ -1,12 +1,12 @@
# AMDSHARK
# SHARK
High Performance Machine Learning Distribution
<h2>NOTE: This project is not currently maintained.</h2>
*We are currently rebuilding SHARK to take advantage of [Turbine](https://github.com/nod-ai/SHARK-Turbine). Until that is complete make sure you use an .exe release or a checkout of the `SHARK-1.0` branch, for a working SHARK*
*The latest versions of this project are developments towards a refactor on top of IREE-Turbine. Until further notice, make sure you use an .exe release or a checkout of the `AMDSHARK-1.0` branch, for a working AMDSHARK-Studio*
[![Nightly Release](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/nightly.yml)
[![Validate torch-models on Shark Runtime](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml/badge.svg)](https://github.com/nod-ai/SHARK/actions/workflows/test-models.yml)
[![Nightly Release](https://github.com/nod-ai/AMD-SHARK-Studio/actions/workflows/nightly.yml/badge.svg)](https://github.com/nod-ai/AMD-SHARK-Studio/actions/workflows/nightly.yml)
<details>
<summary>Prerequisites - Drivers </summary>
@@ -25,11 +25,11 @@ Other users please ensure you have your latest vendor drivers and Vulkan SDK fro
### Quick Start for AMDSHARK Stable Diffusion for Windows 10/11 Users
### Quick Start for SHARK Stable Diffusion for Windows 10/11 Users
Install the Driver from [Prerequisites](https://github.com/nod-ai/AMD-SHARK-Studio#install-your-hardware-drivers) above
Install the Driver from (Prerequisites)[https://github.com/nod-ai/SHARK#install-your-hardware-drivers] above
Download the [stable release](https://github.com/nod-ai/AMD-SHARK-Studio/releases/latest) or the most recent [AMDSHARK 1.0 pre-release](https://github.com/nod-ai/AMD-SHARK-Studio/releases).
Download the [stable release](https://github.com/nod-ai/shark/releases/latest) or the most recent [SHARK 1.0 pre-release](https://github.com/nod-ai/shark/releases).
Double click the .exe, or [run from the command line](#running) (recommended), and you should have the [UI](http://localhost:8080/) in the browser.
@@ -67,16 +67,16 @@ Enjoy.
## Check out the code
```shell
git clone https://github.com/nod-ai/AMD-SHARK-Studio.git
cd AMD-SHARK-Studio
git clone https://github.com/nod-ai/SHARK.git
cd SHARK
```
## Switch to the Correct Branch (IMPORTANT!)
Currently AMDSHARK is being rebuilt for [Turbine](https://github.com/iree-org/iree-turbine) on the `main` branch. For now you are strongly discouraged from using `main` unless you are working on the rebuild effort, and should not expect the code there to produce a working application for Image Generation, So for now you'll need switch over to the `AMDSHARK-1.0` branch and use the stable code.
Currently SHARK is being rebuilt for [Turbine](https://github.com/nod-ai/SHARK-Turbine) on the `main` branch. For now you are strongly discouraged from using `main` unless you are working on the rebuild effort, and should not expect the code there to produce a working application for Image Generation, So for now you'll need switch over to the `SHARK-1.0` branch and use the stable code.
```shell
git checkout AMDSHARK-1.0
git checkout SHARK-1.0
```
The following setup instructions assume you are on this branch.
@@ -92,7 +92,7 @@ The following setup instructions assume you are on this branch.
set-executionpolicy remotesigned
```
#### Setup venv and install necessary packages (torch-mlir, nodLabs/AMDShark, ...)
#### Setup venv and install necessary packages (torch-mlir, nodLabs/Shark, ...)
```powershell
./setup_venv.ps1 #You can re-run this script to get the latest version
```
@@ -101,20 +101,20 @@ set-executionpolicy remotesigned
```shell
./setup_venv.sh
source amdshark1.venv/bin/activate
source shark1.venv/bin/activate
```
### Run Stable Diffusion on your device - WebUI
#### Windows 10/11 Users
```powershell
(amdshark1.venv) PS C:\g\amdshark> cd .\apps\stable_diffusion\web\
(amdshark1.venv) PS C:\g\amdshark\apps\stable_diffusion\web> python .\index.py
(shark1.venv) PS C:\g\shark> cd .\apps\stable_diffusion\web\
(shark1.venv) PS C:\g\shark\apps\stable_diffusion\web> python .\index.py
```
#### Linux / macOS Users
```shell
(amdshark1.venv) > cd apps/stable_diffusion/web
(amdshark1.venv) > python index.py
(shark1.venv) > cd apps/stable_diffusion/web
(shark1.venv) > python index.py
```
#### Access Stable Diffusion on http://localhost:8080/?__theme=dark
@@ -128,7 +128,7 @@ source amdshark1.venv/bin/activate
#### Windows 10/11 Users
```powershell
(amdshark1.venv) PS C:\g\amdshark> python .\apps\stable_diffusion\scripts\main.py --app="txt2img" --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
(shark1.venv) PS C:\g\shark> python .\apps\stable_diffusion\scripts\main.py --app="txt2img" --precision="fp16" --prompt="tajmahal, snow, sunflowers, oil on canvas" --device="vulkan"
```
#### Linux / macOS Users
@@ -156,7 +156,7 @@ Here are some samples generated:
![a photo of a crab playing a trumpet](https://user-images.githubusercontent.com/74956/204933258-252e7240-8548-45f7-8253-97647d38313d.jpg)
Find us on [AMDSHARK Discord server](https://discord.gg/RUqY2h2s9u) if you have any trouble with running it on your hardware.
Find us on [SHARK Discord server](https://discord.gg/RUqY2h2s9u) if you have any trouble with running it on your hardware.
<details>
@@ -168,8 +168,8 @@ This step sets up a new VirtualEnv for Python
```shell
python --version #Check you have 3.11 on Linux, macOS or Windows Powershell
python -m venv amdshark_venv
source amdshark_venv/bin/activate # Use amdshark_venv/Scripts/activate on Windows
python -m venv shark_venv
source shark_venv/bin/activate # Use shark_venv/Scripts/activate on Windows
# If you are using conda create and activate a new conda env
@@ -179,15 +179,15 @@ python -m pip install --upgrade pip
*macOS Metal* users please install https://sdk.lunarg.com/sdk/download/latest/mac/vulkan-sdk.dmg and enable "System wide install"
### Install AMD-SHARK
### Install SHARK
This step pip installs AMD-SHARK and related packages on Linux Python 3.8, 3.10 and 3.11 and macOS / Windows Python 3.11
This step pip installs SHARK and related packages on Linux Python 3.8, 3.10 and 3.11 and macOS / Windows Python 3.11
```shell
pip install nodai-amdshark -f https://nod-ai.github.io/AMD-SHARK-Studio/package-index/ -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SRT/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
pip install nodai-shark -f https://nod-ai.github.io/SHARK/package-index/ -f https://llvm.github.io/torch-mlir/package-index/ -f https://nod-ai.github.io/SRT/pip-release-links.html --extra-index-url https://download.pytorch.org/whl/nightly/cpu
```
### Run amdshark tank model tests.
### Run shark tank model tests.
```shell
pytest tank/test_models.py
```
@@ -196,7 +196,7 @@ See tank/README.md for a more detailed walkthrough of our pytest suite and CLI.
### Download and run Resnet50 sample
```shell
curl -O https://raw.githubusercontent.com/nod-ai/AMD-SHARK-Studio/main/amdshark/examples/amdshark_inference/resnet50_script.py
curl -O https://raw.githubusercontent.com/nod-ai/SHARK/main/shark/examples/shark_inference/resnet50_script.py
#Install deps for test script
pip install --pre torch torchvision torchaudio tqdm pillow gsutil --extra-index-url https://download.pytorch.org/whl/nightly/cpu
python ./resnet50_script.py --device="cpu" #use cuda or vulkan or metal
@@ -204,7 +204,7 @@ python ./resnet50_script.py --device="cpu" #use cuda or vulkan or metal
### Download and run BERT (MiniLM) sample
```shell
curl -O https://raw.githubusercontent.com/nod-ai/AMD-SHARK-Studio/main/amdshark/examples/amdshark_inference/minilm_jit.py
curl -O https://raw.githubusercontent.com/nod-ai/SHARK/main/shark/examples/shark_inference/minilm_jit.py
#Install deps for test script
pip install transformers torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu
python ./minilm_jit.py --device="cpu" #use cuda or vulkan or metal
@@ -222,34 +222,34 @@ Set `USE_IREE=1` to use upstream IREE
# PYTHON=python3.11 VENV_DIR=0617_venv IMPORTER=1 ./setup_venv.sh
```
### Run any of the hundreds of AMDSHARK tank models via the test framework
### Run any of the hundreds of SHARK tank models via the test framework
```shell
python -m amdshark.examples.amdshark_inference.resnet50_script --device="cpu" # Use gpu | vulkan
python -m shark.examples.shark_inference.resnet50_script --device="cpu" # Use gpu | vulkan
# Or a pytest
pytest tank/test_models.py -k "MiniLM"
```
### How to use your locally built IREE / Torch-MLIR with AMDSHARK
### How to use your locally built IREE / Torch-MLIR with SHARK
If you are a *Torch-mlir developer or an IREE developer* and want to test local changes you can uninstall
the provided packages with `pip uninstall torch-mlir` and / or `pip uninstall iree-compiler iree-runtime` and build locally
with Python bindings and set your PYTHONPATH as mentioned [here](https://github.com/iree-org/iree/tree/main/docs/api_docs/python#install-iree-binaries)
for IREE and [here](https://github.com/llvm/torch-mlir/blob/main/development.md#setup-python-environment-to-export-the-built-python-packages)
for Torch-MLIR.
How to use your locally built Torch-MLIR with AMDSHARK:
How to use your locally built Torch-MLIR with SHARK:
```shell
1.) Run `./setup_venv.sh in AMDSHARK` and activate `amdshark.venv` virtual env.
1.) Run `./setup_venv.sh in SHARK` and activate `shark.venv` virtual env.
2.) Run `pip uninstall torch-mlir`.
3.) Go to your local Torch-MLIR directory.
4.) Activate mlir_venv virtual envirnoment.
5.) Run `pip uninstall -r requirements.txt`.
6.) Run `pip install -r requirements.txt`.
7.) Build Torch-MLIR.
8.) Activate amdshark.venv virtual environment from the Torch-MLIR directory.
8.) Activate shark.venv virtual environment from the Torch-MLIR directory.
8.) Run `export PYTHONPATH=`pwd`/build/tools/torch-mlir/python_packages/torch_mlir:`pwd`/examples` in the Torch-MLIR directory.
9.) Go to the AMDSHARK directory.
9.) Go to the SHARK directory.
```
Now the AMDSHARK will use your locally build Torch-MLIR repo.
Now the SHARK will use your locally build Torch-MLIR repo.
## Benchmarking Dispatches
@@ -263,10 +263,10 @@ pytest -k "MiniLM and torch and static and cuda" --benchmark_dispatches=All -s -
```
The given command will populate `<dispatch_benchmarks_dir>/<model_name>/` with an `ordered_dispatches.txt` that lists and orders the dispatches and their latencies, as well as folders for each dispatch that contain .mlir, .vmfb, and results of the benchmark for that dispatch.
if you want to instead incorporate this into a python script, you can pass the `dispatch_benchmarks` and `dispatch_benchmarks_dir` commands when initializing `AMDSharkInference`, and the benchmarks will be generated when compiled. E.G:
if you want to instead incorporate this into a python script, you can pass the `dispatch_benchmarks` and `dispatch_benchmarks_dir` commands when initializing `SharkInference`, and the benchmarks will be generated when compiled. E.G:
```
amdshark_module = AMDSharkInference(
shark_module = SharkInference(
mlir_model,
device=args.device,
mlir_dialect="tm_tensor",
@@ -285,34 +285,34 @@ Output will include:
- A .txt file containing benchmark output
See tank/README.md for further instructions on how to run model tests and benchmarks from the AMDSHARK tank.
See tank/README.md for further instructions on how to run model tests and benchmarks from the SHARK tank.
</details>
<details>
<summary>API Reference</summary>
### AMDShark Inference API
### Shark Inference API
```
from amdshark.amdshark_importer import AMDSharkImporter
from shark.shark_importer import SharkImporter
# AMDSharkImporter imports mlir file from the torch, tensorflow or tf-lite module.
# SharkImporter imports mlir file from the torch, tensorflow or tf-lite module.
mlir_importer = AMDSharkImporter(
mlir_importer = SharkImporter(
torch_module,
(input),
frontend="torch", #tf, #tf-lite
)
torch_mlir, func_name = mlir_importer.import_mlir(tracing_required=True)
# AMDSharkInference accepts mlir in linalg, mhlo, and tosa dialect.
# SharkInference accepts mlir in linalg, mhlo, and tosa dialect.
from amdshark.amdshark_inference import AMDSharkInference
amdshark_module = AMDSharkInference(torch_mlir, device="cpu", mlir_dialect="linalg")
amdshark_module.compile()
result = amdshark_module.forward((input))
from shark.shark_inference import SharkInference
shark_module = SharkInference(torch_mlir, device="cpu", mlir_dialect="linalg")
shark_module.compile()
result = shark_module.forward((input))
```
@@ -320,7 +320,7 @@ result = amdshark_module.forward((input))
### Example demonstrating running MHLO IR.
```
from amdshark.amdshark_inference import AMDSharkInference
from shark.shark_inference import SharkInference
import numpy as np
mhlo_ir = r"""builtin.module {
@@ -333,22 +333,22 @@ mhlo_ir = r"""builtin.module {
arg0 = np.ones((1, 4)).astype(np.float32)
arg1 = np.ones((4, 1)).astype(np.float32)
amdshark_module = AMDSharkInference(mhlo_ir, device="cpu", mlir_dialect="mhlo")
amdshark_module.compile()
result = amdshark_module.forward((arg0, arg1))
shark_module = SharkInference(mhlo_ir, device="cpu", mlir_dialect="mhlo")
shark_module.compile()
result = shark_module.forward((arg0, arg1))
```
</details>
## Examples Using the REST API
* [Setting up AMDSHARK for use with Blender](./docs/amdshark_sd_blender.md)
* [Setting up AMDSHARK for use with Koboldcpp](./docs/amdshark_sd_koboldcpp.md)
* [Setting up SHARK for use with Blender](./docs/shark_sd_blender.md)
* [Setting up SHARK for use with Koboldcpp](./docs/shark_sd_koboldcpp.md)
## Supported and Validated Models
AMDSHARK is maintained to support the latest innovations in ML Models:
SHARK is maintained to support the latest innovations in ML Models:
| TF HuggingFace Models | AMDSHARK-CPU | AMDSHARK-CUDA | AMDSHARK-METAL |
| TF HuggingFace Models | SHARK-CPU | SHARK-CUDA | SHARK-METAL |
|---------------------|----------|----------|-------------|
| BERT | :green_heart: | :green_heart: | :green_heart: |
| DistilBERT | :green_heart: | :green_heart: | :green_heart: |
@@ -358,12 +358,12 @@ AMDSHARK is maintained to support the latest innovations in ML Models:
| Vision Transformer | :green_heart: | :green_heart: | :green_heart: |
| ResNet50 | :green_heart: | :green_heart: | :green_heart: |
For a complete list of the models supported in AMDSHARK, please refer to [tank/README.md](https://github.com/nod-ai/AMD-SHARK-Studio/blob/main/tank/README.md).
For a complete list of the models supported in SHARK, please refer to [tank/README.md](https://github.com/nod-ai/SHARK/blob/main/tank/README.md).
## Communication Channels
* [AMDSHARK Discord server](https://discord.gg/RUqY2h2s9u): Real time discussions with the AMDSHARK team and other users
* [GitHub issues](https://github.com/nod-ai/AMD-SHARK-Studio/issues): Feature requests, bugs etc
* [SHARK Discord server](https://discord.gg/RUqY2h2s9u): Real time discussions with the SHARK team and other users
* [GitHub issues](https://github.com/nod-ai/SHARK/issues): Feature requests, bugs etc
## Related Projects
@@ -385,10 +385,10 @@ For a complete list of the models supported in AMDSHARK, please refer to [tank/R
* Torch-MLIR Github issues [here](https://github.com/llvm/torch-mlir/issues)
* [`torch-mlir` section](https://llvm.discourse.group/c/projects-that-want-to-become-official-llvm-projects/torch-mlir/41) of LLVM Discourse
* Weekly meetings on Mondays 9AM PST. See [here](https://discourse.llvm.org/t/community-meeting-developer-hour-refactoring-recurring-meetings/62575) for more information.
* [MLIR topic within LLVM Discourse](https://llvm.discourse.group/c/llvm-project/mlir/31) AMDSHARK and IREE is enabled by and heavily relies on [MLIR](https://mlir.llvm.org).
* [MLIR topic within LLVM Discourse](https://llvm.discourse.group/c/llvm-project/mlir/31) SHARK and IREE is enabled by and heavily relies on [MLIR](https://mlir.llvm.org).
</details>
## License
nod.ai AMDSHARK is licensed under the terms of the Apache 2.0 License with LLVM Exceptions.
nod.ai SHARK is licensed under the terms of the Apache 2.0 License with LLVM Exceptions.
See [LICENSE](LICENSE) for more information.

View File

@@ -1,28 +0,0 @@
import importlib
import logging
from torch._dynamo import register_backend
log = logging.getLogger(__name__)
@register_backend
def amdshark(model, inputs, *, options):
try:
from amdshark.dynamo_backend.utils import AMDSharkBackend
except ImportError:
log.exception(
"Unable to import AMDSHARK - High Performance Machine Learning Distribution"
"Please install the right version of AMDSHARK that matches the PyTorch version being used. "
"Refer to https://github.com/nod-ai/AMD-SHARK-Studio/ for details."
)
raise
return AMDSharkBackend(model, inputs, options)
def has_amdshark():
try:
importlib.import_module("amdshark")
return True
except ImportError:
return False

View File

@@ -1,501 +0,0 @@
# Copyright 2020 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from amdshark.amdshark_runner import AMDSharkRunner
from amdshark.iree_utils.compile_utils import (
export_iree_module_to_vmfb,
load_flatbuffer,
get_iree_runtime_config,
)
from amdshark.iree_utils.benchmark_utils import (
build_benchmark_args,
run_benchmark_module,
)
from amdshark.parser import amdshark_args
from datetime import datetime
import time
from typing import Optional
import csv
import os
TF_CPU_DEVICE = "/CPU:0"
TF_GPU_DEVICE = "/GPU:0"
def _bytes_to_mb_str(bytes_: Optional[int]) -> str:
return "" if bytes_ is None else f"{bytes_ / 1e6:.6f}"
class OnnxFusionOptions(object):
def __init__(self):
self.disable_gelu = False
self.disable_layer_norm = False
self.disable_attention = False
self.disable_skip_layer_norm = False
self.disable_embed_layer_norm = False
self.disable_bias_skip_layer_norm = False
self.disable_bias_gelu = False
self.enable_gelu_approximation = False
self.use_mask_index = False
self.no_attention_mask = False
def check_requirements(frontend):
import importlib
has_pkgs = False
if frontend == "torch":
tv_spec = importlib.util.find_spec("torchvision")
has_pkgs = tv_spec is not None
elif frontend in ["tensorflow", "tf"]:
keras_spec = importlib.util.find_spec("keras")
tf_spec = importlib.util.find_spec("tensorflow")
has_pkgs = keras_spec is not None and tf_spec is not None
return has_pkgs
class AMDSharkBenchmarkRunner(AMDSharkRunner):
# AMDSharkRunner derived class with Benchmarking capabilities.
def __init__(
self,
mlir_module: bytes,
device: str = "none",
mlir_dialect: str = "linalg",
extra_args: list = [],
):
self.device = amdshark_args.device if device == "none" else device
self.enable_tf32 = amdshark_args.enable_tf32
self.frontend_model = None
self.vmfb_file = None
self.mlir_dialect = mlir_dialect
self.extra_args = extra_args
self.import_args = {}
self.temp_file_to_unlink = None
if not os.path.isfile(mlir_module):
print(
"Warning: Initializing AMDSharkRunner with a mlir string/bytecode object will duplicate the model in RAM at compile time. To avoid this, initialize AMDSharkInference with a path to a MLIR module on your hard disk instead."
)
self.compile_str = True
else:
self.compile_str = False
AMDSharkRunner.__init__(
self,
mlir_module,
device,
self.mlir_dialect,
self.extra_args,
compile_vmfb=False,
)
self.vmfb_file = export_iree_module_to_vmfb(
mlir_module,
device,
".",
self.mlir_dialect,
extra_args=self.extra_args,
compile_str=self.compile_str,
)
params = load_flatbuffer(
self.vmfb_file,
device,
mmap=True,
)
self.iree_compilation_module = params["vmfb"]
self.iree_config = params["config"]
self.temp_file_to_unlink = params["temp_file_to_unlink"]
del params
def setup_cl(self, input_tensors):
self.benchmark_cl = build_benchmark_args(
self.vmfb_file,
self.device,
input_tensors,
mlir_dialect=self.mlir_dialect,
)
def benchmark_frontend(self, modelname):
if self.mlir_dialect in ["linalg", "torch"]:
return self.benchmark_torch(modelname)
elif self.mlir_dialect in ["mhlo", "tf"]:
return self.benchmark_tf(modelname)
def benchmark_torch(self, modelname, device="cpu"):
import torch
from tank.model_utils import get_torch_model
# TODO: Pass this as an arg. currently the best way is to setup with BENCHMARK=1 if we want to use torch+cuda, else use cpu.
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
torch.set_default_device("cuda:0")
# if self.enable_tf32:
# torch.backends.cuda.matmul.allow_tf32 = True
else:
torch.set_default_dtype(torch.float32)
torch.set_default_device("cpu")
torch_device = torch.device("cuda:0" if device == "cuda" else "cpu")
HFmodel, input = get_torch_model(modelname, self.import_args)[:2]
frontend_model = HFmodel.model
frontend_model.to(torch_device)
if device == "cuda":
frontend_model.cuda()
input.to(torch.device("cuda:0"))
print(input)
else:
frontend_model.cpu()
input.cpu()
for i in range(amdshark_args.num_warmup_iterations):
frontend_model.forward(input)
if device == "cuda":
torch.cuda.reset_peak_memory_stats()
begin = time.time()
for i in range(amdshark_args.num_iterations):
out = frontend_model.forward(input)
end = time.time()
if device == "cuda":
stats = torch.cuda.memory_stats()
device_peak_b = stats["allocated_bytes.all.peak"]
frontend_model.to(torch.device("cpu"))
input.to(torch.device("cpu"))
torch.cuda.empty_cache()
else:
device_peak_b = None
print(
f"Torch benchmark:{amdshark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{amdshark_args.num_iterations}"
)
if device == "cuda":
# Set device to CPU so we don't run into segfaults exiting pytest subprocesses.
torch_device = torch.device("cpu")
return [
f"{amdshark_args.num_iterations/(end-begin)}",
f"{((end-begin)/amdshark_args.num_iterations)*1000}",
"", # host_peak_b (CPU usage) is not reported by PyTorch.
_bytes_to_mb_str(device_peak_b),
]
def benchmark_tf(self, modelname):
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
import tensorflow as tf
visible_default = tf.config.list_physical_devices("GPU")
try:
tf.config.set_visible_devices([], "GPU")
visible_devices = tf.config.get_visible_devices()
for device in visible_devices:
assert device.device_type != "GPU"
except:
# Invalid device or cannot modify virtual devices once initialized.
pass
from tank.model_utils_tf import get_tf_model
# tf_device = TF_GPU_DEVICE if self.device == "cuda" else TF_CPU_DEVICE
tf_device = TF_CPU_DEVICE
with tf.device(tf_device):
(
model,
input,
) = get_tf_model(
modelname, self.import_args
)[:2]
frontend_model = model
for i in range(amdshark_args.num_warmup_iterations):
frontend_model.forward(*input)
if tf_device == TF_GPU_DEVICE:
tf.config.experimental.reset_memory_stats(tf_device)
begin = time.time()
for i in range(amdshark_args.num_iterations):
out = frontend_model.forward(*input)
end = time.time()
if tf_device == TF_GPU_DEVICE:
memory_info = tf.config.experimental.get_memory_info(tf_device)
device_peak_b = memory_info["peak"]
else:
# tf.config.experimental does not currently support measuring
# CPU memory usage.
device_peak_b = None
print(
f"TF benchmark:{amdshark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{amdshark_args.num_iterations}"
)
return [
f"{amdshark_args.num_iterations/(end-begin)}",
f"{((end-begin)/amdshark_args.num_iterations)*1000}",
"", # host_peak_b (CPU usage) is not reported by TensorFlow.
_bytes_to_mb_str(device_peak_b),
]
def benchmark_c(self):
iter_per_second, host_peak_b, device_peak_b = run_benchmark_module(
self.benchmark_cl
)
print(f"AMDShark-IREE-C benchmark:{iter_per_second} iter/second")
return [
f"{iter_per_second}",
f"{1000/iter_per_second}",
_bytes_to_mb_str(host_peak_b),
_bytes_to_mb_str(device_peak_b),
]
def benchmark_python(self, inputs):
input_list = [x for x in inputs]
for i in range(amdshark_args.num_warmup_iterations):
self.run("forward", input_list)
begin = time.time()
for i in range(amdshark_args.num_iterations):
out = self.run("forward", input_list)
end = time.time()
print(
f"AMDShark-IREE Python benchmark:{amdshark_args.num_iterations/(end-begin)} iter/second, Total Iterations:{amdshark_args.num_iterations}"
)
return [
f"{amdshark_args.num_iterations/(end-begin)}",
f"{((end-begin)/amdshark_args.num_iterations)*1000}",
]
def benchmark_onnx(self, modelname, inputs):
if self.device == "cuda":
print(
"Currently GPU benchmarking on ONNX is not supported in AMDSHARK."
)
return ["N/A", "N/A"]
else:
from onnxruntime.transformers.benchmark import run_onnxruntime
from onnxruntime.transformers.huggingface_models import MODELS
from onnxruntime.transformers.benchmark_helper import (
ConfigModifier,
Precision,
)
import psutil
if modelname == "microsoft/MiniLM-L12-H384-uncased":
modelname = "bert-base-uncased"
if modelname not in MODELS:
print(
f"{modelname} is currently not supported in ORT's HF. Check \
https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/transformers/huggingface_models.py \
for currently supported models. Exiting benchmark ONNX."
)
return ["N/A", "N/A"]
use_gpu = self.device == "cuda"
num_threads = psutil.cpu_count(logical=False)
batch_sizes = [1]
sequence_lengths = [128]
cache_dir = os.path.join(".", "cache_models")
onnx_dir = os.path.join(".", "onnx_models")
verbose = False
input_counts = [1]
optimize_onnx = True
validate_onnx = False
disable_ort_io_binding = False
use_raw_attention_mask = True
model_fusion_statistics = {}
overwrite = False
model_source = "pt" # Either "pt" or "tf"
provider = None
config_modifier = ConfigModifier(None)
onnx_args = OnnxFusionOptions()
result = run_onnxruntime(
use_gpu,
provider,
(modelname,),
None,
config_modifier,
Precision.FLOAT32,
num_threads,
batch_sizes,
sequence_lengths,
amdshark_args.num_iterations,
input_counts,
optimize_onnx,
validate_onnx,
cache_dir,
onnx_dir,
verbose,
overwrite,
disable_ort_io_binding,
use_raw_attention_mask,
model_fusion_statistics,
model_source,
onnx_args,
)
print(
f"ONNX ORT-benchmark:{result[0]['QPS']} iter/second, Total Iterations:{amdshark_args.num_iterations}"
)
return [
result[0]["QPS"],
result[0]["average_latency_ms"],
]
def get_metadata(self, modelname):
metadata_path = os.path.join(".", "tank", "model_metadata.csv")
with open(metadata_path, mode="r") as csvfile:
torch_reader = csv.reader(csvfile, delimiter=",")
fields = next(torch_reader)
for row in torch_reader:
torch_model_name = row[0]
if torch_model_name == modelname:
param_count = row[3]
model_tags = row[4]
model_notes = row[5]
return [param_count, model_tags, model_notes]
def compare_bench_results(self, baseline: str, result: str):
if baseline is not None:
# Takes a baseline and a result string and calculates a comparison, e.g. "1.04x baseline".
a = float(baseline)
b = float(result)
comparison = a / b
comp_str = f"{round(comparison, 2)}x baseline"
else:
comp_str = "N/A"
return comp_str
def benchmark_all_csv(
self,
inputs: tuple,
modelname,
dynamic,
device_str,
frontend,
import_args,
mode="native",
):
self.setup_cl(inputs)
self.import_args = import_args
self.mode = mode
field_names = [
"model",
"batch_size",
"engine",
"dialect",
"device",
"shape_type",
"data_type",
"iter/sec",
"ms/iter",
"vs. PyTorch/TF",
"iterations",
"param_count",
"tags",
"notes",
"datetime",
"host_memory_mb",
"device_memory_mb",
"measured_host_memory_mb",
"measured_device_memory_mb",
]
# "frontend" must be the first element.
if self.mode == "native":
engines = ["amdshark_python", "amdshark_iree_c"]
if self.mode == "baseline":
engines = ["frontend"]
if self.mode == "all":
engines = ["frontend", "amdshark_python", "amdshark_iree_c"]
if amdshark_args.onnx_bench == True:
engines.append("onnxruntime")
if not os.path.exists("bench_results.csv"):
with open("bench_results.csv", mode="w", newline="") as f:
writer = csv.writer(f)
writer.writerow(field_names)
with open("bench_results.csv", mode="a", newline="") as f:
writer = csv.DictWriter(f, fieldnames=field_names)
bench_info = {}
bench_info["model"] = modelname
bench_info["batch_size"] = str(import_args["batch_size"])
bench_info["dialect"] = self.mlir_dialect
bench_info["iterations"] = amdshark_args.num_iterations
if dynamic == True:
bench_info["shape_type"] = "dynamic"
else:
bench_info["shape_type"] = "static"
bench_info["device"] = device_str
if "fp16" in modelname:
bench_info["data_type"] = "float16"
else:
bench_info["data_type"] = inputs[0].dtype
for e in engines:
engine_result = {}
self.frontend_result = None
if e == "frontend":
engine_result["engine"] = frontend
if check_requirements(frontend):
(
engine_result["iter/sec"],
engine_result["ms/iter"],
engine_result["host_memory_mb"],
engine_result["device_memory_mb"],
) = self.benchmark_frontend(modelname)
self.frontend_result = engine_result["ms/iter"]
engine_result["vs. PyTorch/TF"] = "baseline"
(
engine_result["param_count"],
engine_result["tags"],
engine_result["notes"],
) = self.get_metadata(modelname)
else:
self.frontend_result = None
continue
elif e == "amdshark_python":
engine_result["engine"] = "amdshark_python"
(
engine_result["iter/sec"],
engine_result["ms/iter"],
) = self.benchmark_python(inputs)
engine_result[
"vs. PyTorch/TF"
] = self.compare_bench_results(
self.frontend_result, engine_result["ms/iter"]
)
elif e == "amdshark_iree_c":
engine_result["engine"] = "amdshark_iree_c"
(
engine_result["iter/sec"],
engine_result["ms/iter"],
engine_result["host_memory_mb"],
engine_result["device_memory_mb"],
) = self.benchmark_c()
engine_result[
"vs. PyTorch/TF"
] = self.compare_bench_results(
self.frontend_result, engine_result["ms/iter"]
)
elif e == "onnxruntime":
engine_result["engine"] = "onnxruntime"
(
engine_result["iter/sec"],
engine_result["ms/iter"],
) = self.benchmark_onnx(modelname, inputs)
engine_result["datetime"] = str(datetime.now())
writer.writerow(bench_info | engine_result)

View File

@@ -1,241 +0,0 @@
import os
import tempfile
from amdshark.amdshark_inference import AMDSharkInference
from amdshark.amdshark_importer import import_with_fx, save_mlir
import torch
import torch_mlir
from torch_mlir.compiler_utils import run_pipeline_with_repro_report
from typing import List, Tuple
from io import BytesIO
from brevitas_examples.common.generative.quantize import quantize_model
from brevitas_examples.llm.llm_quant.run_utils import get_model_impl
# fmt: off
def quantmatmul_rhs_group_quant〡shape(lhs: List[int], rhs: List[int], rhs_scale: List[int], rhs_zero_point: List[int], rhs_bit_width: int, rhs_group_size: int) -> List[int]:
if len(lhs) == 3 and len(rhs) == 2:
return [lhs[0], lhs[1], rhs[0]]
elif len(lhs) == 2 and len(rhs) == 2:
return [lhs[0], rhs[0]]
else:
raise ValueError("Input shapes not supported.")
def quantmatmul_rhs_group_quant〡dtype(lhs_rank_dtype: Tuple[int, int], rhs_rank_dtype: Tuple[int, int], rhs_scale_rank_dtype: Tuple[int, int], rhs_zero_point_rank_dtype: Tuple[int, int], rhs_bit_width: int, rhs_group_size: int) -> int:
# output dtype is the dtype of the lhs float input
lhs_rank, lhs_dtype = lhs_rank_dtype
return lhs_dtype
def quantmatmul_rhs_group_quant〡has_value_semantics(lhs, rhs, rhs_scale, rhs_zero_point, rhs_bit_width, rhs_group_size) -> None:
return
brevitas_matmul_rhs_group_quant_library = [
quantmatmul_rhs_group_quant〡shape,
quantmatmul_rhs_group_quant〡dtype,
quantmatmul_rhs_group_quant〡has_value_semantics]
# fmt: on
def load_vmfb(extended_model_name, device, mlir_dialect, extra_args=[]):
vmfb_path = os.path.join(os.getcwd(), extended_model_name + ".vmfb")
amdshark_module = None
if os.path.isfile(vmfb_path):
amdshark_module = AMDSharkInference(
None,
device=device,
mlir_dialect=mlir_dialect,
)
print(f"loading existing vmfb from: {vmfb_path}")
amdshark_module.load_module(vmfb_path, extra_args=extra_args)
return amdshark_module
def compile_module(
amdshark_module, extended_model_name, generate_vmfb, extra_args=[]
):
if generate_vmfb:
vmfb_path = os.path.join(os.getcwd(), extended_model_name + ".vmfb")
if os.path.isfile(vmfb_path):
print(f"loading existing vmfb from: {vmfb_path}")
amdshark_module.load_module(vmfb_path, extra_args=extra_args)
else:
print(
"No vmfb found. Compiling and saving to {}".format(vmfb_path)
)
path = amdshark_module.save_module(
os.getcwd(), extended_model_name, extra_args
)
amdshark_module.load_module(path, extra_args=extra_args)
else:
amdshark_module.compile(extra_args)
return amdshark_module
def compile_int_precision(
model, inputs, precision, device, generate_vmfb, extended_model_name
):
weight_bit_width = 4 if precision == "int4" else 8
weight_group_size = 128
quantize_model(
get_model_impl(model),
dtype=torch.float32,
weight_quant_type="asym",
weight_bit_width=weight_bit_width,
weight_param_method="stats",
weight_scale_precision="float_scale",
weight_quant_granularity="per_group",
weight_group_size=weight_group_size,
quantize_weight_zero_point=False,
input_bit_width=None,
input_scale_type="float",
input_param_method="stats",
input_quant_type="asym",
input_quant_granularity="per_tensor",
quantize_input_zero_point=False,
seqlen=2048,
)
print("Weight quantization applied.")
torchscript_module = import_with_fx(
model,
inputs,
precision=precision,
mlir_type="torchscript",
)
mlir_module = torch_mlir.compile(
torchscript_module,
inputs,
output_type="torch",
backend_legal_ops=["quant.matmul_rhs_group_quant"],
extra_library=brevitas_matmul_rhs_group_quant_library,
use_tracing=False,
verbose=False,
)
print(f"[DEBUG] converting torch to linalg")
run_pipeline_with_repro_report(
mlir_module,
"builtin.module(func.func(torch-unpack-quant-tensor),func.func(torch-convert-custom-quant-op),torch-backend-to-linalg-on-tensors-backend-pipeline)",
description="Lowering Torch Backend IR -> Linalg-on-Tensors Backend IR",
)
from contextlib import redirect_stdout
mlir_file_path = os.path.join(
os.getcwd(), f"{extended_model_name}_linalg.mlir"
)
with open(mlir_file_path, "w") as f:
with redirect_stdout(f):
print(mlir_module.operation.get_asm())
mlir_module = str(mlir_module)
mlir_module = mlir_module.encode("UTF-8")
mlir_module = BytesIO(mlir_module)
bytecode = mlir_module.read()
bytecode_path = os.path.join(
os.getcwd(), f"{extended_model_name}_linalg.mlirbc"
)
with open(bytecode_path, "wb") as f:
f.write(bytecode)
del bytecode
del mlir_module
print(f"Elided IR written for {extended_model_name}")
return bytecode_path
amdshark_module = AMDSharkInference(
mlir_module=bytecode_path, device=device, mlir_dialect="tm_tensor"
)
extra_args = [
"--iree-hal-dump-executable-sources-to=ies",
"--iree-vm-target-truncate-unsupported-floats",
"--iree-codegen-check-ir-before-llvm-conversion=false",
"--iree-vm-bytecode-module-output-format=flatbuffer-binary",
]
return (
compile_module(
amdshark_module,
extended_model_name=extended_model_name,
generate_vmfb=generate_vmfb,
extra_args=extra_args,
),
bytecode_path,
)
def amdshark_compile_through_fx(
model,
inputs,
extended_model_name,
precision,
f16_input_mask=None,
save_dir=tempfile.gettempdir(),
debug=False,
generate_or_load_vmfb=True,
extra_args=[],
device=None,
mlir_dialect="tm_tensor",
):
is_f16 = precision == "fp16"
if generate_or_load_vmfb:
amdshark_module = load_vmfb(
extended_model_name=extended_model_name,
device=device,
mlir_dialect=mlir_dialect,
extra_args=extra_args,
)
if amdshark_module:
return (
amdshark_module,
None,
)
from amdshark.parser import amdshark_args
if "cuda" in device:
amdshark_args.enable_tf32 = True
if precision in ["int4", "int8"]:
mlir_module = compile_int_precision(
model,
inputs,
precision,
device,
generate_or_load_vmfb,
extended_model_name,
)
extra_args = [
"--iree-hal-dump-executable-sources-to=ies",
"--iree-vm-target-truncate-unsupported-floats",
"--iree-codegen-check-ir-before-llvm-conversion=false",
"--iree-vm-bytecode-module-output-format=flatbuffer-binary",
]
else:
(
bytecode,
_,
) = import_with_fx(
model=model,
inputs=inputs,
is_f16=is_f16,
f16_input_mask=f16_input_mask,
debug=debug,
model_name=extended_model_name,
save_dir=save_dir,
)
mlir_module = save_mlir(
mlir_module=bytecode,
model_name=extended_model_name,
mlir_dialect=mlir_dialect,
)
amdshark_module = AMDSharkInference(
mlir_module,
device=device,
mlir_dialect=mlir_dialect,
)
return (
compile_module(
amdshark_module,
extended_model_name,
generate_vmfb=generate_or_load_vmfb,
extra_args=extra_args,
),
mlir_module,
)

View File

@@ -1,297 +0,0 @@
# Lint as: python3
"""AMDSHARK Downloader"""
# Requirements : Put amdshark_tank in AMDSHARK directory
# /AMDSHARK
# /gen_amdshark_tank
# /tflite
# /albert_lite_base
# /...model_name...
# /tf
# /pytorch
#
#
#
import numpy as np
import os
from tqdm.std import tqdm
import sys
from pathlib import Path
from amdshark.parser import amdshark_args
from google.cloud import storage
def download_public_file(
full_gs_url, destination_folder_name, single_file=False
):
"""Downloads a public blob from the bucket."""
# bucket_name = "gs://your-bucket-name/path/to/file"
# destination_file_name = "local/path/to/file"
storage_client = storage.Client.create_anonymous_client()
bucket_name = full_gs_url.split("/")[2]
source_blob_name = None
dest_filename = None
desired_file = None
if single_file:
desired_file = full_gs_url.split("/")[-1]
source_blob_name = "/".join(full_gs_url.split("/")[3:-1])
destination_folder_name, dest_filename = os.path.split(
destination_folder_name
)
else:
source_blob_name = "/".join(full_gs_url.split("/")[3:])
bucket = storage_client.bucket(bucket_name)
blobs = bucket.list_blobs(prefix=source_blob_name)
if not os.path.exists(destination_folder_name):
os.mkdir(destination_folder_name)
for blob in blobs:
blob_name = blob.name.split("/")[-1]
if single_file:
if blob_name == desired_file:
destination_filename = os.path.join(
destination_folder_name, dest_filename
)
with open(destination_filename, "wb") as f:
with tqdm.wrapattr(
f, "write", total=blob.size
) as file_obj:
storage_client.download_blob_to_file(blob, file_obj)
else:
continue
else:
destination_filename = os.path.join(
destination_folder_name, blob_name
)
if os.path.isdir(destination_filename):
continue
with open(destination_filename, "wb") as f:
with tqdm.wrapattr(f, "write", total=blob.size) as file_obj:
storage_client.download_blob_to_file(blob, file_obj)
input_type_to_np_dtype = {
"float32": np.float32,
"float64": np.float64,
"bool": np.bool_,
"int32": np.int32,
"int64": np.int64,
"uint8": np.uint8,
"int8": np.int8,
}
# Save the model in the home local so it needn't be fetched everytime in the CI.
home = str(Path.home())
alt_path = os.path.join(os.path.dirname(__file__), "../gen_amdshark_tank/")
custom_path = amdshark_args.local_tank_cache
if custom_path is not None:
if not os.path.exists(custom_path):
os.mkdir(custom_path)
WORKDIR = custom_path
print(f"Using {WORKDIR} as local amdshark_tank cache directory.")
elif os.path.exists(alt_path):
WORKDIR = alt_path
print(
f"Using {WORKDIR} as amdshark_tank directory. Delete this directory if you aren't working from locally generated amdshark_tank."
)
else:
WORKDIR = os.path.join(home, ".local/amdshark_tank/")
print(
f"amdshark_tank local cache is located at {WORKDIR} . You may change this by setting the --local_tank_cache= flag"
)
os.makedirs(WORKDIR, exist_ok=True)
# Checks whether the directory and files exists.
def check_dir_exists(model_name, frontend="torch", dynamic=""):
model_dir = os.path.join(WORKDIR, model_name)
# Remove the _tf keyword from end only for non-SD models.
if not any(model in model_name for model in ["clip", "unet", "vae"]):
if frontend in ["tf", "tensorflow"]:
model_name = model_name[:-3]
elif frontend in ["tflite"]:
model_name = model_name[:-7]
elif frontend in ["torch", "pytorch"]:
model_name = model_name[:-6]
model_mlir_file_name = f"{model_name}{dynamic}_{frontend}.mlir"
if os.path.isdir(model_dir):
if (
os.path.isfile(os.path.join(model_dir, model_mlir_file_name))
and os.path.isfile(os.path.join(model_dir, "function_name.npy"))
and os.path.isfile(os.path.join(model_dir, "inputs.npz"))
and os.path.isfile(os.path.join(model_dir, "golden_out.npz"))
and os.path.isfile(os.path.join(model_dir, "hash.npy"))
):
print(
f"""Model artifacts for {model_name} found at {WORKDIR}..."""
)
return True
return False
def _internet_connected():
import requests as req
try:
req.get("http://1.1.1.1")
return True
except:
return False
def get_git_revision_short_hash() -> str:
import subprocess
if amdshark_args.amdshark_prefix is not None:
prefix_kw = amdshark_args.amdshark_prefix
else:
import json
dir_path = os.path.dirname(os.path.realpath(__file__))
src = os.path.join(dir_path, "..", "tank_version.json")
with open(src, "r") as f:
data = json.loads(f.read())
prefix_kw = data["version"]
print(f"Checking for updates from gs://amdshark_tank/{prefix_kw}")
return prefix_kw
def get_amdsharktank_prefix():
tank_prefix = ""
if not _internet_connected():
print(
"No internet connection. Using the model already present in the tank."
)
tank_prefix = "none"
else:
desired_prefix = get_git_revision_short_hash()
storage_client_a = storage.Client.create_anonymous_client()
base_bucket_name = "amdshark_tank"
base_bucket = storage_client_a.bucket(base_bucket_name)
dir_blobs = base_bucket.list_blobs(prefix=f"{desired_prefix}")
for blob in dir_blobs:
dir_blob_name = blob.name.split("/")
if desired_prefix in dir_blob_name[0]:
tank_prefix = dir_blob_name[0]
break
else:
continue
if tank_prefix == "":
print(
f"amdshark_tank bucket not found matching ({desired_prefix}). Defaulting to nightly."
)
tank_prefix = "nightly"
return tank_prefix
# Downloads the torch model from gs://amdshark_tank dir.
def download_model(
model_name,
dynamic=False,
tank_url=None,
frontend=None,
tuned=None,
import_args={"batch_size": 1},
):
model_name = model_name.replace("/", "_")
dyn_str = "_dynamic" if dynamic else ""
os.makedirs(WORKDIR, exist_ok=True)
amdshark_args.amdshark_prefix = get_amdsharktank_prefix()
if import_args["batch_size"] and import_args["batch_size"] != 1:
model_dir_name = (
model_name
+ "_"
+ frontend
+ "_BS"
+ str(import_args["batch_size"])
)
elif any(model in model_name for model in ["clip", "unet", "vae"]):
# TODO(Ean Garvey): rework extended naming such that device is only included in model_name after .vmfb compilation.
model_dir_name = model_name
else:
model_dir_name = model_name + "_" + frontend
model_dir = os.path.join(WORKDIR, model_dir_name)
if not tank_url:
tank_url = "gs://amdshark_tank/" + amdshark_args.amdshark_prefix
full_gs_url = tank_url.rstrip("/") + "/" + model_dir_name
if not check_dir_exists(
model_dir_name, frontend=frontend, dynamic=dyn_str
):
print(
f"Downloading artifacts for model {model_name} from: {full_gs_url}"
)
download_public_file(full_gs_url, model_dir)
elif amdshark_args.force_update_tank == True:
print(
f"Force-updating artifacts for model {model_name} from: {full_gs_url}"
)
download_public_file(full_gs_url, model_dir)
else:
if not _internet_connected():
print(
"No internet connection. Using the model already present in the tank."
)
else:
local_hash = str(np.load(os.path.join(model_dir, "hash.npy")))
gs_hash_url = (
tank_url.rstrip("/") + "/" + model_dir_name + "/hash.npy"
)
download_public_file(
gs_hash_url,
os.path.join(model_dir, "upstream_hash.npy"),
single_file=True,
)
try:
upstream_hash = str(
np.load(os.path.join(model_dir, "upstream_hash.npy"))
)
except FileNotFoundError:
print(f"Model artifact hash not found at {model_dir}.")
upstream_hash = None
if local_hash != upstream_hash and amdshark_args.update_tank == True:
print(f"Updating artifacts for model {model_name}...")
download_public_file(full_gs_url, model_dir)
elif local_hash != upstream_hash:
print(
"Hash does not match upstream in gs://amdshark_tank/. If you want to use locally generated artifacts, this is working as intended. Otherwise, run with --update_tank."
)
else:
print(
"Local and upstream hashes match. Using cached model artifacts."
)
model_dir = os.path.join(WORKDIR, model_dir_name)
tuned_str = "" if tuned is None else "_" + tuned
suffix = f"{dyn_str}_{frontend}{tuned_str}.mlir"
mlir_filename = os.path.join(model_dir, model_name + suffix)
print(
f"Verifying that model artifacts were downloaded successfully to {mlir_filename}..."
)
if not os.path.exists(mlir_filename):
from tank.generate_amdsharktank import gen_amdshark_files
print(
"The model data was not found. Trying to generate artifacts locally."
)
gen_amdshark_files(model_name, frontend, WORKDIR, import_args)
assert os.path.exists(mlir_filename), f"MLIR not found at {mlir_filename}"
function_name = str(np.load(os.path.join(model_dir, "function_name.npy")))
inputs = np.load(os.path.join(model_dir, "inputs.npz"))
golden_out = np.load(os.path.join(model_dir, "golden_out.npz"))
inputs_tuple = tuple([inputs[key] for key in inputs])
golden_out_tuple = tuple([golden_out[key] for key in golden_out])
return mlir_filename, function_name, inputs_tuple, golden_out_tuple

View File

@@ -1,212 +0,0 @@
from typing import Any, Dict, List, Tuple
from collections import defaultdict
from amdshark.amdshark_importer import import_with_fx, save_mlir
import torchvision.models as models
import copy
import io
import numpy as np
import sys
import torch
import torch.fx
from torch.fx.node import Node
from typing import Dict
import torch_mlir
def amdshark_backend(fx_g: torch.fx.GraphModule, inputs, device: str = "cpu"):
mlir_module = torch_mlir.compile(
fx_g, inputs, output_type="linalg-on-tensors"
)
bytecode_stream = io.BytesIO()
mlir_module.operation.write_bytecode(bytecode_stream)
bytecode = bytecode_stream.getvalue()
bytecode_path = save_mlir(
bytecode,
model_name="amdshark_eager_module",
frontend="torch",
mlir_dialect="tm_tensor",
)
from amdshark.amdshark_inference import AMDSharkInference
amdshark_module = AMDSharkInference(
mlir_module=bytecode_path,
device=device,
mlir_dialect="tm_tensor",
)
amdshark_module.compile(extra_args=[])
return amdshark_module
def _make_single_op_gm(node, captured_val, compiled_graph):
"""Make a GraphModule that just executes the given node."""
g = torch.fx.Graph()
env = {}
inputs = []
for arg in node.args:
if arg and hasattr(arg, "name"):
env[arg.name] = g.placeholder(arg.name)
if isinstance(captured_val[arg.name], (list, tuple)):
for val in captured_val[arg.name]:
inputs.append(val)
else:
inputs.append(captured_val[arg.name])
call = g.node_copy(node, lambda n: env[n.name])
g.output(call)
g.lint()
single_node = torch.fx.GraphModule(torch.nn.Module(), g)
compiled_module = amdshark_backend(single_node, inputs)
compiled_graph[node.name] = {
"module": compiled_module,
"inputs": [i for i in env],
"result": None,
}
return
def compiled_graph(gm: torch.fx.GraphModule, attr_info):
compiled_graph = {}
g = gm.graph
for node in g.nodes:
if node.op == "call_function":
if not (
node.target in [torch.ops.aten.empty]
or node.name.startswith("getitem")
):
_make_single_op_gm(node, attr_info, compiled_graph)
# Currently torch.aten.empty has an compilation issue, so running natively.
elif node.target in [torch.ops.aten.empty]:
compiled_graph[node.name] = {
"target": node.target,
"args": node.args,
"kwargs": node.kwargs,
"result": None,
}
# Get item is a simple case takes a tuple and return the tensor at a particular index.
elif node.name.startswith("getitem"):
compiled_graph[node.name] = {
"input": node.args[0].name,
"pos": node.args[1],
"result": None,
}
return compiled_graph
class ShapeProp:
"""
Shape propagation. This class takes a `GraphModule`.
Then, its `propagate` method executes the `GraphModule`
node-by-node with the given arguments. As each operation
executes, the ShapeProp class stores away the shape and
element type for the output values of each operation on
the `shape` and `dtype` attributes of the operation's
`Node`.
"""
def __init__(self, mod):
self.mod = mod
self.graph = mod.graph
self.modules = dict(self.mod.named_modules())
def propagate(self, *args):
args_iter = iter(args)
env: Dict[str, Node] = {}
def load_arg(a):
return torch.fx.graph.map_arg(a, lambda n: env[n.name])
def fetch_attr(target: str):
target_atoms = target.split(".")
attr_itr = self.mod
for i, atom in enumerate(target_atoms):
if not hasattr(attr_itr, atom):
raise RuntimeError(
f"Node referenced nonexistant target {'.'.join(target_atoms[:i])}"
)
attr_itr = getattr(attr_itr, atom)
return attr_itr
for node in self.graph.nodes:
if node.op == "placeholder":
result = next(args_iter)
elif node.op == "get_attr":
result = fetch_attr(node.target)
elif node.op == "call_function":
result = node.target(
*load_arg(node.args), **load_arg(node.kwargs)
)
elif node.op == "call_method":
self_obj, *args = load_arg(node.args)
kwargs = load_arg(node.kwargs)
result = getattr(self_obj, node.target)(*args, **kwargs)
elif node.op == "call_module":
result = self.modules[node.target](
*load_arg(node.args), **load_arg(node.kwargs)
)
# This is the only code specific to shape propagation.
# you can delete this `if` branch and this becomes
# a generic GraphModule interpreter.
if isinstance(result, torch.Tensor):
node.shape = result.shape
node.dtype = result.dtype
env[node.name] = result
return env
# return load_arg(self.graph.result)
resnet18 = models.resnet18(pretrained=True)
resnet18.train(False)
input = (torch.randn(1, 3, 224, 224),)
print(resnet18(input[0]))
fx_graph = import_with_fx(resnet18, input, mlir_type="fx")
shape_prop = ShapeProp(fx_graph)
x = shape_prop.propagate(input[0])
amdshark_graph = compiled_graph(fx_graph, x)
for key in amdshark_graph:
if key.startswith("getitem"):
input_val = amdshark_graph[key]["input"]
pos = amdshark_graph[key]["pos"]
if input_val not in amdshark_graph:
amdshark_graph[key]["result"] = x[input_val][pos].detach()
else:
amdshark_graph[key]["result"] = amdshark_graph[input_val]["result"][
pos
].detach()
elif key.startswith("empty"):
operator = amdshark_graph[key]["target"]
args = amdshark_graph[key]["args"]
kwargs = amdshark_graph[key]["kwargs"]
amdshark_graph[key]["result"] = operator(*args, **kwargs).detach()
else:
input_val = amdshark_graph[key]["inputs"]
input_tensors = []
for input in input_val:
if input not in amdshark_graph:
input_tensors.append(x[input].detach())
else:
input_tensors.append(amdshark_graph[input]["result"])
val = amdshark_graph[key]["module"]("forward", input_tensors)
if isinstance(val, (tuple, list)):
list_val = []
for v in val:
list_val.append(torch.from_numpy(v))
amdshark_graph[key]["result"] = list_val
else:
amdshark_graph[key]["result"] = torch.from_numpy(val)
print(amdshark_graph)

View File

@@ -1,153 +0,0 @@
import re
import json
import numpy as np
import torch_mlir
from iree.compiler import compile_file
from amdshark.amdshark_importer import import_with_fx, get_f16_inputs, save_mlir
class GenerateConfigFile:
def __init__(
self,
model,
num_sharding_stages: int,
sharding_stages_id: list[str],
units_in_each_stage: list[int],
model_input=None,
config_file_path="model_config.json",
):
self.model = model
self.num_sharding_stages = num_sharding_stages
self.sharding_stages_id = sharding_stages_id
assert self.num_sharding_stages == len(
self.sharding_stages_id
), "Number of sharding stages should be equal to the list of their ID"
self.model_input = model_input
self.config_file_path = config_file_path
# (Nithin) this is a quick fix - revisit and rewrite
self.units_in_each_stage = np.array(units_in_each_stage)
self.track_loop = np.zeros(len(self.sharding_stages_id)).astype(int)
def split_into_dispatches(
self,
backend,
fx_tracing_required=False,
f16_model=False,
torch_mlir_tracing=True,
):
graph_for_compilation = self.model
if fx_tracing_required:
graph_for_compilation = import_with_fx(
self.model,
self.model_input,
is_f16=f16_model,
f16_input_mask=[False, False],
mlir_type="torchscript",
)
module = torch_mlir.compile(
graph_for_compilation,
(self.model_input),
torch_mlir.OutputType.LINALG_ON_TENSORS,
use_tracing=torch_mlir_tracing,
verbose=False,
)
module = module.operation.get_asm(large_elements_limit=4)
module_file = save_mlir(
module,
model_name="module_pre_split",
frontend="torch",
mlir_dialect="linalg",
)
compiled_module_str = str(
compile_file(
module_file,
target_backends=[backend],
extra_args=[
"--compile-to=flow",
"--mlir-elide-elementsattrs-if-larger=4",
],
)
)
substring_start_idx = [
m.start()
for m in re.finditer("flow.dispatch @", compiled_module_str)
]
dispatch_list = dict()
# dispatch_no is the 'i'th index of a dispatch out of n total dispatches of a model
# dispatch_id is the unique id of a dispatch, multiple instances of the same dispatch
# can occur in a model
for dispatch_no, substring_idx in enumerate(substring_start_idx):
dispatch_idx = (
compiled_module_str[substring_idx:]
.split(":")[0]
.split("@")[-1]
)
key = "dispatch_no_" + str(dispatch_no)
dispatch_list[key] = {n: "None" for n in self.sharding_stages_id}
dispatch_list[key]["dispatch_id"] = dispatch_idx
self.generate_json(dispatch_list)
def split_into_layers(self):
model_dictionary = dict()
for name, m in self.model.named_modules():
if name == "":
continue
# Remove non-leaf nodes from the config as they aren't an operation
substring_before_final_period = name.split(".")[:-1]
substring_before_final_period = ".".join(
substring_before_final_period
)
if substring_before_final_period in model_dictionary:
del model_dictionary[substring_before_final_period]
# layer_dict = {n: "None" for n in self.sharding_stages_id}
# By default embed increasing device id's for each layer
increasing_wraparound_idx_list = (
self.track_loop % self.units_in_each_stage
)
layer_dict = {
n: int(increasing_wraparound_idx_list[idx][0][0])
for idx, n in enumerate(self.sharding_stages_id)
}
self.track_loop += 1
model_dictionary[name] = layer_dict
self.generate_json(model_dictionary)
def generate_json(self, artifacts):
with open(self.config_file_path, "w") as outfile:
json.dump(artifacts, outfile)
if __name__ == "__main__":
import torch
from transformers import AutoTokenizer
hf_model_path = "TheBloke/vicuna-7B-1.1-HF"
tokenizer = AutoTokenizer.from_pretrained(hf_model_path, use_fast=False)
compilation_prompt = "".join(["0" for _ in range(17)])
compilation_input_ids = tokenizer(
compilation_prompt,
return_tensors="pt",
).input_ids
compilation_input_ids = torch.tensor(compilation_input_ids).reshape(
[1, 19]
)
firstVicunaCompileInput = (compilation_input_ids,)
from apps.language_models.src.model_wrappers.vicuna_model import (
FirstVicuna,
SecondVicuna7B,
CombinedModel,
)
model = CombinedModel()
c = GenerateConfigFile(model, 1, ["gpu_id"], firstVicunaCompileInput)
c.split_into_layers()

View File

@@ -1,819 +0,0 @@
# Lint as: python3
"""AMDSHARK Importer"""
import sys
import tempfile
import os
import hashlib
from apps.amdshark_studio.modules.shared_cmd_opts import cmd_opts
def create_hash(file_name):
with open(file_name, "rb") as f:
file_hash = hashlib.blake2b(digest_size=64)
while chunk := f.read(2**10):
file_hash.update(chunk)
return file_hash.hexdigest()
# List of the supported frontends.
supported_frontends = {
"tensorflow",
"tf",
"pytorch",
"torch",
"tf-lite",
"tflite",
}
class AMDSharkImporter:
"""
AMDSharkImporter converts frontend modules into a
mlir_module. The supported frameworks are tensorflow,
pytorch, and tf-lite.
...
Attributes
----------
module :
torch, tensorflow or tf-lite module.
inputs :
inputs to the module, may be required for the shape
information.
frontend: str
frontend to which the module belongs.
raw_model_file: str
temp tflite model path
Methods
-------
import_mlir(is_dynamic, tracing_required, func_name):
is_dynamic: input shapes to be totally dynamic (pytorch specific).
tracing_required: whether tracing is required (pytorch specific.
func_name: The function to be traced out or imported to mlir.
import_debug(is_dynamic, tracing_required, func_name):
returns the converted (mlir_module,func_name) with inputs and golden
outputs.
The inputs and outputs are converted into np array.
"""
def __init__(
self,
module,
inputs: tuple = (),
frontend: str = "torch",
raw_model_file: str = "",
return_str: bool = False,
):
self.module = module
self.inputs = None if len(inputs) == 0 else inputs
self.frontend = frontend
if not self.frontend in supported_frontends:
print(
f"The frontend is not in the supported_frontends: {supported_frontends}"
)
sys.exit(1)
self.raw_model_file = raw_model_file
self.return_str = return_str
# NOTE: The default function for torch is "forward" and tf-lite is "main".
def _torch_mlir(self, is_dynamic, tracing_required, mlir_type):
from amdshark.torch_mlir_utils import get_torch_mlir_module
return get_torch_mlir_module(
self.module,
self.inputs,
is_dynamic,
tracing_required,
self.return_str,
mlir_type,
)
def _tf_mlir(self, func_name, save_dir="."):
from iree.compiler import tf as tfc
return tfc.compile_module(
self.module,
exported_names=[func_name],
import_only=True,
output_file=save_dir,
)
def _tflite_mlir(self, func_name, save_dir="."):
from iree.compiler import tflite as tflitec
self.mlir_model = tflitec.compile_file(
self.raw_model_file, # in tflite, it is a path to .tflite file, not a tflite interpreter
input_type="tosa",
import_only=True,
output_file=save_dir,
)
return self.mlir_model
# Adds the conversion of the frontend with the private function.
def import_mlir(
self,
is_dynamic=False,
tracing_required=False,
func_name="forward",
save_dir=cmd_opts.tmp_dir, #"./amdshark_tmp/",
mlir_type="linalg",
):
if self.frontend in ["torch", "pytorch"]:
if self.inputs == None:
print(
"Please pass in the inputs, the inputs are required to determine the shape of the mlir_module"
)
sys.exit(1)
return (
self._torch_mlir(is_dynamic, tracing_required, mlir_type),
func_name,
)
if self.frontend in ["tf", "tensorflow"]:
return self._tf_mlir(func_name, save_dir), func_name
if self.frontend in ["tflite", "tf-lite"]:
func_name = "main"
return self._tflite_mlir(func_name, save_dir), func_name
# Converts the frontend specific tensors into np array.
def convert_to_numpy(self, array_tuple: tuple):
if self.frontend in ["torch", "pytorch"]:
return [x.detach().cpu().numpy() for x in array_tuple]
if self.frontend in ["tf", "tensorflow"]:
return [x.numpy() for x in array_tuple]
# Saves `function_name.npy`, `inputs.npz`, `golden_out.npz` and `model_name.mlir` in the directory `dir`.
def save_data(
self,
dir,
model_name,
mlir_data,
func_name,
inputs,
outputs,
mlir_type="linalg",
):
import numpy as np
inputs_name = "inputs.npz"
outputs_name = "golden_out.npz"
func_file_name = "function_name"
model_name_mlir = (
model_name + "_" + self.frontend + "_" + mlir_type + ".mlir"
)
print(f"saving {model_name_mlir} to {dir}")
try:
inputs = [x.cpu().detach() for x in inputs]
except AttributeError:
try:
inputs = [x.numpy() for x in inputs]
except AttributeError:
inputs = [x for x in inputs]
np.savez(os.path.join(dir, inputs_name), *inputs)
np.savez(os.path.join(dir, outputs_name), *outputs)
np.save(os.path.join(dir, func_file_name), np.array(func_name))
if self.frontend == "torch":
with open(os.path.join(dir, model_name_mlir), "wb") as mlir_file:
mlir_file.write(mlir_data)
hash_gen_attempts = 2
for i in range(hash_gen_attempts):
try:
mlir_hash = create_hash(os.path.join(dir, model_name_mlir))
except FileNotFoundError as err:
if i < hash_gen_attempts:
continue
else:
raise err
np.save(os.path.join(dir, "hash"), np.array(mlir_hash))
return
def import_debug(
self,
is_dynamic=False,
tracing_required=False,
func_name="forward",
dir=tempfile.gettempdir(),
model_name="model",
golden_values=None,
mlir_type="linalg",
):
if self.inputs == None:
print(
f"There is no input provided: {self.inputs}, please provide inputs or simply run import_mlir."
)
sys.exit(1)
model_name_mlir = (
model_name + "_" + self.frontend + "_" + mlir_type + ".mlir"
)
artifact_path = os.path.join(dir, model_name_mlir)
imported_mlir = self.import_mlir(
is_dynamic,
tracing_required,
func_name,
save_dir=artifact_path,
mlir_type=mlir_type,
)
# TODO: Make sure that any generic function name is accepted. Currently takes in the default function names.
# TODO: Check for multiple outputs.
if self.frontend in ["torch", "pytorch"]:
import torch
golden_out = None
if golden_values is not None:
golden_out = golden_values
else:
golden_out = self.module(*self.inputs)
if torch.is_tensor(golden_out):
golden_out = tuple(
golden_out.detach().cpu().numpy(),
)
else:
golden_out = self.convert_to_numpy(golden_out)
# Save the artifacts in the directory dir.
self.save_data(
dir,
model_name,
imported_mlir[0],
imported_mlir[1],
self.inputs,
golden_out,
mlir_type,
)
return (
imported_mlir,
self.convert_to_numpy(self.inputs),
golden_out,
)
if self.frontend in ["tf", "tensorflow"]:
import tensorflow as tf
golden_out = self.module.forward(*self.inputs)
if tf.is_tensor(golden_out):
golden_out = tuple(
golden_out.numpy(),
)
elif golden_out is tuple:
golden_out = self.convert_to_numpy(golden_out)
elif hasattr(golden_out, "logits"):
# from transformers import TFSequenceClassifierOutput
golden_out = golden_out.logits
else:
golden_out = golden_out.last_hidden_state
# Save the artifacts in the directory dir.
self.save_data(
dir,
model_name,
imported_mlir[0],
imported_mlir[1],
self.inputs,
golden_out,
)
return (
imported_mlir,
self.convert_to_numpy(self.inputs),
golden_out,
)
if self.frontend in ["tflite", "tf-lite"]:
# TODO(Chi): Validate it for tflite models.
golden_out = self.module.invoke_tflite(self.inputs)
self.save_data(
dir,
model_name,
imported_mlir[0],
imported_mlir[1],
self.inputs,
golden_out,
)
return (
imported_mlir,
self.inputs,
golden_out,
)
def get_f16_inputs(inputs, is_f16, f16_input_mask):
if is_f16 == False:
return inputs
if f16_input_mask == None:
return tuple([x.half() for x in inputs])
f16_masked_inputs = []
for i in range(len(inputs)):
if f16_input_mask[i]:
f16_masked_inputs.append(inputs[i].half())
else:
f16_masked_inputs.append(inputs[i])
return tuple(f16_masked_inputs)
# Upcasts the block/list of ops.
def add_upcast(fx_g):
import torch
for node in fx_g.graph.nodes:
if node.target in [torch.ops.aten.mul]:
# This is a very strict check.
if hasattr(node.args[1], "target"):
if (
node.args[1].target in [torch.ops.aten.rsqrt]
and node.args[1].args[0].target in [torch.ops.aten.add]
and node.args[1].args[0].args[0].target
in [torch.ops.aten.mean]
and node.args[1].args[0].args[0].args[0].target
in [torch.ops.aten.pow]
):
print("found an upcasting block let's upcast it.")
pow_node = node.args[1].args[0].args[0].args[0]
mul_node = node
with fx_g.graph.inserting_before(pow_node):
lhs = pow_node.args[0]
upcast_lhs = fx_g.graph.call_function(
torch.ops.aten._to_copy,
args=(lhs,),
kwargs={"dtype": torch.float32},
)
pow_node.args = (upcast_lhs, pow_node.args[1])
with fx_g.graph.inserting_before(mul_node):
new_node = fx_g.graph.call_function(
torch.ops.aten._to_copy,
args=(mul_node,),
kwargs={"dtype": torch.float16},
)
mul_node.append(new_node)
mul_node.replace_all_uses_with(new_node)
new_node.args = (mul_node,)
new_node.kwargs = {"dtype": torch.float16}
fx_g.graph.lint()
def transform_fx(fx_g, quantized=False):
import torch
kwargs_dict = {
"dtype": torch.float16,
"device": torch.device(type="cpu"),
"pin_memory": False,
}
kwargs_dict1 = {
"dtype": torch.float16,
}
for node in fx_g.graph.nodes:
if node.op == "call_function":
# aten.empty should be filled with zeros.
if node.target in [torch.ops.aten.empty]:
with fx_g.graph.inserting_after(node):
new_node = fx_g.graph.call_function(
torch.ops.aten.zero_,
args=(node,),
)
node.append(new_node)
node.replace_all_uses_with(new_node)
new_node.args = (node,)
if quantized:
continue
if node.target in [
torch.ops.aten.arange,
torch.ops.aten.empty,
torch.ops.aten.zeros,
torch.ops.aten.zeros_like,
]:
if node.kwargs.get("dtype") == torch.float32:
node.kwargs = kwargs_dict
# Vicuna
if node.target in [
torch.ops.aten._to_copy,
]:
if node.kwargs.get("dtype") == torch.float32:
node.kwargs = kwargs_dict1
if node.target in [
torch.ops.aten.masked_fill,
]:
if node.args[2] > torch.finfo(torch.half).max:
max_val = torch.finfo(torch.half).max
node.args = (node.args[0], node.args[1], max_val)
elif node.args[2] < torch.finfo(torch.half).min:
min_val = torch.finfo(torch.half).min
node.args = (node.args[0], node.args[1], min_val)
if node.target in [
torch.ops.aten.full,
]:
if node.args[1] > torch.finfo(torch.half).max:
max_val = torch.finfo(torch.half).max
node.args = (node.args[0], max_val)
node.kwargs = kwargs_dict
elif node.args[1] < torch.finfo(torch.half).min:
min_val = torch.finfo(torch.half).min
node.args = (node.args[0], min_val)
node.kwargs = kwargs_dict
# Inputs and outputs of aten.var.mean should be upcasted to fp32.
if node.target in [torch.ops.aten.var_mean]:
with fx_g.graph.inserting_before(node):
new_node = fx_g.graph.call_function(
torch.ops.prims.convert_element_type,
args=(node.args[0], torch.float32),
kwargs={},
)
node.args = (new_node, node.args[1])
if node.name.startswith("getitem"):
with fx_g.graph.inserting_before(node):
if node.args[0].target in [torch.ops.aten.var_mean]:
new_node = fx_g.graph.call_function(
torch.ops.aten._to_copy,
args=(node,),
kwargs={"dtype": torch.float16},
)
node.append(new_node)
node.replace_all_uses_with(new_node)
new_node.args = (node,)
new_node.kwargs = {"dtype": torch.float16}
# Required for cuda debugging.
# for node in fx_g.graph.nodes:
# if node.op == "call_function":
# if node.kwargs.get("device") == torch.device(type="cpu"):
# new_kwargs = node.kwargs.copy()
# new_kwargs["device"] = torch.device(type="cuda")
# node.kwargs = new_kwargs
fx_g.graph.lint()
def gptq_transforms(fx_g):
import torch
for node in fx_g.graph.nodes:
if node.op == "call_function":
if node.target in [
torch.ops.aten.arange,
torch.ops.aten.empty,
torch.ops.aten.ones,
torch.ops.aten._to_copy,
]:
if node.kwargs.get("device") == torch.device(device="cuda:0"):
updated_kwargs = node.kwargs.copy()
updated_kwargs["device"] = torch.device(device="cpu")
node.kwargs = updated_kwargs
if node.target in [
torch.ops.aten._to_copy,
]:
if node.kwargs.get("dtype") == torch.bfloat16:
updated_kwargs = node.kwargs.copy()
updated_kwargs["dtype"] = torch.float16
node.kwargs = updated_kwargs
# Inputs of aten.native_layer_norm should be upcasted to fp32.
if node.target in [torch.ops.aten.native_layer_norm]:
with fx_g.graph.inserting_before(node):
new_node_arg0 = fx_g.graph.call_function(
torch.ops.prims.convert_element_type,
args=(node.args[0], torch.float32),
kwargs={},
)
node.args = (
new_node_arg0,
node.args[1],
node.args[2],
node.args[3],
node.args[4],
)
# Inputs of aten.mm should be upcasted to fp32.
if node.target in [torch.ops.aten.mm]:
with fx_g.graph.inserting_before(node):
new_node_arg0 = fx_g.graph.call_function(
torch.ops.prims.convert_element_type,
args=(node.args[0], torch.float32),
kwargs={},
)
new_node_arg1 = fx_g.graph.call_function(
torch.ops.prims.convert_element_type,
args=(node.args[1], torch.float32),
kwargs={},
)
node.args = (new_node_arg0, new_node_arg1)
# Outputs of aten.mm should be downcasted to fp16.
if type(node.args[0]) == torch.fx.node.Node and node.args[
0
].target in [torch.ops.aten.mm]:
with fx_g.graph.inserting_before(node):
tmp = node.args[0]
new_node = fx_g.graph.call_function(
torch.ops.aten._to_copy,
args=(node.args[0],),
kwargs={"dtype": torch.float16},
)
node.args[0].append(new_node)
node.args[0].replace_all_uses_with(new_node)
new_node.args = (tmp,)
new_node.kwargs = {"dtype": torch.float16}
# Inputs of aten._softmax should be upcasted to fp32.
if node.target in [torch.ops.aten._softmax]:
with fx_g.graph.inserting_before(node):
new_node_arg0 = fx_g.graph.call_function(
torch.ops.prims.convert_element_type,
args=(node.args[0], torch.float32),
kwargs={},
)
node.args = (new_node_arg0, node.args[1], node.args[2])
# Outputs of aten._softmax should be downcasted to fp16.
if (
type(node.args[0]) == torch.fx.node.Node
and node.args[0].target in [torch.ops.aten._softmax]
and node.target in [torch.ops.aten.expand]
):
with fx_g.graph.inserting_before(node):
tmp = node.args[0]
new_node = fx_g.graph.call_function(
torch.ops.aten._to_copy,
args=(node.args[0],),
kwargs={"dtype": torch.float16},
)
node.args[0].append(new_node)
node.args[0].replace_all_uses_with(new_node)
new_node.args = (tmp,)
new_node.kwargs = {"dtype": torch.float16}
fx_g.graph.lint()
# Doesn't replace the None type.
def change_fx_graph_return_to_tuple(fx_g):
for node in fx_g.graph.nodes:
if node.op == "output":
# output nodes always have one argument
node_arg = node.args[0]
out_nodes = []
if isinstance(node_arg, list):
# Don't return NoneType elements.
for out_node in node_arg:
if not isinstance(out_node, type(None)):
out_nodes.append(out_node)
# If there is a single tensor/element to be returned don't
# a tuple for it.
if len(out_nodes) == 1:
node.args = out_nodes
else:
node.args = (tuple(out_nodes),)
fx_g.graph.lint()
fx_g.recompile()
return fx_g
def flatten_training_input(inputs):
flattened_input = []
for i in inputs:
if isinstance(i, dict):
for value in i.values():
flattened_input.append(value.detach())
elif isinstance(i, tuple):
for value in i:
flattened_input.append(value)
else:
flattened_input.append(i)
return tuple(flattened_input)
# TODO: Remove is_f16 and fix all calls with using precision instead
# Applies fx conversion to the model and imports the mlir.
def import_with_fx(
model,
inputs,
is_f16=False,
f16_input_mask=None,
debug=False,
training=False,
return_str=False,
save_dir=tempfile.gettempdir(),
model_name="model",
mlir_type="linalg",
is_dynamic=False,
tracing_required=False,
precision="fp32",
is_gptq=False,
):
import torch
from torch.fx.experimental.proxy_tensor import make_fx
from torch._decomp import get_decompositions
from typing import List
golden_values = None
if debug:
try:
golden_values = model(*inputs)
except:
golden_values = None
def _remove_nones(fx_g: torch.fx.GraphModule) -> List[int]:
removed_indexes = []
for node in fx_g.graph.nodes:
if node.op == "output":
assert (
len(node.args) == 1
), "Output node must have a single argument"
node_arg = node.args[0]
if isinstance(node_arg, (list, tuple)):
node_arg = list(node_arg)
node_args_len = len(node_arg)
for i in range(node_args_len):
curr_index = node_args_len - (i + 1)
if node_arg[curr_index] is None:
removed_indexes.append(curr_index)
node_arg.pop(curr_index)
node.args = (tuple(node_arg),)
break
if len(removed_indexes) > 0:
fx_g.graph.lint()
fx_g.graph.eliminate_dead_code()
fx_g.recompile()
removed_indexes.sort()
return removed_indexes
def _unwrap_single_tuple_return(fx_g: torch.fx.GraphModule) -> bool:
"""
Replace tuple with tuple element in functions that return one-element tuples.
Returns true if an unwrapping took place, and false otherwise.
"""
unwrapped_tuple = False
for node in fx_g.graph.nodes:
if node.op == "output":
assert (
len(node.args) == 1
), "Output node must have a single argument"
node_arg = node.args[0]
if isinstance(node_arg, tuple):
if len(node_arg) == 1:
node.args = (node_arg[0],)
unwrapped_tuple = True
break
if unwrapped_tuple:
fx_g.graph.lint()
fx_g.recompile()
return unwrapped_tuple
# TODO: Control the decompositions.
decomps_list = [
torch.ops.aten.embedding_dense_backward,
torch.ops.aten.native_layer_norm_backward,
torch.ops.aten.slice_backward,
torch.ops.aten.select_backward,
torch.ops.aten.norm.ScalarOpt_dim,
torch.ops.aten.native_group_norm,
torch.ops.aten.upsample_bilinear2d.vec,
torch.ops.aten.split.Tensor,
torch.ops.aten.split_with_sizes,
torch.ops.aten.native_layer_norm,
torch.ops.aten.masked_fill.Tensor,
torch.ops.aten.masked_fill.Scalar,
torch.ops.aten._scaled_dot_product_flash_attention.default,
torch.ops.aten.index_add,
torch.ops.aten.index_add_,
]
if precision in ["int4", "int8"] and not is_gptq:
from brevitas_examples.llm.llm_quant.export import (
block_quant_layer_level_manager,
)
from brevitas_examples.llm.llm_quant.export import (
brevitas_layer_export_mode,
)
from brevitas_examples.llm.llm_quant.sharded_mlir_group_export import (
LinearWeightBlockQuantHandlerFwd,
)
from brevitas_examples.llm.llm_quant.export import (
replace_call_fn_target,
)
from brevitas_examples.llm.llm_quant.sharded_mlir_group_export import (
matmul_rhs_group_quant_placeholder,
)
from brevitas.backport.fx.experimental.proxy_tensor import (
make_fx as brevitas_make_fx,
)
export_context_manager = brevitas_layer_export_mode
export_class = block_quant_layer_level_manager(
export_handlers=[LinearWeightBlockQuantHandlerFwd]
)
with export_context_manager(model, export_class):
fx_g = brevitas_make_fx(
model,
decomposition_table=get_decompositions(decomps_list),
)(*inputs)
transform_fx(fx_g, quantized=True)
replace_call_fn_target(
fx_g,
src=matmul_rhs_group_quant_placeholder,
target=torch.ops.quant.matmul_rhs_group_quant,
)
fx_g.recompile()
removed_none_indexes = _remove_nones(fx_g)
was_unwrapped = _unwrap_single_tuple_return(fx_g)
else:
fx_g = make_fx(
model,
decomposition_table=get_decompositions(decomps_list),
)(*inputs)
fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
fx_g.recompile()
def strip_overloads(gm):
"""
Modifies the target of graph nodes in :attr:`gm` to strip overloads.
Args:
gm(fx.GraphModule): The input Fx graph module to be modified
"""
for node in gm.graph.nodes:
if isinstance(node.target, torch._ops.OpOverload):
node.target = node.target.overloadpacket
gm.recompile()
strip_overloads(fx_g)
if is_f16:
fx_g = fx_g.half()
transform_fx(fx_g)
# TODO: Have to make it more generic.
add_upcast(fx_g)
fx_g.recompile()
if is_gptq:
gptq_transforms(fx_g)
fx_g.recompile()
if mlir_type == "fx":
return fx_g
if training:
change_fx_graph_return_to_tuple(fx_g)
inputs = flatten_training_input(inputs)
ts_graph = torch.jit.script(fx_g)
if mlir_type == "torchscript":
return ts_graph
inputs = get_f16_inputs(inputs, is_f16, f16_input_mask)
mlir_importer = AMDSharkImporter(
ts_graph,
inputs,
frontend="torch",
return_str=return_str,
)
if debug: # and not is_f16:
(mlir_module, func_name), _, _ = mlir_importer.import_debug(
dir=save_dir,
model_name=model_name,
golden_values=golden_values,
mlir_type=mlir_type,
is_dynamic=is_dynamic,
tracing_required=tracing_required,
)
return mlir_module, func_name
mlir_module, func_name = mlir_importer.import_mlir(mlir_type=mlir_type)
return mlir_module, func_name
# Saves a .mlir module python object to the directory 'dir' with 'model_name' and returns a path to the saved file.
def save_mlir(
mlir_module,
model_name,
mlir_dialect="linalg",
frontend="torch",
dir="",
):
model_name_mlir = (
model_name + "_" + frontend + "_" + mlir_dialect + ".mlir"
)
if dir == "":
dir = cmd_opts.tmp_dir, #os.path.join(".", "amdshark_tmp")
mlir_path = os.path.join(dir, model_name_mlir)
print(f"saving {model_name_mlir} to {dir}")
if not os.path.exists(dir):
os.makedirs(dir)
if frontend == "torch":
with open(mlir_path, "wb") as mlir_file:
mlir_file.write(mlir_module)
return mlir_path

View File

@@ -1,243 +0,0 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from amdshark.iree_utils.compile_utils import (
export_iree_module_to_vmfb,
load_flatbuffer,
create_dispatch_dirs,
compile_benchmark_dirs,
)
import os
from amdshark.amdshark_runner import AMDSharkRunner
from amdshark.parser import amdshark_args
import numpy as np
dtype_to_np_dtype = {
"f32": np.float32,
"f64": np.float64,
"i32": np.int32,
"i64": np.int64,
"i1": np.bool_,
}
class AMDSharkInference:
"""
Runs prediction or inference on mlir_module.
...
Attributes
----------
mlir_module : str
mlir_module or path represented in string; modules from torch-mlir are serialized in bytecode format.
device : str
device to execute the mlir_module on.
currently supports cpu, cuda, vulkan, and metal backends.
mlir_dialect: str
The dialect in which the given mlir_module is in.
Refer to {https://mlir.llvm.org/docs/Dialects/}
is_benchmark: bool
Whether this AMDSharkInference module should be benchmark-enabled.
mmap: bool
Whether to load/run vmfb using mmap. It's `True` by default.
Methods
-------
__call__(function_name, inputs=None):
Runs the function with `function_name` within the mlir_module along
with the given inputs, if the inputs are not given it autogenerates the
inputs. Also, the inputs should be a numpy array.
input_info():
Gives the information about the inputs required by the `function_name`.
This can be expensive as it does string matching to do so.
"""
def __init__(
self,
mlir_module,
device: str = "none",
mlir_dialect: str = "linalg",
is_benchmark: bool = False,
dispatch_benchmark: str = None,
dispatch_benchmark_dir: str = "temp_dispatch_benchmarks",
device_idx: int = None,
mmap: bool = True,
rt_flags: list = [],
):
self.mlir_module = mlir_module
if mlir_module is not None:
if mlir_module and not os.path.isfile(mlir_module):
print(
"Warning: Initializing AMDSharkInference with a mlir string/bytecode object will duplicate the model in RAM at compile time. To avoid this, initialize AMDSharkInference with a path to a MLIR module on your hard disk instead."
)
self.compile_str = True
else:
self.compile_str = False
self.device = amdshark_args.device if device == "none" else device
self.mlir_dialect = mlir_dialect
self.is_benchmark = is_benchmark
self.device_idx = device_idx
self.dispatch_benchmarks = (
amdshark_args.dispatch_benchmarks
if dispatch_benchmark is None
else dispatch_benchmark
)
self.dispatch_benchmarks_dir = (
amdshark_args.dispatch_benchmarks_dir
if dispatch_benchmark_dir == "temp_dispatch_benchmarks"
else dispatch_benchmark_dir
)
self.amdshark_runner = None
self.mmap = mmap
self.rt_flags = rt_flags
def compile(self, extra_args=[]):
if self.dispatch_benchmarks is not None:
extra_args.append(
f"--iree-hal-dump-executable-sources-to={self.dispatch_benchmarks_dir}"
)
extra_args.append(
f"--iree-hal-dump-executable-binaries-to={self.dispatch_benchmarks_dir}"
)
temp_dir = self.dispatch_benchmarks_dir.split("/")
temp_dir[-1] = "temp_" + temp_dir[-1]
temp_dir = "/".join(temp_dir)
self.temp_dispatch_benchmarks_dir = temp_dir
extra_args.append(
f"--iree-hal-dump-executable-benchmarks-to={self.temp_dispatch_benchmarks_dir}"
)
if self.is_benchmark == True:
from amdshark.amdshark_benchmark_runner import AMDSharkBenchmarkRunner
self.amdshark_runner = AMDSharkBenchmarkRunner(
self.mlir_module,
self.device,
self.mlir_dialect,
extra_args=extra_args,
)
else:
self.amdshark_runner = AMDSharkRunner(
self.mlir_module,
self.device,
self.mlir_dialect,
extra_args=extra_args,
device_idx=self.device_idx,
rt_flags=self.rt_flags,
)
if self.dispatch_benchmarks is not None:
create_dispatch_dirs(self.dispatch_benchmarks_dir, self.device)
compile_benchmark_dirs(
self.dispatch_benchmarks_dir,
self.device,
self.dispatch_benchmarks,
)
os.system(f"rm -rf {self.temp_dispatch_benchmarks_dir}")
# inputs are considered to be tuple of np.array.
def __call__(self, function_name: str, inputs: tuple, send_to_host=True):
return self.amdshark_runner.run(
function_name, inputs, send_to_host, device=self.device
)
# forward function.
def forward(self, inputs: tuple, send_to_host=True):
return self.amdshark_runner.run(
"forward", inputs, send_to_host, device=self.device
)
# Get all function names defined within the compiled module.
def get_functions_in_module(self):
return self.amdshark_runner.get_functions_in_module()
# Captures the static input information from the mlir_module.
# TODO(pashu123): Generate the input information for dynamic shapes.
def _input_info(self, function_name):
# func_key to get the line which contains the function.
func_key = "func.func @" + function_name
func_header = None
for line in str(self.mlir_module).splitlines():
if func_key in line:
func_header = line
break
if func_header is None:
print(f"Function: {function_name} not found")
import re
inputs = re.findall("\(.*?\)", func_header)[0].split(",")
shapes = []
dtype = []
for inp in inputs:
shape_dtype = re.findall(r"<[^>]*>", inp)[0].split("x")
shape_dtype[0], shape_dtype[-1] = (
shape_dtype[0][1:],
shape_dtype[-1][:-1],
)
shapes.append(tuple([int(x) for x in shape_dtype[:-1]]))
dtype.append(shape_dtype[-1])
return shapes, dtype
# Generates random input to be feed into the graph.
def generate_random_inputs(self, low=0, high=1):
shapes, dtype = self._input_info()
inputs = []
for i, j in zip(shapes, dtype):
inputs.append(
np.random.uniform(low, high, size=i).astype(
dtype_to_np_dtype[j]
)
)
return tuple(inputs)
# TODO: Instead of passing directory and having names decided by the module
# , user may want to save the module with manual names.
def save_module(
self, dir=os.getcwd(), module_name=None, extra_args=[], debug=False
):
return export_iree_module_to_vmfb(
self.mlir_module,
self.device,
dir,
self.mlir_dialect,
module_name=module_name,
extra_args=extra_args,
debug=debug,
compile_str=self.compile_str,
)
# load and return the module.
def load_module(self, path, extra_args=[]):
self.amdshark_runner = AMDSharkRunner(
device=self.device,
compile_vmfb=False,
extra_args=extra_args,
rt_flags=self.rt_flags,
)
params = load_flatbuffer(
path,
self.device,
self.device_idx,
mmap=self.mmap,
rt_flags=self.rt_flags,
)
self.amdshark_runner.iree_compilation_module = params["vmfb"]
self.amdshark_runner.iree_config = params["config"]
self.amdshark_runner.temp_file_to_unlink = params["temp_file_to_unlink"]
del params
return

View File

@@ -1,127 +0,0 @@
# Copyright 2020 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from amdshark.iree_utils.compile_utils import (
get_iree_compiled_module,
get_results,
export_iree_module_to_vmfb,
load_flatbuffer,
)
from amdshark.iree_utils._common import check_device_drivers, device_driver_info
from amdshark.parser import amdshark_args
import os
import sys
# supported dialects by the amdshark-runtime.
supported_dialects = {
"linalg",
"auto",
"stablehlo",
"tosa",
"tf-lite",
"tm_tensor",
}
class AMDSharkRunner:
"""
Base class for AMDSharkInference and AMDSharkTrainer
used to execute an mlir_module.
...
Attributes
----------
mlir_module : str
mlir_module path, string, or bytecode.
device : str
device to execute the mlir_module on.
currently supports cpu, cuda, vulkan, and metal backends.
mlir_dialect: str
The dialect in which the given mlir_module is in.
Refer to {https://mlir.llvm.org/docs/Dialects/}
Methods
-------
run(function_name, inputs=None):
Runs the function with `function_name` within the mlir_module along
with the given inputs, if the inputs are not given it autogenerates the
inputs. Also, the inputs should be a numpy array.
input_info():
Gives the information about the inputs required by the `function_name`.
This can be expensive as it does string matching to do so.
"""
def __init__(
self,
mlir_module: bytes = None,
device: str = "none",
mlir_dialect: str = "linalg",
extra_args: list = [],
compile_vmfb: bool = True,
device_idx: int = None,
rt_flags: list = [],
):
self.mlir_module = mlir_module
if self.mlir_module is not None:
if not os.path.isfile(mlir_module):
print(
"Warning: Initializing AMDSharkRunner with a mlir string/bytecode object will duplicate the model in RAM at compile time. To avoid this, initialize AMDSharkInference with a path to a MLIR module on your hard disk instead."
)
self.compile_str = True
else:
self.compile_str = False
self.device = amdshark_args.device if device == "none" else device
self.mlir_dialect = mlir_dialect
self.extra_args = extra_args
self.device_idx = device_idx
self.rt_flags = rt_flags
if check_device_drivers(self.device):
print(device_driver_info(self.device))
sys.exit(1)
if compile_vmfb == True:
# Compile the module to get the .vmfb.
params = get_iree_compiled_module(
self.mlir_module,
self.device,
self.mlir_dialect,
extra_args=self.extra_args,
device_idx=self.device_idx,
rt_flags=self.rt_flags,
compile_str=self.compile_str,
)
self.iree_compilation_module = params["vmfb"]
self.iree_config = params["config"]
self.temp_file_to_unlink = params["temp_file_to_unlink"]
del params
def run(
self, function_name, inputs: tuple, send_to_host=False, device=None
):
return get_results(
self.iree_compilation_module,
function_name,
inputs,
self.iree_config,
self.mlir_dialect,
send_to_host,
device=device,
)
# Get all function names defined within the compiled module.
def get_functions_in_module(self):
return self.iree_compilation_module._vm_module.function_names

View File

@@ -1,163 +0,0 @@
# Copyright 2020 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from amdshark.parser import amdshark_args
from amdshark.amdshark_runner import AMDSharkRunner
from amdshark.backward_makefx import MakeFxModule
from amdshark.amdshark_importer import import_with_fx, save_mlir
import numpy as np
from tqdm import tqdm
import sys
# Prints to stderr.
def print_err(*a):
print(*a, file=sys.stderr)
class AMDSharkTrainer:
"""Training pytorch, tensorflow module on amdshark runtime."""
def __init__(
self,
model,
input: tuple,
dynamic: bool = False,
device: str = None,
jit_trace: bool = False,
from_aot: bool = True,
):
self.model = model
# Change tuple to list.
self.input = [x for x in input]
self.dynamic = dynamic
self.from_aot = from_aot
self.jit_trace = jit_trace
self.from_aot = from_aot
# By default it's the torch frontend.
self.frontend = "pytorch"
self.device = device if device is not None else amdshark_args.device
self.amdshark_runner = None
# Sets the frontend i.e `pytorch` or `tensorflow`.
def set_frontend(self, frontend: str):
if frontend not in [
"pytorch",
"torch",
"tensorflow",
"tf",
"stablehlo",
"mhlo",
"linalg",
"tosa",
]:
print_err("frontend not supported.")
else:
self.frontend = frontend
# Training function is needed in the case of torch_fn.
def compile(self, training_fn=None, mlir_type="linalg", extra_args=[]):
if self.frontend in ["torch", "pytorch"]:
packed_inputs = (
dict(self.model.named_parameters()),
dict(self.model.named_buffers()),
tuple(self.input),
)
mlir_module, func_name = import_with_fx(
training_fn,
packed_inputs,
False,
[],
training=True,
mlir_type=mlir_type,
)
mlir_module = save_mlir(
mlir_module,
model_name="amdshark_model",
frontend="torch",
mlir_dialect=mlir_type,
)
self.amdshark_runner = AMDSharkRunner(
mlir_module,
self.device,
"tm_tensor",
extra_args=extra_args,
)
elif self.frontend in ["tensorflow", "tf", "mhlo", "stablehlo"]:
self.amdshark_runner = AMDSharkRunner(
self.model,
self.input,
self.dynamic,
self.device,
self.jit_trace,
self.from_aot,
self.frontend,
)
else:
print_err("Unknown frontend")
return
# The inputs to the mlir-graph are weights, buffers and inputs respectively.
def get_torch_params(self):
params = [i.detach() for i in self.model.parameters()]
buffers = [i.detach() for i in self.model.buffers()]
return params + buffers
# Function to train pytorch module.
def _train_torch(self, num_iters):
"""Returns the updated weights after num_iters"""
params = self.get_torch_params()
params = [x.numpy() for x in params]
print(f"Training started for {num_iters} iterations:")
for i in tqdm(range(num_iters)):
params = self.amdshark_runner.run(
"forward", params + self.input, self.frontend
)
return params
# Function to train tensorflow module.
# Output final loss.
# TODO(raikonenfnu): Save updated weight/states in AMDSHARK.
def _train_tf(self, num_iters):
input_list = []
for x in self.input:
if isinstance(x, list):
nested_list = []
for val in x:
if isinstance(val, np.ndarray):
nested_list.append(val)
else:
nested_list.append(val.numpy())
input_list.append(nested_list)
elif isinstance(x, np.ndarray):
input_list.append(x)
else:
input_list.append(x.numpy())
print(f"Training started for {num_iters} iterations:")
for i in tqdm(range(num_iters)):
outputs = self.amdshark_runner.forward(input_list, self.frontend)
return outputs
def train(self, num_iters=1):
if self.frontend in ["torch", "pytorch"]:
return self._train_torch(num_iters)
elif self.frontend in ["tf", "tensorflow", "mhlo"]:
return self._train_tf(num_iters)
else:
print_err("Unknown frontend")
return

View File

@@ -1,78 +0,0 @@
# Copyright 2020 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from torch._decomp import get_decompositions
from torch.fx.experimental.proxy_tensor import make_fx
from torch.nn.utils import stateless
from torch import fx
import tempfile
class MakeFxModule:
def __init__(self, model, inputs, labels=None, custom_inference_fn=None):
self.model = model
self.inputs = inputs
self.custom_inference_fn = custom_inference_fn
self.training_graph = None
# Doesn't replace the None type.
def change_fx_graph_return_to_tuple(self, fx_g: fx.GraphModule):
for node in fx_g.graph.nodes:
if node.op == "output":
# output nodes always have one argument
node_arg = node.args[0]
out_nodes = []
if isinstance(node_arg, list):
# Don't return NoneType elements.
for out_node in node_arg:
if not isinstance(out_node, type(None)):
out_nodes.append(out_node)
# If there is a single tensor/element to be returned don't
# a tuple for it.
if len(out_nodes) == 1:
node.args = out_nodes
else:
node.args = (tuple(out_nodes),)
fx_g.graph.lint()
fx_g.recompile()
return fx_g
def generate_graph(self):
fx_g = make_fx(
self.custom_inference_fn,
decomposition_table=get_decompositions(
[
torch.ops.aten.embedding_dense_backward,
torch.ops.aten.native_layer_norm_backward,
torch.ops.aten.slice_backward,
torch.ops.aten.select_backward,
]
),
)(
dict(self.model.named_parameters()),
dict(self.model.named_buffers()),
self.inputs,
)
fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
fx_g.recompile()
fx_g = self.change_fx_graph_return_to_tuple(fx_g)
ts_g = torch.jit.script(fx_g)
temp = tempfile.NamedTemporaryFile(
suffix="_amdshark_ts", prefix="temp_ts_"
)
ts_g.save(temp.name)
new_ts = torch.jit.load(temp.name)
self.training_graph = new_ts

View File

@@ -1,154 +0,0 @@
import functools
from typing import List, Optional
import torch
from torch.fx.experimental.proxy_tensor import make_fx
from torch._functorch.compile_utils import strip_overloads
from amdshark.amdshark_inference import AMDSharkInference
from torch._decomp import get_decompositions
from torch.func import functionalize
import io
import torch_mlir
# TODO: Control decompositions.
def default_decompositions():
return get_decompositions(
[
torch.ops.aten.embedding_dense_backward,
torch.ops.aten.native_layer_norm_backward,
torch.ops.aten.slice_backward,
torch.ops.aten.select_backward,
torch.ops.aten.norm.ScalarOpt_dim,
torch.ops.aten.native_group_norm,
torch.ops.aten.upsample_bilinear2d.vec,
torch.ops.aten.split.Tensor,
torch.ops.aten.split_with_sizes,
torch.ops.aten.native_layer_norm,
torch.ops.aten.masked_fill.Tensor,
torch.ops.aten.masked_fill.Scalar,
]
)
def _remove_nones(fx_g: torch.fx.GraphModule) -> List[int]:
removed_indexes = []
for node in fx_g.graph.nodes:
if node.op == "output":
assert (
len(node.args) == 1
), "Output node must have a single argument"
node_arg = node.args[0]
if isinstance(node_arg, (list, tuple)):
node_arg = list(node_arg)
node_args_len = len(node_arg)
for i in range(node_args_len):
curr_index = node_args_len - (i + 1)
if node_arg[curr_index] is None:
removed_indexes.append(curr_index)
node_arg.pop(curr_index)
node.args = (tuple(node_arg),)
break
if len(removed_indexes) > 0:
fx_g.graph.lint()
fx_g.graph.eliminate_dead_code()
fx_g.recompile()
removed_indexes.sort()
return removed_indexes
def _returns_nothing(fx_g: torch.fx.GraphModule) -> bool:
for node in fx_g.graph.nodes:
if node.op == "output":
assert (
len(node.args) == 1
), "Output node must have a single argument"
node_arg = node.args[0]
if isinstance(node_arg, tuple):
return len(node_arg) == 0
return False
def _unwrap_single_tuple_return(fx_g: torch.fx.GraphModule) -> bool:
"""
Replace tuple with tuple element in functions that return one-element tuples.
Returns true if an unwrapping took place, and false otherwise.
"""
unwrapped_tuple = False
for node in fx_g.graph.nodes:
if node.op == "output":
assert (
len(node.args) == 1
), "Output node must have a single argument"
node_arg = node.args[0]
if isinstance(node_arg, tuple):
if len(node_arg) == 1:
node.args = (node_arg[0],)
unwrapped_tuple = True
break
if unwrapped_tuple:
fx_g.graph.lint()
fx_g.recompile()
return unwrapped_tuple
class AMDSharkBackend:
def __init__(
self, fx_g: torch.fx.GraphModule, inputs: tuple, options: dict
):
self.fx_g = fx_g
self.inputs = inputs
self.amdshark_module = None
self.device: str = options.get("device", "cpu")
self.was_unwrapped: bool = False
self.none_indices: list = []
self._modify_fx_g()
self.compile()
def _modify_fx_g(self):
self.none_indices = _remove_nones(self.fx_g)
self.was_unwrapped = _unwrap_single_tuple_return(self.fx_g)
def compile(self):
gm = make_fx(
functionalize(self.fx_g),
decomposition_table=default_decompositions(),
)(*self.inputs)
gm.graph.set_codegen(torch.fx.graph.CodeGen())
gm.recompile()
strip_overloads(gm)
ts_g = torch.jit.script(gm)
mlir_module = torch_mlir.compile(
ts_g, self.inputs, output_type="linalg-on-tensors"
)
bytecode_stream = io.BytesIO()
mlir_module.operation.write_bytecode(bytecode_stream)
bytecode = bytecode_stream.getvalue()
from amdshark.amdshark_inference import AMDSharkInference
amdshark_module = AMDSharkInference(
mlir_module=bytecode,
device=self.device,
mlir_dialect="tm_tensor",
)
amdshark_module.compile(extra_args=[])
self.amdshark_module = amdshark_module
def __call__(self, *inputs):
np_inputs = [x.contiguous().detach().cpu().numpy() for x in inputs]
np_outs = self.amdshark_module("forward", np_inputs)
if self.was_unwrapped:
np_outs = [
np_outs,
]
if not isinstance(np_outs, list):
res = torch.from_numpy(np_outs)
return res
result = [torch.from_numpy(x) for x in np_outs]
for r_in in self.none_indices:
result.insert(r_in, None)
result = tuple(result)
return result

View File

@@ -1,25 +0,0 @@
import torch
import amdshark
def foo(x, a):
if x.shape[0] > 3:
return x + a
else:
return x + 3
amdshark_options = {"device": "cpu"}
compiled = torch.compile(foo, backend="amdshark", options=amdshark_options)
input = torch.ones(4)
x = compiled(input, input)
print(x)
input = torch.ones(3)
x = compiled(input, input)
print(x)

View File

@@ -1,309 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/mlevental/miniconda3/envs/torch-mlir/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"# standard imports\n",
"import torch\n",
"from amdshark.iree_utils import get_iree_compiled_module"
]
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [],
"source": [
"# torch dynamo related imports\n",
"try:\n",
" import torchdynamo\n",
" from torchdynamo.optimizations.backends import create_backend\n",
" from torchdynamo.optimizations.subgraph import SubGraph\n",
"except ModuleNotFoundError:\n",
" print(\n",
" \"Please install TorchDynamo using pip install git+https://github.com/pytorch/torchdynamo\"\n",
" )\n",
" exit()\n",
"\n",
"# torch-mlir imports for compiling\n",
"from torch_mlir import compile, OutputType"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"[TorchDynamo](https://github.com/pytorch/torchdynamo) is a compiler for PyTorch programs that uses the [frame evaluation API](https://www.python.org/dev/peps/pep-0523/) in CPython to dynamically modify Python bytecode right before it is executed. It creates this FX Graph through bytecode analysis and is designed to mix Python execution with compiled backends."
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [],
"source": [
"def toy_example(*args):\n",
" a, b = args\n",
"\n",
" x = a / (torch.abs(a) + 1)\n",
" if b.sum() < 0:\n",
" b = b * -1\n",
" return x * b"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [],
"source": [
"# compiler that lowers fx_graph to through MLIR\n",
"def __torch_mlir(fx_graph, *args, **kwargs):\n",
" assert isinstance(\n",
" fx_graph, torch.fx.GraphModule\n",
" ), \"Model must be an FX GraphModule.\"\n",
"\n",
" def _unwrap_single_tuple_return(fx_g: torch.fx.GraphModule):\n",
" \"\"\"Replace tuple with tuple element in functions that return one-element tuples.\"\"\"\n",
"\n",
" for node in fx_g.graph.nodes:\n",
" if node.op == \"output\":\n",
" assert (\n",
" len(node.args) == 1\n",
" ), \"Output node must have a single argument\"\n",
" node_arg = node.args[0]\n",
" if isinstance(node_arg, tuple) and len(node_arg) == 1:\n",
" node.args = (node_arg[0],)\n",
" fx_g.graph.lint()\n",
" fx_g.recompile()\n",
" return fx_g\n",
"\n",
" fx_graph = _unwrap_single_tuple_return(fx_graph)\n",
" ts_graph = torch.jit.script(fx_graph)\n",
"\n",
" # torchdynamo does munges the args differently depending on whether you use\n",
" # the @torchdynamo.optimize decorator or the context manager\n",
" if isinstance(args, tuple):\n",
" args = list(args)\n",
" assert isinstance(args, list)\n",
" if len(args) == 1 and isinstance(args[0], list):\n",
" args = args[0]\n",
"\n",
" linalg_module = compile(\n",
" ts_graph, args, output_type=OutputType.LINALG_ON_TENSORS\n",
" )\n",
" callable, _ = get_iree_compiled_module(\n",
" linalg_module, \"cuda\", func_name=\"forward\"\n",
" )\n",
"\n",
" def forward(*inputs):\n",
" return callable(*inputs)\n",
"\n",
" return forward"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"Simplest way to use TorchDynamo with the `torchdynamo.optimize` context manager:"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 1 device(s).\n",
"Device: 0\n",
" Name: NVIDIA GeForce RTX 3080\n",
" Compute Capability: 8.6\n",
"[-0.40066046 -0.4210303 0.03225489 -0.44849953 0.10370405 -0.04422468\n",
" 0.33262825 -0.20109026 0.02102537 -0.24882983]\n",
"[-0.07824923 -0.17004533 0.06439921 -0.06163602 0.26633525 -1.1560082\n",
" -0.06660341 0.24227881 0.1462235 -0.32055548]\n",
"[-0.01464001 0.442209 -0.0607936 -0.5477967 -0.25226554 -0.08588809\n",
" -0.30497575 0.00061084 -0.50069696 0.2317973 ]\n",
"[ 0.25726247 0.39388427 -0.24093066 0.12316308 -0.01981307 0.5661146\n",
" 0.26199922 0.8123446 -0.01576749 0.30846444]\n",
"[ 0.7878203 -0.45975062 -0.29956317 -0.07032048 -0.55817443 -0.62506855\n",
" -1.6837492 -0.38442805 0.28220773 -1.5325156 ]\n",
"[ 0.07975311 0.67754704 -0.30927914 0.00347631 -0.07326564 0.01893554\n",
" -0.7518105 -0.03078967 -0.07623022 0.38865626]\n",
"[-0.7751679 -0.5841397 -0.6622711 0.18574935 -0.6049372 0.02844244\n",
" -0.20471913 0.3337415 -0.3619432 -0.35087156]\n",
"[-0.08569919 -0.10775139 -0.02338934 0.21933547 -0.46712473 0.00062137\n",
" -0.58207744 0.06457533 0.18276742 0.03866556]\n",
"[-0.2311981 -0.43036282 0.20561649 -0.10363232 -0.13248594 0.02885137\n",
" -0.31241602 -0.36907142 0.08861586 0.2331427 ]\n",
"[-0.07273526 -0.31246194 -0.24218291 -0.24145737 0.0364486 0.14382267\n",
" -0.00531162 0.15447603 -0.5220248 -0.09016377]\n"
]
}
],
"source": [
"with torchdynamo.optimize(__torch_mlir):\n",
" for _ in range(10):\n",
" print(toy_example(torch.randn(10), torch.randn(10)))"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"It can also be used through a decorator:"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [],
"source": [
"@create_backend\n",
"def torch_mlir(subgraph, *args, **kwargs):\n",
" assert isinstance(subgraph, SubGraph), \"Model must be a dynamo SubGraph.\"\n",
" return __torch_mlir(subgraph.model, *list(subgraph.example_inputs))\n",
"\n",
"\n",
"@torchdynamo.optimize(\"torch_mlir\")\n",
"def toy_example2(*args):\n",
" a, b = args\n",
"\n",
" x = a / (torch.abs(a) + 1)\n",
" if b.sum() < 0:\n",
" b = b * -1\n",
" return x * b"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 1 device(s).\n",
"Device: 0\n",
" Name: NVIDIA GeForce RTX 3080\n",
" Compute Capability: 8.6\n",
"[-0.35494277 0.03409214 -0.02271946 0.7335942 0.03122527 -0.41881397\n",
" -0.6609761 -0.6418614 0.29336175 -0.01973678]\n",
"[-2.7246824e-01 -3.5543957e-01 6.0087401e-01 -7.4570496e-03\n",
" -4.2481605e-02 -5.0296803e-04 7.2928613e-01 -1.4673788e-03\n",
" -2.7621329e-01 -6.0995776e-02]\n",
"[-0.03165906 0.3889693 0.24052973 0.27279532 -0.02773128 -0.12602475\n",
" -1.0124422 0.5720256 -0.35437614 -0.20992722]\n",
"[-0.41831446 0.5525326 -0.29749998 -0.17044766 0.11804754 -0.05210691\n",
" -0.46145165 -0.8776549 0.10090438 0.17463352]\n",
"[ 0.02194221 0.20959911 0.26973712 0.12551276 -0.0020404 0.1490246\n",
" -0.04456685 1.1100804 0.8105744 0.6676846 ]\n",
"[ 0.06528181 -0.13591261 0.5370964 -0.4398162 -0.03372452 0.9691372\n",
" -0.01120087 0.2947028 0.4804801 -0.3324341 ]\n",
"[ 0.33549032 -0.23001772 -0.08681437 0.16490957 -0.11223086 0.09168988\n",
" 0.02403045 0.17344482 0.46406478 -0.00129451]\n",
"[-0.27475086 0.42384806 1.9090122 -0.41147137 -0.6888369 0.08435658\n",
" -0.26628923 -0.17436793 -0.8058869 -0.02582378]\n",
"[-0.10109414 0.08681287 -0.10055986 0.6858881 0.29267687 -0.02797117\n",
" -0.01425194 0.4882803 0.3551982 -0.858935 ]\n",
"[-0.22086617 0.524994 0.17721705 -0.03813264 -0.54570735 -0.4421502\n",
" 0.11938014 -0.01122053 0.39294165 -0.61770755]\n"
]
}
],
"source": [
"for _ in range(10):\n",
" print(toy_example2(torch.randn(10), torch.randn(10)))"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

View File

@@ -1,92 +0,0 @@
import torch
from torch_mlir import compile, OutputType
from amdshark.iree_utils import get_iree_compiled_module
try:
import torchdynamo
from torchdynamo.optimizations.backends import create_backend
from torchdynamo.optimizations.subgraph import SubGraph
except ModuleNotFoundError:
print(
"Please install TorchDynamo using pip install git+https://github.com/pytorch/torchdynamo"
)
exit()
NUM_ITERS = 10
def __torch_mlir(fx_graph, *args, **kwargs):
assert isinstance(
fx_graph, torch.fx.GraphModule
), "Model must be an FX GraphModule."
def _unwrap_single_tuple_return(fx_g: torch.fx.GraphModule):
"""Replace tuple with tuple element in functions that return one-element tuples."""
for node in fx_g.graph.nodes:
if node.op == "output":
assert (
len(node.args) == 1
), "Output node must have a single argument"
node_arg = node.args[0]
if isinstance(node_arg, tuple) and len(node_arg) == 1:
node.args = (node_arg[0],)
fx_g.graph.lint()
fx_g.recompile()
return fx_g
fx_graph = _unwrap_single_tuple_return(fx_graph)
ts_graph = torch.jit.script(fx_graph)
if isinstance(args, tuple):
args = list(args)
assert isinstance(args, list)
if len(args) == 1 and isinstance(args[0], list):
args = args[0]
linalg_module = compile(
ts_graph, args, output_type=OutputType.LINALG_ON_TENSORS
)
callable, _ = get_iree_compiled_module(
linalg_module, "cuda", func_name="forward"
)
def forward(*inputs):
return callable(*inputs)
return forward
def toy_example(*args):
a, b = args
x = a / (torch.abs(a) + 1)
if b.sum() < 0:
b = b * -1
return x * b
with torchdynamo.optimize(__torch_mlir):
for _ in range(10):
print(toy_example(torch.randn(10), torch.randn(10)))
@create_backend
def torch_mlir(subgraph, *args, **kwargs):
assert isinstance(subgraph, SubGraph), "Model must be a dynamo SubGraph."
return __torch_mlir(subgraph.model, *list(subgraph.example_inputs))
@torchdynamo.optimize("torch_mlir")
def toy_example2(*args):
a, b = args
x = a / (torch.abs(a) + 1)
if b.sum() < 0:
b = b * -1
return x * b
for _ in range(10):
print(toy_example2(torch.randn(10), torch.randn(10)))

View File

@@ -1,805 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/mlevental/miniconda3/envs/torch-mlir/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"# standard imports\n",
"import torch\n",
"from torch_mlir.eager_mode import torch_mlir_tensor"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [],
"source": [
"# eager mode imports\n",
"from torch_mlir.eager_mode.torch_mlir_tensor import TorchMLIRTensor\n",
"from amdshark.iree_eager_backend import EagerModeIREELinalgOnTensorsBackend"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"The simplest way of using Eager Mode (through IREE) requires setting a \"backend\":"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [],
"source": [
"torch_mlir_tensor.backend = EagerModeIREELinalgOnTensorsBackend(\"cpu\")"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"and wrapping all your `torch.Tensor`s:"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
"TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n"
]
}
],
"source": [
"NUM_ITERS = 10\n",
"\n",
"t = torch.ones((10, 10))\n",
"u = 2 * torch.ones((10, 10))\n",
"\n",
"tt = TorchMLIRTensor(t)\n",
"print(tt)\n",
"uu = TorchMLIRTensor(u)\n",
"print(uu)"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"`TorchMLIRTensor` is a \"tensor wrapper subclass\" (more info [here](https://github.com/albanD/subclass_zoo)) that keeps the IREE `DeviceArray` in a field `elem`:"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n"
]
}
],
"source": [
"for i in range(NUM_ITERS):\n",
" yy = tt + uu\n",
" print(type(yy))\n",
" print(yy.elem.to_host())\n",
" yy = tt * uu\n",
" print(type(yy))\n",
" print(yy.elem.to_host())"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"If you have a GPU (and CUDA installed) that works too (you can verify by having `watch -n1 nvidia-smi` up in a terminal while running the next cell):"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
"TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
"[[3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]\n",
" [3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]]\n",
"[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n"
]
}
],
"source": [
"torch_mlir_tensor.backend = EagerModeIREELinalgOnTensorsBackend(\"gpu\")\n",
"\n",
"t = torch.ones((10, 10))\n",
"u = 2 * torch.ones((10, 10))\n",
"\n",
"tt = TorchMLIRTensor(t)\n",
"print(tt)\n",
"uu = TorchMLIRTensor(u)\n",
"print(uu)\n",
"\n",
"yy = tt + uu\n",
"print(yy.elem.to_host())\n",
"yy = tt * uu\n",
"print(yy.elem.to_host())"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"There is a convenience class `AMDSharkEagerMode` that will handle both the installation of the backend and the wrapping of `torch.Tensor`s:"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
"TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n"
]
}
],
"source": [
"# eager mode RAII\n",
"from amdshark.amdshark_runner import AMDSharkEagerMode\n",
"\n",
"amdshark_eager_mode = AMDSharkEagerMode(\"cpu\")\n",
"\n",
"t = torch.ones((10, 10))\n",
"u = torch.ones((10, 10))\n",
"\n",
"print(t)\n",
"print(u)\n",
"\n",
"for i in range(NUM_ITERS):\n",
" yy = t + u\n",
" print(type(yy))\n",
" print(yy.elem.to_host())\n",
" yy = t * u\n",
" print(type(yy))\n",
" print(yy.elem.to_host())"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
{
"cell_type": "markdown",
"source": [
"The `AMDSharkEagerMode` class is a hacky take on [RAII](https://en.wikipedia.org/wiki/Resource_acquisition_is_initialization) that defines a \"deleter\" that runs when an instantiation (of `AMDSharkEagerMode`) is garbage collected. Takeaway is that if you want to turn off `AMDSharkEagerMode`, or switch backends, you need to `del` the instance:"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%% md\n"
}
}
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
"TorchMLIRTensor(<IREE DeviceArray: shape=[10, 10], dtype=float32>, backend=EagerModeIREELinalgOnTensorsBackend)\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]\n",
" [2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]]\n",
"<class 'torch_mlir.eager_mode.torch_mlir_tensor.TorchMLIRTensor'>\n",
"[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]\n",
" [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]\n"
]
}
],
"source": [
"del amdshark_eager_mode\n",
"amdshark_eager_mode = AMDSharkEagerMode(\"cuda\")\n",
"\n",
"t = torch.ones((10, 10))\n",
"u = torch.ones((10, 10))\n",
"\n",
"print(t)\n",
"print(u)\n",
"\n",
"yy = t + u\n",
"print(type(yy))\n",
"print(yy.elem.to_host())\n",
"yy = t * u\n",
"print(type(yy))\n",
"print(yy.elem.to_host())"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

View File

@@ -1,148 +0,0 @@
# Copyright 2020 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from torch.utils.cpp_extension import load_inline, include_paths
from torch_mlir.eager_mode import torch_mlir_tensor
from torch_mlir.eager_mode.torch_mlir_tensor import TorchMLIRTensor
from amdshark.iree_eager_backend import EagerModeIREELinalgOnTensorsBackend
from amdshark.amdshark_runner import AMDSharkEagerMode
def test_cpu():
torch_mlir_tensor.backend = EagerModeIREELinalgOnTensorsBackend("cpu")
t = torch.ones((10, 10), device="cpu")
u = 2 * torch.ones((10, 10), device="cpu")
tt = TorchMLIRTensor(t)
print(tt)
uu = TorchMLIRTensor(u)
print(uu)
for i in range(NUM_ITERS):
yy = tt + uu
print(type(yy))
print(yy.elem.to_host())
yy = tt * uu
print(type(yy))
print(yy.elem.to_host())
def test_gpu():
source = """
#include <iostream>
#include "cuda.h"
#include "cuda_runtime_api.h"
using namespace std;
void print_free_mem() {
int num_gpus;
size_t free, total;
cudaSetDevice(0);
int id;
cudaGetDevice(&id);
cudaMemGetInfo(&free, &total);
cout << "GPU " << id << " memory: used=" << (total-free)/(1<<20) << endl;
}
"""
gpu_stats = load_inline(
name="inline_extension",
cpp_sources=[source],
extra_include_paths=include_paths(cuda=True),
functions=["print_free_mem"],
)
torch_mlir_tensor.backend = EagerModeIREELinalgOnTensorsBackend("gpu")
t = torch.ones((10, 10), device="cpu")
u = 2 * torch.ones((10, 10), device="cpu")
tt = TorchMLIRTensor(t)
print(tt)
uu = TorchMLIRTensor(u)
print(uu)
for i in range(NUM_ITERS):
yy = tt + uu
print(yy.elem.to_host())
yy = tt * uu
print(yy.elem.to_host())
gpu_stats.print_free_mem()
def test_python_mode_ref_backend():
# hide this wherever you want?
_ = AMDSharkEagerMode("refbackend")
t = torch.ones((10, 10), device="cpu")
u = torch.ones((10, 10), device="cpu")
print(t)
print(u)
for i in range(NUM_ITERS):
print(i)
yy = t + u
print(yy.elem)
yy = t * u
print(yy.elem)
def test_python_mode_iree_cpu():
# hide this wherever you want?
_ = AMDSharkEagerMode("cpu")
t = torch.ones((10, 10), device="cpu")
u = torch.ones((10, 10), device="cpu")
print(t)
print(u)
for i in range(NUM_ITERS):
yy = t + u
print(type(yy))
print(yy.elem.to_host())
yy = t * u
print(type(yy))
print(yy.elem.to_host())
def test_python_mode_iree_gpu():
_ = AMDSharkEagerMode("gpu")
t = torch.ones((10, 10), device="cpu")
u = torch.ones((10, 10), device="cpu")
print(t)
print(u)
for i in range(NUM_ITERS):
yy = t + u
print(type(yy))
print(yy.elem.to_host())
yy = t * u
print(type(yy))
print(yy.elem.to_host())
if __name__ == "__main__":
NUM_ITERS = 10
test_cpu()
if torch.cuda.is_available():
test_gpu()
test_python_mode_ref_backend()
test_python_mode_iree_cpu()
test_python_mode_iree_gpu()

View File

@@ -1,73 +0,0 @@
import torch
import numpy as np
model = torch.hub.load(
"pytorch/vision:v0.10.0", "squeezenet1_0", pretrained=True
)
model.eval()
# from PIL import Image
# from torchvision import transforms
# import urllib
#
# url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
# try: urllib.URLopener().retrieve(url, filename)
# except: urllib.request.urlretrieve(url, filename)
#
#
# input_image = Image.open(filename)
# preprocess = transforms.Compose([
# transforms.Resize(256),
# transforms.CenterCrop(224),
# transforms.ToTensor(),
# transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
# ])
# input_tensor = preprocess(input_image)
# input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model
# print(input_batch.shape) # size = [1, 3, 224, 224]
# The above is code for generating sample inputs from an image. We can just use
# random values for accuracy testing though
input_batch = torch.randn(1, 3, 224, 224)
# Focus on CPU for now
if False and torch.cuda.is_available():
input_batch = input_batch.to("cuda")
model.to("cuda")
with torch.no_grad():
output = model(input_batch)
# Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes
golden_confidences = output[0]
# The output has unnormalized scores. To get probabilities, you can run a softmax on it.
golden_probabilities = torch.nn.functional.softmax(
golden_confidences, dim=0
).numpy()
golden_confidences = golden_confidences.numpy()
from amdshark.torch_mlir_lockstep_tensor import TorchMLIRLockstepTensor
input_detached_clone = input_batch.clone()
eager_input_batch = TorchMLIRLockstepTensor(input_detached_clone)
print("getting torch-mlir result")
output = model(eager_input_batch)
static_output = output.elem
confidences = static_output[0]
probabilities = torch.nn.functional.softmax(
torch.from_numpy(confidences), dim=0
).numpy()
print("The obtained result via amdshark is: ", confidences)
print("The golden result is:", golden_confidences)
np.testing.assert_allclose(
golden_confidences, confidences, rtol=1e-02, atol=1e-03
)
np.testing.assert_allclose(
golden_probabilities, probabilities, rtol=1e-02, atol=1e-03
)

View File

@@ -1,65 +0,0 @@
from PIL import Image
import requests
from transformers import CLIPProcessor, TFCLIPModel
import tensorflow as tf
from amdshark.amdshark_inference import AMDSharkInference
# Create a set of inputs
clip_vit_inputs = [
tf.TensorSpec(shape=[2, 7], dtype=tf.int32),
tf.TensorSpec(shape=[2, 7], dtype=tf.int32),
tf.TensorSpec(shape=[1, 3, 224, 224], dtype=tf.float32),
]
class CLIPModule(tf.Module):
def __init__(self):
super(CLIPModule, self).__init__()
self.m = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
self.m.predict = lambda x, y, z: self.m(
input_ids=x, attention_mask=y, pixel_values=z
)
@tf.function(input_signature=clip_vit_inputs, jit_compile=True)
def forward(self, input_ids, attention_mask, pixel_values):
return self.m.predict(
input_ids, attention_mask, pixel_values
).logits_per_image
if __name__ == "__main__":
# Prepping Data
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(
text=["a photo of a cat", "a photo of a dog"],
images=image,
return_tensors="tf",
padding=True,
)
amdshark_module = AMDSharkInference(
CLIPModule(),
(
inputs["input_ids"],
inputs["attention_mask"],
inputs["pixel_values"],
),
)
amdshark_module.set_frontend("tensorflow")
amdshark_module.compile()
print(
amdshark_module.forward(
(
inputs["input_ids"],
inputs["attention_mask"],
inputs["pixel_values"],
)
)
)

View File

@@ -1,15 +0,0 @@
## Running ESRGAN
```
1. pip install numpy opencv-python
2. mkdir InputImages
(this is where all the input images will reside in)
3. mkdir OutputImages
(this is where the model will generate all the images)
4. mkdir models
(save the .pth checkpoint file here)
5. python esrgan.py
```
- Download [RRDB_ESRGAN_x4.pth](https://drive.google.com/drive/u/0/folders/17VYV_SoZZesU6mbxz2dMAIccSSlqLecY) and place it in the `models` directory as mentioned above in step 4.
- Credits : [ESRGAN](https://github.com/xinntao/ESRGAN)

View File

@@ -1,239 +0,0 @@
from ast import arg
import os.path as osp
import glob
import cv2
import numpy as np
import torch
from torch.fx.experimental.proxy_tensor import make_fx
from torch._decomp import get_decompositions
from amdshark.amdshark_inference import AMDSharkInference
import torch_mlir
import tempfile
import functools
import torch
import torch.nn as nn
import torch.nn.functional as F
def make_layer(block, n_layers):
layers = []
for _ in range(n_layers):
layers.append(block())
return nn.Sequential(*layers)
class ResidualDenseBlock_5C(nn.Module):
def __init__(self, nf=64, gc=32, bias=True):
super(ResidualDenseBlock_5C, self).__init__()
# gc: growth channel, i.e. intermediate channels
self.conv1 = nn.Conv2d(nf, gc, 3, 1, 1, bias=bias)
self.conv2 = nn.Conv2d(nf + gc, gc, 3, 1, 1, bias=bias)
self.conv3 = nn.Conv2d(nf + 2 * gc, gc, 3, 1, 1, bias=bias)
self.conv4 = nn.Conv2d(nf + 3 * gc, gc, 3, 1, 1, bias=bias)
self.conv5 = nn.Conv2d(nf + 4 * gc, nf, 3, 1, 1, bias=bias)
self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
# initialization
# mutil.initialize_weights([self.conv1, self.conv2, self.conv3, self.conv4, self.conv5], 0.1)
def forward(self, x):
x1 = self.lrelu(self.conv1(x))
x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1)))
x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1)))
x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1)))
x5 = self.conv5(torch.cat((x, x1, x2, x3, x4), 1))
return x5 * 0.2 + x
class RRDB(nn.Module):
"""Residual in Residual Dense Block"""
def __init__(self, nf, gc=32):
super(RRDB, self).__init__()
self.RDB1 = ResidualDenseBlock_5C(nf, gc)
self.RDB2 = ResidualDenseBlock_5C(nf, gc)
self.RDB3 = ResidualDenseBlock_5C(nf, gc)
def forward(self, x):
out = self.RDB1(x)
out = self.RDB2(out)
out = self.RDB3(out)
return out * 0.2 + x
class RRDBNet(nn.Module):
def __init__(self, in_nc, out_nc, nf, nb, gc=32):
super(RRDBNet, self).__init__()
RRDB_block_f = functools.partial(RRDB, nf=nf, gc=gc)
self.conv_first = nn.Conv2d(in_nc, nf, 3, 1, 1, bias=True)
self.RRDB_trunk = make_layer(RRDB_block_f, nb)
self.trunk_conv = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
#### upsampling
self.upconv1 = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
self.upconv2 = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
self.HRconv = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
self.conv_last = nn.Conv2d(nf, out_nc, 3, 1, 1, bias=True)
self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
def forward(self, x):
fea = self.conv_first(x)
trunk = self.trunk_conv(self.RRDB_trunk(fea))
fea = fea + trunk
fea = self.lrelu(
self.upconv1(F.interpolate(fea, scale_factor=2, mode="nearest"))
)
fea = self.lrelu(
self.upconv2(F.interpolate(fea, scale_factor=2, mode="nearest"))
)
out = self.conv_last(self.lrelu(self.HRconv(fea)))
return out
############### Parsing args #####################
import argparse
p = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
p.add_argument("--device", type=str, default="cpu", help="the device to use")
p.add_argument(
"--mlir_loc",
type=str,
default=None,
help="location of the model's mlir file",
)
args = p.parse_args()
###################################################
def inference(input_m):
return model(input_m)
def load_mlir(mlir_loc):
import os
if mlir_loc == None:
return None
print(f"Trying to load the model from {mlir_loc}.")
with open(os.path.join(mlir_loc)) as f:
mlir_module = f.read()
return mlir_module
def compile_through_fx(model, inputs, mlir_loc=None):
module = load_mlir(mlir_loc)
if module == None:
fx_g = make_fx(
model,
decomposition_table=get_decompositions(
[
torch.ops.aten.embedding_dense_backward,
torch.ops.aten.native_layer_norm_backward,
torch.ops.aten.slice_backward,
torch.ops.aten.select_backward,
torch.ops.aten.norm.ScalarOpt_dim,
torch.ops.aten.native_group_norm,
torch.ops.aten.upsample_bilinear2d.vec,
torch.ops.aten.split.Tensor,
torch.ops.aten.split_with_sizes,
]
),
)(inputs)
fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
fx_g.recompile()
def strip_overloads(gm):
"""
Modifies the target of graph nodes in :attr:`gm` to strip overloads.
Args:
gm(fx.GraphModule): The input Fx graph module to be modified
"""
for node in gm.graph.nodes:
if isinstance(node.target, torch._ops.OpOverload):
node.target = node.target.overloadpacket
gm.recompile()
strip_overloads(fx_g)
ts_g = torch.jit.script(fx_g)
print("Torchscript graph generated successfully")
module = torch_mlir.compile(
ts_g,
inputs,
torch_mlir.OutputType.LINALG_ON_TENSORS,
use_tracing=False,
verbose=False,
)
mlir_model = str(module)
func_name = "forward"
amdshark_module = AMDSharkInference(
mlir_model, device=args.device, mlir_dialect="linalg"
)
amdshark_module.compile()
return amdshark_module
model_path = "models/RRDB_ESRGAN_x4.pth" # models/RRDB_ESRGAN_x4.pth OR models/RRDB_PSNR_x4.pth
# device = torch.device('cuda') # if you want to run on CPU, change 'cuda' -> cpu
device = torch.device("cpu")
test_img_folder = "InputImages/*"
model = RRDBNet(3, 3, 64, 23, gc=32)
model.load_state_dict(torch.load(model_path), strict=True)
model.eval()
model = model.to(device)
print("Model path {:s}. \nTesting...".format(model_path))
if __name__ == "__main__":
idx = 0
for path in glob.glob(test_img_folder):
idx += 1
base = osp.splitext(osp.basename(path))[0]
print(idx, base)
# read images
img = cv2.imread(path, cv2.IMREAD_COLOR)
img = img * 1.0 / 255
img = torch.from_numpy(
np.transpose(img[:, :, [2, 1, 0]], (2, 0, 1))
).float()
img_LR = img.unsqueeze(0)
img_LR = img_LR.to(device)
with torch.no_grad():
amdshark_module = compile_through_fx(inference, img_LR)
amdshark_output = amdshark_module.forward((img_LR,))
amdshark_output = torch.from_numpy(amdshark_output)
amdshark_output = (
amdshark_output.data.squeeze().float().cpu().clamp_(0, 1).numpy()
)
esrgan_output = (
model(img_LR).data.squeeze().float().cpu().clamp_(0, 1).numpy()
)
# AMDSHARK OUTPUT
amdshark_output = np.transpose(amdshark_output[[2, 1, 0], :, :], (1, 2, 0))
amdshark_output = (amdshark_output * 255.0).round()
cv2.imwrite(
"OutputImages/{:s}_rlt_amdshark_output.png".format(base), amdshark_output
)
print("Generated AMDSHARK's output")
# ESRGAN OUTPUT
esrgan_output = np.transpose(esrgan_output[[2, 1, 0], :, :], (1, 2, 0))
esrgan_output = (esrgan_output * 255.0).round()
cv2.imwrite(
"OutputImages/{:s}_rlt_esrgan_output.png".format(base),
esrgan_output,
)
print("Generated ESRGAN's output")

View File

@@ -1,86 +0,0 @@
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch
from amdshark.amdshark_inference import AMDSharkInference
from amdshark.amdshark_importer import AMDSharkImporter
from iree.compiler import compile_str
from iree import runtime as ireert
import os
import numpy as np
MAX_SEQUENCE_LENGTH = 512
BATCH_SIZE = 1
class AlbertModule(torch.nn.Module):
def __init__(self):
super().__init__()
self.model = AutoModelForMaskedLM.from_pretrained("albert-base-v2")
self.model.eval()
def forward(self, input_ids, attention_mask):
return self.model(
input_ids=input_ids, attention_mask=attention_mask
).logits
if __name__ == "__main__":
# Prepping Data
tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
text = "This [MASK] is very tasty."
encoded_inputs = tokenizer(
text,
padding="max_length",
truncation=True,
max_length=MAX_SEQUENCE_LENGTH,
return_tensors="pt",
)
inputs = (encoded_inputs["input_ids"], encoded_inputs["attention_mask"])
mlir_importer = AMDSharkImporter(
AlbertModule(),
inputs,
frontend="torch",
)
minilm_mlir, func_name = mlir_importer.import_mlir(
is_dynamic=False, tracing_required=True
)
amdshark_module = AMDSharkInference(minilm_mlir)
amdshark_module.compile()
token_logits = torch.tensor(amdshark_module.forward(inputs))
mask_id = torch.where(
encoded_inputs["input_ids"] == tokenizer.mask_token_id
)[1]
mask_token_logits = token_logits[0, mask_id, :]
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
for token in top_5_tokens:
print(
f"'>>> Sample/Warmup output: {text.replace(tokenizer.mask_token, tokenizer.decode(token))}'"
)
while True:
try:
new_text = input("Give me a sentence with [MASK] to fill: ")
encoded_inputs = tokenizer(
new_text,
padding="max_length",
truncation=True,
max_length=MAX_SEQUENCE_LENGTH,
return_tensors="pt",
)
inputs = (
encoded_inputs["input_ids"],
encoded_inputs["attention_mask"],
)
token_logits = torch.tensor(amdshark_module.forward(inputs))
mask_id = torch.where(
encoded_inputs["input_ids"] == tokenizer.mask_token_id
)[1]
mask_token_logits = token_logits[0, mask_id, :]
top_5_tokens = (
torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
)
for token in top_5_tokens:
print(
f"'>>> {new_text.replace(tokenizer.mask_token, tokenizer.decode(token))}'"
)
except KeyboardInterrupt:
print("Exiting program.")
break

View File

@@ -1,100 +0,0 @@
from PIL import Image
import requests
from transformers import TFAutoModelForMaskedLM, AutoTokenizer
import tensorflow as tf
from amdshark.amdshark_inference import AMDSharkInference
from amdshark.amdshark_importer import AMDSharkImporter
from iree.compiler import tf as tfc
from iree.compiler import compile_str
from iree import runtime as ireert
import os
import numpy as np
import sys
MAX_SEQUENCE_LENGTH = 512
BATCH_SIZE = 1
# Create a set of inputs
t5_inputs = [
tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
]
class AlbertModule(tf.Module):
def __init__(self):
super(AlbertModule, self).__init__()
self.m = TFAutoModelForMaskedLM.from_pretrained("albert-base-v2")
self.m.predict = lambda x, y: self.m(input_ids=x, attention_mask=y)
@tf.function(input_signature=t5_inputs, jit_compile=True)
def forward(self, input_ids, attention_mask):
return self.m.predict(input_ids, attention_mask)
if __name__ == "__main__":
# Prepping Data
tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
# text = "This is a great [MASK]."
text = "This [MASK] is very tasty."
encoded_inputs = tokenizer(
text,
padding="max_length",
truncation=True,
max_length=MAX_SEQUENCE_LENGTH,
return_tensors="tf",
)
inputs = (encoded_inputs["input_ids"], encoded_inputs["attention_mask"])
mlir_importer = AMDSharkImporter(
AlbertModule(),
inputs,
frontend="tf",
)
minilm_mlir, func_name = mlir_importer.import_mlir(
is_dynamic=False, tracing_required=False
)
amdshark_module = AMDSharkInference(minilm_mlir, mlir_dialect="mhlo")
amdshark_module.compile()
output_idx = 0
data_idx = 1
token_logits = amdshark_module.forward(inputs)[output_idx][data_idx]
mask_id = np.where(
tf.squeeze(encoded_inputs["input_ids"]) == tokenizer.mask_token_id
)
mask_token_logits = token_logits[0, mask_id, :]
top_5_tokens = np.flip(np.argsort(mask_token_logits)).squeeze()[0:5]
for token in top_5_tokens:
print(
f"'>>> Sample/Warmup output: {text.replace(tokenizer.mask_token, tokenizer.decode(token))}'"
)
while True:
try:
new_text = input("Give me a sentence with [MASK] to fill: ")
encoded_inputs = tokenizer(
new_text,
padding="max_length",
truncation=True,
max_length=MAX_SEQUENCE_LENGTH,
return_tensors="tf",
)
inputs = (
encoded_inputs["input_ids"],
encoded_inputs["attention_mask"],
)
token_logits = amdshark_module.forward(inputs)[output_idx][data_idx]
mask_id = np.where(
tf.squeeze(encoded_inputs["input_ids"])
== tokenizer.mask_token_id
)
mask_token_logits = token_logits[0, mask_id, :]
top_5_tokens = np.flip(np.argsort(mask_token_logits)).squeeze()[
0:5
]
for token in top_5_tokens:
print(
f"'>>> {new_text.replace(tokenizer.mask_token, tokenizer.decode(token))}'"
)
except KeyboardInterrupt:
print("Exiting program.")
sys.exit()

View File

@@ -1,14 +0,0 @@
from amdshark.amdshark_inference import AMDSharkInference
from amdshark.amdshark_downloader import download_model
mlir_model, func_name, inputs, golden_out = download_model(
"bloom", frontend="torch"
)
amdshark_module = AMDSharkInference(
mlir_model, device="cpu", mlir_dialect="tm_tensor"
)
amdshark_module.compile()
result = amdshark_module.forward(inputs)
print("The obtained result via amdshark is: ", result)
print("The golden result is:", golden_out)

View File

@@ -1,40 +0,0 @@
from PIL import Image
import requests
from transformers import GPT2Tokenizer, TFGPT2Model
import tensorflow as tf
from amdshark.amdshark_inference import AMDSharkInference
# Create a set of inputs
gpt2_inputs = [
tf.TensorSpec(shape=[1, 8], dtype=tf.int32),
tf.TensorSpec(shape=[1, 8], dtype=tf.int32),
]
class GPT2Module(tf.Module):
def __init__(self):
super(GPT2Module, self).__init__()
self.m = TFGPT2Model.from_pretrained("distilgpt2")
self.m.predict = lambda x, y: self.m(input_ids=x, attention_mask=y)
@tf.function(input_signature=gpt2_inputs, jit_compile=True)
def forward(self, input_ids, attention_mask):
return self.m.predict(input_ids, attention_mask)
if __name__ == "__main__":
# Prepping Data
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
text = "I love the distilled version of models."
inputs = tokenizer(text, return_tensors="tf")
amdshark_module = AMDSharkInference(
GPT2Module(), (inputs["input_ids"], inputs["attention_mask"])
)
amdshark_module.set_frontend("tensorflow")
amdshark_module.compile()
print(
amdshark_module.forward((inputs["input_ids"], inputs["attention_mask"]))
)

View File

@@ -1,18 +0,0 @@
# AMDSHARK LLaMA
## TORCH-MLIR Version
```
https://github.com/nod-ai/torch-mlir.git
```
Then check out the `complex` branch and `git submodule update --init` and then build with `.\build_tools\python_deploy\build_windows.ps1`
### Setup & Run
```
git clone https://github.com/nod-ai/llama.git
```
Then in this repository
```
pip install -e .
python llama/amdshark_model.py
```

View File

@@ -1,72 +0,0 @@
import torch
import torch_mlir
from amdshark.amdshark_inference import AMDSharkInference
from amdshark.amdshark_compile import amdshark_compile_through_fx
from MEGABYTE_pytorch import MEGABYTE
import os
class MegaModel(torch.nn.Module):
def __init__(self):
super().__init__()
self.model = MEGABYTE(
num_tokens=16000, # number of tokens
dim=(
512,
256,
), # transformer model dimension (512 for coarsest, 256 for fine in this example)
max_seq_len=(
1024,
4,
), # sequence length for global and then local. this can be more than 2
depth=(
6,
4,
), # number of layers for global and then local. this can be more than 2, but length must match the max_seq_len's
dim_head=64, # dimension per head
heads=8, # number of attention heads
flash_attn=True, # use flash attention
)
def forward(self, input):
return self.model(input)
megaModel = MegaModel()
inputs = [torch.randint(0, 16000, (1, 1024, 4))]
# CURRENTLY IT BAILS OUT HERE BECAUSE OF MISSING OP LOWERINGS :-
# 1. aten.alias
amdshark_module, _ = amdshark_compile_through_fx(
model=megaModel,
inputs=inputs,
extended_model_name="mega_amdshark",
is_f16=False,
f16_input_mask=None,
save_dir=os.getcwd(),
debug=False,
generate_or_load_vmfb=True,
extra_args=[],
device="cuda",
mlir_dialect="tm_tensor",
)
# logits = model(x)
def print_output_info(output, msg):
print("\n", msg)
print("\n\t", output.shape)
ans = amdshark_module("forward", inputs)
print_output_info(torch.from_numpy(ans), "AMDSHARK's output")
ans = megaModel.forward(*inputs)
print_output_info(ans, "ORIGINAL Model's output")
# and sample from the logits accordingly
# or you can use the generate function
# NEED TO LOOK AT THIS LATER IF REQUIRED IN AMDSHARK.
# sampled = model.generate(temperature = 0.9, filter_thres = 0.9) # (1, 1024, 4)

View File

@@ -1,31 +0,0 @@
from amdshark.amdshark_inference import AMDSharkInference
import numpy as np
mhlo_ir = r"""builtin.module {
func.func @forward(%arg0: tensor<1x4xf32>, %arg1: tensor<4x1xf32>) -> tensor<4x4xf32> {
%0 = chlo.broadcast_add %arg0, %arg1 : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<4x4xf32>
%1 = "mhlo.abs"(%0) : (tensor<4x4xf32>) -> tensor<4x4xf32>
return %1 : tensor<4x4xf32>
}
}"""
arg0 = np.ones((1, 4)).astype(np.float32)
arg1 = np.ones((4, 1)).astype(np.float32)
print("Running amdshark on cpu backend")
amdshark_module = AMDSharkInference(mhlo_ir, device="cpu", mlir_dialect="mhlo")
# Generate the random inputs and feed into the graph.
x = amdshark_module.generate_random_inputs()
amdshark_module.compile()
print(amdshark_module.forward(x))
print("Running amdshark on cuda backend")
amdshark_module = AMDSharkInference(mhlo_ir, device="cuda", mlir_dialect="mhlo")
amdshark_module.compile()
print(amdshark_module.forward(x))
print("Running amdshark on vulkan backend")
amdshark_module = AMDSharkInference(mhlo_ir, device="vulkan", mlir_dialect="mhlo")
amdshark_module.compile()
print(amdshark_module.forward(x))

View File

@@ -1,35 +0,0 @@
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from amdshark.amdshark_inference import AMDSharkInference
torch.manual_seed(0)
tokenizer = AutoTokenizer.from_pretrained("microsoft/MiniLM-L12-H384-uncased")
class MiniLMSequenceClassification(torch.nn.Module):
def __init__(self):
super().__init__()
self.model = AutoModelForSequenceClassification.from_pretrained(
"microsoft/MiniLM-L12-H384-uncased", # The pretrained model.
num_labels=2, # The number of output labels--2 for binary classification.
output_attentions=False, # Whether the model returns attentions weights.
output_hidden_states=False, # Whether the model returns all hidden-states.
torchscript=True,
)
def forward(self, tokens):
return self.model.forward(tokens)[0]
test_input = torch.randint(2, (1, 128))
amdshark_module = AMDSharkInference(
MiniLMSequenceClassification(),
(test_input,),
jit_trace=True,
benchmark_mode=True,
)
amdshark_module.compile()
amdshark_module.forward((test_input,))
amdshark_module.benchmark_all((test_input,))

View File

@@ -1,61 +0,0 @@
import tensorflow as tf
from transformers import BertModel, BertTokenizer, TFBertModel
from amdshark.amdshark_inference import AMDSharkInference
MAX_SEQUENCE_LENGTH = 512
BATCH_SIZE = 1
# Create a set of 2-dimensional inputs
bert_input = [
tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
]
class BertModule(tf.Module):
def __init__(self):
super(BertModule, self).__init__()
# Create a BERT trainer with the created network.
self.m = TFBertModel.from_pretrained(
"microsoft/MiniLM-L12-H384-uncased", from_pt=True
)
# Invoke the trainer model on the inputs. This causes the layer to be built.
self.m.predict = lambda x, y, z: self.m.call(
input_ids=x, attention_mask=y, token_type_ids=z, training=False
)
@tf.function(input_signature=bert_input, jit_compile=True)
def forward(self, input_ids, attention_mask, token_type_ids):
return self.m.predict(input_ids, attention_mask, token_type_ids)
if __name__ == "__main__":
# Prepping Data
tokenizer = BertTokenizer.from_pretrained(
"microsoft/MiniLM-L12-H384-uncased"
)
text = "Replace me by any text you'd like."
encoded_input = tokenizer(
text,
padding="max_length",
truncation=True,
max_length=MAX_SEQUENCE_LENGTH,
)
for key in encoded_input:
encoded_input[key] = tf.expand_dims(
tf.convert_to_tensor(encoded_input[key]), 0
)
test_input = (
encoded_input["input_ids"],
encoded_input["attention_mask"],
encoded_input["token_type_ids"],
)
amdshark_module = AMDSharkInference(
BertModule(), test_input, benchmark_mode=True
)
amdshark_module.set_frontend("tensorflow")
amdshark_module.compile()
amdshark_module.benchmark_all(test_input)

View File

@@ -1,73 +0,0 @@
from transformers import AutoTokenizer, FlaxAutoModel
import torch
import jax
from typing import Union, Dict, List, Any
import numpy as np
from amdshark.amdshark_inference import AMDSharkInference
import io
NumpyTree = Union[np.ndarray, Dict[str, np.ndarray], List[np.ndarray]]
def convert_torch_tensor_tree_to_numpy(
tree: Union[torch.tensor, Dict[str, torch.tensor], List[torch.tensor]]
) -> NumpyTree:
return jax.tree_util.tree_map(
lambda torch_tensor: torch_tensor.cpu().detach().numpy(), tree
)
def convert_int64_to_int32(tree: NumpyTree) -> NumpyTree:
return jax.tree_util.tree_map(
lambda tensor: np.array(tensor, dtype=np.int32)
if tensor.dtype == np.int64
else tensor,
tree,
)
def get_sample_input():
tokenizer = AutoTokenizer.from_pretrained(
"microsoft/MiniLM-L12-H384-uncased"
)
inputs_torch = tokenizer("Hello, World!", return_tensors="pt")
return convert_int64_to_int32(
convert_torch_tensor_tree_to_numpy(inputs_torch.data)
)
def get_jax_model():
return FlaxAutoModel.from_pretrained("microsoft/MiniLM-L12-H384-uncased")
def export_jax_to_mlir(jax_model: Any, sample_input: NumpyTree):
model_mlir = jax.jit(jax_model).lower(**sample_input).compiler_ir()
byte_stream = io.BytesIO()
model_mlir.operation.write_bytecode(file=byte_stream)
return byte_stream.getvalue()
def assert_array_list_allclose(x, y, *args, **kwargs):
assert len(x) == len(y)
for a, b in zip(x, y):
np.testing.assert_allclose(
np.asarray(a), np.asarray(b), *args, **kwargs
)
sample_input = get_sample_input()
jax_model = get_jax_model()
mlir = export_jax_to_mlir(jax_model, sample_input)
# Compile and load module.
amdshark_inference = AMDSharkInference(mlir_module=mlir, mlir_dialect="mhlo")
amdshark_inference.compile()
# Run main function.
result = amdshark_inference("main", jax.tree_util.tree_flatten(sample_input)[0])
# Run JAX model.
reference_result = jax.tree_util.tree_flatten(jax_model(**sample_input))[0]
# Verify result.
assert_array_list_allclose(result, reference_result, atol=1e-5)

View File

@@ -1,6 +0,0 @@
flax
jax[cpu]
nodai-AMDSHARK
orbax
transformers
torch

View File

@@ -1,23 +0,0 @@
from amdshark.amdshark_inference import AMDSharkInference
from amdshark.amdshark_downloader import download_model
mlir_model, func_name, inputs, golden_out = download_model(
"microsoft/MiniLM-L12-H384-uncased",
frontend="torch",
)
amdshark_module = AMDSharkInference(mlir_model, device="cpu", mlir_dialect="linalg")
amdshark_module.compile()
result = amdshark_module.forward(inputs)
print("The obtained result via amdshark is: ", result)
print("The golden result is:", golden_out)
# Let's generate random inputs, currently supported
# for static models.
rand_inputs = amdshark_module.generate_random_inputs()
rand_results = amdshark_module.forward(rand_inputs)
print("Running amdshark_module with random_inputs is: ", rand_results)

View File

@@ -1,70 +0,0 @@
import tensorflow as tf
from transformers import BertModel, BertTokenizer, TFBertModel
from amdshark.amdshark_inference import AMDSharkInference
MAX_SEQUENCE_LENGTH = 512
BATCH_SIZE = 1
# Create a set of 2-dimensional inputs
bert_input = [
tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
tf.TensorSpec(shape=[BATCH_SIZE, MAX_SEQUENCE_LENGTH], dtype=tf.int32),
]
class BertModule(tf.Module):
def __init__(self):
super(BertModule, self).__init__()
# Create a BERT trainer with the created network.
self.m = TFBertModel.from_pretrained(
"microsoft/MiniLM-L12-H384-uncased", from_pt=True
)
# Invoke the trainer model on the inputs. This causes the layer to be built.
self.m.predict = lambda x, y, z: self.m.call(
input_ids=x, attention_mask=y, token_type_ids=z, training=False
)
@tf.function(input_signature=bert_input, jit_compile=True)
def forward(self, input_ids, attention_mask, token_type_ids):
return self.m.predict(input_ids, attention_mask, token_type_ids)
if __name__ == "__main__":
# Prepping Data
tokenizer = BertTokenizer.from_pretrained(
"microsoft/MiniLM-L12-H384-uncased"
)
text = "Replace me by any text you'd like."
encoded_input = tokenizer(
text,
padding="max_length",
truncation=True,
max_length=MAX_SEQUENCE_LENGTH,
)
for key in encoded_input:
encoded_input[key] = tf.expand_dims(
tf.convert_to_tensor(encoded_input[key]), 0
)
amdshark_module = AMDSharkInference(
BertModule(),
(
encoded_input["input_ids"],
encoded_input["attention_mask"],
encoded_input["token_type_ids"],
),
)
amdshark_module.set_frontend("tensorflow")
amdshark_module.compile()
print(
amdshark_module.forward(
(
encoded_input["input_ids"],
encoded_input["attention_mask"],
encoded_input["token_type_ids"],
)
)
)

File diff suppressed because one or more lines are too long

View File

@@ -1,39 +0,0 @@
import torch
import torchvision.models as models
from amdshark.amdshark_inference import AMDSharkInference
from amdshark.amdshark_importer import AMDSharkImporter
torch.hub.list("zhanghang1989/ResNeSt", force_reload=True)
class ResnestModule(torch.nn.Module):
def __init__(self):
super().__init__()
self.model = torch.hub.load(
"zhanghang1989/ResNeSt", "resnest50", pretrained=True
)
self.model.eval()
def forward(self, input):
return self.model.forward(input)
input = torch.randn(1, 3, 224, 224)
mlir_importer = AMDSharkImporter(
ResnestModule(),
(input,),
frontend="torch",
)
(vision_mlir, func_name), inputs, golden_out = mlir_importer.import_debug(
tracing_required=True
)
print(golden_out)
amdshark_module = AMDSharkInference(vision_mlir, mlir_dialect="linalg")
amdshark_module.compile()
result = amdshark_module.forward((input,))
print("Obtained result", result)

View File

@@ -1,74 +0,0 @@
from amdshark.amdshark_inference import AMDSharkInference
from amdshark.parser import amdshark_args
import torch
import numpy as np
import sys
import torchvision.models as models
import torch_mlir
torch.manual_seed(0)
class VisionModule(torch.nn.Module):
def __init__(self):
super().__init__()
self.model = models.resnet50(pretrained=True)
self.train(False)
def forward(self, input):
return self.model.forward(input)
model = VisionModule()
test_input = torch.randn(1, 3, 224, 224)
actual_out = model(test_input)
test_input_fp16 = test_input.to(device=torch.device("cuda"), dtype=torch.half)
model_fp16 = model.half()
model_fp16.eval()
model_fp16.to("cuda")
actual_out_fp16 = model_fp16(test_input_fp16)
ts_g = torch.jit.trace(model_fp16, [test_input_fp16])
module = torch_mlir.compile(
ts_g,
(test_input_fp16),
torch_mlir.OutputType.LINALG_ON_TENSORS,
use_tracing=True,
verbose=False,
)
# from contextlib import redirect_stdout
# with open('resnet50_fp16_linalg_ir.mlir', 'w') as f:
# with redirect_stdout(f):
# print(module.operation.get_asm())
mlir_model = module
func_name = "forward"
amdshark_module = AMDSharkInference(mlir_model, device="cuda", mlir_dialect="linalg")
amdshark_module.compile()
def amdshark_result(x):
x_ny = x.cpu().detach().numpy()
inputs = (x_ny,)
result = amdshark_module.forward(inputs)
return torch.from_numpy(result)
observed_out = amdshark_result(test_input_fp16)
print("Golden result:", actual_out_fp16)
print("AMDSHARK result:", observed_out)
actual_out_fp16 = actual_out_fp16.to(device=torch.device("cpu"))
print(
torch.testing.assert_allclose(
actual_out_fp16, observed_out, rtol=1e-2, atol=1e-2
)
)

View File

@@ -1,85 +0,0 @@
from PIL import Image
import requests
import torch
import torchvision.models as models
from torchvision import transforms
import sys
from amdshark.amdshark_inference import AMDSharkInference
from amdshark.amdshark_downloader import download_model
################################## Preprocessing inputs and model ############
def load_and_preprocess_image(url: str):
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
}
img = Image.open(
requests.get(url, headers=headers, stream=True).raw
).convert("RGB")
# preprocessing pipeline
preprocess = transforms.Compose(
[
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
),
]
)
img_preprocessed = preprocess(img)
return torch.unsqueeze(img_preprocessed, 0)
def load_labels():
classes_text = requests.get(
"https://raw.githubusercontent.com/cathyzhyi/ml-data/main/imagenet-classes.txt",
stream=True,
).text
labels = [line.strip() for line in classes_text.splitlines()]
return labels
def top3_possibilities(res):
_, indexes = torch.sort(res, descending=True)
percentage = torch.nn.functional.softmax(res, dim=1)[0] * 100
top3 = [(labels[idx], percentage[idx].item()) for idx in indexes[0][:3]]
return top3
class Resnet50Module(torch.nn.Module):
def __init__(self):
super().__init__()
self.resnet = models.resnet50(pretrained=True)
self.train(False)
def forward(self, img):
return self.resnet.forward(img)
image_url = "https://upload.wikimedia.org/wikipedia/commons/2/26/YellowLabradorLooking_new.jpg"
print("load image from " + image_url, file=sys.stderr)
img = load_and_preprocess_image(image_url)
labels = load_labels()
##############################################################################
## Can pass any img or input to the forward module.
mlir_model, func_name, inputs, golden_out = download_model(
"resnet50", frontend="torch"
)
amdshark_module = AMDSharkInference(mlir_model, mlir_dialect="linalg")
amdshark_module.compile()
path = amdshark_module.save_module()
amdshark_module.load_module(path)
result = amdshark_module("forward", (img.detach().numpy(),))
print("The top 3 results obtained via amdshark_runner is:")
print(top3_possibilities(torch.from_numpy(result)))
print()
print("The top 3 results obtained via torch is:")
print(top3_possibilities(Resnet50Module()(img)))

View File

@@ -1,842 +0,0 @@
####################################################################################
# Please make sure you have transformers 4.21.2 installed before running this demo
#
# -p --model_path: the directory in which you want to store the bloom files.
# -dl --device_list: the list of device indices you want to use. if you want to only use the first device, or you are running on cpu leave this blank.
# Otherwise, please give this argument in this format: "[0, 1, 2]"
# -de --device: the device you want to run bloom on. E.G. cpu, cuda
# -c, --recompile: set to true if you want to recompile to vmfb.
# -d, --download: set to true if you want to redownload the mlir files
# -cm, --create_mlirs: set to true if you want to create the mlir files from scratch. please make sure you have transformers 4.21.2 before using this option
# -t --token_count: the number of tokens you want to generate
# -pr --prompt: the prompt you want to feed to the model
# -m --model_name: the name of the model, e.g. bloom-560m
#
# If you don't specify a prompt when you run this example, you will be able to give prompts through the terminal. Run the
# example in this way if you want to run multiple examples without reinitializing the model
#####################################################################################
import os
import io
import torch
import torch.nn as nn
from collections import OrderedDict
import torch_mlir
from torch_mlir import TensorPlaceholder
import re
from transformers.models.bloom.configuration_bloom import BloomConfig
import json
import sys
import argparse
import json
import urllib.request
import subprocess
from torch.fx.experimental.proxy_tensor import make_fx
from torch._decomp import get_decompositions
from amdshark.amdshark_inference import AMDSharkInference
from amdshark.amdshark_downloader import download_public_file
from transformers import (
BloomTokenizerFast,
BloomForSequenceClassification,
BloomForCausalLM,
)
from transformers.models.bloom.modeling_bloom import (
BloomBlock,
build_alibi_tensor,
)
IS_CUDA = False
class ShardedBloom:
def __init__(self, src_folder):
f = open(f"{src_folder}/config.json")
config = json.load(f)
f.close()
self.layers_initialized = False
self.src_folder = src_folder
try:
self.n_embed = config["n_embed"]
except KeyError:
self.n_embed = config["hidden_size"]
self.vocab_size = config["vocab_size"]
self.n_layer = config["n_layer"]
try:
self.n_head = config["num_attention_heads"]
except KeyError:
self.n_head = config["n_head"]
def _init_layer(self, layer_name, device, replace, device_idx):
if replace or not os.path.exists(
f"{self.src_folder}/{layer_name}.vmfb"
):
f_ = open(f"{self.src_folder}/{layer_name}.mlir", encoding="utf-8")
module = f_.read()
f_.close()
module = bytes(module, "utf-8")
amdshark_module = AMDSharkInference(
module,
device=device,
mlir_dialect="tm_tensor",
device_idx=device_idx,
)
amdshark_module.save_module(
module_name=f"{self.src_folder}/{layer_name}",
extra_args=[
"--iree-vm-bytecode-module-output-format=flatbuffer-binary",
"--iree-stream-resource-max-allocation-size=1000000000",
"--iree-codegen-check-ir-before-llvm-conversion=false",
],
)
else:
amdshark_module = AMDSharkInference(
"",
device=device,
mlir_dialect="tm_tensor",
device_idx=device_idx,
)
return amdshark_module
def init_layers(self, device, replace=False, device_idx=[0]):
if device_idx is not None:
n_devices = len(device_idx)
self.word_embeddings_module = self._init_layer(
"word_embeddings",
device,
replace,
device_idx if device_idx is None else device_idx[0 % n_devices],
)
self.word_embeddings_layernorm_module = self._init_layer(
"word_embeddings_layernorm",
device,
replace,
device_idx if device_idx is None else device_idx[1 % n_devices],
)
self.ln_f_module = self._init_layer(
"ln_f",
device,
replace,
device_idx if device_idx is None else device_idx[2 % n_devices],
)
self.lm_head_module = self._init_layer(
"lm_head",
device,
replace,
device_idx if device_idx is None else device_idx[3 % n_devices],
)
self.block_modules = [
self._init_layer(
f"bloom_block_{i}",
device,
replace,
device_idx
if device_idx is None
else device_idx[(i + 4) % n_devices],
)
for i in range(self.n_layer)
]
self.layers_initialized = True
def load_layers(self):
assert self.layers_initialized
self.word_embeddings_module.load_module(
f"{self.src_folder}/word_embeddings.vmfb"
)
self.word_embeddings_layernorm_module.load_module(
f"{self.src_folder}/word_embeddings_layernorm.vmfb"
)
for block_module, i in zip(self.block_modules, range(self.n_layer)):
block_module.load_module(f"{self.src_folder}/bloom_block_{i}.vmfb")
self.ln_f_module.load_module(f"{self.src_folder}/ln_f.vmfb")
self.lm_head_module.load_module(f"{self.src_folder}/lm_head.vmfb")
def forward_pass(self, input_ids, device):
if IS_CUDA:
cudaSetDevice(self.word_embeddings_module.device_idx)
input_embeds = self.word_embeddings_module(
inputs=(input_ids,), function_name="forward"
)
input_embeds = torch.tensor(input_embeds).float()
if IS_CUDA:
cudaSetDevice(self.word_embeddings_layernorm_module.device_idx)
hidden_states = self.word_embeddings_layernorm_module(
inputs=(input_embeds,), function_name="forward"
)
hidden_states = torch.tensor(hidden_states).float()
attention_mask = torch.ones(
[hidden_states.shape[0], len(input_ids[0])]
)
alibi = build_alibi_tensor(
attention_mask,
self.n_head,
hidden_states.dtype,
hidden_states.device,
)
causal_mask = _prepare_attn_mask(
attention_mask, input_ids.size(), input_embeds, 0
)
causal_mask = torch.tensor(causal_mask).float()
presents = ()
all_hidden_states = tuple(hidden_states)
for block_module, i in zip(self.block_modules, range(self.n_layer)):
if IS_CUDA:
cudaSetDevice(block_module.device_idx)
output = block_module(
inputs=(
hidden_states.detach().numpy(),
alibi.detach().numpy(),
causal_mask.detach().numpy(),
),
function_name="forward",
)
hidden_states = torch.tensor(output[0]).float()
all_hidden_states = all_hidden_states + (hidden_states,)
presents = presents + (
tuple(
(
output[1],
output[2],
)
),
)
if IS_CUDA:
cudaSetDevice(self.ln_f_module.device_idx)
hidden_states = self.ln_f_module(
inputs=(hidden_states,), function_name="forward"
)
if IS_CUDA:
cudaSetDevice(self.lm_head_module.device_idx)
logits = self.lm_head_module(
inputs=(hidden_states,), function_name="forward"
)
logits = torch.tensor(logits).float()
return torch.argmax(logits[:, -1, :], dim=-1)
def _make_causal_mask(
input_ids_shape: torch.Size,
dtype: torch.dtype,
past_key_values_length: int = 0,
):
"""
Make causal mask used for bi-directional self-attention.
"""
batch_size, target_length = input_ids_shape
mask = torch.full((target_length, target_length), torch.finfo(dtype).min)
mask_cond = torch.arange(mask.size(-1))
intermediate_mask = mask_cond < (mask_cond + 1).view(mask.size(-1), 1)
mask.masked_fill_(intermediate_mask, 0)
mask = mask.to(dtype)
if past_key_values_length > 0:
mask = torch.cat(
[
torch.zeros(
target_length, past_key_values_length, dtype=dtype
),
mask,
],
dim=-1,
)
expanded_mask = mask[None, None, :, :].expand(
batch_size, 1, target_length, target_length + past_key_values_length
)
return expanded_mask
def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: int = None):
"""
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
"""
batch_size, source_length = mask.size()
tgt_len = tgt_len if tgt_len is not None else source_length
expanded_mask = (
mask[:, None, None, :]
.expand(batch_size, 1, tgt_len, source_length)
.to(dtype)
)
inverted_mask = 1.0 - expanded_mask
return inverted_mask.masked_fill(
inverted_mask.to(torch.bool), torch.finfo(dtype).min
)
def _prepare_attn_mask(
attention_mask, input_shape, inputs_embeds, past_key_values_length
):
# create causal mask
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
combined_attention_mask = None
if input_shape[-1] > 1:
combined_attention_mask = _make_causal_mask(
input_shape,
inputs_embeds.dtype,
past_key_values_length=past_key_values_length,
).to(attention_mask.device)
if attention_mask is not None:
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
expanded_attn_mask = _expand_mask(
attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
)
combined_attention_mask = (
expanded_attn_mask
if combined_attention_mask is None
else expanded_attn_mask + combined_attention_mask
)
return combined_attention_mask
def download_model(destination_folder, model_name):
download_public_file(
f"gs://amdshark_tank/sharded_bloom/{model_name}/", destination_folder
)
def compile_embeddings(embeddings_layer, input_ids, path):
input_ids_placeholder = torch_mlir.TensorPlaceholder.like(
input_ids, dynamic_axes=[1]
)
module = torch_mlir.compile(
embeddings_layer,
(input_ids_placeholder),
torch_mlir.OutputType.LINALG_ON_TENSORS,
use_tracing=False,
verbose=False,
)
bytecode_stream = io.BytesIO()
module.operation.write_bytecode(bytecode_stream)
bytecode = bytecode_stream.getvalue()
f_ = open(path, "w+")
f_.write(str(module))
f_.close()
return
def compile_word_embeddings_layernorm(
embeddings_layer_layernorm, embeds, path
):
embeds_placeholder = torch_mlir.TensorPlaceholder.like(
embeds, dynamic_axes=[1]
)
module = torch_mlir.compile(
embeddings_layer_layernorm,
(embeds_placeholder),
torch_mlir.OutputType.LINALG_ON_TENSORS,
use_tracing=False,
verbose=False,
)
bytecode_stream = io.BytesIO()
module.operation.write_bytecode(bytecode_stream)
bytecode = bytecode_stream.getvalue()
f_ = open(path, "w+")
f_.write(str(module))
f_.close()
return
def strip_overloads(gm):
"""
Modifies the target of graph nodes in :attr:`gm` to strip overloads.
Args:
gm(fx.GraphModule): The input Fx graph module to be modified
"""
for node in gm.graph.nodes:
if isinstance(node.target, torch._ops.OpOverload):
node.target = node.target.overloadpacket
gm.recompile()
def compile_to_mlir(
bblock,
hidden_states,
layer_past=None,
attention_mask=None,
head_mask=None,
use_cache=None,
output_attentions=False,
alibi=None,
block_index=0,
path=".",
):
fx_g = make_fx(
bblock,
decomposition_table=get_decompositions(
[
torch.ops.aten.split.Tensor,
torch.ops.aten.split_with_sizes,
]
),
tracing_mode="real",
_allow_non_fake_inputs=False,
)(hidden_states, alibi, attention_mask)
fx_g.graph.set_codegen(torch.fx.graph.CodeGen())
fx_g.recompile()
strip_overloads(fx_g)
hidden_states_placeholder = TensorPlaceholder.like(
hidden_states, dynamic_axes=[1]
)
attention_mask_placeholder = TensorPlaceholder.like(
attention_mask, dynamic_axes=[2, 3]
)
alibi_placeholder = TensorPlaceholder.like(alibi, dynamic_axes=[2])
ts_g = torch.jit.script(fx_g)
module = torch_mlir.compile(
ts_g,
(
hidden_states_placeholder,
alibi_placeholder,
attention_mask_placeholder,
),
torch_mlir.OutputType.LINALG_ON_TENSORS,
use_tracing=False,
verbose=False,
)
module_placeholder = module
module_context = module_placeholder.context
def check_valid_line(line, line_n, mlir_file_len):
if "private" in line:
return False
if "attributes" in line:
return False
if mlir_file_len - line_n == 2:
return False
return True
mlir_file_len = len(str(module).split("\n"))
def remove_constant_dim(line):
if "17x" in line:
line = re.sub("17x", "?x", line)
line = re.sub("tensor.empty\(\)", "tensor.empty(%dim)", line)
if "tensor.empty" in line and "?x?" in line:
line = re.sub(
"tensor.empty\(%dim\)", "tensor.empty(%dim, %dim)", line
)
if "arith.cmpi eq" in line:
line = re.sub("c17", "dim", line)
if " 17," in line:
line = re.sub(" 17,", " %dim,", line)
return line
module = "\n".join(
[
remove_constant_dim(line)
for line, line_n in zip(
str(module).split("\n"), range(mlir_file_len)
)
if check_valid_line(line, line_n, mlir_file_len)
]
)
module = module_placeholder.parse(module, context=module_context)
bytecode_stream = io.BytesIO()
module.operation.write_bytecode(bytecode_stream)
bytecode = bytecode_stream.getvalue()
f_ = open(path, "w+")
f_.write(str(module))
f_.close()
return
def compile_ln_f(ln_f, hidden_layers, path):
hidden_layers_placeholder = torch_mlir.TensorPlaceholder.like(
hidden_layers, dynamic_axes=[1]
)
module = torch_mlir.compile(
ln_f,
(hidden_layers_placeholder),
torch_mlir.OutputType.LINALG_ON_TENSORS,
use_tracing=False,
verbose=False,
)
bytecode_stream = io.BytesIO()
module.operation.write_bytecode(bytecode_stream)
bytecode = bytecode_stream.getvalue()
f_ = open(path, "w+")
f_.write(str(module))
f_.close()
return
def compile_lm_head(lm_head, hidden_layers, path):
hidden_layers_placeholder = torch_mlir.TensorPlaceholder.like(
hidden_layers, dynamic_axes=[1]
)
module = torch_mlir.compile(
lm_head,
(hidden_layers_placeholder),
torch_mlir.OutputType.LINALG_ON_TENSORS,
use_tracing=False,
verbose=False,
)
bytecode_stream = io.BytesIO()
module.operation.write_bytecode(bytecode_stream)
bytecode = bytecode_stream.getvalue()
f_ = open(path, "w+")
f_.write(str(module))
f_.close()
return
def create_mlirs(destination_folder, model_name):
model_config = "bigscience/" + model_name
sample_input_ids = torch.ones([1, 17], dtype=torch.int64)
urllib.request.urlretrieve(
f"https://huggingface.co/bigscience/{model_name}/resolve/main/config.json",
filename=f"{destination_folder}/config.json",
)
urllib.request.urlretrieve(
f"https://huggingface.co/bigscience/bloom/resolve/main/tokenizer.json",
filename=f"{destination_folder}/tokenizer.json",
)
class HuggingFaceLanguage(torch.nn.Module):
def __init__(self):
super().__init__()
self.model = BloomForCausalLM.from_pretrained(model_config)
def forward(self, tokens):
return self.model.forward(tokens)[0]
class HuggingFaceBlock(torch.nn.Module):
def __init__(self, block):
super().__init__()
self.model = block
def forward(self, tokens, alibi, attention_mask):
output = self.model(
hidden_states=tokens,
alibi=alibi,
attention_mask=attention_mask,
use_cache=True,
output_attentions=False,
)
return (output[0], output[1][0], output[1][1])
model = HuggingFaceLanguage()
compile_embeddings(
model.model.transformer.word_embeddings,
sample_input_ids,
f"{destination_folder}/word_embeddings.mlir",
)
inputs_embeds = model.model.transformer.word_embeddings(sample_input_ids)
compile_word_embeddings_layernorm(
model.model.transformer.word_embeddings_layernorm,
inputs_embeds,
f"{destination_folder}/word_embeddings_layernorm.mlir",
)
hidden_states = model.model.transformer.word_embeddings_layernorm(
inputs_embeds
)
input_shape = sample_input_ids.size()
current_sequence_length = hidden_states.shape[1]
past_key_values_length = 0
past_key_values = tuple([None] * len(model.model.transformer.h))
attention_mask = torch.ones(
(hidden_states.shape[0], current_sequence_length), device="cpu"
)
alibi = build_alibi_tensor(
attention_mask,
model.model.transformer.n_head,
hidden_states.dtype,
"cpu",
)
causal_mask = _prepare_attn_mask(
attention_mask, input_shape, inputs_embeds, past_key_values_length
)
head_mask = model.model.transformer.get_head_mask(
None, model.model.transformer.config.n_layer
)
output_attentions = model.model.transformer.config.output_attentions
all_hidden_states = ()
for i, (block, layer_past) in enumerate(
zip(model.model.transformer.h, past_key_values)
):
all_hidden_states = all_hidden_states + (hidden_states,)
proxy_model = HuggingFaceBlock(block)
compile_to_mlir(
proxy_model,
hidden_states,
layer_past=layer_past,
attention_mask=causal_mask,
head_mask=head_mask[i],
use_cache=True,
output_attentions=output_attentions,
alibi=alibi,
block_index=i,
path=f"{destination_folder}/bloom_block_{i}.mlir",
)
compile_ln_f(
model.model.transformer.ln_f,
hidden_states,
f"{destination_folder}/ln_f.mlir",
)
hidden_states = model.model.transformer.ln_f(hidden_states)
compile_lm_head(
model.model.lm_head,
hidden_states,
f"{destination_folder}/lm_head.mlir",
)
def run_large_model(
token_count,
recompile,
model_path,
prompt,
device_list,
script_path,
device,
):
f = open(f"{model_path}/prompt.txt", "w+")
f.write(prompt)
f.close()
for i in range(token_count):
if i == 0:
will_compile = recompile
else:
will_compile = False
f = open(f"{model_path}/prompt.txt", "r")
prompt = f.read()
f.close()
subprocess.run(
[
"python",
script_path,
model_path,
"start",
str(will_compile),
"cpu",
"None",
prompt,
]
)
for i in range(config["n_layer"]):
if device_list is not None:
device_idx = str(device_list[i % len(device_list)])
else:
device_idx = "None"
subprocess.run(
[
"python",
script_path,
model_path,
str(i),
str(will_compile),
device,
device_idx,
prompt,
]
)
subprocess.run(
[
"python",
script_path,
model_path,
"end",
str(will_compile),
"cpu",
"None",
prompt,
]
)
f = open(f"{model_path}/prompt.txt", "r")
output = f.read()
f.close()
print(output)
if __name__ == "__main__":
parser = argparse.ArgumentParser(prog="Bloom-560m")
parser.add_argument("-p", "--model_path")
parser.add_argument("-dl", "--device_list", default=None)
parser.add_argument("-de", "--device", default="cpu")
parser.add_argument("-c", "--recompile", default=False, type=bool)
parser.add_argument("-d", "--download", default=False, type=bool)
parser.add_argument("-t", "--token_count", default=10, type=int)
parser.add_argument("-m", "--model_name", default="bloom-560m")
parser.add_argument("-cm", "--create_mlirs", default=False, type=bool)
parser.add_argument(
"-lm", "--large_model_memory_efficient", default=False, type=bool
)
parser.add_argument(
"-pr",
"--prompt",
default=None,
)
args = parser.parse_args()
if args.create_mlirs and args.large_model_memory_efficient:
print(
"Warning: If you need to use memory efficient mode, you probably want to use 'download' instead"
)
if not os.path.isdir(args.model_path):
os.mkdir(args.model_path)
if args.device_list is not None:
args.device_list = json.loads(args.device_list)
if args.device == "cuda" and args.device_list is not None:
IS_CUDA = True
from cuda.cudart import cudaSetDevice
if args.download and args.create_mlirs:
print(
"WARNING: It is not advised to turn on both download and create_mlirs"
)
if args.download:
download_model(args.model_path, args.model_name)
if args.create_mlirs:
create_mlirs(args.model_path, args.model_name)
from transformers import AutoTokenizer, AutoModelForCausalLM, BloomConfig
tokenizer = AutoTokenizer.from_pretrained(args.model_path)
if args.prompt is not None:
input_ids = tokenizer.encode(args.prompt, return_tensors="pt")
if args.large_model_memory_efficient:
f = open(f"{args.model_path}/config.json")
config = json.load(f)
f.close()
self_path = os.path.dirname(os.path.abspath(__file__))
script_path = os.path.join(self_path, "sharded_bloom_large_models.py")
if args.prompt is not None:
run_large_model(
args.token_count,
args.recompile,
args.model_path,
args.prompt,
args.device_list,
script_path,
args.device,
)
else:
while True:
prompt = input("Enter Prompt: ")
try:
token_count = int(
input("Enter number of tokens you want to generate: ")
)
except:
print(
"Invalid integer entered. Using default value of 10"
)
token_count = 10
run_large_model(
token_count,
args.recompile,
args.model_path,
prompt,
args.device_list,
script_path,
args.device,
)
else:
shardedbloom = ShardedBloom(args.model_path)
shardedbloom.init_layers(
device=args.device,
replace=args.recompile,
device_idx=args.device_list,
)
shardedbloom.load_layers()
if args.prompt is not None:
for _ in range(args.token_count):
next_token = shardedbloom.forward_pass(
torch.tensor(input_ids), device=args.device
)
input_ids = torch.cat(
[input_ids, next_token.unsqueeze(-1)], dim=-1
)
print(tokenizer.decode(input_ids.squeeze()))
else:
while True:
prompt = input("Enter Prompt: ")
try:
token_count = int(
input("Enter number of tokens you want to generate: ")
)
except:
print(
"Invalid integer entered. Using default value of 10"
)
token_count = 10
input_ids = tokenizer.encode(prompt, return_tensors="pt")
for _ in range(token_count):
next_token = shardedbloom.forward_pass(
torch.tensor(input_ids), device=args.device
)
input_ids = torch.cat(
[input_ids, next_token.unsqueeze(-1)], dim=-1
)
print(tokenizer.decode(input_ids.squeeze()))

View File

@@ -1,381 +0,0 @@
import sys
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, BloomConfig
import re
from amdshark.amdshark_inference import AMDSharkInference
import torch
import torch.nn as nn
from collections import OrderedDict
from transformers.models.bloom.modeling_bloom import (
BloomBlock,
build_alibi_tensor,
)
import time
import json
def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: int = None):
"""
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
"""
batch_size, source_length = mask.size()
tgt_len = tgt_len if tgt_len is not None else source_length
expanded_mask = (
mask[:, None, None, :]
.expand(batch_size, 1, tgt_len, source_length)
.to(dtype)
)
inverted_mask = 1.0 - expanded_mask
return inverted_mask.masked_fill(
inverted_mask.to(torch.bool), torch.finfo(dtype).min
)
def _prepare_attn_mask(
attention_mask, input_shape, inputs_embeds, past_key_values_length
):
# create causal mask
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
combined_attention_mask = None
if input_shape[-1] > 1:
combined_attention_mask = _make_causal_mask(
input_shape,
inputs_embeds.dtype,
past_key_values_length=past_key_values_length,
).to(attention_mask.device)
if attention_mask is not None:
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
expanded_attn_mask = _expand_mask(
attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
)
combined_attention_mask = (
expanded_attn_mask
if combined_attention_mask is None
else expanded_attn_mask + combined_attention_mask
)
return combined_attention_mask
def _make_causal_mask(
input_ids_shape: torch.Size,
dtype: torch.dtype,
past_key_values_length: int = 0,
):
"""
Make causal mask used for bi-directional self-attention.
"""
batch_size, target_length = input_ids_shape
mask = torch.full((target_length, target_length), torch.finfo(dtype).min)
mask_cond = torch.arange(mask.size(-1))
intermediate_mask = mask_cond < (mask_cond + 1).view(mask.size(-1), 1)
mask.masked_fill_(intermediate_mask, 0)
mask = mask.to(dtype)
if past_key_values_length > 0:
mask = torch.cat(
[
torch.zeros(
target_length, past_key_values_length, dtype=dtype
),
mask,
],
dim=-1,
)
expanded_mask = mask[None, None, :, :].expand(
batch_size, 1, target_length, target_length + past_key_values_length
)
return expanded_mask
if __name__ == "__main__":
working_dir = sys.argv[1]
layer_name = sys.argv[2]
will_compile = sys.argv[3]
device = sys.argv[4]
device_idx = sys.argv[5]
prompt = sys.argv[6]
if device_idx.lower().strip() == "none":
device_idx = None
else:
device_idx = int(device_idx)
if will_compile.lower().strip() == "true":
will_compile = True
else:
will_compile = False
f = open(f"{working_dir}/config.json")
config = json.load(f)
f.close()
layers_initialized = False
try:
n_embed = config["n_embed"]
except KeyError:
n_embed = config["hidden_size"]
vocab_size = config["vocab_size"]
n_layer = config["n_layer"]
try:
n_head = config["num_attention_heads"]
except KeyError:
n_head = config["n_head"]
if not os.path.isdir(working_dir):
os.mkdir(working_dir)
if layer_name == "start":
tokenizer = AutoTokenizer.from_pretrained(working_dir)
input_ids = tokenizer.encode(prompt, return_tensors="pt")
mlir_str = ""
if will_compile:
f = open(f"{working_dir}/word_embeddings.mlir", encoding="utf-8")
mlir_str = f.read()
f.close()
mlir_str = bytes(mlir_str, "utf-8")
amdshark_module = AMDSharkInference(
mlir_str,
device="cpu",
mlir_dialect="tm_tensor",
device_idx=None,
)
if will_compile:
amdshark_module.save_module(
module_name=f"{working_dir}/word_embeddings",
extra_args=[
"--iree-vm-bytecode-module-output-format=flatbuffer-binary",
"--iree-stream-resource-max-allocation-size=1000000000",
"--iree-codegen-check-ir-before-llvm-conversion=false",
],
)
amdshark_module.load_module(f"{working_dir}/word_embeddings.vmfb")
input_embeds = amdshark_module(
inputs=(input_ids,), function_name="forward"
)
input_embeds = torch.tensor(input_embeds).float()
mlir_str = ""
if will_compile:
f = open(
f"{working_dir}/word_embeddings_layernorm.mlir",
encoding="utf-8",
)
mlir_str = f.read()
f.close()
amdshark_module = AMDSharkInference(
mlir_str,
device="cpu",
mlir_dialect="tm_tensor",
device_idx=None,
)
if will_compile:
amdshark_module.save_module(
module_name=f"{working_dir}/word_embeddings_layernorm",
extra_args=[
"--iree-vm-bytecode-module-output-format=flatbuffer-binary",
"--iree-stream-resource-max-allocation-size=1000000000",
"--iree-codegen-check-ir-before-llvm-conversion=false",
],
)
amdshark_module.load_module(
f"{working_dir}/word_embeddings_layernorm.vmfb"
)
hidden_states = amdshark_module(
inputs=(input_embeds,), function_name="forward"
)
hidden_states = torch.tensor(hidden_states).float()
torch.save(hidden_states, f"{working_dir}/hidden_states_0.pt")
attention_mask = torch.ones(
[hidden_states.shape[0], len(input_ids[0])]
)
attention_mask = torch.tensor(attention_mask).float()
alibi = build_alibi_tensor(
attention_mask,
n_head,
hidden_states.dtype,
device="cpu",
)
torch.save(alibi, f"{working_dir}/alibi.pt")
causal_mask = _prepare_attn_mask(
attention_mask, input_ids.size(), input_embeds, 0
)
causal_mask = torch.tensor(causal_mask).float()
torch.save(causal_mask, f"{working_dir}/causal_mask.pt")
elif layer_name in [str(x) for x in range(n_layer)]:
hidden_states = torch.load(
f"{working_dir}/hidden_states_{layer_name}.pt"
)
alibi = torch.load(f"{working_dir}/alibi.pt")
causal_mask = torch.load(f"{working_dir}/causal_mask.pt")
mlir_str = ""
if will_compile:
f = open(
f"{working_dir}/bloom_block_{layer_name}.mlir",
encoding="utf-8",
)
mlir_str = f.read()
f.close()
mlir_str = bytes(mlir_str, "utf-8")
amdshark_module = AMDSharkInference(
mlir_str,
device=device,
mlir_dialect="tm_tensor",
device_idx=device_idx,
)
if will_compile:
amdshark_module.save_module(
module_name=f"{working_dir}/bloom_block_{layer_name}",
extra_args=[
"--iree-vm-bytecode-module-output-format=flatbuffer-binary",
"--iree-stream-resource-max-allocation-size=1000000000",
"--iree-codegen-check-ir-before-llvm-conversion=false",
],
)
amdshark_module.load_module(
f"{working_dir}/bloom_block_{layer_name}.vmfb"
)
output = amdshark_module(
inputs=(
hidden_states.detach().numpy(),
alibi.detach().numpy(),
causal_mask.detach().numpy(),
),
function_name="forward",
)
hidden_states = torch.tensor(output[0]).float()
torch.save(
hidden_states,
f"{working_dir}/hidden_states_{int(layer_name) + 1}.pt",
)
elif layer_name == "end":
mlir_str = ""
if will_compile:
f = open(f"{working_dir}/ln_f.mlir", encoding="utf-8")
mlir_str = f.read()
f.close()
mlir_str = bytes(mlir_str, "utf-8")
amdshark_module = AMDSharkInference(
mlir_str,
device="cpu",
mlir_dialect="tm_tensor",
device_idx=None,
)
if will_compile:
amdshark_module.save_module(
module_name=f"{working_dir}/ln_f",
extra_args=[
"--iree-vm-bytecode-module-output-format=flatbuffer-binary",
"--iree-stream-resource-max-allocation-size=1000000000",
"--iree-codegen-check-ir-before-llvm-conversion=false",
],
)
amdshark_module.load_module(f"{working_dir}/ln_f.vmfb")
hidden_states = torch.load(f"{working_dir}/hidden_states_{n_layer}.pt")
hidden_states = amdshark_module(
inputs=(hidden_states,), function_name="forward"
)
mlir_str = ""
if will_compile:
f = open(f"{working_dir}/lm_head.mlir", encoding="utf-8")
mlir_str = f.read()
f.close()
mlir_str = bytes(mlir_str, "utf-8")
if config["n_embed"] == 14336:
def get_state_dict():
d = torch.load(
f"{working_dir}/pytorch_model_00001-of-00072.bin"
)
return OrderedDict(
(k.replace("word_embeddings.", ""), v)
for k, v in d.items()
)
def load_causal_lm_head():
linear = nn.utils.skip_init(
nn.Linear, 14336, 250880, bias=False, dtype=torch.float
)
linear.load_state_dict(get_state_dict(), strict=False)
return linear.float()
lm_head = load_causal_lm_head()
logits = lm_head(torch.tensor(hidden_states).float())
else:
amdshark_module = AMDSharkInference(
mlir_str,
device="cpu",
mlir_dialect="tm_tensor",
device_idx=None,
)
if will_compile:
amdshark_module.save_module(
module_name=f"{working_dir}/lm_head",
extra_args=[
"--iree-vm-bytecode-module-output-format=flatbuffer-binary",
"--iree-stream-resource-max-allocation-size=1000000000",
"--iree-codegen-check-ir-before-llvm-conversion=false",
],
)
amdshark_module.load_module(f"{working_dir}/lm_head.vmfb")
logits = amdshark_module(
inputs=(hidden_states,), function_name="forward"
)
logits = torch.tensor(logits).float()
tokenizer = AutoTokenizer.from_pretrained(working_dir)
next_token = tokenizer.decode(torch.argmax(logits[:, -1, :], dim=-1))
f = open(f"{working_dir}/prompt.txt", "w+")
f.write(prompt + next_token)
f.close()

View File

@@ -1,390 +0,0 @@
# Description: an implementation of a deep learning recommendation model (DLRM)
# The model input consists of dense and sparse features. The former is a vector
# of floating point values. The latter is a list of sparse indices into
# embedding tables, which consist of vectors of floating point values.
# The selected vectors are passed to mlp networks denoted by triangles,
# in some cases the vectors are interacted through operators (Ops).
#
# output:
# vector of values
# model: |
# /\
# /__\
# |
# _____________________> Op <___________________
# / | \
# /\ /\ /\
# /__\ /__\ ... /__\
# | | |
# | Op Op
# | ____/__\_____ ____/__\____
# | |_Emb_|____|__| ... |_Emb_|__|___|
# input:
# [ dense features ] [sparse indices] , ..., [sparse indices]
#
# More precise definition of model layers:
# 1) fully connected layers of an mlp
# z = f(y)
# y = Wx + b
#
# 2) embedding lookup (for a list of sparse indices p=[p1,...,pk])
# z = Op(e1,...,ek)
# obtain vectors e1=E[:,p1], ..., ek=E[:,pk]
#
# 3) Operator Op can be one of the following
# Sum(e1,...,ek) = e1 + ... + ek
# Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek]
# Cat(e1,...,ek) = [e1', ..., ek']'
# where ' denotes transpose operation
#
# References:
# [1] Maxim Naumov, Dheevatsa Mudigere, Hao-Jun Michael Shi, Jianyu Huang,
# Narayanan Sundaram, Jongsoo Park, Xiaodong Wang, Udit Gupta, Carole-Jean Wu,
# Alisson G. Azzolini, Dmytro Dzhulgakov, Andrey Mallevich, Ilia Cherniavskii,
# Yinghai Lu, Raghuraman Krishnamoorthi, Ansha Yu, Volodymyr Kondratenko,
# Stephanie Pereira, Xianjie Chen, Wenlin Chen, Vijay Rao, Bill Jia, Liang Xiong,
# Misha Smelyanskiy, "Deep Learning Recommendation Model for Personalization and
# Recommendation Systems", CoRR, arXiv:1906.00091, 2019
import argparse
import sys
import numpy as np
import torch
import torch.nn as nn
from amdshark.amdshark_inference import AMDSharkInference
from amdshark.amdshark_importer import AMDSharkImporter
torch.manual_seed(0)
np.random.seed(0)
### define dlrm in PyTorch ###
class DLRM_Net(nn.Module):
def create_mlp(self, ln, sigmoid_layer):
# build MLP layer by layer
layers = nn.ModuleList()
for i in range(0, ln.size - 1):
n = ln[i]
m = ln[i + 1]
# construct fully connected operator
LL = nn.Linear(int(n), int(m), bias=True)
# initialize the weights
# with torch.no_grad():
# custom Xavier input, output or two-sided fill
mean = 0.0 # std_dev = np.sqrt(variance)
std_dev = np.sqrt(2 / (m + n)) # np.sqrt(1 / m) # np.sqrt(1 / n)
W = np.random.normal(mean, std_dev, size=(m, n)).astype(np.float32)
std_dev = np.sqrt(1 / m) # np.sqrt(2 / (m + 1))
bt = np.random.normal(mean, std_dev, size=m).astype(np.float32)
LL.weight.data = torch.tensor(W, requires_grad=True)
LL.bias.data = torch.tensor(bt, requires_grad=True)
# approach 2
# LL.weight.data.copy_(torch.tensor(W))
# LL.bias.data.copy_(torch.tensor(bt))
# approach 3
# LL.weight = Parameter(torch.tensor(W),requires_grad=True)
# LL.bias = Parameter(torch.tensor(bt),requires_grad=True)
layers.append(LL)
# construct sigmoid or relu operator
if i == sigmoid_layer:
layers.append(nn.Sigmoid())
else:
layers.append(nn.ReLU())
# approach 1: use ModuleList
# return layers
# approach 2: use Sequential container to wrap all layers
return torch.nn.Sequential(*layers)
def create_emb(self, m, ln, weighted_pooling=None):
emb_l = nn.ModuleList()
v_W_l = []
for i in range(0, ln.size):
n = ln[i]
# construct embedding operator
EE = nn.EmbeddingBag(n, m, mode="sum")
# initialize embeddings
# nn.init.uniform_(EE.weight, a=-np.sqrt(1 / n), b=np.sqrt(1 / n))
W = np.random.uniform(
low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, m)
).astype(np.float32)
# approach 1
print(W)
EE.weight.data = torch.tensor(W, requires_grad=True)
# approach 2
# EE.weight.data.copy_(torch.tensor(W))
# approach 3
# EE.weight = Parameter(torch.tensor(W),requires_grad=True)
if weighted_pooling is None:
v_W_l.append(None)
else:
v_W_l.append(torch.ones(n, dtype=torch.float32))
emb_l.append(EE)
return emb_l, v_W_l
def __init__(
self,
m_spa=None,
ln_emb=None,
ln_bot=None,
ln_top=None,
arch_interaction_op=None,
arch_interaction_itself=False,
sigmoid_bot=-1,
sigmoid_top=-1,
weighted_pooling=None,
):
super(DLRM_Net, self).__init__()
if (
(m_spa is not None)
and (ln_emb is not None)
and (ln_bot is not None)
and (ln_top is not None)
and (arch_interaction_op is not None)
):
# save arguments
self.output_d = 0
self.arch_interaction_op = arch_interaction_op
self.arch_interaction_itself = arch_interaction_itself
if weighted_pooling is not None and weighted_pooling != "fixed":
self.weighted_pooling = "learned"
else:
self.weighted_pooling = weighted_pooling
# create operators
self.emb_l, w_list = self.create_emb(
m_spa, ln_emb, weighted_pooling
)
if self.weighted_pooling == "learned":
self.v_W_l = nn.ParameterList()
for w in w_list:
self.v_W_l.append(nn.Parameter(w))
else:
self.v_W_l = w_list
self.bot_l = self.create_mlp(ln_bot, sigmoid_bot)
self.top_l = self.create_mlp(ln_top, sigmoid_top)
def apply_mlp(self, x, layers):
return layers(x)
def apply_emb(self, lS_o, lS_i, emb_l, v_W_l):
# WARNING: notice that we are processing the batch at once. We implicitly
# assume that the data is laid out such that:
# 1. each embedding is indexed with a group of sparse indices,
# corresponding to a single lookup
# 2. for each embedding the lookups are further organized into a batch
# 3. for a list of embedding tables there is a list of batched lookups
# TORCH-MLIR
# We are passing all the embeddings as arguments for easy parsing.
ly = []
for k, sparse_index_group_batch in enumerate(lS_i):
sparse_offset_group_batch = lS_o[k]
# embedding lookup
# We are using EmbeddingBag, which implicitly uses sum operator.
# The embeddings are represented as tall matrices, with sum
# happening vertically across 0 axis, resulting in a row vector
# E = emb_l[k]
if v_W_l[k] is not None:
per_sample_weights = v_W_l[k].gather(
0, sparse_index_group_batch
)
else:
per_sample_weights = None
E = emb_l[k]
V = E(
sparse_index_group_batch,
sparse_offset_group_batch,
per_sample_weights=per_sample_weights,
)
ly.append(V)
return ly
def interact_features(self, x, ly):
if self.arch_interaction_op == "dot":
# concatenate dense and sparse features
(batch_size, d) = x.shape
T = torch.cat([x] + ly, dim=1).view((batch_size, -1, d))
# perform a dot product
Z = torch.bmm(T, torch.transpose(T, 1, 2))
# append dense feature with the interactions (into a row vector)
# approach 1: all
# Zflat = Z.view((batch_size, -1))
# approach 2: unique
_, ni, nj = Z.shape
# approach 1: tril_indices
# offset = 0 if self.arch_interaction_itself else -1
# li, lj = torch.tril_indices(ni, nj, offset=offset)
# approach 2: custom
offset = 1 if self.arch_interaction_itself else 0
li = torch.tensor(
[i for i in range(ni) for j in range(i + offset)]
)
lj = torch.tensor(
[j for i in range(nj) for j in range(i + offset)]
)
Zflat = Z[:, li, lj]
# concatenate dense features and interactions
R = torch.cat([x] + [Zflat], dim=1)
elif self.arch_interaction_op == "cat":
# concatenation features (into a row vector)
R = torch.cat([x] + ly, dim=1)
else:
sys.exit(
"ERROR: --arch-interaction-op="
+ self.arch_interaction_op
+ " is not supported"
)
return R
def forward(self, dense_x, lS_o, *lS_i):
return self.sequential_forward(dense_x, lS_o, lS_i)
def sequential_forward(self, dense_x, lS_o, lS_i):
# process dense features (using bottom mlp), resulting in a row vector
x = self.apply_mlp(dense_x, self.bot_l)
# debug prints
# print("intermediate")
# print(x.detach().cpu().numpy())
# process sparse features(using embeddings), resulting in a list of row vectors
ly = self.apply_emb(lS_o, lS_i, self.emb_l, self.v_W_l)
# for y in ly:
# print(y.detach().cpu().numpy())
# interact features (dense and sparse)
z = self.interact_features(x, ly)
# print(z.detach().cpu().numpy())
# obtain probability of a click (using top mlp)
p = self.apply_mlp(z, self.top_l)
# # clamp output if needed
# if 0.0 < self.loss_threshold and self.loss_threshold < 1.0:
# z = torch.clamp(p, min=self.loss_threshold, max=(1.0 - self.loss_threshold))
# else:
# z = p
return p
def dash_separated_ints(value):
vals = value.split("-")
for val in vals:
try:
int(val)
except ValueError:
raise argparse.ArgumentTypeError(
"%s is not a valid dash separated list of ints" % value
)
return value
# model related parameters
parser = argparse.ArgumentParser(
description="Train Deep Learning Recommendation Model (DLRM)"
)
parser.add_argument("--arch-sparse-feature-size", type=int, default=2)
parser.add_argument(
"--arch-embedding-size", type=dash_separated_ints, default="4-3-2"
)
# j will be replaced with the table number
parser.add_argument(
"--arch-mlp-bot", type=dash_separated_ints, default="4-3-2"
)
parser.add_argument(
"--arch-mlp-top", type=dash_separated_ints, default="8-2-1"
)
parser.add_argument(
"--arch-interaction-op", type=str, choices=["dot", "cat"], default="dot"
)
parser.add_argument(
"--arch-interaction-itself", action="store_true", default=False
)
parser.add_argument("--weighted-pooling", type=str, default=None)
args = parser.parse_args()
ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-")
ln_top = np.fromstring(args.arch_mlp_top, dtype=int, sep="-")
m_den = ln_bot[0]
ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-")
m_spa = args.arch_sparse_feature_size
ln_emb = np.asarray(ln_emb)
num_fea = ln_emb.size + 1 # num sparse + num dense features
# Initialize the model.
dlrm_model = DLRM_Net(
m_spa=m_spa,
ln_emb=ln_emb,
ln_bot=ln_bot,
ln_top=ln_top,
arch_interaction_op=args.arch_interaction_op,
)
# Inputs to the model.
dense_inp = torch.tensor([[0.6965, 0.2861, 0.2269, 0.5513]])
vs0 = torch.tensor([[0], [0], [0]], dtype=torch.int64)
vsi = torch.tensor([1, 2, 3]), torch.tensor([1]), torch.tensor([1])
input_dlrm = (dense_inp, vs0, *vsi)
golden_output = dlrm_model(dense_inp, vs0, *vsi)
mlir_importer = AMDSharkImporter(
dlrm_model,
input_dlrm,
frontend="torch",
)
(dlrm_mlir, func_name), inputs, golden_out = mlir_importer.import_debug(
tracing_required=True
)
amdshark_module = AMDSharkInference(
dlrm_mlir, device="vulkan", mlir_dialect="linalg"
)
amdshark_module.compile()
result = amdshark_module.forward(input_dlrm)
np.testing.assert_allclose(
golden_output.detach().numpy(), result, rtol=1e-02, atol=1e-03
)
# Verified via torch-mlir.
# import torch_mlir
# from torch_mlir_e2e_test.linalg_on_tensors_backends import refbackend
# module = torch_mlir.compile(
# dlrm_model, inputs, use_tracing=True, output_type="linalg-on-tensors"
# )
# backend = refbackend.RefBackendLinalgOnTensorsBackend()
# compiled = backend.compile(module)
# jit_module = backend.load(compiled)
# dense_numpy = dense_inp.numpy()
# vs0_numpy = vs0.numpy()
# vsi_numpy = [inp.numpy() for inp in vsi]
# numpy_inp = (dense_numpy, vs0_numpy, *vsi_numpy)
# print(jit_module.forward(*numpy_inp))

View File

@@ -1,311 +0,0 @@
import torch
from torch import nn
from torchrec.datasets.utils import Batch
from torchrec.modules.crossnet import LowRankCrossNet
from torchrec.sparse.jagged_tensor import KeyedJaggedTensor, KeyedTensor
from torchrec.modules.embedding_configs import EmbeddingBagConfig
from torchrec.modules.embedding_modules import EmbeddingBagCollection
from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
from typing import Dict, List, Optional, Tuple
from torchrec.models.dlrm import (
choose,
DenseArch,
DLRM,
InteractionArch,
SparseArch,
OverArch,
)
from amdshark.amdshark_inference import AMDSharkInference
from amdshark.amdshark_importer import AMDSharkImporter
import numpy as np
torch.manual_seed(0)
np.random.seed(0)
def calculate_offsets(tensor_list, prev_values, prev_offsets):
offset_init = 0
offset_list = []
values_list = []
if prev_offsets != None:
offset_init = prev_values.shape[-1]
for tensor in tensor_list:
offset_list.append(offset_init)
offset_init += tensor.shape[0]
concatendated_tensor_list = torch.cat(tensor_list)
if prev_values != None:
concatendated_tensor_list = torch.cat(
[prev_values, concatendated_tensor_list]
)
concatenated_offsets = torch.tensor(offset_list)
if prev_offsets != None:
concatenated_offsets = torch.cat([prev_offsets, concatenated_offsets])
return concatendated_tensor_list, concatenated_offsets
# Have to make combined_keys as dict as to which embedding bags they
# point to. {f1: 0, f3: 0, f2: 1}
# The result will be a triple containing values, indices and pointer tensor.
def to_list(key_jagged, combined_keys):
key_jagged_dict = key_jagged.to_dict()
combined_list = []
for key in combined_keys:
prev_values, prev_offsets = calculate_offsets(
key_jagged_dict[key].to_dense(), None, None
)
print(prev_values)
print(prev_offsets)
combined_list.append(prev_values)
combined_list.append(prev_offsets)
combined_list.append(torch.tensor(combined_keys[key]))
return combined_list
class SparseArchAMDShark(nn.Module):
def create_emb(self, embedding_dim, num_embeddings_list):
embedding_list = nn.ModuleList()
for i in range(0, num_embeddings_list.size):
num_embeddings = num_embeddings_list[i]
EE = nn.EmbeddingBag(num_embeddings, embedding_dim, mode="sum")
W = np.random.uniform(
low=-np.sqrt(1 / num_embeddings),
high=np.sqrt(1 / num_embeddings),
size=(num_embeddings, embedding_dim),
).astype(np.float32)
EE.weight.data = torch.tensor(W, requires_grad=True)
embedding_list.append(EE)
return embedding_list
def __init__(
self,
embedding_dim,
total_features,
num_embeddings_list,
):
super(SparseArchAMDShark, self).__init__()
self.embedding_dim = embedding_dim
self.num_features = total_features
self.embedding_list = self.create_emb(
embedding_dim, num_embeddings_list
)
def forward(self, *batched_inputs):
concatenated_list = []
input_enum, embedding_enum = 0, 0
for k in range(len(batched_inputs) // 3):
values = batched_inputs[input_enum]
input_enum += 1
offsets = batched_inputs[input_enum]
input_enum += 1
embedding_pointer = int(batched_inputs[input_enum])
input_enum += 1
E = self.embedding_list[embedding_pointer]
V = E(values, offsets)
concatenated_list.append(V)
return torch.cat(concatenated_list, dim=1).reshape(
-1, self.num_features, self.embedding_dim
)
def test_sparse_arch() -> None:
D = 3
eb1_config = EmbeddingBagConfig(
name="t1",
embedding_dim=D,
num_embeddings=10,
feature_names=["f1", "f3"],
)
eb2_config = EmbeddingBagConfig(
name="t2",
embedding_dim=D,
num_embeddings=10,
feature_names=["f2"],
)
ebc = EmbeddingBagCollection(tables=[eb1_config, eb2_config])
w1 = ebc.embedding_bags["t1"].weight
w2 = ebc.embedding_bags["t2"].weight
sparse_arch = SparseArch(ebc)
keys = ["f1", "f2", "f3", "f4", "f5"]
offsets = torch.tensor([0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 19])
features = KeyedJaggedTensor.from_offsets_sync(
keys=keys,
values=torch.tensor(
[1, 2, 4, 5, 4, 3, 2, 9, 1, 2, 4, 5, 4, 3, 2, 9, 1, 2, 3]
),
offsets=offsets,
)
sparse_archi = SparseArchAMDShark(D, 3, np.array([10, 10]))
sparse_archi.embedding_list[0].weight = w1
sparse_archi.embedding_list[1].weight = w2
inputs = to_list(features, {"f1": 0, "f3": 0, "f2": 1})
test_results = sparse_archi(*inputs)
sparse_features = sparse_arch(features)
torch.allclose(
sparse_features,
test_results,
rtol=1e-4,
atol=1e-4,
)
test_sparse_arch()
class DLRMAMDShark(nn.Module):
def __init__(
self,
embedding_dim,
total_features,
num_embeddings_list,
dense_in_features: int,
dense_arch_layer_sizes: List[int],
over_arch_layer_sizes: List[int],
) -> None:
super().__init__()
self.sparse_arch: SparseArchAMDShark = SparseArchAMDShark(
embedding_dim, total_features, num_embeddings_list
)
num_sparse_features: int = total_features
self.dense_arch = DenseArch(
in_features=dense_in_features,
layer_sizes=dense_arch_layer_sizes,
)
self.inter_arch = InteractionArch(
num_sparse_features=num_sparse_features,
)
over_in_features: int = (
embedding_dim
+ choose(num_sparse_features, 2)
+ num_sparse_features
)
self.over_arch = OverArch(
in_features=over_in_features,
layer_sizes=over_arch_layer_sizes,
)
def forward(
self, dense_features: torch.Tensor, *sparse_features
) -> torch.Tensor:
embedded_dense = self.dense_arch(dense_features)
embedded_sparse = self.sparse_arch(*sparse_features)
concatenated_dense = self.inter_arch(
dense_features=embedded_dense, sparse_features=embedded_sparse
)
logits = self.over_arch(concatenated_dense)
return logits
def test_dlrm() -> None:
B = 2
D = 8
dense_in_features = 100
eb1_config = EmbeddingBagConfig(
name="t1",
embedding_dim=D,
num_embeddings=100,
feature_names=["f1", "f3"],
)
eb2_config = EmbeddingBagConfig(
name="t2",
embedding_dim=D,
num_embeddings=100,
feature_names=["f2"],
)
ebc = EmbeddingBagCollection(tables=[eb1_config, eb2_config])
sparse_features = KeyedJaggedTensor.from_offsets_sync(
keys=["f1", "f3", "f2"],
values=torch.tensor([1, 2, 4, 5, 4, 3, 2, 9, 1, 2, 3]),
offsets=torch.tensor([0, 2, 4, 6, 8, 10, 11]),
)
ebc = EmbeddingBagCollection(tables=[eb1_config, eb2_config])
sparse_nn = DLRM(
embedding_bag_collection=ebc,
dense_in_features=dense_in_features,
dense_arch_layer_sizes=[20, D],
over_arch_layer_sizes=[5, 1],
)
sparse_nn_nod = DLRMAMDShark(
embedding_dim=8,
total_features=3,
num_embeddings_list=np.array([100, 100]),
dense_in_features=dense_in_features,
dense_arch_layer_sizes=[20, D],
over_arch_layer_sizes=[5, 1],
)
dense_features = torch.rand((B, dense_in_features))
x = to_list(sparse_features, {"f1": 0, "f3": 0, "f2": 1})
w1 = ebc.embedding_bags["t1"].weight
w2 = ebc.embedding_bags["t2"].weight
sparse_nn_nod.sparse_arch.embedding_list[0].weight = w1
sparse_nn_nod.sparse_arch.embedding_list[1].weight = w2
sparse_nn_nod.dense_arch.load_state_dict(sparse_nn.dense_arch.state_dict())
sparse_nn_nod.inter_arch.load_state_dict(sparse_nn.inter_arch.state_dict())
sparse_nn_nod.over_arch.load_state_dict(sparse_nn.over_arch.state_dict())
logits = sparse_nn(
dense_features=dense_features,
sparse_features=sparse_features,
)
logits_nod = sparse_nn_nod(dense_features, *x)
# print(logits)
# print(logits_nod)
# Import the module and print.
mlir_importer = AMDSharkImporter(
sparse_nn_nod,
(dense_features, *x),
frontend="torch",
)
(dlrm_mlir, func_name), inputs, golden_out = mlir_importer.import_debug(
tracing_required=True
)
amdshark_module = AMDSharkInference(
dlrm_mlir, device="cpu", mlir_dialect="linalg"
)
amdshark_module.compile()
result = amdshark_module.forward(inputs)
np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)
torch.allclose(
logits,
logits_nod,
rtol=1e-4,
atol=1e-4,
)
test_dlrm()

View File

@@ -1,35 +0,0 @@
from PIL import Image
import requests
from transformers import T5Tokenizer, TFT5Model
import tensorflow as tf
from amdshark.amdshark_inference import AMDSharkInference
# Create a set of inputs
t5_inputs = [
tf.TensorSpec(shape=[1, 10], dtype=tf.int32),
tf.TensorSpec(shape=[1, 10], dtype=tf.int32),
]
class T5Module(tf.Module):
def __init__(self):
super(T5Module, self).__init__()
self.m = TFT5Model.from_pretrained("t5-small")
self.m.predict = lambda x, y: self.m(input_ids=x, decoder_input_ids=y)
@tf.function(input_signature=t5_inputs, jit_compile=True)
def forward(self, input_ids, decoder_input_ids):
return self.m.predict(input_ids, decoder_input_ids)
if __name__ == "__main__":
# Prepping Data
tokenizer = T5Tokenizer.from_pretrained("t5-small")
text = "I love the distilled version of models."
inputs = tokenizer(text, return_tensors="tf").input_ids
amdshark_module = AMDSharkInference(T5Module(), (inputs, inputs))
amdshark_module.set_frontend("tensorflow")
amdshark_module.compile()
print(amdshark_module.forward((inputs, inputs)))

View File

@@ -1,43 +0,0 @@
import torch
import torchvision.models as models
from amdshark.amdshark_inference import AMDSharkInference
class VisionModule(torch.nn.Module):
def __init__(self, model):
super().__init__()
self.model = model
self.train(False)
def forward(self, input):
return self.model.forward(input)
input = torch.randn(1, 3, 224, 224)
## The vision models present here: https://pytorch.org/vision/stable/models.html
vision_models_list = [
models.resnet18(pretrained=True),
models.alexnet(pretrained=True),
models.vgg16(pretrained=True),
models.squeezenet1_0(pretrained=True),
models.densenet161(pretrained=True),
models.inception_v3(pretrained=True),
models.shufflenet_v2_x1_0(pretrained=True),
models.mobilenet_v2(pretrained=True),
models.mobilenet_v3_small(pretrained=True),
models.resnext50_32x4d(pretrained=True),
models.wide_resnet50_2(pretrained=True),
models.mnasnet1_0(pretrained=True),
models.efficientnet_b0(pretrained=True),
models.regnet_y_400mf(pretrained=True),
models.regnet_x_400mf(pretrained=True),
]
for i, vision_model in enumerate(vision_models_list):
amdshark_module = AMDSharkInference(
VisionModule(vision_model),
(input,),
)
amdshark_module.compile()
amdshark_module.forward((input,))

View File

@@ -1,39 +0,0 @@
import torch
import numpy as np
from amdshark.amdshark_inference import AMDSharkInference
from amdshark.amdshark_importer import AMDSharkImporter
class UnetModule(torch.nn.Module):
def __init__(self):
super().__init__()
self.model = torch.hub.load(
"mateuszbuda/brain-segmentation-pytorch",
"unet",
in_channels=3,
out_channels=1,
init_features=32,
pretrained=True,
)
self.model.eval()
def forward(self, input):
return self.model(input)
input = torch.randn(1, 3, 224, 224)
mlir_importer = AMDSharkImporter(
UnetModule(),
(input,),
frontend="torch",
)
(vision_mlir, func_name), inputs, golden_out = mlir_importer.import_debug(
tracing_required=False
)
amdshark_module = AMDSharkInference(vision_mlir, mlir_dialect="linalg")
amdshark_module.compile()
result = amdshark_module.forward((input,))
np.testing.assert_allclose(golden_out, result, rtol=1e-02, atol=1e-03)

View File

@@ -1,21 +0,0 @@
import requests
from PIL import Image
from io import BytesIO
from pipeline_amdshark_stable_diffusion_upscale import (
AMDSharkStableDiffusionUpscalePipeline,
)
import torch
model_id = "stabilityai/stable-diffusion-x4-upscaler"
pipeline = AMDSharkStableDiffusionUpscalePipeline(model_id)
# let's download an image
url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-upscale/low_res_cat.png"
response = requests.get(url)
low_res_img = Image.open(BytesIO(response.content)).convert("RGB")
low_res_img = low_res_img.resize((128, 128))
prompt = "a white cat"
upscaled_image = pipeline(prompt=prompt, image=low_res_img).images[0]
upscaled_image.save("upsampled_cat.png")

View File

@@ -1,98 +0,0 @@
from diffusers import AutoencoderKL, UNet2DConditionModel
from transformers import CLIPTextModel
from utils import compile_through_fx
import torch
model_id = "stabilityai/stable-diffusion-x4-upscaler"
model_input = {
"clip": (torch.randint(1, 2, (1, 77)),),
"vae": (torch.randn(1, 4, 128, 128),),
"unet": (
torch.randn(2, 7, 128, 128), # latents
torch.tensor([1]).to(torch.float32), # timestep
torch.randn(2, 77, 1024), # embedding
torch.randn(2).to(torch.int64), # noise_level
),
}
def get_clip_mlir(model_name="clip_text", extra_args=[]):
text_encoder = CLIPTextModel.from_pretrained(
model_id,
subfolder="text_encoder",
)
class CLIPText(torch.nn.Module):
def __init__(self):
super().__init__()
self.text_encoder = text_encoder
def forward(self, input):
return self.text_encoder(input)[0]
clip_model = CLIPText()
amdshark_clip = compile_through_fx(
clip_model,
model_input["clip"],
model_name=model_name,
extra_args=extra_args,
)
return amdshark_clip
def get_vae_mlir(model_name="vae", extra_args=[]):
class VaeModel(torch.nn.Module):
def __init__(self):
super().__init__()
self.vae = AutoencoderKL.from_pretrained(
model_id,
subfolder="vae",
)
def forward(self, input):
x = self.vae.decode(input, return_dict=False)[0]
return x
vae = VaeModel()
amdshark_vae = compile_through_fx(
vae,
model_input["vae"],
model_name=model_name,
extra_args=extra_args,
)
return amdshark_vae
def get_unet_mlir(model_name="unet", extra_args=[]):
class UnetModel(torch.nn.Module):
def __init__(self):
super().__init__()
self.unet = UNet2DConditionModel.from_pretrained(
model_id,
subfolder="unet",
)
self.in_channels = self.unet.in_channels
self.train(False)
def forward(self, latent, timestep, text_embedding, noise_level):
unet_out = self.unet.forward(
latent,
timestep,
text_embedding,
noise_level,
return_dict=False,
)[0]
return unet_out
unet = UnetModel()
f16_input_mask = (True, True, True, False)
amdshark_unet = compile_through_fx(
unet,
model_input["unet"],
model_name=model_name,
is_f16=True,
f16_input_mask=f16_input_mask,
extra_args=extra_args,
)
return amdshark_unet

View File

@@ -1,48 +0,0 @@
import sys
from model_wrappers import (
get_vae_mlir,
get_unet_mlir,
get_clip_mlir,
)
from upscaler_args import args
from utils import get_amdshark_model
BATCH_SIZE = len(args.prompts)
if BATCH_SIZE != 1:
sys.exit("Only batch size 1 is supported.")
unet_flag = [
"--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=32}))"
]
vae_flag = [
"--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-convert-conv-nchw-to-nhwc,iree-preprocessing-pad-linalg-ops{pad-size=16}))"
]
clip_flag = [
"--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-preprocessing-pad-linalg-ops{pad-size=16}))"
]
bucket = "gs://amdshark_tank/stable_diffusion/"
def get_unet():
model_name = "upscaler_unet"
if args.import_mlir:
return get_unet_mlir(model_name, unet_flag)
return get_amdshark_model(bucket, model_name, unet_flag)
def get_vae():
model_name = "upscaler_vae"
if args.import_mlir:
return get_vae_mlir(model_name, vae_flag)
return get_amdshark_model(bucket, model_name, vae_flag)
def get_clip():
model_name = "upscaler_clip"
if args.import_mlir:
return get_clip_mlir(model_name, clip_flag)
return get_amdshark_model(bucket, model_name, clip_flag)

View File

@@ -1,489 +0,0 @@
import inspect
from typing import Callable, List, Optional, Union
import numpy as np
import torch
import PIL
from PIL import Image
from diffusers.utils import is_accelerate_available
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, UNet2DConditionModel
from diffusers import (
DDIMScheduler,
DDPMScheduler,
LMSDiscreteScheduler,
PNDMScheduler,
)
from diffusers import logging
from diffusers.pipeline_utils import ImagePipelineOutput
from opt_params import get_unet, get_vae, get_clip
from tqdm.auto import tqdm
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
def preprocess(image):
if isinstance(image, torch.Tensor):
return image
elif isinstance(image, PIL.Image.Image):
image = [image]
if isinstance(image[0], PIL.Image.Image):
w, h = image[0].size
w, h = map(
lambda x: x - x % 64, (w, h)
) # resize to integer multiple of 64
image = [np.array(i.resize((w, h)))[None, :] for i in image]
image = np.concatenate(image, axis=0)
image = np.array(image).astype(np.float32) / 255.0
image = image.transpose(0, 3, 1, 2)
image = 2.0 * image - 1.0
image = torch.from_numpy(image)
elif isinstance(image[0], torch.Tensor):
image = torch.cat(image, dim=0)
return image
def amdshark_run_wrapper(model, *args):
np_inputs = tuple([x.detach().numpy() for x in args])
outputs = model("forward", np_inputs)
return torch.from_numpy(outputs)
class AMDSharkStableDiffusionUpscalePipeline:
def __init__(
self,
model_id,
):
self.tokenizer = CLIPTokenizer.from_pretrained(
model_id, subfolder="tokenizer"
)
self.low_res_scheduler = DDPMScheduler.from_pretrained(
model_id,
subfolder="scheduler",
)
self.scheduler = DDIMScheduler.from_pretrained(
model_id,
subfolder="scheduler",
)
self.vae = get_vae()
self.unet = get_unet()
self.text_encoder = get_clip()
self.max_noise_level = (350,)
self._execution_device = "cpu"
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
self,
prompt,
device,
num_images_per_prompt,
do_classifier_free_guidance,
negative_prompt,
):
r"""
Encodes the prompt into text encoder hidden states.
Args:
prompt (`str` or `list(int)`):
prompt to be encoded
device: (`torch.device`):
torch device
num_images_per_prompt (`int`):
number of images that should be generated per prompt
do_classifier_free_guidance (`bool`):
whether to use classifier free guidance or not
negative_prompt (`str` or `List[str]`):
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
if `guidance_scale` is less than `1`).
"""
batch_size = len(prompt) if isinstance(prompt, list) else 1
text_inputs = self.tokenizer(
prompt,
padding="max_length",
max_length=self.tokenizer.model_max_length,
truncation=True,
return_tensors="pt",
)
text_input_ids = text_inputs.input_ids
untruncated_ids = self.tokenizer(
prompt, padding="longest", return_tensors="pt"
).input_ids
if untruncated_ids.shape[-1] >= text_input_ids.shape[
-1
] and not torch.equal(text_input_ids, untruncated_ids):
removed_text = self.tokenizer.batch_decode(
untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
)
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
f" {self.tokenizer.model_max_length} tokens: {removed_text}"
)
# if (
# hasattr(self.text_encoder.config, "use_attention_mask")
# and self.text_encoder.config.use_attention_mask
# ):
# attention_mask = text_inputs.attention_mask.to(device)
# else:
# attention_mask = None
text_embeddings = amdshark_run_wrapper(
self.text_encoder, text_input_ids.to(device)
)
# duplicate text embeddings for each generation per prompt, using mps friendly method
bs_embed, seq_len, _ = text_embeddings.shape
text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
text_embeddings = text_embeddings.view(
bs_embed * num_images_per_prompt, seq_len, -1
)
# get unconditional embeddings for classifier free guidance
if do_classifier_free_guidance:
uncond_tokens: List[str]
if negative_prompt is None:
uncond_tokens = [""] * batch_size
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
f" {type(prompt)}."
)
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
" the batch size of `prompt`."
)
else:
uncond_tokens = negative_prompt
max_length = text_input_ids.shape[-1]
uncond_input = self.tokenizer(
uncond_tokens,
padding="max_length",
max_length=max_length,
truncation=True,
return_tensors="pt",
)
# if (
# hasattr(self.text_encoder.config, "use_attention_mask")
# and self.text_encoder.config.use_attention_mask
# ):
# attention_mask = uncond_input.attention_mask.to(device)
# else:
# attention_mask = None
uncond_embeddings = amdshark_run_wrapper(
self.text_encoder,
uncond_input.input_ids.to(device),
)
uncond_embeddings = uncond_embeddings
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len = uncond_embeddings.shape[1]
uncond_embeddings = uncond_embeddings.repeat(
1, num_images_per_prompt, 1
)
uncond_embeddings = uncond_embeddings.view(
batch_size * num_images_per_prompt, seq_len, -1
)
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
return text_embeddings
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
def prepare_extra_step_kwargs(self, generator, eta):
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
accepts_eta = "eta" in set(
inspect.signature(self.scheduler.step).parameters.keys()
)
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
# check if the scheduler accepts generator
accepts_generator = "generator" in set(
inspect.signature(self.scheduler.step).parameters.keys()
)
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents with 0.18215->0.08333
def decode_latents(self, latents):
latents = 1 / 0.08333 * latents
image = amdshark_run_wrapper(self.vae, latents)
image = (image / 2 + 0.5).clamp(0, 1)
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
return image
def check_inputs(self, prompt, image, noise_level, callback_steps):
if not isinstance(prompt, str) and not isinstance(prompt, list):
raise ValueError(
f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
)
if (
not isinstance(image, torch.Tensor)
and not isinstance(image, PIL.Image.Image)
and not isinstance(image, list)
):
raise ValueError(
f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or `list` but is {type(image)}"
)
# verify batch size of prompt and image are same if image is a list or tensor
if isinstance(image, list) or isinstance(image, torch.Tensor):
if isinstance(prompt, str):
batch_size = 1
else:
batch_size = len(prompt)
if isinstance(image, list):
image_batch_size = len(image)
else:
image_batch_size = image.shape[0]
if batch_size != image_batch_size:
raise ValueError(
f"`prompt` has batch size {batch_size} and `image` has batch size {image_batch_size}."
" Please make sure that passed `prompt` matches the batch size of `image`."
)
@staticmethod
def numpy_to_pil(images):
"""
Convert a numpy image or a batch of images to a PIL image.
"""
if images.ndim == 3:
images = images[None, ...]
images = (images * 255).round().astype("uint8")
if images.shape[-1] == 1:
# special case for grayscale (single channel) images
pil_images = [
Image.fromarray(image.squeeze(), mode="L") for image in images
]
else:
pil_images = [Image.fromarray(image) for image in images]
return pil_images
def prepare_latents(
self,
batch_size,
num_channels_latents,
height,
width,
dtype,
device,
generator,
latents=None,
):
shape = (batch_size, num_channels_latents, height, width)
if latents is None:
if device == "mps":
# randn does not work reproducibly on mps
latents = torch.randn(
shape, generator=generator, device="cpu", dtype=dtype
).to(device)
else:
latents = torch.randn(
shape, generator=generator, device=device, dtype=dtype
)
else:
if latents.shape != shape:
raise ValueError(
f"Unexpected latents shape, got {latents.shape}, expected {shape}"
)
latents = latents.to(device)
# scale the initial noise by the standard deviation required by the scheduler
latents = latents * self.scheduler.init_noise_sigma
return latents
@torch.no_grad()
def __call__(
self,
prompt: Union[str, List[str]],
image: Union[
torch.FloatTensor, PIL.Image.Image, List[PIL.Image.Image]
],
num_inference_steps: int = 75,
guidance_scale: float = 9.0,
noise_level: int = 20,
negative_prompt: Optional[Union[str, List[str]]] = None,
num_images_per_prompt: Optional[int] = 1,
eta: float = 0.0,
generator: Optional[
Union[torch.Generator, List[torch.Generator]]
] = None,
latents: Optional[torch.FloatTensor] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
callback: Optional[
Callable[[int, int, torch.FloatTensor], None]
] = None,
callback_steps: Optional[int] = 1,
):
# 1. Check inputs
self.check_inputs(prompt, image, noise_level, callback_steps)
# 2. Define call parameters
batch_size = 1 if isinstance(prompt, str) else len(prompt)
device = self._execution_device
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
# corresponds to doing no classifier free guidance.
do_classifier_free_guidance = guidance_scale > 1.0
# 3. Encode input prompt
text_embeddings = self._encode_prompt(
prompt,
device,
num_images_per_prompt,
do_classifier_free_guidance,
negative_prompt,
)
# 4. Preprocess image
image = preprocess(image)
image = image.to(dtype=text_embeddings.dtype, device=device)
# 5. set timesteps
self.scheduler.set_timesteps(num_inference_steps, device=device)
timesteps = self.scheduler.timesteps
# 5. Add noise to image
noise_level = torch.tensor(
[noise_level], dtype=torch.long, device=device
)
if device == "mps":
# randn does not work reproducibly on mps
noise = torch.randn(
image.shape,
generator=generator,
device="cpu",
dtype=text_embeddings.dtype,
).to(device)
else:
noise = torch.randn(
image.shape,
generator=generator,
device=device,
dtype=text_embeddings.dtype,
)
image = self.low_res_scheduler.add_noise(image, noise, noise_level)
batch_multiplier = 2 if do_classifier_free_guidance else 1
image = torch.cat([image] * batch_multiplier * num_images_per_prompt)
noise_level = torch.cat([noise_level] * image.shape[0])
# 6. Prepare latent variables
height, width = image.shape[2:]
# num_channels_latents = self.vae.config.latent_channels
num_channels_latents = 4
latents = self.prepare_latents(
batch_size * num_images_per_prompt,
num_channels_latents,
height,
width,
text_embeddings.dtype,
device,
generator,
latents,
)
# 7. Check that sizes of image and latents match
num_channels_image = image.shape[1]
# if (
# num_channels_latents + num_channels_image
# != self.unet.config.in_channels
# ):
# raise ValueError(
# f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
# f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
# f" `num_channels_image`: {num_channels_image} "
# f" = {num_channels_latents+num_channels_image}. Please verify the config of"
# " `pipeline.unet` or your `image` input."
# )
# 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 9. Denoising loop
num_warmup_steps = (
len(timesteps) - num_inference_steps * self.scheduler.order
)
for i, t in tqdm(enumerate(timesteps)):
# expand the latents if we are doing classifier free guidance
latent_model_input = (
torch.cat([latents] * 2)
if do_classifier_free_guidance
else latents
)
# concat latents, mask, masked_image_latents in the channel dimension
latent_model_input = self.scheduler.scale_model_input(
latent_model_input, t
)
latent_model_input = torch.cat([latent_model_input, image], dim=1)
timestep = torch.tensor([t]).to(torch.float32)
# predict the noise residual
noise_pred = amdshark_run_wrapper(
self.unet,
latent_model_input.half(),
timestep,
text_embeddings.half(),
noise_level,
)
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
noise_pred = noise_pred_uncond + guidance_scale * (
noise_pred_text - noise_pred_uncond
)
# compute the previous noisy sample x_t -> x_t-1
latents = self.scheduler.step(
noise_pred, t, latents, **extra_step_kwargs
).prev_sample
# # call the callback, if provided
# if i == len(timesteps) - 1 or (
# (i + 1) > num_warmup_steps
# and (i + 1) % self.scheduler.order == 0
# ):
# progress_bar.update()
# if callback is not None and i % callback_steps == 0:
# callback(i, t, latents)
# 10. Post-processing
# make sure the VAE is in float32 mode, as it overflows in float16
# self.vae.to(dtype=torch.float32)
image = self.decode_latents(latents.float())
# 11. Convert to PIL
if output_type == "pil":
image = self.numpy_to_pil(image)
if not return_dict:
return (image,)
return ImagePipelineOutput(images=image)

View File

@@ -1,98 +0,0 @@
import argparse
p = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
##############################################################################
### Stable Diffusion Params
##############################################################################
p.add_argument(
"--prompts",
nargs="+",
default=["cyberpunk forest by Salvador Dali"],
help="text of which images to be generated.",
)
p.add_argument(
"--negative-prompts",
nargs="+",
default=[""],
help="text you don't want to see in the generated image.",
)
p.add_argument(
"--steps",
type=int,
default=50,
help="the no. of steps to do the sampling.",
)
p.add_argument(
"--seed",
type=int,
default=42,
help="the seed to use.",
)
p.add_argument(
"--guidance_scale",
type=float,
default=7.5,
help="the value to be used for guidance scaling.",
)
##############################################################################
### Model Config and Usage Params
##############################################################################
p.add_argument(
"--device", type=str, default="vulkan", help="device to run the model."
)
p.add_argument(
"--precision", type=str, default="fp16", help="precision to run the model."
)
p.add_argument(
"--import_mlir",
default=False,
action=argparse.BooleanOptionalAction,
help="imports the model from torch module to amdshark_module otherwise downloads the model from amdshark_tank.",
)
p.add_argument(
"--load_vmfb",
default=True,
action=argparse.BooleanOptionalAction,
help="attempts to load the model from a precompiled flatbuffer and compiles + saves it if not found.",
)
p.add_argument(
"--save_vmfb",
default=False,
action=argparse.BooleanOptionalAction,
help="saves the compiled flatbuffer to the local directory",
)
##############################################################################
### IREE - Vulkan supported flags
##############################################################################
p.add_argument(
"--iree-vulkan-target-triple",
type=str,
default="",
help="Specify target triple for vulkan",
)
p.add_argument(
"--vulkan_debug_utils",
default=False,
action=argparse.BooleanOptionalAction,
help="Profiles vulkan device and collects the .rdc info",
)
args = p.parse_args()

View File

@@ -1,230 +0,0 @@
import os
import torch
from amdshark.amdshark_inference import AMDSharkInference
from upscaler_args import args
from amdshark.amdshark_importer import import_with_fx
from amdshark.iree_utils.vulkan_utils import (
set_iree_vulkan_runtime_flags,
get_vulkan_target_triple,
get_iree_vulkan_runtime_flags,
)
def _compile_module(amdshark_module, model_name, extra_args=[]):
if args.load_vmfb or args.save_vmfb:
device = (
args.device
if "://" not in args.device
else "-".join(args.device.split("://"))
)
extended_name = "{}_{}".format(model_name, device)
vmfb_path = os.path.join(os.getcwd(), extended_name + ".vmfb")
if args.load_vmfb and os.path.isfile(vmfb_path) and not args.save_vmfb:
print(f"loading existing vmfb from: {vmfb_path}")
amdshark_module.load_module(vmfb_path, extra_args=extra_args)
else:
if args.save_vmfb:
print("Saving to {}".format(vmfb_path))
else:
print(
"No vmfb found. Compiling and saving to {}".format(
vmfb_path
)
)
path = amdshark_module.save_module(
os.getcwd(), extended_name, extra_args
)
amdshark_module.load_module(path, extra_args=extra_args)
else:
amdshark_module.compile(extra_args)
return amdshark_module
# Downloads the model from amdshark_tank and returns the amdshark_module.
def get_amdshark_model(tank_url, model_name, extra_args=[]):
from amdshark.amdshark_downloader import download_model
from amdshark.parser import amdshark_args
# Set local amdshark_tank cache directory.
# amdshark_args.local_tank_cache = args.local_tank_cache
mlir_model, func_name, inputs, golden_out = download_model(
model_name,
tank_url=tank_url,
frontend="torch",
)
amdshark_module = AMDSharkInference(
mlir_model, device=args.device, mlir_dialect="linalg"
)
return _compile_module(amdshark_module, model_name, extra_args)
# Converts the torch-module into a amdshark_module.
def compile_through_fx(
model, inputs, model_name, is_f16=False, f16_input_mask=None, extra_args=[]
):
mlir_module, func_name = import_with_fx(
model, inputs, is_f16, f16_input_mask
)
amdshark_module = AMDSharkInference(
mlir_module,
device=args.device,
mlir_dialect="linalg",
)
return _compile_module(amdshark_module, model_name, extra_args)
def set_iree_runtime_flags():
vulkan_runtime_flags = get_iree_vulkan_runtime_flags()
if args.enable_rgp:
vulkan_runtime_flags += [
f"--enable_rgp=true",
f"--vulkan_debug_utils=true",
]
set_iree_vulkan_runtime_flags(flags=vulkan_runtime_flags)
def get_all_devices(driver_name):
"""
Inputs: driver_name
Returns a list of all the available devices for a given driver sorted by
the iree path names of the device as in --list_devices option in iree.
"""
from iree.runtime import get_driver
driver = get_driver(driver_name)
device_list_src = driver.query_available_devices()
device_list_src.sort(key=lambda d: d["path"])
return device_list_src
def get_device_mapping(driver, key_combination=3):
"""This method ensures consistent device ordering when choosing
specific devices for execution
Args:
driver (str): execution driver (vulkan, cuda, rocm, etc)
key_combination (int, optional): choice for mapping value for device name.
1 : path
2 : name
3 : (name, path)
Defaults to 3.
Returns:
dict: map to possible device names user can input mapped to desired combination of name/path.
"""
from amdshark.iree_utils._common import iree_device_map
driver = iree_device_map(driver)
device_list = get_all_devices(driver)
device_map = dict()
def get_output_value(dev_dict):
if key_combination == 1:
return f"{driver}://{dev_dict['path']}"
if key_combination == 2:
return dev_dict["name"]
if key_combination == 3:
return (dev_dict["name"], f"{driver}://{dev_dict['path']}")
# mapping driver name to default device (driver://0)
device_map[f"{driver}"] = get_output_value(device_list[0])
for i, device in enumerate(device_list):
# mapping with index
device_map[f"{driver}://{i}"] = get_output_value(device)
# mapping with full path
device_map[f"{driver}://{device['path']}"] = get_output_value(device)
return device_map
def map_device_to_name_path(device, key_combination=3):
"""Gives the appropriate device data (supported name/path) for user selected execution device
Args:
device (str): user
key_combination (int, optional): choice for mapping value for device name.
1 : path
2 : name
3 : (name, path)
Defaults to 3.
Raises:
ValueError:
Returns:
str / tuple: returns the mapping str or tuple of mapping str for the device depending on key_combination value
"""
driver = device.split("://")[0]
device_map = get_device_mapping(driver, key_combination)
try:
device_mapping = device_map[device]
except KeyError:
raise ValueError(f"Device '{device}' is not a valid device.")
return device_mapping
def set_init_device_flags():
if "vulkan" in args.device:
# set runtime flags for vulkan.
set_iree_runtime_flags()
# set triple flag to avoid multiple calls to get_vulkan_triple_flag
device_name, args.device = map_device_to_name_path(args.device)
if not args.iree_vulkan_target_triple:
triple = get_vulkan_target_triple(device_name)
if triple is not None:
args.iree_vulkan_target_triple = triple
print(
f"Found device {device_name}. Using target triple {args.iree_vulkan_target_triple}."
)
elif "cuda" in args.device:
args.device = "cuda"
elif "cpu" in args.device:
args.device = "cpu"
# set max_length based on availability.
if args.variant in ["anythingv3", "analogdiffusion", "dreamlike"]:
args.max_length = 77
elif args.variant == "openjourney":
args.max_length = 64
# use tuned models only in the case of stablediffusion/fp16 and rdna3 cards.
if (
args.variant in ["openjourney", "dreamlike"]
or args.precision != "fp16"
or "vulkan" not in args.device
or "rdna3" not in args.iree_vulkan_target_triple
):
args.use_tuned = False
print("Tuned models are currently not supported for this setting.")
elif args.use_base_vae and args.variant != "stablediffusion":
args.use_tuned = False
print("Tuned models are currently not supported for this setting.")
if args.use_tuned:
print("Using tuned models for stablediffusion/fp16 and rdna3 card.")
# Utility to get list of devices available.
def get_available_devices():
def get_devices_by_name(driver_name):
from amdshark.iree_utils._common import iree_device_map
device_list = []
try:
driver_name = iree_device_map(driver_name)
device_list_dict = get_all_devices(driver_name)
print(f"{driver_name} devices are available.")
except:
print(f"{driver_name} devices are not available.")
else:
for i, device in enumerate(device_list_dict):
device_list.append(f"{driver_name}://{i} => {device['name']}")
return device_list
set_iree_runtime_flags()
available_devices = []
vulkan_devices = get_devices_by_name("vulkan")
available_devices.extend(vulkan_devices)
cuda_devices = get_devices_by_name("cuda")
available_devices.extend(cuda_devices)
available_devices.append("cpu")
return available_devices

View File

@@ -1,15 +0,0 @@
from amdshark.amdshark_inference import AMDSharkInference
from amdshark.amdshark_downloader import download_model
mlir_model, func_name, inputs, golden_out = download_model(
"v_diffusion", frontend="torch"
)
amdshark_module = AMDSharkInference(
mlir_model, device="vulkan", mlir_dialect="linalg"
)
amdshark_module.compile()
result = amdshark_module.forward(inputs)
print("The obtained result via amdshark is: ", result)
print("The golden result is:", golden_out)

View File

@@ -1,48 +0,0 @@
import torch
from torch.nn.utils import stateless
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from amdshark.amdshark_trainer import AMDSharkTrainer
class MiniLMSequenceClassification(torch.nn.Module):
def __init__(self):
super().__init__()
self.model = AutoModelForSequenceClassification.from_pretrained(
"microsoft/MiniLM-L12-H384-uncased", # The pretrained model.
num_labels=2, # The number of output labels--2 for binary classification.
output_attentions=False, # Whether the model returns attentions weights.
output_hidden_states=False, # Whether the model returns all hidden-states.
torchscript=True,
)
def forward(self, tokens):
return self.model.forward(tokens)[0]
mod = MiniLMSequenceClassification()
def get_sorted_params(named_params):
return [i[1] for i in sorted(named_params.items())]
print(dict(mod.named_buffers()))
inp = (torch.randint(2, (1, 128)),)
def forward(params, buffers, args):
params_and_buffers = {**params, **buffers}
stateless.functional_call(
mod, params_and_buffers, args, {}
).sum().backward()
optim = torch.optim.SGD(get_sorted_params(params), lr=0.01)
# optim.load_state_dict(optim_state)
optim.step()
return params, buffers
amdshark_module = AMDSharkTrainer(mod, inp)
amdshark_module.compile(forward)
amdshark_module.train(num_iters=2)
print("training done")

View File

@@ -1,60 +0,0 @@
import numpy as np
import os
import time
import tensorflow as tf
from amdshark.amdshark_trainer import AMDSharkTrainer
from amdshark.parser import parser
from urllib import request
parser.add_argument(
"--download_mlir_path",
type=str,
default="bert_tf_training.mlir",
help="Specifies path to target mlir file that will be loaded.",
)
load_args, unknown = parser.parse_known_args()
tf.random.set_seed(0)
vocab_size = 100
NUM_CLASSES = 5
SEQUENCE_LENGTH = 512
BATCH_SIZE = 1
# Download BERT model from tank and train.
if __name__ == "__main__":
predict_sample_input = [
np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH)),
np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH)),
np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH)),
]
file_link = "https://storage.googleapis.com/amdshark_tank/users/stanley/bert_tf_training.mlir"
response = request.urlretrieve(file_link, load_args.download_mlir_path)
sample_input_tensors = [
tf.convert_to_tensor(val, dtype=tf.int32)
for val in predict_sample_input
]
num_iter = 10
if not os.path.isfile(load_args.download_mlir_path):
raise ValueError(
f"Tried looking for target mlir in {load_args.download_mlir_path}, but cannot be found."
)
with open(load_args.download_mlir_path, "rb") as input_file:
bert_mlir = input_file.read()
amdshark_module = AMDSharkTrainer(
bert_mlir,
(
sample_input_tensors,
tf.convert_to_tensor(
np.random.randint(5, size=(BATCH_SIZE)), dtype=tf.int32
),
),
)
amdshark_module.set_frontend("mhlo")
amdshark_module.compile()
start = time.time()
print(amdshark_module.train(num_iter))
end = time.time()
total_time = end - start
print("time: " + str(total_time))
print("time/iter: " + str(total_time / num_iter))

View File

@@ -1,98 +0,0 @@
from absl import app
import time
import numpy as np
import tensorflow as tf
from official.nlp.modeling import layers
from official.nlp.modeling import networks
from official.nlp.modeling.models import bert_classifier
from amdshark.amdshark_trainer import AMDSharkTrainer
tf.random.set_seed(0)
vocab_size = 100
NUM_CLASSES = 5
SEQUENCE_LENGTH = 512
BATCH_SIZE = 1
# Create a set of 2-dimensional inputs
bert_input = [
tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
tf.TensorSpec(shape=[BATCH_SIZE, SEQUENCE_LENGTH], dtype=tf.int32),
]
class BertModule(tf.Module):
def __init__(self):
super(BertModule, self).__init__()
dict_outputs = False
test_network = networks.BertEncoder(
vocab_size=vocab_size, num_layers=2, dict_outputs=dict_outputs
)
# Create a BERT trainer with the created network.
bert_trainer_model = bert_classifier.BertClassifier(
test_network, num_classes=NUM_CLASSES
)
bert_trainer_model.summary()
# Invoke the trainer model on the inputs. This causes the layer to be built.
self.m = bert_trainer_model
self.m.predict = lambda x: self.m.call(x, training=False)
self.predict = tf.function(input_signature=[bert_input])(
self.m.predict
)
self.m.learn = lambda x, y: self.m.call(x, training=False)
self.loss = tf.keras.losses.SparseCategoricalCrossentropy()
self.optimizer = tf.keras.optimizers.SGD(learning_rate=1e-2)
@tf.function(
input_signature=[
bert_input, # inputs
tf.TensorSpec(shape=[BATCH_SIZE], dtype=tf.int32), # labels
],
jit_compile=True,
)
def forward(self, inputs, labels):
with tf.GradientTape() as tape:
# Capture the gradients from forward prop...
probs = self.m(inputs, training=True)
loss = self.loss(labels, probs)
# ...and use them to update the model's weights.
variables = self.m.trainable_variables
gradients = tape.gradient(loss, variables)
self.optimizer.apply_gradients(zip(gradients, variables))
return loss
if __name__ == "__main__":
predict_sample_input = [
np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH)),
np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH)),
np.random.randint(5, size=(BATCH_SIZE, SEQUENCE_LENGTH)),
]
sample_input_tensors = [
tf.convert_to_tensor(val, dtype=tf.int32)
for val in predict_sample_input
]
num_iter = 10
amdshark_module = AMDSharkTrainer(
BertModule(),
(
sample_input_tensors,
tf.convert_to_tensor(
np.random.randint(5, size=(BATCH_SIZE)), dtype=tf.int32
),
),
)
amdshark_module.set_frontend("tensorflow")
amdshark_module.compile()
start = time.time()
print(amdshark_module.train(num_iter))
end = time.time()
total_time = end - start
print("time: " + str(total_time))
print("time/iter: " + str(total_time / num_iter))

View File

@@ -1,44 +0,0 @@
import torch
from torch.nn.utils import _stateless
from amdshark.amdshark_trainer import AMDSharkTrainer
class Foo(torch.nn.Module):
def __init__(self):
super(Foo, self).__init__()
self.l1 = torch.nn.Linear(10, 16)
self.relu = torch.nn.ReLU()
self.l2 = torch.nn.Linear(16, 2)
def forward(self, x):
out = self.l1(x)
out = self.relu(out)
out = self.l2(out)
return out
mod = Foo()
inp = (torch.randn(10, 10),)
def get_sorted_params(named_params):
return [i[1] for i in sorted(named_params.items())]
def forward(params, buffers, args):
params_and_buffers = {**params, **buffers}
_stateless.functional_call(
mod, params_and_buffers, args, {}
).sum().backward()
optim = torch.optim.SGD(get_sorted_params(params), lr=0.01)
optim.step()
return params, buffers
# fx_graph = forward(dict(mod.named_parameters()), dict(mod.named_buffers()), inp)
amdshark_module = AMDSharkTrainer(mod, inp)
# Pass the training function in case of torch
amdshark_module.compile(training_fn=forward)
amdshark_module.train(num_iters=10)

View File

@@ -1,41 +0,0 @@
# Stable Diffusion Img2Img model
## Installation
<details>
<summary>Installation (Linux)</summary>
### Activate amdshark.venv Virtual Environment
```shell
source amdshark.venv/bin/activate
# Some older pip installs may not be able to handle the recent PyTorch deps
python -m pip install --upgrade pip
```
### Install dependencies
# Run the setup.sh script
```shell
./setup.sh
```
### Run the Stable diffusion Img2Img model
To run the model with the default set of images and params, run:
```shell
python stable_diffusion_img2img.py
```
To run the model with your set of images, and parameters you need to specify the following params:
1.) Input images directory with the arg `--input_dir` containing 3-5 images.
2.) What to teach the model? Using the arg `--what_to_teach`, allowed values are `object` or `style`.
3.) Placeholder token using the arg `--placeholder_token`, that represents your new concept. It should be passed with the opening and closing angle brackets. For ex: token is `cat-toy`, it should be passed as `<cat-toy>`.
4.) Initializer token using the arg `--initializer_token`, which summarise what is your new concept.
For the result, you need to pass the text prompt with the arg: `--prompt`. The prompt string should contain a "*s" in it, which will be replaced by the placeholder token during the inference.
By default the result images will go into the `sd_result` dir. To specify your output dir use the arg: `--output_dir`.
The default value of max_training_steps is `3000`, which takes some hours to complete. You can pass the smaller value with the arg `--training_steps`. Specify the number of images to be sampled for the result with the `--num_inference_samples` arg.

View File

@@ -1,25 +0,0 @@
#!/bin/bash
TD="$(cd $(dirname $0) && pwd)"
if [ -z "$PYTHON" ]; then
PYTHON="$(which python3)"
fi
function die() {
echo "Error executing command: $*"
exit 1
}
PYTHON_VERSION_X_Y=`${PYTHON} -c 'import sys; version=sys.version_info[:2]; print("{0}.{1}".format(*version))'`
echo "Python: $PYTHON"
echo "Python version: $PYTHON_VERSION_X_Y"
mkdir input_images
wget https://huggingface.co/datasets/valhalla/images/resolve/main/2.jpeg -P input_images/
wget https://huggingface.co/datasets/valhalla/images/resolve/main/3.jpeg -P input_images/
wget https://huggingface.co/datasets/valhalla/images/resolve/main/5.jpeg -P input_images/
wget https://huggingface.co/datasets/valhalla/images/resolve/main/6.jpeg -P input_images/
pip install diffusers["training"]==0.4.1 transformers ftfy opencv-python

View File

@@ -1,600 +0,0 @@
# Textual-inversion fine-tuning for Stable Diffusion using diffusers
# This script shows how to "teach" Stable Diffusion a new concept via
# textual-inversion using 🤗 Hugging Face [🧨 Diffusers library](https://github.com/huggingface/diffusers).
# By using just 3-5 images you can teach new concepts to Stable Diffusion
# and personalize the model on your own images.
import argparse
import itertools
import math
import os
import random
import cv2
import numpy as np
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch.utils.data import Dataset
import PIL
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import set_seed
from diffusers import (
AutoencoderKL,
DDPMScheduler,
PNDMScheduler,
StableDiffusionPipeline,
UNet2DConditionModel,
)
from diffusers.hub_utils import init_git_repo, push_to_hub
from diffusers.optimization import get_scheduler
from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
from PIL import Image
from torchvision import transforms
from tqdm.auto import tqdm
from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
YOUR_TOKEN = "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"
p = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
p.add_argument(
"--input_dir",
type=str,
default="input_images/",
help="the directory contains the images used for fine tuning",
)
p.add_argument(
"--output_dir",
type=str,
default="sd_result",
help="the directory contains the images used for fine tuning",
)
p.add_argument(
"--training_steps",
type=int,
default=3000,
help="the maximum number of training steps",
)
p.add_argument("--seed", type=int, default=42, help="the random seed")
p.add_argument(
"--what_to_teach",
type=str,
choices=["object", "style"],
default="object",
help="what is it that you are teaching?",
)
p.add_argument(
"--placeholder_token",
type=str,
default="<cat-toy>",
help="It is the token you are going to use to represent your new concept",
)
p.add_argument(
"--initializer_token",
type=str,
default="toy",
help="It is a word that can summarise what is your new concept",
)
p.add_argument(
"--inference_steps",
type=int,
default=50,
help="the number of steps for inference",
)
p.add_argument(
"--num_inference_samples",
type=int,
default=4,
help="the number of samples for inference",
)
p.add_argument(
"--prompt",
type=str,
default="a grafitti in a wall with a *s on it",
help="the text prompt to use",
)
args = p.parse_args()
if "*s" not in args.prompt:
raise ValueError(
f'The prompt should have a "*s" which will be replaced by a placeholder token.'
)
prompt1, prompt2 = args.prompt.split("*s")
args.prompt = prompt1 + args.placeholder_token + prompt2
pretrained_model_name_or_path = "CompVis/stable-diffusion-v1-4"
# Load input images.
images = []
for filename in os.listdir(args.input_dir):
img = cv2.imread(os.path.join(args.input_dir, filename))
if img is not None:
images.append(img)
# Setup the prompt templates for training
imagenet_templates_small = [
"a photo of a {}",
"a rendering of a {}",
"a cropped photo of the {}",
"the photo of a {}",
"a photo of a clean {}",
"a photo of a dirty {}",
"a dark photo of the {}",
"a photo of my {}",
"a photo of the cool {}",
"a close-up photo of a {}",
"a bright photo of the {}",
"a cropped photo of a {}",
"a photo of the {}",
"a good photo of the {}",
"a photo of one {}",
"a close-up photo of the {}",
"a rendition of the {}",
"a photo of the clean {}",
"a rendition of a {}",
"a photo of a nice {}",
"a good photo of a {}",
"a photo of the nice {}",
"a photo of the small {}",
"a photo of the weird {}",
"a photo of the large {}",
"a photo of a cool {}",
"a photo of a small {}",
]
imagenet_style_templates_small = [
"a painting in the style of {}",
"a rendering in the style of {}",
"a cropped painting in the style of {}",
"the painting in the style of {}",
"a clean painting in the style of {}",
"a dirty painting in the style of {}",
"a dark painting in the style of {}",
"a picture in the style of {}",
"a cool painting in the style of {}",
"a close-up painting in the style of {}",
"a bright painting in the style of {}",
"a cropped painting in the style of {}",
"a good painting in the style of {}",
"a close-up painting in the style of {}",
"a rendition in the style of {}",
"a nice painting in the style of {}",
"a small painting in the style of {}",
"a weird painting in the style of {}",
"a large painting in the style of {}",
]
# Setup the dataset
class TextualInversionDataset(Dataset):
def __init__(
self,
data_root,
tokenizer,
learnable_property="object", # [object, style]
size=512,
repeats=100,
interpolation="bicubic",
flip_p=0.5,
set="train",
placeholder_token="*",
center_crop=False,
):
self.data_root = data_root
self.tokenizer = tokenizer
self.learnable_property = learnable_property
self.size = size
self.placeholder_token = placeholder_token
self.center_crop = center_crop
self.flip_p = flip_p
self.image_paths = [
os.path.join(self.data_root, file_path)
for file_path in os.listdir(self.data_root)
]
self.num_images = len(self.image_paths)
self._length = self.num_images
if set == "train":
self._length = self.num_images * repeats
self.interpolation = {
"linear": PIL.Image.LINEAR,
"bilinear": PIL.Image.BILINEAR,
"bicubic": PIL.Image.BICUBIC,
"lanczos": PIL.Image.LANCZOS,
}[interpolation]
self.templates = (
imagenet_style_templates_small
if learnable_property == "style"
else imagenet_templates_small
)
self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p)
def __len__(self):
return self._length
def __getitem__(self, i):
example = {}
image = Image.open(self.image_paths[i % self.num_images])
if not image.mode == "RGB":
image = image.convert("RGB")
placeholder_string = self.placeholder_token
text = random.choice(self.templates).format(placeholder_string)
example["input_ids"] = self.tokenizer(
text,
padding="max_length",
truncation=True,
max_length=self.tokenizer.model_max_length,
return_tensors="pt",
).input_ids[0]
# default to score-sde preprocessing
img = np.array(image).astype(np.uint8)
if self.center_crop:
crop = min(img.shape[0], img.shape[1])
(
h,
w,
) = (
img.shape[0],
img.shape[1],
)
img = img[
(h - crop) // 2 : (h + crop) // 2,
(w - crop) // 2 : (w + crop) // 2,
]
image = Image.fromarray(img)
image = image.resize(
(self.size, self.size), resample=self.interpolation
)
image = self.flip_transform(image)
image = np.array(image).astype(np.uint8)
image = (image / 127.5 - 1.0).astype(np.float32)
example["pixel_values"] = torch.from_numpy(image).permute(2, 0, 1)
return example
# Setting up the model
# Load the tokenizer and add the placeholder token as a additional special token.
# Please read and if you agree accept the LICENSE
# [here](https://huggingface.co/CompVis/stable-diffusion-v1-4) if you see an error
tokenizer = CLIPTokenizer.from_pretrained(
pretrained_model_name_or_path,
subfolder="tokenizer",
use_auth_token=YOUR_TOKEN,
)
# Add the placeholder token in tokenizer
num_added_tokens = tokenizer.add_tokens(args.placeholder_token)
if num_added_tokens == 0:
raise ValueError(
f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
" `placeholder_token` that is not already in the tokenizer."
)
# Get token ids for our placeholder and initializer token.
# This code block will complain if initializer string is not a single token
# Convert the initializer_token, placeholder_token to ids
token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)
# Check if initializer_token is a single token or a sequence of tokens
if len(token_ids) > 1:
raise ValueError("The initializer token must be a single token.")
initializer_token_id = token_ids[0]
placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)
# Load the Stable Diffusion model
# Load models and create wrapper for stable diffusion
text_encoder = CLIPTextModel.from_pretrained(
pretrained_model_name_or_path,
subfolder="text_encoder",
use_auth_token=YOUR_TOKEN,
)
vae = AutoencoderKL.from_pretrained(
pretrained_model_name_or_path,
subfolder="vae",
use_auth_token=YOUR_TOKEN,
)
unet = UNet2DConditionModel.from_pretrained(
pretrained_model_name_or_path,
subfolder="unet",
use_auth_token=YOUR_TOKEN,
)
# We have added the `placeholder_token` in the `tokenizer` so we resize the token embeddings here,
# this will a new embedding vector in the token embeddings for our `placeholder_token`
text_encoder.resize_token_embeddings(len(tokenizer))
# Initialise the newly added placeholder token with the embeddings of the initializer token
token_embeds = text_encoder.get_input_embeddings().weight.data
token_embeds[placeholder_token_id] = token_embeds[initializer_token_id]
# In Textual-Inversion we only train the newly added embedding vector,
# so lets freeze rest of the model parameters here.
def freeze_params(params):
for param in params:
param.requires_grad = False
# Freeze vae and unet
freeze_params(vae.parameters())
freeze_params(unet.parameters())
# Freeze all parameters except for the token embeddings in text encoder
params_to_freeze = itertools.chain(
text_encoder.text_model.encoder.parameters(),
text_encoder.text_model.final_layer_norm.parameters(),
text_encoder.text_model.embeddings.position_embedding.parameters(),
)
freeze_params(params_to_freeze)
# Creating our training data
train_dataset = TextualInversionDataset(
data_root=args.input_dir,
tokenizer=tokenizer,
size=512,
placeholder_token=args.placeholder_token,
repeats=100,
learnable_property=args.what_to_teach, # Option selected above between object and style
center_crop=False,
set="train",
)
def create_dataloader(train_batch_size=1):
return torch.utils.data.DataLoader(
train_dataset, batch_size=train_batch_size, shuffle=True
)
# Create noise_scheduler for training.
noise_scheduler = DDPMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
num_train_timesteps=1000,
tensor_format="pt",
)
# Define hyperparameters for our training
hyperparameters = {
"learning_rate": 5e-04,
"scale_lr": True,
"max_train_steps": args.training_steps,
"train_batch_size": 1,
"gradient_accumulation_steps": 4,
"seed": args.seed,
"output_dir": "sd-concept-output",
}
def training_function(text_encoder, vae, unet):
logger = get_logger(__name__)
train_batch_size = hyperparameters["train_batch_size"]
gradient_accumulation_steps = hyperparameters[
"gradient_accumulation_steps"
]
learning_rate = hyperparameters["learning_rate"]
max_train_steps = hyperparameters["max_train_steps"]
output_dir = hyperparameters["output_dir"]
accelerator = Accelerator(
gradient_accumulation_steps=gradient_accumulation_steps,
)
train_dataloader = create_dataloader(train_batch_size)
if hyperparameters["scale_lr"]:
learning_rate = (
learning_rate
* gradient_accumulation_steps
* train_batch_size
* accelerator.num_processes
)
# Initialize the optimizer
optimizer = torch.optim.AdamW(
text_encoder.get_input_embeddings().parameters(), # only optimize the embeddings
lr=learning_rate,
)
text_encoder, optimizer, train_dataloader = accelerator.prepare(
text_encoder, optimizer, train_dataloader
)
# Move vae and unet to device
vae.to(accelerator.device)
unet.to(accelerator.device)
# Keep vae and unet in eval model as we don't train these
vae.eval()
unet.eval()
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
num_update_steps_per_epoch = math.ceil(
len(train_dataloader) / gradient_accumulation_steps
)
num_train_epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)
# Train!
total_batch_size = (
train_batch_size
* accelerator.num_processes
* gradient_accumulation_steps
)
logger.info("***** Running training *****")
logger.info(f" Num examples = {len(train_dataset)}")
logger.info(f" Instantaneous batch size per device = {train_batch_size}")
logger.info(
f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
)
logger.info(
f" Gradient Accumulation steps = {gradient_accumulation_steps}"
)
logger.info(f" Total optimization steps = {max_train_steps}")
# Only show the progress bar once on each machine.
progress_bar = tqdm(
range(max_train_steps), disable=not accelerator.is_local_main_process
)
progress_bar.set_description("Steps")
global_step = 0
for epoch in range(num_train_epochs):
text_encoder.train()
for step, batch in enumerate(train_dataloader):
with accelerator.accumulate(text_encoder):
# Convert images to latent space
latents = (
vae.encode(batch["pixel_values"])
.latent_dist.sample()
.detach()
)
latents = latents * 0.18215
# Sample noise that we'll add to the latents
noise = torch.randn(latents.shape).to(latents.device)
bsz = latents.shape[0]
# Sample a random timestep for each image
timesteps = torch.randint(
0,
noise_scheduler.num_train_timesteps,
(bsz,),
device=latents.device,
).long()
# Add noise to the latents according to the noise magnitude at each timestep
# (this is the forward diffusion process)
noisy_latents = noise_scheduler.add_noise(
latents, noise, timesteps
)
# Get the text embedding for conditioning
encoder_hidden_states = text_encoder(batch["input_ids"])[0]
# Predict the noise residual
noise_pred = unet(
noisy_latents, timesteps, encoder_hidden_states
).sample
loss = (
F.mse_loss(noise_pred, noise, reduction="none")
.mean([1, 2, 3])
.mean()
)
accelerator.backward(loss)
# Zero out the gradients for all token embeddings except the newly added
# embeddings for the concept, as we only want to optimize the concept embeddings
if accelerator.num_processes > 1:
grads = (
text_encoder.module.get_input_embeddings().weight.grad
)
else:
grads = text_encoder.get_input_embeddings().weight.grad
# Get the index for tokens that we want to zero the grads for
index_grads_to_zero = (
torch.arange(len(tokenizer)) != placeholder_token_id
)
grads.data[index_grads_to_zero, :] = grads.data[
index_grads_to_zero, :
].fill_(0)
optimizer.step()
optimizer.zero_grad()
# Checks if the accelerator has performed an optimization step behind the scenes
if accelerator.sync_gradients:
progress_bar.update(1)
global_step += 1
logs = {"loss": loss.detach().item()}
progress_bar.set_postfix(**logs)
if global_step >= max_train_steps:
break
accelerator.wait_for_everyone()
# Create the pipeline using using the trained modules and save it.
if accelerator.is_main_process:
pipeline = StableDiffusionPipeline(
text_encoder=accelerator.unwrap_model(text_encoder),
vae=vae,
unet=unet,
tokenizer=tokenizer,
scheduler=PNDMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
skip_prk_steps=True,
),
safety_checker=StableDiffusionSafetyChecker.from_pretrained(
"CompVis/stable-diffusion-safety-checker"
),
feature_extractor=CLIPFeatureExtractor.from_pretrained(
"openai/clip-vit-base-patch32"
),
)
pipeline.save_pretrained(output_dir)
# Also save the newly trained embeddings
learned_embeds = (
accelerator.unwrap_model(text_encoder)
.get_input_embeddings()
.weight[placeholder_token_id]
)
learned_embeds_dict = {
args.placeholder_token: learned_embeds.detach().cpu()
}
torch.save(
learned_embeds_dict, os.path.join(output_dir, "learned_embeds.bin")
)
import accelerate
accelerate.notebook_launcher(
training_function, args=(text_encoder, vae, unet), num_processes=1
)
# Set up the pipeline
pipe = StableDiffusionPipeline.from_pretrained(
hyperparameters["output_dir"],
# torch_dtype=torch.float16,
)
all_images = []
for _ in range(args.num_inference_samples):
images = pipe(
[args.prompt],
num_inference_steps=args.inference_steps,
guidance_scale=7.5,
).images
all_images.extend(images)
# output_path = os.path.abspath(os.path.join(os.getcwd(), args.output_dir))
if not os.path.isdir(args.output_dir):
os.mkdir(args.output_dir)
[
image.save(f"{args.output_dir}/{i}.jpeg")
for i, image in enumerate(all_images)
]

View File

@@ -1,43 +0,0 @@
# Stable Diffusion Fine Tuning
## Installation (Linux)
### Activate amdshark.venv Virtual Environment
```shell
source amdshark.venv/bin/activate
# Some older pip installs may not be able to handle the recent PyTorch deps
python -m pip install --upgrade pip
```
## Install dependencies
### Run the following installation commands:
```
pip install -U git+https://github.com/huggingface/diffusers.git
pip install accelerate transformers ftfy
```
### Build torch-mlir with the following branch:
Please cherry-pick this branch of torch-mlir: https://github.com/vivekkhandelwal1/torch-mlir/tree/sd-ops
and build it locally. You can find the instructions for using locally build Torch-MLIR,
here: https://github.com/nod-ai/AMD-SHARK-Studio#how-to-use-your-locally-built-iree--torch-mlir-with-amdshark
## Run the Stable diffusion fine tuning
To run the model with the default set of images and params, run:
```shell
python stable_diffusion_fine_tuning.py
```
By default the training is run through the PyTorch path. If you want to train the model using the Torchdynamo path of Torch-MLIR, you need to specify `--use_torchdynamo=True`.
The default number of training steps are `2000`, which would take many hours to complete based on your system config. You can pass the smaller value with the arg `--training_steps`. You can specify the number of images to be sampled for the result with the `--num_inference_samples` arg. For the number of inference steps you can use `--inference_steps` flag.
For example, you can run the training for a limited set of steps via the dynamo path by using the following command:
```
python stable_diffusion_fine_tuning.py --training_steps=1 --inference_steps=1 --num_inference_samples=1 --train_batch_size=1 --use_torchdynamo=True
```
You can also specify the device to be used via the flag `--device`. The default value is `cpu`, for GPU execution you can specify `--device="cuda"`.

View File

@@ -1,914 +0,0 @@
# Install the required libs
# pip install -U git+https://github.com/huggingface/diffusers.git
# pip install accelerate transformers ftfy
# Import required libraries
import argparse
import itertools
import math
import os
from typing import List
import random
import numpy as np
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch.utils.data import Dataset
import PIL
import logging
import torch_mlir
from torch_mlir.dynamo import make_simple_dynamo_backend
import torch._dynamo as dynamo
from torch.fx.experimental.proxy_tensor import make_fx
from torch_mlir_e2e_test.linalg_on_tensors_backends import refbackend
from amdshark.amdshark_inference import AMDSharkInference
torch._dynamo.config.verbose = True
from diffusers import (
AutoencoderKL,
DDPMScheduler,
PNDMScheduler,
StableDiffusionPipeline,
UNet2DConditionModel,
)
from diffusers.optimization import get_scheduler
from diffusers.pipelines.stable_diffusion import (
StableDiffusionSafetyChecker,
)
from PIL import Image
from torchvision import transforms
from tqdm.auto import tqdm
from transformers import (
CLIPFeatureExtractor,
CLIPTextModel,
CLIPTokenizer,
)
# Enter your HuggingFace Token
# Note: You can comment this prompt and just set your token instead of passing it through cli for every execution.
hf_token = input("Please enter your huggingface token here: ")
YOUR_TOKEN = hf_token
def image_grid(imgs, rows, cols):
assert len(imgs) == rows * cols
w, h = imgs[0].size
grid = Image.new("RGB", size=(cols * w, rows * h))
grid_w, grid_h = grid.size
for i, img in enumerate(imgs):
grid.paste(img, box=(i % cols * w, i // cols * h))
return grid
# `pretrained_model_name_or_path` which Stable Diffusion checkpoint you want to use
# Options: 1.) "stabilityai/stable-diffusion-2"
# 2.) "stabilityai/stable-diffusion-2-base"
# 3.) "CompVis/stable-diffusion-v1-4"
# 4.) "runwayml/stable-diffusion-v1-5"
pretrained_model_name_or_path = "stabilityai/stable-diffusion-2"
# Add here the URLs to the images of the concept you are adding. 3-5 should be fine
urls = [
"https://huggingface.co/datasets/valhalla/images/resolve/main/2.jpeg",
"https://huggingface.co/datasets/valhalla/images/resolve/main/3.jpeg",
"https://huggingface.co/datasets/valhalla/images/resolve/main/5.jpeg",
"https://huggingface.co/datasets/valhalla/images/resolve/main/6.jpeg",
## You can add additional images here
]
# Downloading Images
import requests
import glob
from io import BytesIO
def download_image(url):
try:
response = requests.get(url)
except:
return None
return Image.open(BytesIO(response.content)).convert("RGB")
images = list(filter(None, [download_image(url) for url in urls]))
save_path = "./my_concept"
if not os.path.exists(save_path):
os.mkdir(save_path)
[image.save(f"{save_path}/{i}.jpeg") for i, image in enumerate(images)]
p = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
p.add_argument(
"--input_dir",
type=str,
default="my_concept/",
help="the directory contains the images used for fine tuning",
)
p.add_argument(
"--output_dir",
type=str,
default="sd_result",
help="the directory contains the images used for fine tuning",
)
p.add_argument(
"--training_steps",
type=int,
default=2000,
help="the maximum number of training steps",
)
p.add_argument(
"--train_batch_size",
type=int,
default=4,
help="The batch size for training",
)
p.add_argument(
"--save_steps",
type=int,
default=250,
help="the number of steps after which to save the learned concept",
)
p.add_argument("--seed", type=int, default=42, help="the random seed")
p.add_argument(
"--what_to_teach",
type=str,
choices=["object", "style"],
default="object",
help="what is it that you are teaching?",
)
p.add_argument(
"--placeholder_token",
type=str,
default="<cat-toy>",
help="It is the token you are going to use to represent your new concept",
)
p.add_argument(
"--initializer_token",
type=str,
default="toy",
help="It is a word that can summarise what is your new concept",
)
p.add_argument(
"--inference_steps",
type=int,
default=50,
help="the number of steps for inference",
)
p.add_argument(
"--num_inference_samples",
type=int,
default=4,
help="the number of samples for inference",
)
p.add_argument(
"--prompt",
type=str,
default="a grafitti in a wall with a *s on it",
help="the text prompt to use",
)
p.add_argument(
"--device",
type=str,
default="cpu",
help="The device to use",
)
p.add_argument(
"--use_torchdynamo",
type=bool,
default=False,
help="This flag is used to determine whether the training has to be done through the torchdynamo path or not.",
)
args = p.parse_args()
torch.manual_seed(args.seed)
if "*s" not in args.prompt:
raise ValueError(
f'The prompt should have a "*s" which will be replaced by a placeholder token.'
)
prompt1, prompt2 = args.prompt.split("*s")
args.prompt = prompt1 + args.placeholder_token + prompt2
# `images_path` is a path to directory containing the training images.
images_path = args.input_dir
while not os.path.exists(str(images_path)):
print(
"The images_path specified does not exist, use the colab file explorer to copy the path :"
)
images_path = input("")
save_path = images_path
# Setup and check the images you have just added
images = []
for file_path in os.listdir(save_path):
try:
image_path = os.path.join(save_path, file_path)
images.append(Image.open(image_path).resize((512, 512)))
except:
print(
f"{image_path} is not a valid image, please make sure to remove this file from the directory otherwise the training could fail."
)
image_grid(images, 1, len(images))
########### Create Dataset ##########
# Setup the prompt templates for training
imagenet_templates_small = [
"a photo of a {}",
"a rendering of a {}",
"a cropped photo of the {}",
"the photo of a {}",
"a photo of a clean {}",
"a photo of a dirty {}",
"a dark photo of the {}",
"a photo of my {}",
"a photo of the cool {}",
"a close-up photo of a {}",
"a bright photo of the {}",
"a cropped photo of a {}",
"a photo of the {}",
"a good photo of the {}",
"a photo of one {}",
"a close-up photo of the {}",
"a rendition of the {}",
"a photo of the clean {}",
"a rendition of a {}",
"a photo of a nice {}",
"a good photo of a {}",
"a photo of the nice {}",
"a photo of the small {}",
"a photo of the weird {}",
"a photo of the large {}",
"a photo of a cool {}",
"a photo of a small {}",
]
imagenet_style_templates_small = [
"a painting in the style of {}",
"a rendering in the style of {}",
"a cropped painting in the style of {}",
"the painting in the style of {}",
"a clean painting in the style of {}",
"a dirty painting in the style of {}",
"a dark painting in the style of {}",
"a picture in the style of {}",
"a cool painting in the style of {}",
"a close-up painting in the style of {}",
"a bright painting in the style of {}",
"a cropped painting in the style of {}",
"a good painting in the style of {}",
"a close-up painting in the style of {}",
"a rendition in the style of {}",
"a nice painting in the style of {}",
"a small painting in the style of {}",
"a weird painting in the style of {}",
"a large painting in the style of {}",
]
# Setup the dataset
class TextualInversionDataset(Dataset):
def __init__(
self,
data_root,
tokenizer,
learnable_property="object", # [object, style]
size=512,
repeats=100,
interpolation="bicubic",
flip_p=0.5,
set="train",
placeholder_token="*",
center_crop=False,
):
self.data_root = data_root
self.tokenizer = tokenizer
self.learnable_property = learnable_property
self.size = size
self.placeholder_token = placeholder_token
self.center_crop = center_crop
self.flip_p = flip_p
self.image_paths = [
os.path.join(self.data_root, file_path)
for file_path in os.listdir(self.data_root)
]
self.num_images = len(self.image_paths)
self._length = self.num_images
if set == "train":
self._length = self.num_images * repeats
self.interpolation = {
"linear": PIL.Image.LINEAR,
"bilinear": PIL.Image.BILINEAR,
"bicubic": PIL.Image.BICUBIC,
"lanczos": PIL.Image.LANCZOS,
}[interpolation]
self.templates = (
imagenet_style_templates_small
if learnable_property == "style"
else imagenet_templates_small
)
self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p)
def __len__(self):
return self._length
def __getitem__(self, i):
example = {}
image = Image.open(self.image_paths[i % self.num_images])
if not image.mode == "RGB":
image = image.convert("RGB")
placeholder_string = self.placeholder_token
text = random.choice(self.templates).format(placeholder_string)
example["input_ids"] = self.tokenizer(
text,
padding="max_length",
truncation=True,
max_length=self.tokenizer.model_max_length,
return_tensors="pt",
).input_ids[0]
# default to score-sde preprocessing
img = np.array(image).astype(np.uint8)
if self.center_crop:
crop = min(img.shape[0], img.shape[1])
(
h,
w,
) = (
img.shape[0],
img.shape[1],
)
img = img[
(h - crop) // 2 : (h + crop) // 2,
(w - crop) // 2 : (w + crop) // 2,
]
image = Image.fromarray(img)
image = image.resize(
(self.size, self.size), resample=self.interpolation
)
image = self.flip_transform(image)
image = np.array(image).astype(np.uint8)
image = (image / 127.5 - 1.0).astype(np.float32)
example["pixel_values"] = torch.from_numpy(image).permute(2, 0, 1)
return example
########## Setting up the model ##########
# Load the tokenizer and add the placeholder token as a additional special token.
tokenizer = CLIPTokenizer.from_pretrained(
pretrained_model_name_or_path,
subfolder="tokenizer",
)
# Add the placeholder token in tokenizer
num_added_tokens = tokenizer.add_tokens(args.placeholder_token)
if num_added_tokens == 0:
raise ValueError(
f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
" `placeholder_token` that is not already in the tokenizer."
)
# Get token ids for our placeholder and initializer token.
# This code block will complain if initializer string is not a single token
# Convert the initializer_token, placeholder_token to ids
token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)
# Check if initializer_token is a single token or a sequence of tokens
if len(token_ids) > 1:
raise ValueError("The initializer token must be a single token.")
initializer_token_id = token_ids[0]
placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)
# Load the Stable Diffusion model
# Load models and create wrapper for stable diffusion
# pipeline = StableDiffusionPipeline.from_pretrained(pretrained_model_name_or_path)
# del pipeline
text_encoder = CLIPTextModel.from_pretrained(
pretrained_model_name_or_path, subfolder="text_encoder"
)
vae = AutoencoderKL.from_pretrained(
pretrained_model_name_or_path, subfolder="vae"
)
unet = UNet2DConditionModel.from_pretrained(
pretrained_model_name_or_path, subfolder="unet"
)
# We have added the placeholder_token in the tokenizer so we resize the token embeddings here
# this will a new embedding vector in the token embeddings for our placeholder_token
text_encoder.resize_token_embeddings(len(tokenizer))
# Initialise the newly added placeholder token with the embeddings of the initializer token
token_embeds = text_encoder.get_input_embeddings().weight.data
token_embeds[placeholder_token_id] = token_embeds[initializer_token_id]
# In Textual-Inversion we only train the newly added embedding vector
# so lets freeze rest of the model parameters here
def freeze_params(params):
for param in params:
param.requires_grad = False
# Freeze vae and unet
freeze_params(vae.parameters())
freeze_params(unet.parameters())
# Freeze all parameters except for the token embeddings in text encoder
params_to_freeze = itertools.chain(
text_encoder.text_model.encoder.parameters(),
text_encoder.text_model.final_layer_norm.parameters(),
text_encoder.text_model.embeddings.position_embedding.parameters(),
)
freeze_params(params_to_freeze)
# Move vae and unet to device
# For the dynamo path default compilation device is `cpu`, since torch-mlir
# supports only that. Therefore, convert to device only for PyTorch path.
if not args.use_torchdynamo:
vae.to(args.device)
unet.to(args.device)
# Keep vae in eval mode as we don't train it
vae.eval()
# Keep unet in train mode to enable gradient checkpointing
unet.train()
class VaeModel(torch.nn.Module):
def __init__(self):
super().__init__()
self.vae = vae
def forward(self, input):
x = self.vae.encode(input, return_dict=False)[0]
return x
class UnetModel(torch.nn.Module):
def __init__(self):
super().__init__()
self.unet = unet
def forward(self, x, y, z):
return self.unet.forward(x, y, z, return_dict=False)[0]
amdshark_vae = VaeModel()
amdshark_unet = UnetModel()
####### Creating our training data ########
# Let's create the Dataset and Dataloader
train_dataset = TextualInversionDataset(
data_root=save_path,
tokenizer=tokenizer,
size=vae.sample_size,
placeholder_token=args.placeholder_token,
repeats=100,
learnable_property=args.what_to_teach, # Option selected above between object and style
center_crop=False,
set="train",
)
def create_dataloader(train_batch_size=1):
return torch.utils.data.DataLoader(
train_dataset, batch_size=train_batch_size, shuffle=True
)
# Create noise_scheduler for training
noise_scheduler = DDPMScheduler.from_config(
pretrained_model_name_or_path, subfolder="scheduler"
)
######## Training ###########
# Define hyperparameters for our training. If you are not happy with your results,
# you can tune the `learning_rate` and the `max_train_steps`
# Setting up all training args
hyperparameters = {
"learning_rate": 5e-04,
"scale_lr": True,
"max_train_steps": args.training_steps,
"save_steps": args.save_steps,
"train_batch_size": args.train_batch_size,
"gradient_accumulation_steps": 1,
"gradient_checkpointing": True,
"mixed_precision": "fp16",
"seed": 42,
"output_dir": "sd-concept-output",
}
# creating output directory
cwd = os.getcwd()
out_dir = os.path.join(cwd, hyperparameters["output_dir"])
while not os.path.exists(str(out_dir)):
try:
os.mkdir(out_dir)
except OSError as error:
print("Output directory not created")
###### Torch-MLIR Compilation ######
def _remove_nones(fx_g: torch.fx.GraphModule) -> List[int]:
removed_indexes = []
for node in fx_g.graph.nodes:
if node.op == "output":
assert (
len(node.args) == 1
), "Output node must have a single argument"
node_arg = node.args[0]
if isinstance(node_arg, (list, tuple)):
node_arg = list(node_arg)
node_args_len = len(node_arg)
for i in range(node_args_len):
curr_index = node_args_len - (i + 1)
if node_arg[curr_index] is None:
removed_indexes.append(curr_index)
node_arg.pop(curr_index)
node.args = (tuple(node_arg),)
break
if len(removed_indexes) > 0:
fx_g.graph.lint()
fx_g.graph.eliminate_dead_code()
fx_g.recompile()
removed_indexes.sort()
return removed_indexes
def _unwrap_single_tuple_return(fx_g: torch.fx.GraphModule) -> bool:
"""
Replace tuple with tuple element in functions that return one-element tuples.
Returns true if an unwrapping took place, and false otherwise.
"""
unwrapped_tuple = False
for node in fx_g.graph.nodes:
if node.op == "output":
assert (
len(node.args) == 1
), "Output node must have a single argument"
node_arg = node.args[0]
if isinstance(node_arg, tuple):
if len(node_arg) == 1:
node.args = (node_arg[0],)
unwrapped_tuple = True
break
if unwrapped_tuple:
fx_g.graph.lint()
fx_g.recompile()
return unwrapped_tuple
def _returns_nothing(fx_g: torch.fx.GraphModule) -> bool:
for node in fx_g.graph.nodes:
if node.op == "output":
assert (
len(node.args) == 1
), "Output node must have a single argument"
node_arg = node.args[0]
if isinstance(node_arg, tuple):
return len(node_arg) == 0
return False
def transform_fx(fx_g):
for node in fx_g.graph.nodes:
if node.op == "call_function":
if node.target in [
torch.ops.aten.empty,
]:
# aten.empty should be filled with zeros.
if node.target in [torch.ops.aten.empty]:
with fx_g.graph.inserting_after(node):
new_node = fx_g.graph.call_function(
torch.ops.aten.zero_,
args=(node,),
)
node.append(new_node)
node.replace_all_uses_with(new_node)
new_node.args = (node,)
fx_g.graph.lint()
@make_simple_dynamo_backend
def refbackend_torchdynamo_backend(
fx_graph: torch.fx.GraphModule, example_inputs: List[torch.Tensor]
):
# handling usage of empty tensor without initializing
transform_fx(fx_graph)
fx_graph.recompile()
if _returns_nothing(fx_graph):
return fx_graph
removed_none_indexes = _remove_nones(fx_graph)
was_unwrapped = _unwrap_single_tuple_return(fx_graph)
mlir_module = torch_mlir.compile(
fx_graph, example_inputs, output_type="linalg-on-tensors"
)
bytecode_stream = BytesIO()
mlir_module.operation.write_bytecode(bytecode_stream)
bytecode = bytecode_stream.getvalue()
amdshark_module = AMDSharkInference(
mlir_module=bytecode, device=args.device, mlir_dialect="tm_tensor"
)
amdshark_module.compile()
def compiled_callable(*inputs):
inputs = [x.numpy() for x in inputs]
result = amdshark_module("forward", inputs)
if was_unwrapped:
result = [
result,
]
if not isinstance(result, list):
result = torch.from_numpy(result)
else:
result = tuple(torch.from_numpy(x) for x in result)
result = list(result)
for removed_index in removed_none_indexes:
result.insert(removed_index, None)
result = tuple(result)
return result
return compiled_callable
def predictions(torch_func, jit_func, batchA, batchB):
res = jit_func(batchA.numpy(), batchB.numpy())
if res is not None:
prediction = res
else:
prediction = None
return prediction
logger = logging.getLogger(__name__)
# def save_progress(text_encoder, placeholder_token_id, accelerator, save_path):
def save_progress(text_encoder, placeholder_token_id, save_path):
logger.info("Saving embeddings")
learned_embeds = (
# accelerator.unwrap_model(text_encoder)
text_encoder.get_input_embeddings().weight[placeholder_token_id]
)
learned_embeds_dict = {
args.placeholder_token: learned_embeds.detach().cpu()
}
torch.save(learned_embeds_dict, save_path)
train_batch_size = hyperparameters["train_batch_size"]
gradient_accumulation_steps = hyperparameters["gradient_accumulation_steps"]
learning_rate = hyperparameters["learning_rate"]
if hyperparameters["scale_lr"]:
learning_rate = (
learning_rate
* gradient_accumulation_steps
* train_batch_size
# * accelerator.num_processes
)
# Initialize the optimizer
optimizer = torch.optim.AdamW(
text_encoder.get_input_embeddings().parameters(), # only optimize the embeddings
lr=learning_rate,
)
# Training function
def train_func(batch_pixel_values, batch_input_ids):
# Convert images to latent space
latents = amdshark_vae(batch_pixel_values).sample().detach()
latents = latents * 0.18215
# Sample noise that we'll add to the latents
noise = torch.randn_like(latents)
bsz = latents.shape[0]
# Sample a random timestep for each image
timesteps = torch.randint(
0,
noise_scheduler.num_train_timesteps,
(bsz,),
device=latents.device,
).long()
# Add noise to the latents according to the noise magnitude at each timestep
# (this is the forward diffusion process)
noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
# Get the text embedding for conditioning
encoder_hidden_states = text_encoder(batch_input_ids)[0]
# Predict the noise residual
noise_pred = amdshark_unet(
noisy_latents,
timesteps,
encoder_hidden_states,
)
# Get the target for loss depending on the prediction type
if noise_scheduler.config.prediction_type == "epsilon":
target = noise
elif noise_scheduler.config.prediction_type == "v_prediction":
target = noise_scheduler.get_velocity(latents, noise, timesteps)
else:
raise ValueError(
f"Unknown prediction type {noise_scheduler.config.prediction_type}"
)
loss = (
F.mse_loss(noise_pred, target, reduction="none").mean([1, 2, 3]).mean()
)
loss.backward()
# Zero out the gradients for all token embeddings except the newly added
# embeddings for the concept, as we only want to optimize the concept embeddings
grads = text_encoder.get_input_embeddings().weight.grad
# Get the index for tokens that we want to zero the grads for
index_grads_to_zero = torch.arange(len(tokenizer)) != placeholder_token_id
grads.data[index_grads_to_zero, :] = grads.data[
index_grads_to_zero, :
].fill_(0)
optimizer.step()
optimizer.zero_grad()
return loss
def training_function():
max_train_steps = hyperparameters["max_train_steps"]
output_dir = hyperparameters["output_dir"]
gradient_checkpointing = hyperparameters["gradient_checkpointing"]
train_dataloader = create_dataloader(train_batch_size)
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
num_update_steps_per_epoch = math.ceil(
len(train_dataloader) / gradient_accumulation_steps
)
num_train_epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)
# Train!
total_batch_size = (
train_batch_size
* gradient_accumulation_steps
# train_batch_size * accelerator.num_processes * gradient_accumulation_steps
)
logger.info("***** Running training *****")
logger.info(f" Num examples = {len(train_dataset)}")
logger.info(f" Instantaneous batch size per device = {train_batch_size}")
logger.info(
f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
)
logger.info(
f" Gradient Accumulation steps = {gradient_accumulation_steps}"
)
logger.info(f" Total optimization steps = {max_train_steps}")
# Only show the progress bar once on each machine.
progress_bar = tqdm(
# range(max_train_steps), disable=not accelerator.is_local_main_process
range(max_train_steps)
)
progress_bar.set_description("Steps")
global_step = 0
params_ = [i for i in text_encoder.get_input_embeddings().parameters()]
if args.use_torchdynamo:
print("******** TRAINING STARTED - TORCHYDNAMO PATH ********")
else:
print("******** TRAINING STARTED - PYTORCH PATH ********")
print("Initial weights:")
print(params_, params_[0].shape)
for epoch in range(num_train_epochs):
text_encoder.train()
for step, batch in enumerate(train_dataloader):
if args.use_torchdynamo:
dynamo_callable = dynamo.optimize(
refbackend_torchdynamo_backend
)(train_func)
lam_func = lambda x, y: dynamo_callable(
torch.from_numpy(x), torch.from_numpy(y)
)
loss = predictions(
train_func,
lam_func,
batch["pixel_values"],
batch["input_ids"],
# params[0].detach(),
)
else:
loss = train_func(batch["pixel_values"], batch["input_ids"])
print(loss)
# Checks if the accelerator has performed an optimization step behind the scenes
progress_bar.update(1)
global_step += 1
if global_step % hyperparameters["save_steps"] == 0:
save_path = os.path.join(
output_dir,
f"learned_embeds-step-{global_step}.bin",
)
save_progress(
text_encoder,
placeholder_token_id,
save_path,
)
logs = {"loss": loss.detach().item()}
progress_bar.set_postfix(**logs)
if global_step >= max_train_steps:
break
# Create the pipeline using using the trained modules and save it.
params__ = [i for i in text_encoder.get_input_embeddings().parameters()]
print("******** TRAINING PROCESS FINISHED ********")
print("Updated weights:")
print(params__, params__[0].shape)
pipeline = StableDiffusionPipeline.from_pretrained(
pretrained_model_name_or_path,
# text_encoder=accelerator.unwrap_model(text_encoder),
text_encoder=text_encoder,
tokenizer=tokenizer,
vae=vae,
unet=unet,
)
pipeline.save_pretrained(output_dir)
# Also save the newly trained embeddings
save_path = os.path.join(output_dir, f"learned_embeds.bin")
save_progress(text_encoder, placeholder_token_id, save_path)
training_function()
for param in itertools.chain(unet.parameters(), text_encoder.parameters()):
if param.grad is not None:
del param.grad # free some memory
torch.cuda.empty_cache()
# Set up the pipeline
from diffusers import DPMSolverMultistepScheduler
pipe = StableDiffusionPipeline.from_pretrained(
hyperparameters["output_dir"],
scheduler=DPMSolverMultistepScheduler.from_pretrained(
hyperparameters["output_dir"], subfolder="scheduler"
),
)
if not args.use_torchdynamo:
pipe.to(args.device)
# Run the Stable Diffusion pipeline
# Don't forget to use the placeholder token in your prompt
all_images = []
for _ in range(args.num_inference_samples):
images = pipe(
[args.prompt],
num_inference_steps=args.inference_steps,
guidance_scale=7.5,
).images
all_images.extend(images)
output_path = os.path.abspath(os.path.join(os.getcwd(), args.output_dir))
if not os.path.isdir(args.output_dir):
os.mkdir(args.output_dir)
[
image.save(f"{args.output_dir}/{i}.jpeg")
for i, image in enumerate(all_images)
]

View File

@@ -1,86 +0,0 @@
# Copyright 2020 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Dict, Any
import iree
import iree.runtime as ireert
import numpy as np
import torch
from iree.runtime import DeviceArray
from torch_mlir._mlir_libs._mlir.ir import Module
from torch_mlir.compiler_utils import (
run_pipeline_with_repro_report,
)
from torch_mlir.eager_mode.torch_mlir_eager_backend import (
TorchMLIREagerBackend,
TensorMetaData,
)
from torch_mlir_e2e_test.eager_backends.refbackend import (
NUMPY_TO_TORCH_DTYPE_DICT,
)
from amdshark.iree_utils.compile_utils import (
get_iree_compiled_module,
IREE_DEVICE_MAP,
)
class EagerModeIREELinalgOnTensorsBackend(TorchMLIREagerBackend):
"""Main entry-point for the iree backend for torch-mlir eager mode.
EagerModeIREELinalgOnTensorsBackend uses iree.DeviceArray representations of tensors and
thus all of the wrapping and unwrapping and munging here is done to between torch.Tensor and iree.DeviceArray,
with np.ndarray as an intermediary.
"""
def __init__(self, device: str):
self.torch_device_str = device
self.config = ireert.Config(IREE_DEVICE_MAP[device])
self.raw_device_str = device
def get_torch_metadata(
self, tensor: DeviceArray, kwargs: Dict[str, Any]
) -> TensorMetaData:
return TensorMetaData(
size=tensor.shape,
dtype=NUMPY_TO_TORCH_DTYPE_DICT[tensor.dtype.type],
device=torch.device(self.torch_device_str),
requires_grad=tensor.dtype.type
in {np.float, np.float32, np.float64}
and kwargs.get("requires_grad", False),
)
def compile(self, imported_module: Module):
run_pipeline_with_repro_report(
imported_module,
"torch-function-to-torch-backend-pipeline,torch-backend-to-linalg-on-tensors-backend-pipeline",
"EagerMode",
)
callable, _ = get_iree_compiled_module(
imported_module, self.raw_device_str
)
return callable
def copy_into(self, dst, src):
"""Copy output back to appropriate arg that it should alias."""
np.copyto(dst, src)
def transfer_from_device_to_torch(self, e):
return torch.from_numpy(e.to_host())
def transfer_from_torch_to_device(
self, tensor: torch.Tensor
) -> DeviceArray:
return iree.runtime.asdevicearray(self.config.device, tensor.numpy())

View File

@@ -1,164 +0,0 @@
# Copyright 2023 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
## Common utilities to be shared by iree utilities.
import functools
import os
import sys
import subprocess
def run_cmd(cmd, debug=False, raise_err=False):
"""
Inputs:
cmd : cli command string.
debug : if True, prints debug info
raise_err : if True, raise exception to caller
"""
if debug:
print("IREE run command: \n\n")
print(cmd)
print("\n\n")
try:
result = subprocess.run(
cmd,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True,
)
stdout = result.stdout.decode()
stderr = result.stderr.decode()
return stdout, stderr
except subprocess.CalledProcessError as e:
if raise_err:
raise Exception from e
else:
print(e.output)
sys.exit(f"Exiting program due to error running {cmd}")
def iree_device_map(device):
uri_parts = device.split("://", 2)
iree_driver = (
_IREE_DEVICE_MAP[uri_parts[0]]
if uri_parts[0] in _IREE_DEVICE_MAP
else uri_parts[0]
)
if len(uri_parts) == 1:
return iree_driver
elif "rocm" in uri_parts:
return "rocm"
else:
return f"{iree_driver}://{uri_parts[1]}"
def get_supported_device_list():
return list(_IREE_DEVICE_MAP.keys())
_IREE_DEVICE_MAP = {
"cpu": "local-task",
"cpu-task": "local-task",
"cpu-sync": "local-sync",
"cuda": "cuda",
"vulkan": "vulkan",
"metal": "metal",
"rocm": "rocm",
"hip": "hip",
"intel-gpu": "level_zero",
}
def iree_target_map(device):
if "://" in device:
device = device.split("://")[0]
return _IREE_TARGET_MAP[device] if device in _IREE_TARGET_MAP else device
_IREE_TARGET_MAP = {
"cpu": "llvm-cpu",
"cpu-task": "llvm-cpu",
"cpu-sync": "llvm-cpu",
"cuda": "cuda",
"vulkan": "vulkan-spirv",
"metal": "metal",
"rocm": "rocm",
"hip": "rocm",
"intel-gpu": "opencl-spirv",
}
# Finds whether the required drivers are installed for the given device.
@functools.cache
def check_device_drivers(device):
"""
Checks necessary drivers present for gpu and vulkan devices
False => drivers present!
"""
if "://" in device:
device = device.split("://")[0]
from iree.runtime import get_driver
device_mapped = iree_device_map(device)
try:
_ = get_driver(device_mapped)
except ValueError as ve:
print(
f"[ERR] device `{device}` not registered with IREE. "
"Ensure IREE is configured for use with this device.\n"
f"Full Error: \n {repr(ve)}"
)
return True
except RuntimeError as re:
print(f"[ERR] Failed to get driver for {device} with error:\n{repr(re)}")
return True
# Unknown device. We assume drivers are installed.
return False
# Installation info for the missing device drivers.
def device_driver_info(device):
device_driver_err_map = {
"cuda": {
"debug": "Try `nvidia-smi` on system to check.",
"solution": " from https://www.nvidia.in/Download/index.aspx?lang=en-in for your system.",
},
"vulkan": {
"debug": "Try `vulkaninfo` on system to check.",
"solution": " from https://vulkan.lunarg.com/sdk/home for your distribution.",
},
"metal": {
"debug": "Check if Bare metal is supported and enabled on your system.",
"solution": ".",
},
"rocm": {
"debug": f"Try `{'hip' if sys.platform == 'win32' else 'rocm'}info` on system to check.",
"solution": " from https://rocm.docs.amd.com/en/latest/rocm.html for your system.",
},
}
if device in device_driver_err_map:
err_msg = (
f"Required drivers for {device} not found. {device_driver_err_map[device]['debug']} "
f"Please install the required drivers{device_driver_err_map[device]['solution']} "
f"For further assistance please reach out to the community on discord [https://discord.com/invite/RUqY2h2s9u]"
f" and/or file a bug at https://github.com/nod-ai/AMD-SHARK-Studio/issues"
)
return err_msg
else:
return f"{device} is not supported."

View File

@@ -1,154 +0,0 @@
# Copyright 2020 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from amdshark.iree_utils._common import run_cmd, iree_device_map
from amdshark.iree_utils.cpu_utils import get_cpu_count
import numpy as np
import os
import re
import platform
UNIT_TO_SECOND_MAP = {"us": 1e-6, "ms": 0.001, "s": 1}
def tensor_to_type_str(input_tensors: tuple, mlir_dialect: str):
"""
Input: A tuple of input tensors i.e tuple(torch.tensor)
Output: list of string that represent mlir types (i.e 1x24xf64)
# TODO: Support more than floats, and ints
"""
list_of_type = []
for input_tensor in input_tensors:
type_string = "x".join([str(dim) for dim in input_tensor.shape])
if mlir_dialect in ["linalg", "tosa"]:
dtype_string = str(input_tensor.dtype).replace("torch.", "")
elif mlir_dialect in ["mhlo", "tflite"]:
dtype = input_tensor.dtype
try:
dtype_string = re.findall("'[^\"]*'", str(dtype))[0].replace(
"'", ""
)
except IndexError:
dtype_string = str(dtype)
regex_split = re.compile("([a-zA-Z]+)([0-9]+)")
match = regex_split.match(dtype_string)
mlir_type_string = str(match.group(1)[0]) + str(match.group(2))
type_string += f"x{mlir_type_string}"
list_of_type.append(type_string)
return list_of_type
def build_benchmark_args(
input_file: str,
device: str,
input_tensors: tuple,
mlir_dialect: str,
training=False,
):
"""
Inputs: input_file leading to vmfb, input_tensor to function, target device,
and whether it is training or not.
Outputs: string that execute benchmark-module on target model.
"""
path = os.path.join(os.environ["VIRTUAL_ENV"], "bin")
if platform.system() == "Windows":
benchmarker_path = os.path.join(path, "iree-benchmark-module.exe")
time_extractor = None
else:
benchmarker_path = os.path.join(path, "iree-benchmark-module")
time_extractor = "| awk 'END{{print $2 $3}}'"
benchmark_cl = [benchmarker_path, f"--module={input_file}"]
# TODO: The function named can be passed as one of the args.
fn_name = "forward"
if training == True:
# TODO: Replace name of train with actual train fn name.
fn_name = "train"
benchmark_cl.append(f"--function={fn_name}")
benchmark_cl.append(f"--device={iree_device_map(device)}")
mlir_input_types = tensor_to_type_str(input_tensors, mlir_dialect)
for mlir_input in mlir_input_types:
benchmark_cl.append(f"--input={mlir_input}")
if device == "cpu":
num_cpus = get_cpu_count()
if num_cpus is not None:
benchmark_cl.append(f"--task_topology_max_group_count={num_cpus}")
# if time_extractor:
# benchmark_cl.append(time_extractor)
benchmark_cl.append(f"--print_statistics=true")
return benchmark_cl
def build_benchmark_args_non_tensor_input(
input_file: str,
device: str,
inputs: tuple,
mlir_dialect: str,
function_name: str,
):
"""
Inputs: input_file leading to vmfb, input_tensor to function, target device,
and whether it is training or not.
Outputs: string that execute benchmark-module on target model.
"""
path = os.path.join(os.environ["VIRTUAL_ENV"], "bin")
if platform.system() == "Windows":
benchmarker_path = os.path.join(path, "iree-benchmark-module.exe")
time_extractor = None
else:
benchmarker_path = os.path.join(path, "iree-benchmark-module")
time_extractor = "| awk 'END{{print $2 $3}}'"
benchmark_cl = [benchmarker_path, f"--module={input_file}"]
# TODO: The function named can be passed as one of the args.
if function_name:
benchmark_cl.append(f"--function={function_name}")
benchmark_cl.append(f"--device={iree_device_map(device)}")
for input in inputs:
benchmark_cl.append(f"--input={input}")
if platform.system() != "Windows":
time_extractor = "| awk 'END{{print $2 $3}}'"
benchmark_cl.append(time_extractor)
return benchmark_cl
def run_benchmark_module(benchmark_cl):
"""
Run benchmark command, extract result and return iteration/seconds, host
peak memory, and device peak memory.
# TODO: Add an example of the benchmark command.
Input: benchmark command.
"""
benchmark_path = benchmark_cl[0]
assert os.path.exists(
benchmark_path
), "Cannot find iree_benchmark_module, Please contact AMDSHARK maintainer on discord."
bench_stdout, bench_stderr = run_cmd(" ".join(benchmark_cl))
try:
regex_split = re.compile("(\d+[.]*\d*)( *)([a-zA-Z]+)")
match = regex_split.search(bench_stdout)
time_ms = float(match.group(1))
unit = match.group(3)
except AttributeError:
regex_split = re.compile("(\d+[.]*\d*)([a-zA-Z]+)")
match = regex_split.search(bench_stdout)
time_ms = float(match.group(1))
unit = match.group(2)
iter_per_second = 1.0 / (time_ms * 0.001)
# Extract peak memory.
host_regex = re.compile(r".*HOST_LOCAL:\s*([0-9]+)B peak")
host_peak_b = int(host_regex.search(bench_stderr).group(1))
device_regex = re.compile(r".*DEVICE_LOCAL:\s*([0-9]+)B peak")
device_peak_b = int(device_regex.search(bench_stderr).group(1))
return iter_per_second, host_peak_b, device_peak_b

View File

@@ -1,704 +0,0 @@
# Copyright 2023 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import functools
import numpy as np
import os
import re
import tempfile
from pathlib import Path
import iree.runtime as ireert
import iree.compiler as ireec
from amdshark.parser import amdshark_args
from .trace import DetailLogger
from ._common import iree_device_map, iree_target_map
from .cpu_utils import get_iree_cpu_rt_args
from .benchmark_utils import *
# Get the iree-compile arguments given device.
def get_iree_device_args(device, extra_args=[]):
print("Configuring for device:" + device)
device, device_num = clean_device_info(device)
if "cpu" in device:
from amdshark.iree_utils.cpu_utils import get_iree_cpu_args
u_kernel_flag = ["--iree-llvmcpu-enable-ukernels"]
stack_size_flag = ["--iree-llvmcpu-stack-allocation-limit=256000"]
return (
get_iree_cpu_args()
+ u_kernel_flag
+ stack_size_flag
)
if device == "cuda":
from amdshark.iree_utils.gpu_utils import get_iree_gpu_args
return get_iree_gpu_args()
if device == "vulkan":
from amdshark.iree_utils.vulkan_utils import get_iree_vulkan_args
return get_iree_vulkan_args(
device_num=device_num, extra_args=extra_args
)
if device == "metal":
from amdshark.iree_utils.metal_utils import get_iree_metal_args
return get_iree_metal_args(extra_args=extra_args)
if device == "rocm":
from amdshark.iree_utils.gpu_utils import get_iree_rocm_args
return get_iree_rocm_args(device_num=device_num, extra_args=extra_args)
if device == "hip":
from amdshark.iree_utils.gpu_utils import get_iree_rocm_args
return get_iree_rocm_args(device_num=device_num, extra_args=extra_args, hip_driver=True)
return []
def get_iree_target_triple(device):
args = get_iree_device_args(device)
for flag in args:
if "triple" in flag:
triple = flag.split("=")[-1]
return triple
return ""
def clean_device_info(raw_device):
# return appropriate device and device_id for consumption by Studio pipeline
# Multiple devices only supported for vulkan and rocm (as of now).
# default device must be selected for all others
device_id = None
device = (
raw_device
if "=>" not in raw_device
else raw_device.split("=>")[1].strip()
)
if "://" in device:
device, device_id = device.split("://")
if len(device_id) <= 2:
device_id = int(device_id)
if device not in ["hip", "rocm", "vulkan"]:
device_id = None
if device in ["hip", "rocm", "vulkan"] and device_id == None:
device_id = 0
return device, device_id
# Get the iree-compiler arguments given frontend.
def get_iree_frontend_args(frontend):
if frontend in ["torch", "pytorch", "linalg", "tm_tensor"]:
return ["--iree-llvmcpu-target-cpu-features=host"]
elif frontend in ["tensorflow", "tf", "mhlo", "stablehlo"]:
return [
"--iree-llvmcpu-target-cpu-features=host",
"--iree-input-demote-i64-to-i32",
]
else:
# Frontend not found.
return []
# Common args to be used given any frontend or device.
def get_iree_common_args(debug=False):
common_args = [
"--iree-util-zero-fill-elided-attrs",
"--mlir-elide-elementsattrs-if-larger=10",
]
if debug == True:
common_args.extend(
[
"--iree-opt-strip-assertions=false",
"--verify=true",
]
)
else:
common_args.extend(
[
"--iree-opt-strip-assertions=true",
"--verify=false",
]
)
return common_args
# Args that are suitable only for certain models or groups of models.
# amdshark_args are passed down from pytests to control which models compile with these flags,
# but they can also be set in amdshark/parser.py
def get_model_specific_args():
ms_args = []
if amdshark_args.enable_conv_transform == True:
ms_args += [
"--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-flow-convert-conv-nchw-to-nhwc))"
]
if amdshark_args.enable_img2col_transform == True:
ms_args += [
"--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-preprocessing-convert-conv2d-to-img2col))"
]
if amdshark_args.use_winograd == True:
ms_args += [
"--iree-preprocessing-pass-pipeline=builtin.module(func.func(iree-linalg-ext-convert-conv2d-to-winograd))"
]
return ms_args
def create_dispatch_dirs(bench_dir, device):
protected_files = ["ordered-dispatches.txt"]
bench_dir_path = bench_dir.split("/")
bench_dir_path[-1] = "temp_" + bench_dir_path[-1]
tmp_bench_dir = "/".join(bench_dir_path)
for f_ in os.listdir(bench_dir):
if os.path.isfile(f"{bench_dir}/{f_}") and f_ not in protected_files:
dir_name = re.sub("\.\S*$", "", f_)
if os.path.exists(f"{bench_dir}/{dir_name}"):
os.system(f"rm -rf {bench_dir}/{dir_name}")
os.system(f"mkdir {bench_dir}/{dir_name}")
os.system(f"mv {bench_dir}/{f_} {bench_dir}/{dir_name}/{f_}")
for f_ in os.listdir(tmp_bench_dir):
if os.path.isfile(f"{tmp_bench_dir}/{f_}"):
dir_name = ""
for d_ in os.listdir(bench_dir):
if re.search(f"{d_}(?=\D)", f_):
dir_name = d_
if dir_name != "":
os.system(
f"mv {tmp_bench_dir}/{f_} {bench_dir}/{dir_name}/{dir_name}_benchmark.mlir"
)
def dump_isas(bench_dir):
for d_ in os.listdir(bench_dir):
if os.path.isdir(f"{bench_dir}/{d_}"):
for f_ in os.listdir(f"{bench_dir}/{d_}"):
if f_.endswith(".spv"):
os.system(
f"amdllpc -gfxip 11.0 {bench_dir}/{d_}/{f_} -v > \
{bench_dir}/{d_}/isa.txt"
)
def compile_benchmark_dirs(bench_dir, device, dispatch_benchmarks):
benchmark_runtimes = {}
dispatch_list = []
all_dispatches = False
if dispatch_benchmarks.lower().strip() == "all":
all_dispatches = True
else:
try:
dispatch_list = [
int(dispatch_index)
for dispatch_index in dispatch_benchmarks.split(" ")
]
except:
print("ERROR: Invalid dispatch benchmarks")
return None
for d_ in os.listdir(bench_dir):
if os.path.isdir(f"{bench_dir}/{d_}"):
in_dispatches = False
for dispatch in dispatch_list:
if str(dispatch) in d_:
in_dispatches = True
if all_dispatches or in_dispatches:
for f_ in os.listdir(f"{bench_dir}/{d_}"):
if "benchmark.mlir" in f_:
dispatch_file = open(f"{bench_dir}/{d_}/{f_}", "r")
module = dispatch_file.read()
dispatch_file.close()
flatbuffer_blob = ireec.compile_str(
module, target_backends=[iree_target_map(device)]
)
vmfb_file = open(
f"{bench_dir}/{d_}/{d_}_benchmark.vmfb", "wb"
)
vmfb_file.write(flatbuffer_blob)
vmfb_file.close()
config = get_iree_runtime_config(device)
vm_module = ireert.VmModule.from_buffer(
config.vm_instance,
flatbuffer_blob,
warn_if_copy=False,
)
benchmark_cl = build_benchmark_args_non_tensor_input(
input_file=f"{bench_dir}/{d_}/{d_}_benchmark.vmfb",
device=device,
inputs=(0,),
mlir_dialect="linalg",
function_name="",
)
benchmark_bash = open(
f"{bench_dir}/{d_}/{d_}_benchmark.sh", "w+"
)
benchmark_bash.write("#!/bin/bash\n")
benchmark_bash.write(" ".join(benchmark_cl))
benchmark_bash.close()
iter_per_second, _, _ = run_benchmark_module(
benchmark_cl
)
benchmark_file = open(
f"{bench_dir}/{d_}/{d_}_data.txt", "w+"
)
benchmark_file.write(f"DISPATCH: {d_}\n")
benchmark_file.write(str(iter_per_second) + "\n")
benchmark_file.write(
"AMDSHARK BENCHMARK RESULT: "
+ str(1 / (iter_per_second * 0.001))
+ "\n"
)
benchmark_file.close()
benchmark_runtimes[d_] = 1 / (iter_per_second * 0.001)
elif ".mlir" in f_ and "benchmark" not in f_:
dispatch_file = open(f"{bench_dir}/{d_}/{f_}", "r")
module = dispatch_file.read()
dispatch_file.close()
module = re.sub(
"hal.executable private",
"hal.executable public",
module,
)
flatbuffer_blob = ireec.compile_str(
module,
target_backends=[iree_target_map(device)],
extra_args=["--compile-mode=hal-executable"],
)
spirv_file = open(
f"{bench_dir}/{d_}/{d_}_spirv.vmfb", "wb"
)
spirv_file.write(flatbuffer_blob)
spirv_file.close()
ordered_dispatches = [
(k, v)
for k, v in sorted(
benchmark_runtimes.items(), key=lambda item: item[1]
)
][::-1]
f_ = open(f"{bench_dir}/ordered-dispatches.txt", "w+")
for dispatch in ordered_dispatches:
f_.write(f"{dispatch[0]}: {dispatch[1]}ms\n")
f_.close()
def compile_module_to_flatbuffer(
module,
device,
frontend,
model_config_path,
extra_args,
model_name="None",
debug=False,
compile_str=False,
write_to=None,
):
# Setup Compile arguments wrt to frontends.
input_type = "auto"
args = get_iree_frontend_args(frontend)
args += get_iree_device_args(device, extra_args)
args += get_iree_common_args(debug=debug)
args += get_model_specific_args()
args += extra_args
args += amdshark_args.additional_compile_args
if frontend in ["tensorflow", "tf"]:
input_type = "auto"
elif frontend in ["stablehlo", "tosa"]:
input_type = frontend
elif frontend in ["tflite", "tflite-tosa"]:
input_type = "tosa"
elif frontend in ["tm_tensor"]:
input_type = ireec.InputType.TM_TENSOR
elif frontend in ["torch", "pytorch"]:
input_type = "torch"
if compile_str:
flatbuffer_blob = ireec.compile_str(
module,
target_backends=[iree_target_map(device)],
extra_args=args,
input_type=input_type,
)
else:
assert os.path.isfile(module)
flatbuffer_blob = ireec.compile_file(
str(module),
input_type=input_type,
target_backends=[iree_target_map(device)],
extra_args=args,
)
if write_to is not None:
with open(write_to, "wb") as f:
f.write(flatbuffer_blob)
return None
return flatbuffer_blob
def get_iree_module(
flatbuffer_blob,
device,
device_idx=None,
rt_flags: list = [],
external_weight_file=None,
):
if external_weight_file is not None:
index = ireert.ParameterIndex()
index.load(external_weight_file)
# Returns the compiled module and the configs.
for flag in rt_flags:
ireert.flags.parse_flag(flag)
if device_idx is not None:
device = iree_device_map(device)
print("registering device id: ", device_idx)
haldriver = ireert.get_driver(device)
hal_device_id = haldriver.query_available_devices()[device_idx][
"device_id"
]
haldevice = haldriver.create_device(
hal_device_id,
allocators=amdshark_args.device_allocator,
)
config = ireert.Config(device=haldevice)
config.id = hal_device_id
else:
config = get_iree_runtime_config(device)
vm_module = ireert.VmModule.from_buffer(
config.vm_instance, flatbuffer_blob, warn_if_copy=False
)
modules = []
if external_weight_file is not None:
modules.append(index.create_provider(scope="model"))
ctx = ireert.SystemContext(vm_modules=modules, config=config)
ctx.add_vm_module(vm_module)
ModuleCompiled = getattr(ctx.modules, vm_module.name)
return ModuleCompiled, config
def load_vmfb_using_mmap(
flatbuffer_blob_or_path,
device: str,
device_idx: int = None,
rt_flags: list = [],
external_weight_file: str = None,
):
print(f"Loading module {flatbuffer_blob_or_path}...")
if "task" in device:
print(
f"[DEBUG] setting iree runtime flags for cpu:\n{' '.join(get_iree_cpu_rt_args())}"
)
for flag in get_iree_cpu_rt_args():
rt_flags.append(flag)
for flag in rt_flags:
print(flag)
ireert.flags.parse_flags(flag)
if "rocm" in device:
device = "rocm"
with DetailLogger(timeout=2.5) as dl:
# First get configs.
if device_idx is not None:
dl.log(f"Mapping device id: {device_idx}")
device = iree_device_map(device)
haldriver = ireert.get_driver(device)
dl.log(f"ireert.get_driver()")
hal_device_id = haldriver.query_available_devices()[device_idx][
"device_id"
]
haldevice = haldriver.create_device(
hal_device_id,
allocators=amdshark_args.device_allocator,
)
dl.log(f"ireert.create_device()")
config = ireert.Config(device=haldevice)
config.id = hal_device_id
dl.log(f"ireert.Config()")
else:
config = get_iree_runtime_config(device)
dl.log("get_iree_runtime_config")
if "task" in device:
print(
f"[DEBUG] setting iree runtime flags for cpu:\n{' '.join(get_iree_cpu_rt_args())}"
)
for flag in get_iree_cpu_rt_args():
ireert.flags.parse_flags(flag)
# Now load vmfb.
# Two scenarios we have here :-
# 1. We either have the vmfb already saved and therefore pass the path of it.
# (This would arise if we're invoking `load_module` from a AMDSharkInference obj)
# OR 2. We are compiling on the fly, therefore we have the flatbuffer blob to play with.
# (This would arise if we're invoking `compile` from a AMDSharkInference obj)
temp_file_to_unlink = None
if isinstance(flatbuffer_blob_or_path, Path):
flatbuffer_blob_or_path = flatbuffer_blob_or_path.__str__()
if (
isinstance(flatbuffer_blob_or_path, str)
and ".vmfb" in flatbuffer_blob_or_path
):
vmfb_file_path = flatbuffer_blob_or_path
mmaped_vmfb = ireert.VmModule.mmap(
config.vm_instance, flatbuffer_blob_or_path
)
vm_modules = []
if external_weight_file is not None:
index = ireert.ParameterIndex()
index.load(external_weight_file)
param_module = ireert.create_io_parameters_module(
config.vm_instance, index.create_provider(scope="model")
)
vm_modules.append(param_module)
vm_modules.append(mmaped_vmfb)
vm_modules.append(
ireert.create_hal_module(config.vm_instance, config.device)
)
dl.log(f"mmap {flatbuffer_blob_or_path}")
if "vulkan" in device:
# Vulkan pipeline creation consumes significant amount of time.
print(
"\tCompiling Vulkan shaders. This may take a few minutes."
)
ctx = ireert.SystemContext(config=config, vm_modules=vm_modules)
dl.log(f"ireert.SystemContext created")
for flag in amdshark_args.additional_runtime_args:
ireert.flags.parse_flags(flag)
dl.log(f"module initialized")
mmaped_vmfb = getattr(ctx.modules, mmaped_vmfb.name)
else:
with tempfile.NamedTemporaryFile(delete=False) as tf:
tf.write(flatbuffer_blob_or_path)
tf.flush()
vmfb_file_path = tf.name
temp_file_to_unlink = vmfb_file_path
mmaped_vmfb = ireert.VmModule.mmap(instance, vmfb_file_path)
dl.log(f"mmap temp {vmfb_file_path}")
return mmaped_vmfb, config, temp_file_to_unlink
def get_iree_compiled_module(
module,
device: str,
frontend: str = "torch",
model_config_path: str = None,
extra_args: list = [],
rt_flags: list = [],
device_idx: int = None,
mmap: bool = False,
debug: bool = False,
compile_str: bool = False,
external_weight_file: str = None,
write_to: bool = None,
):
"""Given a module returns the compiled .vmfb and configs"""
flatbuffer_blob = compile_module_to_flatbuffer(
module=module,
device=device,
frontend=frontend,
model_config_path=model_config_path,
extra_args=extra_args,
debug=debug,
compile_str=compile_str,
write_to=write_to,
)
temp_file_to_unlink = None
# TODO: Currently mmap=True control flow path has been switched off for mmap.
# Got to find a cleaner way to unlink/delete the temporary file since
# we're setting delete=False when creating NamedTemporaryFile. That's why
# I'm getting hold of the name of the temporary file in `temp_file_to_unlink`.
if mmap:
if write_to is not None:
flatbuffer_blob = write_to
vmfb, config, temp_file_to_unlink = load_vmfb_using_mmap(
flatbuffer_blob,
device,
device_idx,
rt_flags,
external_weight_file=external_weight_file,
)
else:
vmfb, config = get_iree_module(
flatbuffer_blob,
device,
device_idx=device_idx,
rt_flags=rt_flags,
external_weight_file=external_weight_file,
)
ret_params = {
"vmfb": vmfb,
"config": config,
"temp_file_to_unlink": temp_file_to_unlink,
}
return ret_params
def load_flatbuffer(
flatbuffer_path: str,
device: str,
device_idx: int = None,
mmap: bool = False,
rt_flags: list = [],
):
temp_file_to_unlink = None
if mmap:
vmfb, config, temp_file_to_unlink = load_vmfb_using_mmap(
flatbuffer_path, device, device_idx, rt_flags
)
else:
with open(os.path.join(flatbuffer_path), "rb") as f:
flatbuffer_blob = f.read()
vmfb, config = get_iree_module(
flatbuffer_blob,
device,
device_idx=device_idx,
rt_flags=rt_flags,
)
ret_params = {
"vmfb": vmfb,
"config": config,
"temp_file_to_unlink": temp_file_to_unlink,
}
return ret_params
def export_iree_module_to_vmfb(
module,
device: str,
directory: str,
mlir_dialect: str = "linalg",
model_config_path: str = None,
module_name: str = None,
extra_args: list = [],
debug: bool = False,
compile_str: bool = False,
):
# Compiles the module given specs and saves it as .vmfb file.
flatbuffer_blob = compile_module_to_flatbuffer(
module=module,
device=device,
frontend=mlir_dialect,
model_config_path=model_config_path,
extra_args=extra_args,
debug=debug,
compile_str=compile_str,
)
if module_name is None:
device_name = (
device if "://" not in device else "-".join(device.split("://"))
)
module_name = f"{mlir_dialect}_{device_name}"
filename = os.path.join(directory, module_name + ".vmfb")
with open(filename, "wb") as f:
f.write(flatbuffer_blob)
print(f"Saved vmfb in {filename}.")
return filename
def export_module_to_mlir_file(module, frontend, directory: str):
# TODO: write proper documentation.
mlir_str = module
if frontend in ["tensorflow", "tf", "mhlo", "stablehlo", "tflite"]:
mlir_str = module.decode("utf-8")
elif frontend in ["pytorch", "torch"]:
mlir_str = module.operation.get_asm()
filename = os.path.join(directory, "model.mlir")
with open(filename, "w") as f:
f.write(mlir_str)
print(f"Saved mlir in {filename}.")
return filename
def get_results(
compiled_vm,
function_name,
input,
config,
frontend="torch",
send_to_host=True,
debug_timeout: float = 5.0,
device: str = None,
):
"""Runs a .vmfb file given inputs and config and returns output."""
with DetailLogger(debug_timeout) as dl:
device_inputs = []
if device == "rocm" and hasattr(config, "id"):
haldriver = ireert.get_driver("rocm")
haldevice = haldriver.create_device(
config.id,
allocators=amdshark_args.device_allocator,
)
for input_array in input:
dl.log(f"Load to device: {input_array.shape}")
device_inputs.append(
ireert.asdevicearray(config.device, input_array)
)
dl.log(f"Invoke function: {function_name}")
result = compiled_vm[function_name](*device_inputs)
dl.log(f"Invoke complete")
result_tensors = []
if isinstance(result, tuple):
if send_to_host:
for val in result:
dl.log(f"Result to host: {val.shape}")
result_tensors.append(np.asarray(val, val.dtype))
else:
for val in result:
result_tensors.append(val)
return result_tensors
elif isinstance(result, dict):
data = list(result.items())
if send_to_host:
res = np.array(data, dtype=object)
return np.copy(res)
return data
else:
if send_to_host and result is not None:
dl.log("Result to host")
return result.to_host()
return result
dl.log("Execution complete")
@functools.cache
def get_iree_runtime_config(device):
device = iree_device_map(device)
haldriver = ireert.get_driver(device)
if "metal" in device and amdshark_args.device_allocator == "caching":
print(
"[WARNING] metal devices can not have a `caching` allocator."
"\nUsing default allocator `None`"
)
haldevice = haldriver.create_device_by_uri(
device,
# metal devices have a failure with caching allocators atm. blcking this util it gets fixed upstream.
allocators=amdshark_args.device_allocator
if "metal" not in device
else None,
)
config = ireert.Config(device=haldevice)
return config

View File

@@ -1,65 +0,0 @@
# Copyright 2020 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# All the iree_cpu related functionalities go here.
import functools
import subprocess
import platform
from amdshark.parser import amdshark_args
def get_cpu_count():
import multiprocessing
try:
cpu_count = multiprocessing.cpu_count()
return cpu_count
except NotImplementedError:
return None
# Get the default cpu args.
@functools.cache
def get_iree_cpu_args():
uname = platform.uname()
os_name, proc_name = uname.system, uname.machine
if os_name == "Darwin":
kernel_version = uname.release
target_triple = f"{proc_name}-apple-darwin{kernel_version}"
elif os_name == "Linux":
target_triple = f"{proc_name}-linux-gnu"
elif os_name == "Windows":
target_triple = "x86_64-pc-windows-msvc"
else:
error_message = f"OS Type f{os_name} not supported and triple can't be determined, open issue to dAMDSHARK team please :)"
raise Exception(error_message)
print(f"Target triple found:{target_triple}")
return [
f"--iree-llvmcpu-target-triple={target_triple}",
]
# Get iree runtime flags for cpu
@functools.cache
def get_iree_cpu_rt_args():
default = get_cpu_count()
default = default if default <= 8 else default - 2
cpu_count = (
default
if amdshark_args.task_topology_max_group_count is None
else amdshark_args.task_topology_max_group_count
)
return [f"--task_topology_max_group_count={cpu_count}"]

View File

@@ -1,209 +0,0 @@
# Copyright 2020 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# All the iree_gpu related functionalities go here.
import functools
import iree.runtime as ireert
import ctypes
import sys
from subprocess import CalledProcessError
from amdshark.parser import amdshark_args
from amdshark.iree_utils._common import run_cmd
# TODO: refactor to rocm and cuda utils
# Get the default gpu args given the architecture.
@functools.cache
def get_iree_gpu_args():
ireert.flags.FUNCTION_INPUT_VALIDATION = False
ireert.flags.parse_flags("--cuda_allow_inline_execution")
# TODO: Give the user_interface to pass the sm_arch.
sm_arch = get_cuda_sm_cc()
if (
sm_arch
in ["sm_70", "sm_72", "sm_75", "sm_80", "sm_84", "sm_86", "sm_89"]
) and (amdshark_args.enable_tf32 == True):
return [
f"--iree-hal-cuda-llvm-target-arch={sm_arch}",
]
else:
return []
def check_rocm_device_arch_in_args(extra_args):
# Check if the target arch flag for rocm device present in extra_args
for flag in extra_args:
if "iree-rocm-target-chip" in flag:
flag_arch = flag.split("=")[1]
return flag_arch
return None
def get_rocm_device_arch(device_num=0, extra_args=[], hip_driver=False):
# ROCM Device Arch selection:
# 1 : User given device arch using `--iree-rocm-target-chip` flag
# 2 : Device arch from `iree-run-module --dump_devices=rocm` for device on index <device_num>
# 3 : default arch : gfx1100
arch_in_flag = check_rocm_device_arch_in_args(extra_args)
if arch_in_flag is not None:
print(
f"User Specified rocm target device arch from flag : {arch_in_flag} will be used"
)
return arch_in_flag
arch_in_device_dump = None
# get rocm arch from iree dump devices
def get_devices_info_from_dump(dump, driver):
from os import linesep
if driver == "hip":
dump_clean = list(
filter(
lambda s: "AMD" in s,
dump.split(linesep),
)
)
else:
dump_clean = list(
filter(
lambda s: f"--device={driver}" in s or "gpu-arch-name:" in s,
dump.split(linesep),
)
)
arch_pairs = [
(
dump_clean[i].split("=")[1].strip(),
dump_clean[i + 1].split(":")[1].strip(),
)
for i in range(0, len(dump_clean), 2)
]
return arch_pairs
dump_device_info = None
driver = "hip" if hip_driver else "rocm"
try:
dump_device_info = run_cmd(
"iree-run-module --dump_devices=" + driver, raise_err=True
)
except Exception as e:
print("could not execute `iree-run-module --dump_devices=" + driver + "`")
if dump_device_info is not None:
device_num = 0 if device_num is None else device_num
device_arch_pairs = get_devices_info_from_dump(dump_device_info[0], driver)
if len(device_arch_pairs) > device_num: # can find arch in the list
arch_in_device_dump = device_arch_pairs[device_num][1]
if arch_in_device_dump is not None:
print(f"Found ROCm device arch : {arch_in_device_dump}")
return arch_in_device_dump
default_rocm_arch = "gfx1100"
print(
"Did not find ROCm architecture from `--iree-rocm-target-chip` flag"
"\n or from `iree-run-module --dump_devices` command."
f"\nUsing {default_rocm_arch} as ROCm arch for compilation."
)
return default_rocm_arch
# Get the default gpu args given the architecture.
def get_iree_rocm_args(device_num=0, extra_args=[], hip_driver=False):
ireert.flags.FUNCTION_INPUT_VALIDATION = False
rocm_flags = []
if check_rocm_device_arch_in_args(extra_args) is None:
rocm_arch = get_rocm_device_arch(device_num, extra_args, hip_driver=hip_driver)
rocm_flags.append(f"--iree-rocm-target-chip={rocm_arch}")
return rocm_flags
# Some constants taken from cuda.h
CUDA_SUCCESS = 0
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36
@functools.cache
def get_cuda_sm_cc():
libnames = ("libcuda.so", "libcuda.dylib", "nvcuda.dll")
for libname in libnames:
try:
cuda = ctypes.CDLL(libname)
except OSError:
continue
else:
break
else:
raise OSError("could not load any of: " + " ".join(libnames))
nGpus = ctypes.c_int()
name = b" " * 100
cc_major = ctypes.c_int()
cc_minor = ctypes.c_int()
result = ctypes.c_int()
device = ctypes.c_int()
context = ctypes.c_void_p()
error_str = ctypes.c_char_p()
result = cuda.cuInit(0)
if result != CUDA_SUCCESS:
cuda.cuGetErrorString(result, ctypes.byref(error_str))
print(
"cuInit failed with error code %d: %s"
% (result, error_str.value.decode())
)
return 1
result = cuda.cuDeviceGetCount(ctypes.byref(nGpus))
if result != CUDA_SUCCESS:
cuda.cuGetErrorString(result, ctypes.byref(error_str))
print(
"cuDeviceGetCount failed with error code %d: %s"
% (result, error_str.value.decode())
)
return 1
print("Found %d device(s)." % nGpus.value)
for i in range(nGpus.value):
result = cuda.cuDeviceGet(ctypes.byref(device), i)
if result != CUDA_SUCCESS:
cuda.cuGetErrorString(result, ctypes.byref(error_str))
print(
"cuDeviceGet failed with error code %d: %s"
% (result, error_str.value.decode())
)
return 1
print("Device: %d" % i)
if (
cuda.cuDeviceGetName(ctypes.c_char_p(name), len(name), device)
== CUDA_SUCCESS
):
print(" Name: %s" % (name.split(b"\0", 1)[0].decode()))
if (
cuda.cuDeviceComputeCapability(
ctypes.byref(cc_major), ctypes.byref(cc_minor), device
)
== CUDA_SUCCESS
):
print(
" Compute Capability: %d.%d"
% (cc_major.value, cc_minor.value)
)
sm = f"sm_{cc_major.value}{cc_minor.value}"
return sm

View File

@@ -1,102 +0,0 @@
# Copyright 2023 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# All the iree_vulkan related functionalities go here.
import functools
from amdshark.iree_utils._common import run_cmd
import iree.runtime as ireert
from sys import platform
from amdshark.iree_utils.vulkan_target_env_utils import get_vulkan_target_env_flag
@functools.cache
def get_metal_device_name(device_num=0):
iree_device_dump = run_cmd("iree-run-module --dump_devices")
iree_device_dump = iree_device_dump[0].split("\n\n")
metal_device_list = [
s.split("\n#")[2] for s in iree_device_dump if "--device=metal" in s
]
if len(metal_device_list) == 0:
raise ValueError("No device name found in device dump!")
if len(metal_device_list) > 1:
print("Following devices found:")
for i, dname in enumerate(metal_device_list):
print(f"{i}. {dname}")
print(f"Choosing device: {metal_device_list[device_num]}")
return metal_device_list[device_num]
def get_os_name():
if platform.startswith("linux"):
return "linux"
elif platform == "darwin":
return "macos"
elif platform == "win32":
return "windows"
else:
print("Cannot detect OS type, defaulting to linux.")
return "linux"
def get_metal_target_triple(device_name):
"""This method provides a target triple str for specified vulkan device.
Args:
device_name (str): name of the hardware device to be used with vulkan
Returns:
str or None: target triple or None if no match found for given name
"""
return "macos"
def get_metal_triple_flag(device_name="", device_num=0, extra_args=[]):
for flag in extra_args:
if "-iree-metal-target-platform=" in flag:
print(f"Using target triple {flag.split('=')[1]}")
return None
if device_name == "" or device_name == [] or device_name is None:
metal_device = get_metal_device_name(device_num=device_num)
else:
metal_device = device_name
triple = get_metal_target_triple(metal_device)
if triple is not None:
print(
f"Found metal device {metal_device}. Using metal target platform {triple}"
)
return f"-iree-metal-target-platform={triple}"
print(
"""Optimized kernel for your target device is not added yet.
Contact AMDSHARK Admin on discord[https://discord.com/invite/RUqY2h2s9u]
or pull up an issue."""
)
print(f"Target : {metal_device}")
return None
def get_iree_metal_args(device_num=0, extra_args=[]):
# Add any metal spefic compilation flags here
res_metal_flag = []
if len(extra_args) > 0:
res_metal_flag.extend(extra_args)
return res_metal_flag
def set_iree_metal_runtime_flags(flags):
for flag in flags:
ireert.flags.parse_flags(flag)
return

View File

@@ -1,76 +0,0 @@
# Copyright 2023 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Tuple
import os
import threading
import time
def _enable_detail_trace() -> bool:
return os.getenv("AMDSHARK_DETAIL_TRACE", "0") == "1"
class DetailLogger:
"""Context manager which can accumulate detailed log messages.
Detailed log is only emitted if the operation takes a long time
or errors.
"""
def __init__(self, timeout: float):
self._timeout = timeout
self._messages: List[Tuple[float, str]] = []
self._start_time = time.time()
self._active = not _enable_detail_trace()
self._lock = threading.RLock()
self._cond = threading.Condition(self._lock)
self._thread = None
def __enter__(self):
self._thread = threading.Thread(target=self._run)
self._thread.start()
return self
def __exit__(self, type, value, traceback):
with self._lock:
self._active = False
self._cond.notify()
if traceback:
self.dump_on_error(f"exception")
def _run(self):
with self._lock:
timed_out = not self._cond.wait(self._timeout)
if timed_out:
self.dump_on_error(f"took longer than {self._timeout}s")
def log(self, msg):
with self._lock:
timestamp = time.time()
if self._active:
self._messages.append((timestamp, msg))
else:
print(f" +{(timestamp - self._start_time) * 1000}ms: {msg}")
def dump_on_error(self, summary: str):
with self._lock:
if self._active:
print(f"::: Detailed report ({summary}):")
for timestamp, msg in self._messages:
print(
f" +{(timestamp - self._start_time) * 1000}ms: {msg}"
)
self._active = False

View File

@@ -1,538 +0,0 @@
# Copyright 2020 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import OrderedDict
import functools
@functools.cache
def get_vulkan_target_env(vulkan_target_triple):
arch, product, os = vulkan_target_triple.split("=")[1].split("-")
triple = (arch, product, os)
# get version
version = get_version(triple=triple)
# TODO get revision
revision = 120
# extensions
extensions = get_extensions(triple)
# get vendor
vendor = get_vendor(triple)
# get device type
device_type = get_device_type(triple)
# get capabilities
capabilities = get_vulkan_target_capabilities(triple)
target_env = f"<#spirv.vce<{version}, r({revision}), {extensions}>, {vendor}:{device_type}, #spirv.resource_limits< {capabilities} >>"
return target_env
def get_vulkan_target_env_flag(vulkan_target_triple):
target_env = get_vulkan_target_env(vulkan_target_triple)
target_env_flag = f"--iree-vulkan-target-env={target_env}"
return target_env_flag
def get_version(triple):
arch, product, os = triple
if os in ["android30", "android31"]:
return "v1.1"
if product in ["android30", "android31"]:
return "v1.1"
if arch in ["unknown"]:
return "v1.1"
return "v1.3"
@functools.cache
def get_extensions(triple):
def make_ext_list(ext_list):
res = ", ".join(ext_list)
return f"[{res}]"
arch, product, os = triple
if arch == "m1":
ext = [
"SPV_KHR_16bit_storage",
"SPV_KHR_8bit_storage",
"SPV_KHR_shader_float16_int8",
"SPV_KHR_storage_buffer_storage_class",
"SPV_KHR_variable_pointers",
]
return make_ext_list(ext_list=ext)
if arch == "valhall":
ext = [
"SPV_KHR_16bit_storage",
"SPV_KHR_8bit_storage",
"SPV_KHR_shader_float16_int8",
"SPV_KHR_spirv_1_4",
"SPV_KHR_storage_buffer_storage_class",
"SPV_KHR_variable_pointers",
]
return make_ext_list(ext_list=ext)
if arch == "adreno":
ext = [
"SPV_KHR_16bit_storage",
"SPV_KHR_shader_float16_int8",
"SPV_KHR_spirv_1_4",
"SPV_KHR_storage_buffer_storage_class",
"SPV_KHR_variable_pointers",
]
if os == "android31":
ext.append("SPV_KHR_8bit_storage")
return make_ext_list(ext_list=ext)
if get_vendor(triple) == "SwiftShader":
ext = ["SPV_KHR_storage_buffer_storage_class"]
return make_ext_list(ext_list=ext)
if arch == "unknown":
ext = [
"SPV_KHR_storage_buffer_storage_class",
"SPV_KHR_variable_pointers",
]
return make_ext_list(ext_list=ext)
ext = [
"SPV_KHR_16bit_storage",
"SPV_KHR_8bit_storage",
"SPV_KHR_shader_float16_int8",
"SPV_KHR_spirv_1_4",
"SPV_KHR_storage_buffer_storage_class",
"SPV_KHR_variable_pointers",
"VK_EXT_subgroup_size_control",
]
if get_vendor(triple) == "NVIDIA" or arch == "rdna3":
ext.append("SPV_KHR_cooperative_matrix")
if get_vendor(triple) == ["NVIDIA", "AMD", "Intel"]:
ext.append("SPV_KHR_shader_integer_dot_product")
return make_ext_list(ext_list=ext)
@functools.cache
def get_vendor(triple):
arch, product, os = triple
if arch == "unknown":
return "Unknown"
if arch in ["rdna1", "rdna2", "rdna3", "rgcn3", "rgcn4", "rgcn5"]:
return "AMD"
if arch == "valhall":
return "ARM"
if arch == "m1":
return "Apple"
if arch in ["arc", "UHD"]:
return "Intel"
if arch in ["turing", "ampere", "pascal"]:
return "NVIDIA"
if arch == "adreno":
return "Qualcomm"
if arch == "cpu":
if product == "swiftshader":
return "SwiftShader"
return "Unknown"
print(f"Vendor for target triple - {triple} not found. Using unknown")
return "Unknown"
@functools.cache
def get_device_type(triple):
arch, product, _ = triple
if arch == "unknown":
return "Unknown"
if arch == "cpu":
return "CPU"
if arch in ["turing", "ampere", "arc", "pascal"]:
return "DiscreteGPU"
if arch in ["rdna1", "rdna2", "rdna3", "rgcn3", "rgcn5"]:
if product == "ivega10":
return "IntegratedGPU"
return "DiscreteGPU"
if arch in ["m1", "valhall", "adreno"]:
return "IntegratedGPU"
print(f"Device type for target triple - {triple} not found. Using unknown")
return "Unknown"
# get all the capabilities for the device
# TODO: make a dataclass for capabilites and init using vulkaninfo
@functools.cache
def get_vulkan_target_capabilities(triple):
def get_subgroup_val(l):
return int(sum([subgroup_feature[sgf] for sgf in l]))
cap = OrderedDict()
arch, product, os = triple
subgroup_feature = {
"Basic": 1,
"Vote": 2,
"Arithmetic": 4,
"Ballot": 8,
"Shuffle": 16,
"ShuffleRelative": 32,
"Clustered": 64,
"Quad": 128,
"PartitionedNV": 256,
}
cap["max_compute_shared_memory_size"] = 16384
cap["max_compute_workgroup_invocations"] = 128
cap["max_compute_workgroup_size"] = [128, 128, 64]
cap["subgroup_size"] = 32
cap["subgroupFeatures"] = ["Basic"]
cap["min_subgroup_size"] = None
cap["max_subgroup_size"] = None
cap["shaderFloat16"] = False
cap["shaderFloat64"] = False
cap["shaderInt8"] = False
cap["shaderInt16"] = False
cap["shaderInt64"] = False
cap["storageBuffer16BitAccess"] = False
cap["storagePushConstant16"] = False
cap["uniformAndStorageBuffer16BitAccess"] = False
cap["storageBuffer8BitAccess"] = False
cap["storagePushConstant8"] = False
cap["uniformAndStorageBuffer8BitAccess"] = False
cap["variablePointers"] = False
cap["variablePointersStorageBuffer"] = False
cap["coopmatCases"] = None
if arch in ["rdna1", "rdna2", "rdna3"]:
cap["max_compute_shared_memory_size"] = 65536
cap["max_compute_workgroup_invocations"] = 1024
cap["max_compute_workgroup_size"] = [1024, 1024, 1024]
cap["subgroup_size"] = 64
cap["min_subgroup_size"] = 32
cap["max_subgroup_size"] = 64
cap["subgroupFeatures"] = [
"Basic",
"Vote",
"Arithmetic",
"Ballot",
"Shuffle",
"ShuffleRelative",
"Clustered",
"Quad",
]
cap["shaderFloat16"] = True
cap["shaderFloat64"] = True
cap["shaderInt8"] = True
cap["shaderInt16"] = True
cap["shaderInt64"] = True
cap["shaderIntegerDotProduct"] = True
cap["storageBuffer16BitAccess"] = True
cap["storagePushConstant16"] = True
cap["uniformAndStorageBuffer16BitAccess"] = True
cap["storageBuffer8BitAccess"] = True
cap["storagePushConstant8"] = True
cap["uniformAndStorageBuffer8BitAccess"] = True
cap["variablePointers"] = True
cap["variablePointersStorageBuffer"] = True
if arch == "rdna3":
# TODO: Get scope value
cap["coopmatCases"] = [
"m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, acc_sat = false, scope = <Subgroup>",
"m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, acc_sat = false, scope = <Subgroup>"
]
if product == "rx5700xt":
cap["storagePushConstant16"] = False
cap["storagePushConstant8"] = False
elif arch in ["rgcn5", "rgcn4", "rgcn3"]:
cap["max_compute_shared_memory_size"] = 65536
cap["max_compute_workgroup_invocations"] = 1024
cap["max_compute_workgroup_size"] = [1024, 1024, 1024]
cap["subgroup_size"] = 64
cap["subgroupFeatures"] = [
"Basic",
"Vote",
"Arithmetic",
"Ballot",
"Shuffle",
"ShuffleRelative",
"Clustered",
"Quad",
]
cap["min_subgroup_size"] = 64
cap["max_subgroup_size"] = 64
if arch == "rgcn5":
cap["shaderFloat16"] = True
cap["shaderFloat64"] = True
cap["storageBuffer16BitAccess"] = True
cap["shaderInt8"] = True
cap["shaderInt16"] = True
cap["shaderInt64"] = True
cap["shaderIntegerDotProduct"] = True
cap["storagePushConstant16"] = False
cap["uniformAndStorageBuffer16BitAccess"] = True
cap["storageBuffer8BitAccess"] = True
cap["storagePushConstant8"] = False
cap["uniformAndStorageBuffer8BitAccess"] = True
cap["variablePointers"] = True
cap["variablePointersStorageBuffer"] = True
elif arch == "m1":
cap["max_compute_shared_memory_size"] = 32768
cap["max_compute_workgroup_invocations"] = 1024
cap["max_compute_workgroup_size"] = [1024, 1024, 1024]
cap["subgroup_size"] = 32
cap["subgroupFeatures"] = [
"Basic",
"Vote",
"Arithmetic",
"Ballot",
"Shuffle",
"ShuffleRelative",
"Quad",
]
cap["shaderFloat16"] = True
cap["shaderFloat64"] = True
cap["shaderInt8"] = True
cap["shaderInt16"] = True
cap["shaderInt64"] = True
cap["shaderIntegerDotProduct"] = False
cap["storageBuffer16BitAccess"] = True
cap["storagePushConstant16"] = True
cap["uniformAndStorageBuffer16BitAccess"] = True
cap["storageBuffer8BitAccess"] = True
cap["storagePushConstant8"] = True
cap["uniformAndStorageBuffer8BitAccess"] = True
cap["variablePointers"] = True
cap["variablePointersStorageBuffer"] = True
elif arch == "valhall":
cap["max_compute_shared_memory_size"] = 32768
cap["max_compute_workgroup_invocations"] = 512
cap["max_compute_workgroup_size"] = [512, 512, 512]
cap["subgroup_size"] = 16
cap["subgroupFeatures"] = [
"Basic",
"Vote",
"Arithmetic",
"Ballot",
"Clustered",
"Quad",
]
if os == "android31":
cap["subgroupFeatures"].append("Shuffle")
cap["subgroupFeatures"].append("ShuffleRelative")
cap["shaderFloat16"] = True
cap["shaderInt8"] = True
cap["shaderInt16"] = True
cap["storageBuffer16BitAccess"] = True
cap["storagePushConstant16"] = True
cap["uniformAndStorageBuffer16BitAccess"] = True
cap["storageBuffer8BitAccess"] = True
cap["storagePushConstant8"] = True
cap["uniformAndStorageBuffer8BitAccess"] = True
cap["variablePointers"] = True
cap["variablePointersStorageBuffer"] = True
elif arch == "arc":
cap["max_compute_shared_memory_size"] = 32768
cap["max_compute_workgroup_invocations"] = 1024
cap["max_compute_workgroup_size"] = [1024, 1024, 64]
cap["subgroup_size"] = 32
cap["subgroupFeatures"] = [
"Basic",
"Vote",
"Arithmetic",
"Ballot",
"Shuffle",
"ShuffleRelative",
"Clustered",
"Quad",
]
cap["shaderFloat16"] = True
cap["shaderFloat64"] = False
cap["shaderInt8"] = True
cap["shaderInt16"] = True
cap["shaderInt64"] = False
cap["shaderIntegerDotProduct"] = True
cap["storageBuffer16BitAccess"] = True
cap["storagePushConstant16"] = True
cap["uniformAndStorageBuffer16BitAccess"] = True
cap["storageBuffer8BitAccess"] = True
cap["storagePushConstant8"] = True
cap["uniformAndStorageBuffer8BitAccess"] = True
cap["variablePointers"] = True
cap["variablePointersStorageBuffer"] = True
elif arch == "cpu":
if product == "swiftshader":
cap["max_compute_shared_memory_size"] = 16384
cap["subgroup_size"] = 4
cap["subgroupFeatures"] = [
"Basic",
"Vote",
"Arithmetic",
"Ballot",
"Shuffle",
"ShuffleRelative",
]
elif arch in ["pascal"]:
cap["max_compute_shared_memory_size"] = 49152
cap["max_compute_workgroup_invocations"] = 1536
cap["max_compute_workgroup_size"] = [1536, 1024, 64]
cap["subgroup_size"] = 32
cap["min_subgroup_size"] = 32
cap["max_subgroup_size"] = 32
cap["subgroupFeatures"] = [
"Basic",
"Vote",
"Arithmetic",
"Ballot",
"Shuffle",
"ShuffleRelative",
"Clustered",
"Quad",
]
cap["shaderFloat16"] = False
cap["shaderFloat64"] = True
cap["shaderInt8"] = True
cap["shaderInt16"] = True
cap["shaderInt64"] = True
cap["shaderIntegerDotProduct"] = True
cap["storageBuffer16BitAccess"] = True
cap["storagePushConstant16"] = True
cap["uniformAndStorageBuffer16BitAccess"] = True
cap["storageBuffer8BitAccess"] = True
cap["storagePushConstant8"] = True
cap["uniformAndStorageBuffer8BitAccess"] = True
cap["variablePointers"] = True
cap["variablePointersStorageBuffer"] = True
elif arch in ["ampere", "turing"]:
cap["max_compute_shared_memory_size"] = 49152
cap["max_compute_workgroup_invocations"] = 1024
cap["max_compute_workgroup_size"] = [1024, 1024, 1024]
cap["subgroup_size"] = 32
cap["min_subgroup_size"] = 32
cap["max_subgroup_size"] = 32
cap["subgroupFeatures"] = [
"Basic",
"Vote",
"Arithmetic",
"Ballot",
"Shuffle",
"ShuffleRelative",
"Clustered",
"Quad",
]
cap["shaderFloat16"] = True
cap["shaderFloat64"] = True
cap["shaderInt8"] = True
cap["shaderInt16"] = True
cap["shaderInt64"] = True
cap["shaderIntegerDotProduct"] = True
cap["storageBuffer16BitAccess"] = True
cap["storagePushConstant16"] = True
cap["uniformAndStorageBuffer16BitAccess"] = True
cap["storageBuffer8BitAccess"] = True
cap["storagePushConstant8"] = True
cap["uniformAndStorageBuffer8BitAccess"] = True
cap["variablePointers"] = True
cap["variablePointersStorageBuffer"] = True
cap["coopmatCases"] = [
"mSize = 8, nSize = 8, kSize = 32, aType = i8, bType = i8, cType = i32, resultType = i32, accSat = false, scope = #vk.scope<Subgroup>",
"mSize = 16, nSize = 16, kSize = 16, aType = f16, bType = f16, cType = f16, resultType = f16, accSat = false, scope = #vk.scope<Subgroup>",
"mSize = 16, nSize = 16, kSize = 16, aType = f16, bType = f16, cType = f32, resultType = f32, accSat = false, scope = #vk.scope<Subgroup>",
]
elif arch == "adreno":
cap["max_compute_shared_memory_size"] = 32768
cap["max_compute_workgroup_invocations"] = 1024
cap["max_compute_workgroup_size"] = [1024, 1024, 64]
cap["subgroup_size"] = 64
cap["subgroupFeatures"] = [
"Basic",
"Vote",
"Arithmetic",
"Ballot",
"Shuffle",
"ShuffleRelative",
"Quad",
]
cap["shaderFloat16"] = True
cap["shaderInt8"] = True
cap["shaderInt16"] = True
cap["storageBuffer16BitAccess"] = True
if os == "android31":
cap["uniformAndStorageBuffer8BitAccess"] = True
cap["variablePointers"] = True
cap["variablePointersStorageBuffer"] = True
elif arch == "unknown":
cap["subgroup_size"] = 64
cap["variablePointers"] = False
cap["variablePointersStorageBuffer"] = False
else:
print(
f"Architecture {arch} not matched. Using default vulkan target device capability"
)
def get_comma_sep_str(ele_list):
l = ""
for ele in ele_list:
l += f"{ele}, "
l = f"[{l[:-2]}]"
return l
res = ""
for k, v in cap.items():
if v is None or v == False:
continue
if isinstance(v, bool):
res += f"{k} = {'unit' if v == True else None}, "
elif isinstance(v, list):
if k == "subgroupFeatures":
res += f"subgroup_features = {get_subgroup_val(v)}: i32, "
elif k == "max_compute_workgroup_size":
res += f"max_compute_workgroup_size = dense<{get_comma_sep_str(v)}>: vector<{len(v)}xi32>, "
elif k == "coopmatCases":
cmc = ""
for case in v:
cmc += f"#spirv.coop_matrix_props_khr<{case}>, "
res += f"cooperative_matrix_properties_khr = [{cmc[:-2]}], "
else:
res += f"{k} = {get_comma_sep_str(v)}, "
else:
res += f"{k} = {v}, "
res = res[:-2]
return res

View File

@@ -1,221 +0,0 @@
# Copyright 2020 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# All the iree_vulkan related functionalities go here.
import functools
from os import linesep
from amdshark.iree_utils._common import run_cmd
import iree.runtime as ireert
from sys import platform
from amdshark.iree_utils.vulkan_target_env_utils import get_vulkan_target_env_flag
from amdshark.parser import amdshark_args
@functools.cache
def get_all_vulkan_devices():
from iree.runtime import get_driver
try:
driver = get_driver("vulkan")
device_list_src = driver.query_available_devices()
except:
device_list_src = {}
return [d["name"] for d in device_list_src]
@functools.cache
def get_vulkan_device_name(device_num=0):
if isinstance(device_num, int):
vulkaninfo_list = get_all_vulkan_devices()
if len(vulkaninfo_list) == 0:
raise ValueError("No device name found in VulkanInfo!")
if len(vulkaninfo_list) > 1:
print("Following devices found:")
for i, dname in enumerate(vulkaninfo_list):
print(f"{i}. {dname}")
print(f"Choosing device: vulkan://{device_num}")
vulkan_device_name = vulkaninfo_list[device_num]
else:
from iree.runtime import get_driver
vulkan_device_driver = get_driver(device_num)
vulkan_device_name = vulkan_device_driver.query_available_devices()[0]
print(vulkan_device_name)
return vulkan_device_name
def get_os_name():
if platform.startswith("linux"):
return "linux"
elif platform == "darwin":
return "macos"
elif platform == "win32":
return "windows"
else:
print("Cannot detect OS type, defaulting to linux.")
return "linux"
@functools.cache
def get_vulkan_target_triple(device_name):
"""This method provides a target triple str for specified vulkan device.
Args:
device_name (str): name of the hardware device to be used with vulkan
Returns:
str or None: target triple or None if no match found for given name
"""
# TODO: Replace this with a dict or something smarter.
system_os = get_os_name()
# Apple Targets
if all(x in device_name for x in ("Apple", "M1")):
triple = "m1-moltenvk-macos"
elif all(x in device_name for x in ("Apple", "M2")):
triple = "m1-moltenvk-macos"
# Nvidia Targets
elif all(x in device_name for x in ("RTX", "2080")):
triple = f"turing-rtx2080-{system_os}"
elif all(x in device_name for x in ("A100", "SXM4")):
triple = f"ampere-a100-{system_os}"
elif all(x in device_name for x in ("RTX", "3090")):
triple = f"ampere-rtx3090-{system_os}"
elif all(x in device_name for x in ("RTX", "3080")):
triple = f"ampere-rtx3080-{system_os}"
elif all(x in device_name for x in ("RTX", "3070")):
triple = f"ampere-rtx3070-{system_os}"
elif all(x in device_name for x in ("RTX", "3060")):
triple = f"ampere-rtx3060-{system_os}"
elif all(x in device_name for x in ("RTX", "3050")):
triple = f"ampere-rtx3050-{system_os}"
# We use ampere until lovelace target triples are plumbed in.
elif all(x in device_name for x in ("RTX", "4090")):
triple = f"ampere-rtx4090-{system_os}"
elif all(x in device_name for x in ("RTX", "4080")):
triple = f"ampere-rtx4080-{system_os}"
elif all(x in device_name for x in ("RTX", "4070")):
triple = f"ampere-rtx4070-{system_os}"
elif all(x in device_name for x in ("RTX", "4000")):
triple = f"turing-rtx4000-{system_os}"
elif all(x in device_name for x in ("RTX", "5000")):
triple = f"turing-rtx5000-{system_os}"
elif all(x in device_name for x in ("RTX", "6000")):
triple = f"turing-rtx6000-{system_os}"
elif all(x in device_name for x in ("RTX", "8000")):
triple = f"turing-rtx8000-{system_os}"
elif all(x in device_name for x in ("TITAN", "RTX")):
triple = f"turing-titanrtx-{system_os}"
elif all(x in device_name for x in ("GTX", "1060")):
triple = f"pascal-gtx1060-{system_os}"
elif all(x in device_name for x in ("GTX", "1070")):
triple = f"pascal-gtx1070-{system_os}"
elif all(x in device_name for x in ("GTX", "1080")):
triple = f"pascal-gtx1080-{system_os}"
# Amd Targets
# Linux: Radeon RX 7900 XTX
# Windows: AMD Radeon RX 7900 XTX
elif all(x in device_name for x in ("RX", "7800")):
triple = f"rdna3-7800-{system_os}"
elif all(x in device_name for x in ("RX", "7900")):
triple = f"rdna3-7900-{system_os}"
elif all(x in device_name for x in ("Radeon", "780M")):
triple = f"rdna3-780m-{system_os}"
elif all(x in device_name for x in ("AMD", "PRO", "W7900")):
triple = f"rdna3-w7900-{system_os}"
elif any(x in device_name for x in ("AMD", "Radeon")):
triple = f"rdna2-unknown-{system_os}"
# Intel Targets
elif any(x in device_name for x in ("A770", "A750")):
triple = f"arc-770-{system_os}"
elif "v620" in device_name:
triple = f"rdna2-v620-{system_os}"
# Adreno Targets
elif all(x in device_name for x in ("Adreno", "740")):
triple = f"adreno-a740-{system_os}"
else:
triple = None
return triple
def get_vulkan_triple_flag(device_name="", device_num=0, extra_args=[]):
for flag in extra_args:
if "-iree-vulkan-target-triple=" in flag:
print(f"Using target triple {flag.split('=')[1]}")
return None
if device_name == "" or device_name == [] or device_name is None:
vulkan_device = get_vulkan_device_name(device_num=device_num)
else:
vulkan_device = device_name
triple = get_vulkan_target_triple(vulkan_device)
if triple is not None:
print(
f"Found vulkan device {vulkan_device}. Using target triple {triple}"
)
return f"--iree-vulkan-target-triple={triple}"
print(
"""Optimized kernel for your target device is not added yet.
Contact AMDSHARK Admin on discord[https://discord.com/invite/RUqY2h2s9u]
or pull up an issue."""
)
print(f"Target : {vulkan_device}")
return None
def get_iree_vulkan_args(device_num=0, extra_args=[]):
# res_vulkan_flag = ["--iree-flow-demote-i64-to-i32"]
res_vulkan_flag = []
res_vulkan_flag += [
"--iree-stream-resource-max-allocation-size=3221225472",
"--iree-flow-inline-constants-max-byte-length=0"
]
vulkan_triple_flag = None
for arg in extra_args:
if "-iree-vulkan-target-triple=" in arg:
print(f"Using target triple {arg} from command line args")
vulkan_triple_flag = arg
break
if vulkan_triple_flag is None:
vulkan_triple_flag = get_vulkan_triple_flag(
device_num=device_num, extra_args=extra_args
)
res_vulkan_flag += [vulkan_triple_flag]
return res_vulkan_flag
@functools.cache
def get_iree_vulkan_runtime_flags():
vulkan_runtime_flags = [
f"--vulkan_validation_layers={'true' if amdshark_args.vulkan_debug_utils else 'false'}",
f"--vulkan_debug_verbosity={'4' if amdshark_args.vulkan_debug_utils else '0'}"
f"--vulkan-robust-buffer-access={'true' if amdshark_args.vulkan_debug_utils else 'false'}",
]
return vulkan_runtime_flags
def set_iree_vulkan_runtime_flags(flags):
for flag in flags:
ireert.flags.parse_flags(flag)
return

View File

@@ -1,468 +0,0 @@
# Copyright 2020 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Usage:
This function takes the model mlir file and the tuned config file as input,
and output a new mlir file with lowering configs annotated on certain ops.
There are two ways to utilize the function:
1. Call model_annotation function within another python script
from amdshark.model_annotation import model_annotation
with create_context() as ctx:
module = model_annotation(ctx, input_contents=..., config_path=..., search_op=...)
2. Run model_annotation.py directly
python model_annotation.py -model path_to_original_mlir -config_path path_to_config_file
"""
import json
import os
import sys
from typing import Dict, List
import iree.compiler._mlir_libs
from iree.compiler import ir
def model_annotation(
ctx: ir.Context,
*,
input_contents: str,
config_path: str,
search_op: str,
winograd: bool = False,
):
if os.path.isfile(input_contents):
with open(input_contents, "rb") as f:
input_contents = f.read()
module = ir.Module.parse(input_contents)
if config_path == "":
return module
if winograd:
with open(config_path, "r") as f:
data = json.load(f)
configs = data["c,f"]
else:
configs = load_model_configs(config_path)
# The Python API does not expose a general walk() function, so we just
# do it ourselves.
walk_children(module.operation, configs, search_op, winograd)
if not module.operation.verify():
raise RuntimeError("Modified program does not verify!")
return module
def load_model_configs(config_path: str):
config = {}
with open(config_path, "r") as f:
for line in f:
data = json.loads(line)
if "identifier" not in data.keys():
continue
if data["identifier"] == "matmul":
matrix_size = [data["m"], data["n"], data["k"]]
elif data["identifier"] == "bmm":
matrix_size = [data["b"], data["m"], data["n"], data["k"]]
elif data["identifier"] == "generic":
matrix_size = [1, data["b"], data["m"], data["n"], data["k"]]
elif data["identifier"] == "conv":
matrix_size = [
data["n"],
data["ih"],
data["iw"],
data["c"],
data["kh"],
data["kw"],
data["f"],
data["oh"],
data["ow"],
data["d"],
data["s"],
data["p"],
]
config[shape_list_to_string(matrix_size)] = data
f.close()
return config
def walk_children(
op: ir.Operation, configs: List[Dict], search_op: str, winograd: bool
):
if search_op == "matmul":
op_names = ["linalg.matmul", "mhlo.dot"]
elif search_op == "bmm":
op_names = ["linalg.batch_matmul", "mhlo.dot_general"]
elif search_op == "conv":
op_names = ["mhlo.convolution", "linalg.conv_2d_nhwc_hwcf"]
elif search_op == "generic":
op_names = ["linalg.generic"]
elif search_op == "all":
op_names = [
"mhlo.dot",
"mhlo.dot_general",
"mhlo.convolution",
"linalg.matmul",
"linalg.batch_matmul",
"linalg.conv_2d_nhwc_hwcf",
"linalg.generic",
]
else:
raise ValueError(f"{search_op} op is not tunable.")
for region in op.regions:
for block in region.blocks:
for child_op in block.operations:
# TODO: This is dumb. Both Operation and OpView should expose
# 'operation' and 'name' attributes.
if isinstance(child_op, ir.OpView):
child_op = child_op.operation
if winograd and child_op.name in [
"linalg.conv_2d_nchw_fchw",
"linalg.conv_2d_nhwc_hwcf",
]:
add_winograd_attribute(child_op, configs)
if child_op.name in op_names:
if child_op.name == "linalg.generic":
# This is for generic op that has contractionOpInterface
# which is basically einsum("mk,bkn->bmn")
op_result = str(child_op.results[0])
op_iterator = str(
child_op.attributes["iterator_types"]
)
if len(child_op.operands) != 3:
continue
if "reduction" not in op_iterator:
continue
if (
"arith.addf" not in op_result
or "arith.mulf" not in op_result
):
continue
if "arith.subf" in op_result:
continue
child_op_shape = get_op_shape(child_op, search_op)
if (
child_op_shape in configs.keys()
and configs[child_op_shape]["options"][0] != None
):
add_attributes(
child_op, configs[child_op_shape]["options"][0]
)
walk_children(child_op, configs, search_op, winograd)
def get_op_shape(op: ir.Operation, search_op: str):
shape_list = []
if search_op in ["generic", "all"]:
if op.name in ["linalg.generic"]:
input1 = str(op.operands[0].type)
input2 = str(op.operands[1].type)
m = input1.split("tensor<")[1].split("x")[0]
b = input2.split("tensor<")[1].split("x")[0]
k = input2.split("tensor<")[1].split("x")[1]
n = input2.split("tensor<")[1].split("x")[2]
shape_list = [1, int(b), int(m), int(n), int(k)]
if search_op in ["matmul", "all"]:
if op.name in ["mhlo.dot"]:
op_result = str(op.results[0])
m = op_result.split("tensor<")[1].split("x")[0]
k = op_result.split("tensor<")[1].split("x")[1]
n = op_result.split("tensor<")[2].split("x")[1]
shape_list = [int(m), int(n), int(k)]
elif op.name in ["linalg.matmul"]:
op_result = str(op.results[0]).split("ins(")[1]
m = op_result.split("tensor<")[1].split("x")[0]
k = op_result.split("tensor<")[1].split("x")[1]
n = op_result.split("tensor<")[2].split("x")[1]
shape_list = [int(m), int(n), int(k)]
if search_op in ["bmm", "all"]:
if op.name in ["mhlo.dot_general"]:
op_result = str(op.results[0])
b = op_result.split("tensor<")[1].split("x")[1]
m = op_result.split("tensor<")[1].split("x")[2]
k = op_result.split("tensor<")[1].split("x")[3]
n = op_result.split("tensor<")[3].split("x")[3]
shape_list = [int(b), int(m), int(n), int(k)]
elif op.name in ["linalg.batch_matmul"]:
op_result = str(op.results[0]).split("ins(")[1]
b = op_result.split("tensor<")[1].split("x")[0]
m = op_result.split("tensor<")[1].split("x")[1]
k = op_result.split("tensor<")[1].split("x")[2]
n = op_result.split("tensor<")[3].split("x")[2]
shape_list = [int(b), int(m), int(n), int(k)]
if search_op in ["conv", "all"]:
if op.name in ["mhlo.convolution"]:
op_result = str(op.results[0])
dilation = (
str(op.attributes["rhs_dilation"])
.split("dense<")[1]
.split(">")[0]
)
stride = (
str(op.attributes["window_strides"])
.split("dense<")[1]
.split(">")[0]
)
pad = (
str(op.attributes["padding"]).split("dense<")[1].split(">")[0]
)
n = op_result.split("tensor<")[1].split("x")[0]
ih = op_result.split("tensor<")[1].split("x")[1]
iw = op_result.split("tensor<")[1].split("x")[2]
c = op_result.split("tensor<")[1].split("x")[3]
kh = op_result.split("tensor<")[2].split("x")[0]
kw = op_result.split("tensor<")[2].split("x")[1]
f = op_result.split("tensor<")[2].split("x")[3]
oh = op_result.split("tensor<")[3].split("x")[1]
ow = op_result.split("tensor<")[3].split("x")[2]
shape_list = [
int(n),
int(ih),
int(iw),
int(c),
int(kh),
int(kw),
int(f),
int(oh),
int(ow),
int(dilation),
int(stride),
int(pad),
]
elif op.name in ["linalg.conv_2d_nhwc_hwcf"]:
op_result = str(op.results[0]).split("ins(")[1]
dilation = (
str(op.attributes["dilations"])
.split("dense<")[1]
.split(">")[0]
)
stride = (
str(op.attributes["strides"]).split("dense<")[1].split(">")[0]
)
pad = 0
n = op_result.split("tensor<")[1].split("x")[0]
ih = op_result.split("tensor<")[1].split("x")[1]
iw = op_result.split("tensor<")[1].split("x")[2]
c = op_result.split("tensor<")[1].split("x")[3]
kh = op_result.split("tensor<")[2].split("x")[0]
kw = op_result.split("tensor<")[2].split("x")[1]
f = op_result.split("tensor<")[2].split("x")[3]
oh = op_result.split("tensor<")[3].split("x")[1]
ow = op_result.split("tensor<")[3].split("x")[2]
shape_list = [
int(n),
int(ih),
int(iw),
int(c),
int(kh),
int(kw),
int(f),
int(oh),
int(ow),
int(dilation),
int(stride),
int(pad),
]
shape_str = shape_list_to_string(shape_list)
return shape_str
def add_attributes(op: ir.Operation, config: List[Dict]):
# Parse the config file
split_k = None
pipeline_depth = None
store_stage = None
subgroup_size = None
if "GPU" in config["pipeline"]:
pipeline = (
"LLVMGPUMatmulSimt"
if config["pipeline"] == "GPU"
else "LLVMGPUMatmulTensorCore"
)
tile_sizes = [config["work_group_tile_sizes"]]
workgroup_size = config["work_group_sizes"]
if "pipeline_depth" in config.keys():
pipeline_depth = config["pipeline_depth"]
if "split_k" in config.keys():
split_k = config["split_k"]
elif "SPIRV" in config["pipeline"]:
pipeline = config["pipeline"]
if pipeline == "SPIRVMatmulPromoteVectorize":
tile_sizes = [
config["work_group_tile_sizes"]
+ [config["reduction_tile_sizes"][-1]],
]
else:
tile_sizes = [
config["work_group_tile_sizes"],
config["parallel_tile_sizes"],
config["reduction_tile_sizes"],
]
workgroup_size = config["work_group_sizes"]
if "vector_tile_sizes" in config.keys():
tile_sizes += [config["vector_tile_sizes"]]
if "window_tile_sizes" in config.keys():
tile_sizes += [config["window_tile_sizes"]]
if "subgroup_size" in config.keys():
subgroup_size = config["subgroup_size"]
if "pipeline_depth" in config.keys():
pipeline_depth = config["pipeline_depth"]
if "store_stage" in config.keys():
store_stage = config["store_stage"]
else:
# For IREE CPU pipelines
pipeline = config["pipeline"]
tile_sizes = [
config["work_group_tile_sizes"],
config["parallel_tile_sizes"],
config["reduction_tile_sizes"],
]
workgroup_size = []
# Add compilation info as an attribute. We don't have a Python binding for CompilationInfo,
# so we just parse its string form.
if pipeline_depth != None:
translation_info = f"{pipeline} pipeline_depth = {pipeline_depth}"
if store_stage != None:
translation_info += f" store_stage = {store_stage}"
else:
translation_info = f"{pipeline}"
compilation_info = (
f"#iree_codegen.compilation_info<"
f"lowering_config = <tile_sizes = {repr(tile_sizes)}>, "
f"translation_info = <{translation_info}>, "
f"workgroup_size = {repr(workgroup_size)} "
)
if subgroup_size != None:
compilation_info += f", subgroup_size = {subgroup_size}>"
else:
compilation_info += ">"
attr = ir.Attribute.parse(compilation_info)
op.attributes["compilation_info"] = attr
# Add other attributes if required.
if split_k:
add_attribute_by_name(op, "iree_flow_split_k", split_k)
def add_winograd_attribute(op: ir.Operation, config: List):
op_result = str(op.results[0]).split("ins(")[1]
dilation = int(
str(op.attributes["dilations"]).split("dense<")[1].split(">")[0]
)
stride = int(
str(op.attributes["strides"]).split("dense<")[1].split(">")[0]
)
if op.name == "linalg.conv_2d_nchw_fchw":
f = int(op_result.split("tensor<")[2].split("x")[0])
c = int(op_result.split("tensor<")[2].split("x")[1])
kh = int(op_result.split("tensor<")[2].split("x")[2])
kw = int(op_result.split("tensor<")[2].split("x")[3])
else:
kh = int(op_result.split("tensor<")[2].split("x")[0])
kw = int(op_result.split("tensor<")[2].split("x")[1])
c = int(op_result.split("tensor<")[2].split("x")[2])
f = int(op_result.split("tensor<")[2].split("x")[3])
if (
dilation == 1
and stride == 1
and kh == 3
and kw == 3
and [c, f] in config
):
op.attributes["iree_winograd_conv"] = ir.IntegerAttr.get(
ir.IntegerType.get_signless(64), 1
)
def add_attribute_by_name(op: ir.Operation, name: str, val: int):
attr = ir.IntegerAttr.get(ir.IntegerType.get_signless(64), val)
op.attributes[name] = attr
def shape_list_to_string(input):
return "x".join([str(d) for d in input])
def create_context() -> ir.Context:
context = ir.Context()
context.allow_unregistered_dialects = True
return context
if __name__ == "__main__":
import argparse
from pathlib import Path
def path_expand(s):
return Path(s).expanduser().resolve()
parser = argparse.ArgumentParser()
parser.add_argument(
"-model",
type=path_expand,
default="model.mlir",
help="Path to the input mlir file",
)
parser.add_argument(
"-config_path",
type=path_expand,
default="best_configs.json",
help="Path where stores the op config file",
)
parser.add_argument(
"-output_path",
type=path_expand,
default="tuned_model.mlir",
help="Path to save the annotated mlir file",
)
parser.add_argument(
"-search_op",
type=str,
default="all",
help="Op to be optimized. options are matmul, bmm, conv.",
)
args = parser.parse_args()
with create_context() as ctx:
module = model_annotation(
ctx,
input_contents=args.model,
config_path=args.config_path,
search_op=args.search_op,
)
mlir_str = str(module)
with open(args.output_path, "w") as f:
f.write(mlir_str)
print(f"Saved mlir in {args.output_path}.")

View File

@@ -1,170 +0,0 @@
# Copyright 2020 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import shlex
import subprocess
class SplitStrToListAction(argparse.Action):
def __init__(self, option_strings, dest, *args, **kwargs):
super(SplitStrToListAction, self).__init__(
option_strings=option_strings, dest=dest, *args, **kwargs
)
def __call__(self, parser, namespace, values, option_string=None):
del parser, option_string
setattr(namespace, self.dest, shlex.split(" "))
parser = argparse.ArgumentParser(description="AMDSHARK runner.")
parser.add_argument(
"--device",
type=str,
default="cpu",
help="Device on which amdshark_runner runs. options are cpu, cuda, and vulkan",
)
parser.add_argument(
"--additional_compile_args",
default=list(),
nargs=1,
action=SplitStrToListAction,
help="Additional arguments to pass to the compiler. These are appended as the last arguments.",
)
parser.add_argument(
"--additional_runtime_args",
default=list(),
nargs=1,
action=SplitStrToListAction,
help="Additional arguments to pass to the IREE runtime. These are appended as the last arguments.",
)
parser.add_argument(
"--enable_tf32",
type=bool,
default=False,
help="Enables TF32 precision calculations on supported GPUs.",
)
parser.add_argument(
"--model_config_path",
help="Directory to where the tuned model config file is located.",
default=None,
)
parser.add_argument(
"--num_warmup_iterations",
type=int,
default=5,
help="Run the model for the specified number of warmup iterations.",
)
parser.add_argument(
"--num_iterations",
type=int,
default=100,
help="Run the model for the specified number of iterations.",
)
parser.add_argument(
"--onnx_bench",
default=False,
action="store_true",
help="When enabled, pytest bench results will include ONNX benchmark results.",
)
parser.add_argument(
"--amdshark_prefix",
default=None,
help="gs://amdshark_tank/<this_flag>/model_directories",
)
parser.add_argument(
"--update_tank",
default=True,
action="store_true",
help="When enabled, AMDSHARK downloader will update local amdshark_tank if local hash is different from latest upstream hash.",
)
parser.add_argument(
"--force_update_tank",
default=False,
action="store_true",
help="When enabled, AMDSHARK downloader will force an update of local amdshark_tank artifacts for each request.",
)
parser.add_argument(
"--local_tank_cache",
default=None,
help="Specify where to save downloaded amdshark_tank artifacts. If this is not set, the default is ~/.local/amdshark_tank/.",
)
parser.add_argument(
"--dispatch_benchmarks",
default=None,
help='dispatches to return benchamrk data on. use "All" for all, and None for none.',
)
parser.add_argument(
"--dispatch_benchmarks_dir",
default="temp_dispatch_benchmarks",
help='directory where you want to store dispatch data generated with "--dispatch_benchmarks"',
)
parser.add_argument(
"--enable_conv_transform",
default=False,
action="store_true",
help="Enables the --iree-flow-enable-conv-nchw-to-nhwc-transform flag.",
)
parser.add_argument(
"--enable_img2col_transform",
default=False,
action="store_true",
help="Enables the --iree-flow-enable-conv-img2col-transform flag.",
)
parser.add_argument(
"--use_winograd",
default=False,
action="store_true",
help="Enables the --iree-flow-enable-conv-winograd-transform flag.",
)
parser.add_argument(
"--device_allocator",
type=str,
nargs="*",
default=["caching"],
help="Specifies one or more HAL device allocator specs "
"to augment the base device allocator",
choices=["debug", "caching"],
)
parser.add_argument(
"--task_topology_max_group_count",
type=str,
default=None,
help="passthrough flag for the iree flag of the same name. If None, defaults to cpu-count",
)
parser.add_argument(
"--vulkan_debug_utils",
default=False,
action=argparse.BooleanOptionalAction,
help="Profiles vulkan device and collects the .rdc info.",
)
parser.add_argument(
"--vulkan_validation_layers",
default=False,
action=argparse.BooleanOptionalAction,
help="Flag for disabling vulkan validation layers when benchmarking.",
)
amdshark_args, unknown = parser.parse_known_args()

View File

@@ -1,315 +0,0 @@
# Copyright 2022 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from iree.runtime import query_available_drivers, get_driver
from amdshark.amdshark_downloader import download_model
from amdshark.amdshark_inference import AMDSharkInference
from typing import List, Optional, Tuple
import numpy as np
import argparse
from amdshark.iree_utils._common import _IREE_DEVICE_MAP
import multiprocessing
from amdshark.amdshark_runner import supported_dialects
import logging
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures.thread import ThreadPoolExecutor
import time
import numpy as np
IREE_TO_AMDSHARK_DRIVER_MAP = {v: k for k, v in _IREE_DEVICE_MAP.items()}
def stress_test_compiled_model(
amdshark_module_path: str,
function_name: str,
device: str,
inputs: List[np.ndarray],
golden_out: List[np.ndarray],
batch_size: int,
max_iterations: int,
max_duration_seconds: float,
inference_timeout_seconds: float,
tolerance_nulp: int,
stress_test_index: int,
):
logging.info(
f"Running stress test {stress_test_index} on device {device}."
)
# All interactions with the module must run in a single thread.
# We are using execution in a sperate thread in order to be able
# to wait with a timeout on the inference operation.
module_executor = ThreadPoolExecutor(1)
amdshark_module = module_executor.submit(
AMDSharkInference,
mlir_module=bytes(),
function_name=function_name,
device=device,
).result()
module_executor.submit(
amdshark_module.load_module, amdshark_module_path
).result()
input_batches = [np.repeat(arr, batch_size, axis=0) for arr in inputs]
golden_output_batches = np.repeat(golden_out, batch_size, axis=0)
report_interval_seconds = 10
start_time = time.time()
previous_report_time = start_time
first_iteration_output = None
for i in range(max_iterations):
output = module_executor.submit(
amdshark_module.forward, input_batches
).result(inference_timeout_seconds)
if first_iteration_output is None:
np.testing.assert_array_almost_equal_nulp(
golden_output_batches, output, nulp=tolerance_nulp
)
first_iteration_output = output
else:
np.testing.assert_array_equal(output, first_iteration_output)
current_time = time.time()
if report_interval_seconds < current_time - previous_report_time:
logging.info(
f"Stress test {stress_test_index} on device "
f"{device} at iteration {i+1}"
)
previous_report_time = current_time
if max_duration_seconds < current_time - start_time:
return
logging.info(f"Stress test {stress_test_index} on device {device} done.")
def get_device_type(device_name: str):
return device_name.split("://", 1)[0]
def get_device_types(device_names: str):
return [get_device_type(device_name) for device_name in device_names]
def query_devices(device_types: Optional[List[str]] = None) -> List[str]:
devices = []
if device_types is None:
device_types = [
IREE_TO_AMDSHARK_DRIVER_MAP[name]
for name in query_available_drivers()
if name in IREE_TO_AMDSHARK_DRIVER_MAP
]
for device_type in device_types:
driver = get_driver(_IREE_DEVICE_MAP[device_type])
device_infos = driver.query_available_devices()
for device_info in device_infos:
uri_path = (
device_info["path"]
if device_info["path"] != ""
else str(device_info["device_id"])
)
device_uri = f"{device_type}://{uri_path}"
devices.append(device_uri)
return devices
def compile_stress_test_module(
device_types: List[str], mlir_model: str, func_name: str, mlir_dialect: str
) -> List[str]:
amdshark_module_paths = []
for device_type in device_types:
logging.info(
f"Compiling stress test model for device type {device_type}."
)
amdshark_module = AMDSharkInference(
mlir_model,
func_name,
mlir_dialect=mlir_dialect,
device=device_type,
)
amdshark_module_paths.append(amdshark_module.save_module())
return amdshark_module_paths
def stress_test(
model_name: str,
dynamic_model: bool = False,
device_types: Optional[List[str]] = None,
device_names: Optional[List[str]] = None,
batch_size: int = 1,
max_iterations: int = 10**7,
max_duration_seconds: float = 3600,
inference_timeout_seconds: float = 60,
mlir_dialect: str = "linalg",
frontend: str = "torch",
oversubscription_factor: int = 1,
tolerance_nulp: int = 50000,
):
logging.info(f"Downloading stress test model {model_name}.")
mlir_model, func_name, inputs, golden_out = download_model(
model_name=model_name, dynamic=dynamic_model, frontend=frontend
)
if device_names is None or device_types is not None:
device_names = [] if device_names is None else device_names
with ProcessPoolExecutor() as executor:
# query_devices needs to run in a separate process,
# because it will interfere with other processes that are forked later.
device_names.extend(
executor.submit(query_devices, device_types).result()
)
device_types_set = list(set(get_device_types(device_names)))
with ProcessPoolExecutor() as executor:
# This needs to run in a subprocess because when compiling for CUDA,
# some stuff get intialized and cuInit will fail in a forked process
# later. It should be just compiling, but alas.
amdshark_module_paths_set = executor.submit(
compile_stress_test_module,
device_types_set,
mlir_model,
func_name,
mlir_dialect,
).result()
device_type_amdshark_module_path_map = {
device_type: module_path
for device_type, module_path in zip(
device_types_set, amdshark_module_paths_set
)
}
device_name_amdshark_module_path_map = {
device_name: device_type_amdshark_module_path_map[
get_device_type(device_name)
]
for device_name in device_names
}
# This needs to run in a spearate process, because it uses the drvier chache
# in IREE and a subsequent call to `iree.runtime.SystemContext.add_vm_module`
# in a forked process will hang.
with multiprocessing.Pool(
len(device_name_amdshark_module_path_map) * oversubscription_factor
) as process_pool:
process_pool.starmap(
stress_test_compiled_model,
[
(
module_path,
func_name,
device_name,
inputs,
golden_out,
batch_size,
max_iterations,
max_duration_seconds,
inference_timeout_seconds,
tolerance_nulp,
stress_test_index,
)
for stress_test_index, (device_name, module_path) in enumerate(
list(device_name_amdshark_module_path_map.items())
* oversubscription_factor
)
],
)
if __name__ == "__main__":
logging.basicConfig(encoding="utf-8", level=logging.INFO)
parser = argparse.ArgumentParser(
description="Downloads, compiles and runs a model from the tank to stress test the system."
)
parser.add_argument(
"--model", type=str, help="Model name in the tank.", default="alexnet"
)
parser.add_argument(
"--dynamic",
help="Use dynamic version of the model.",
action="store_true",
default=False,
)
parser.add_argument(
"--frontend", type=str, help="Frontend of the model.", default="torch"
)
parser.add_argument(
"--mlir-dialect",
type=str,
help="MLIR dialect of the model.",
default="linalg",
choices=supported_dialects,
)
parser.add_argument(
"--device-types",
type=str,
nargs="*",
choices=_IREE_DEVICE_MAP.keys(),
help="Runs the stress test on all devices with that type. "
"If absent and no deveices are specified "
"will run against all available devices.",
)
parser.add_argument(
"--devices",
type=str,
nargs="*",
help="List of devices to run the stress test on. "
"If device-types is specified will run against the union of the two.",
)
parser.add_argument(
"--batch-size",
type=int,
help="Number of inputs to feed into the model",
default=1,
)
parser.add_argument(
"--oversubscription",
type=int,
help="Oversubscrption factor. Each device will execute the model simultaneously "
"this many number of times.",
default=1,
)
parser.add_argument(
"--max-iterations",
type=int,
help="Maximum number of iterations to run the stress test per device.",
default=10**7,
)
parser.add_argument(
"--max-duration",
type=float,
help="Maximum number of seconds to run the stress test.",
default=3600,
)
parser.add_argument(
"--inference-timeout",
type=float,
help="Timeout in seconds for a single model inference operation.",
default=60,
)
parser.add_argument(
"--tolerance-nulp",
type=int,
help="The maximum number of unit in the last place for tolerance "
"when verifing results with the golden reference output.",
default=50000,
)
args = parser.parse_known_args()[0]
stress_test(
model_name=args.model,
dynamic_model=args.dynamic,
frontend=args.frontend,
mlir_dialect=args.mlir_dialect,
device_types=args.device_types,
device_names=args.devices,
batch_size=args.batch_size,
oversubscription_factor=args.oversubscription,
max_iterations=args.max_iterations,
max_duration_seconds=args.max_duration,
inference_timeout_seconds=args.inference_timeout,
tolerance_nulp=args.tolerance_nulp,
)

View File

@@ -1,144 +0,0 @@
# RUN: %PYTHON %s
import numpy as np
from amdshark.amdshark_importer import AMDSharkImporter
import pytest
from amdshark.parser import amdshark_args
from amdshark.amdshark_inference import AMDSharkInference
from amdshark.tflite_utils import TFLitePreprocessor
import sys
# model_path = "https://tfhub.dev/tensorflow/lite-model/albert_lite_base/squadv1/1?lite-format=tflite"
# Inputs modified to be useful albert inputs.
def generate_inputs(input_details):
for input in input_details:
print(str(input["shape"]), input["dtype"].__name__)
args = []
args.append(
np.random.randint(
low=0,
high=256,
size=input_details[0]["shape"],
dtype=input_details[0]["dtype"],
)
)
args.append(
np.ones(
shape=input_details[1]["shape"], dtype=input_details[1]["dtype"]
)
)
args.append(
np.zeros(
shape=input_details[2]["shape"], dtype=input_details[2]["dtype"]
)
)
return args
def compare_results(mlir_results, tflite_results, details):
print("Compare mlir_results VS tflite_results: ")
assert len(mlir_results) == len(
tflite_results
), "Number of results do not match"
for i in range(len(details)):
mlir_result = mlir_results[i]
tflite_result = tflite_results[i]
mlir_result = mlir_result.astype(np.single)
tflite_result = tflite_result.astype(np.single)
assert mlir_result.shape == tflite_result.shape, "shape doesnot match"
max_error = np.max(np.abs(mlir_result - tflite_result))
print("Max error (%d): %f", i, max_error)
class AlbertTfliteModuleTester:
def __init__(
self,
dynamic=False,
device="cpu",
save_mlir=False,
save_vmfb=False,
):
self.dynamic = dynamic
self.device = device
self.save_mlir = save_mlir
self.save_vmfb = save_vmfb
def create_and_check_module(self):
amdshark_args.save_mlir = self.save_mlir
amdshark_args.save_vmfb = self.save_vmfb
tflite_preprocessor = TFLitePreprocessor(model_name="albert_lite_base")
raw_model_file_path = tflite_preprocessor.get_raw_model_file()
inputs = tflite_preprocessor.get_inputs()
tflite_interpreter = tflite_preprocessor.get_interpreter()
my_amdshark_importer = AMDSharkImporter(
module=tflite_interpreter,
inputs=inputs,
frontend="tflite",
raw_model_file=raw_model_file_path,
)
mlir_model, func_name = my_amdshark_importer.import_mlir()
amdshark_module = AMDSharkInference(
mlir_module=mlir_model,
function_name=func_name,
device=self.device,
mlir_dialect="tflite",
)
# Case1: Use amdshark_importer default generate inputs
amdshark_module.compile()
mlir_results = amdshark_module.forward(inputs)
## post process results for compare
input_details, output_details = tflite_preprocessor.get_model_details()
mlir_results = list(mlir_results)
for i in range(len(output_details)):
dtype = output_details[i]["dtype"]
mlir_results[i] = mlir_results[i].astype(dtype)
tflite_results = tflite_preprocessor.get_golden_output()
compare_results(mlir_results, tflite_results, output_details)
# Case2: Use manually set inputs
input_details, output_details = tflite_preprocessor.get_model_details()
inputs = generate_inputs(input_details) # new inputs
amdshark_module = AMDSharkInference(
mlir_module=mlir_model,
function_name=func_name,
device=self.device,
mlir_dialect="tflite",
)
amdshark_module.compile()
mlir_results = amdshark_module.forward(inputs)
## post process results for compare
tflite_results = tflite_preprocessor.get_golden_output()
compare_results(mlir_results, tflite_results, output_details)
# print(mlir_results)
# A specific case can be run by commenting different cases. Runs all the test
# across cpu, gpu and vulkan according to available drivers.
pytest_param = pytest.mark.parametrize(
("dynamic", "device"),
[
pytest.param(False, "cpu"),
# TODO: Language models are failing for dynamic case..
pytest.param(True, "cpu", marks=pytest.mark.skip),
],
)
@pytest_param
@pytest.mark.xfail(
sys.platform == "darwin", reason="known macos tflite install issue"
)
def test_albert(dynamic, device):
module_tester = AlbertTfliteModuleTester(dynamic=dynamic, device=device)
module_tester.create_and_check_module()
if __name__ == "__main__":
test_albert(False, "cpu")

View File

@@ -1,31 +0,0 @@
# Copyright 2022 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest
import subprocess
import sys
import importlib.util
def test_stress_test():
subprocess.check_call(
[
sys.executable,
importlib.util.find_spec("amdshark.stress_test").origin,
"--model=squeezenet1_0",
"--devices",
"cpu",
"--max-iterations=1",
]
)

View File

@@ -1,62 +0,0 @@
import unittest
from unittest.mock import mock_open, patch
from apps.stable_diffusion.web.ui.txt2img_ui import (
export_settings,
load_settings,
all_gradio_labels,
)
class TestExportSettings(unittest.TestCase):
@patch("builtins.open", new_callable=mock_open)
@patch("json.dump")
def test_export_settings(self, mock_json_dump, mock_file):
test_values = ["value1", "value2", "value3"]
expected_output = {
"txt2img": {
label: value
for label, value in zip(all_gradio_labels, test_values)
}
}
export_settings(*test_values)
mock_file.assert_called_once_with("./ui/settings.json", "w")
mock_json_dump.assert_called_once_with(
expected_output, mock_file(), indent=4
)
@patch("apps.stable_diffusion.web.ui.txt2img_ui.json.load")
@patch(
"builtins.open",
new_callable=mock_open,
read_data='{"txt2img": {"some_setting": "some_value"}}',
)
def test_load_settings_file_exists(self, mock_file, mock_json_load):
mock_json_load.return_value = {
"txt2img": {
"txt2img_custom_model": "custom_model_value",
"custom_vae": "custom_vae_value",
}
}
settings = load_settings()
self.assertEqual(settings[0], "custom_model_value")
self.assertEqual(settings[1], "custom_vae_value")
@patch("apps.stable_diffusion.web.ui.txt2img_ui.json.load")
@patch("builtins.open", side_effect=FileNotFoundError)
def test_load_settings_file_not_found(self, mock_file, mock_json_load):
settings = load_settings()
default_lora_weights = "None"
self.assertEqual(settings[4], default_lora_weights)
@patch("apps.stable_diffusion.web.ui.txt2img_ui.json.load")
@patch("builtins.open", new_callable=mock_open, read_data="{}")
def test_load_settings_key_error(self, mock_file, mock_json_load):
mock_json_load.return_value = {}
settings = load_settings()
default_lora_weights = "None"
self.assertEqual(settings[4], default_lora_weights)

View File

@@ -1,208 +0,0 @@
import tensorflow as tf
import numpy as np
import os
import csv
import urllib.request
class TFLiteModelUtil:
def __init__(self, raw_model_file):
self.raw_model_file = str(raw_model_file)
self.tflite_interpreter = None
self.input_details = None
self.output_details = None
self.inputs = []
def setup_tflite_interpreter(self):
self.tflite_interpreter = tf.lite.Interpreter(
model_path=self.raw_model_file
)
self.tflite_interpreter.allocate_tensors()
# default input initialization
return self.get_model_details()
def get_model_details(self):
print("Get tflite input output details")
self.input_details = self.tflite_interpreter.get_input_details()
self.output_details = self.tflite_interpreter.get_output_details()
return self.input_details, self.output_details
def invoke_tflite(self, inputs):
self.inputs = inputs
print("invoke_tflite")
for i, input in enumerate(self.inputs):
self.tflite_interpreter.set_tensor(
self.input_details[i]["index"], input
)
self.tflite_interpreter.invoke()
# post process tflite_result for compare with mlir_result,
# for tflite the output is a list of numpy.tensor
tflite_results = []
for output_detail in self.output_details:
tflite_results.append(
np.array(
self.tflite_interpreter.get_tensor(output_detail["index"])
)
)
for i in range(len(self.output_details)):
# print("output_details ", i, "shape", self.output_details[i]["shape"].__name__,
# ", dtype: ", self.output_details[i]["dtype"].__name__)
out_dtype = self.output_details[i]["dtype"]
tflite_results[i] = tflite_results[i].astype(out_dtype)
return tflite_results
class TFLitePreprocessor:
def __init__(
self,
model_name,
input_details=None,
output_details=None,
model_path=None,
):
self.model_name = model_name
self.input_details = (
input_details # used for tflite, optional for tf/pytorch
)
self.output_details = (
output_details # used for tflite, optional for tf/pytorch
)
self.inputs = []
self.model_path = model_path # url to download the model
self.raw_model_file = (
None # local address for raw tf/tflite/pytorch model
)
self.mlir_file = (
None # local address for .mlir file of tf/tflite/pytorch model
)
self.mlir_model = None # read of .mlir file
self.output_tensor = (
None # the raw tf/pytorch/tflite_output_tensor, not mlir_tensor
)
self.interpreter = (
None # could be tflite/tf/torch_interpreter in utils
)
self.input_file = None
self.output_file = None
# create tmp model file directory
if self.model_path is None and self.model_name is None:
print(
"Error. No model_path, No model name,Please input either one."
)
return
print("Setting up for TMP_WORK_DIR")
self.workdir = os.path.join(
os.path.dirname(__file__), "./../gen_amdshark_tank"
)
os.makedirs(self.workdir, exist_ok=True)
print(f"TMP_WORK_DIR = {self.workdir}")
# compile and run tfhub tflite
load_model_success = self.load_tflite_model()
if not load_model_success:
print("Error, load tflite model fail")
return
if (self.input_details is None) or (self.output_details is None):
# print("Setting up tflite interpreter to get model input details")
self.setup_interpreter()
inputs = self.generate_inputs(self.input_details) # device_inputs
self.setup_inputs(inputs)
def load_tflite_model(self):
# use model name get dir.
tflite_model_name_dir = os.path.join(
self.workdir, str(self.model_name)
)
os.makedirs(tflite_model_name_dir, exist_ok=True)
print(f"TMP_TFLITE_MODELNAME_DIR = {tflite_model_name_dir}")
self.raw_model_file = "/".join(
[tflite_model_name_dir, str(self.model_name) + "_tflite.tflite"]
)
self.mlir_file = "/".join(
[tflite_model_name_dir, str(self.model_name) + "_tflite.mlir"]
)
self.input_file = "/".join([tflite_model_name_dir, "inputs"])
self.output_file = "/".join([tflite_model_name_dir, "golden_out"])
# np.save("/".join([tflite_model_name_dir, "function_name"]), np.array("main"))
if os.path.exists(self.raw_model_file):
print(
"Local address for .tflite model file Exists: ",
self.raw_model_file,
)
else:
print("No local tflite file, Download tflite model")
if self.model_path is None:
# get model file from tflite_model_list.csv or download from gs://bucket
print("No model_path, get from tflite_model_list.csv")
tflite_model_list_path = os.path.join(
os.path.dirname(__file__),
"../tank/tflite/tflite_model_list.csv",
)
tflite_model_list = csv.reader(open(tflite_model_list_path))
for row in tflite_model_list:
if str(row[0]) == str(self.model_name):
self.model_path = row[1]
print("tflite_model_name", str(row[0]))
print("tflite_model_link", self.model_path)
if self.model_path is None:
print("Error, No model path find in tflite_model_list.csv")
return False
urllib.request.urlretrieve(self.model_path, self.raw_model_file)
return True
def setup_interpreter(self):
self.interpreter = TFLiteModelUtil(self.raw_model_file)
(
self.input_details,
self.output_details,
) = self.interpreter.setup_tflite_interpreter()
def generate_inputs(self, input_details):
self.inputs = []
for tmp_input in input_details:
print(
"input_details shape:",
str(tmp_input["shape"]),
" type:",
tmp_input["dtype"].__name__,
)
self.inputs.append(
np.ones(shape=tmp_input["shape"], dtype=tmp_input["dtype"])
)
return self.inputs
def setup_inputs(self, inputs):
# print("Setting up inputs")
self.inputs = inputs
def get_mlir_model(self):
return self.mlir_model
def get_mlir_file(self):
return self.mlir_file
def get_inputs(self):
return self.inputs
def get_golden_output(self):
self.output_tensor = self.interpreter.invoke_tflite(self.inputs)
np.savez(self.output_file, *self.output_tensor)
return self.output_tensor
def get_model_details(self):
return self.input_details, self.output_details
def get_raw_model_file(self):
return self.raw_model_file
def get_interpreter(self):
return self.interpreter

View File

@@ -1,220 +0,0 @@
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
# Also available under a BSD-style license. See LICENSE.
import contextlib
import re
import traceback
import warnings
from typing import Any
import numpy as np
import torch
from torch.utils._pytree import tree_map
from torch_mlir.eager_mode.ir_building import build_mlir_module
from torch_mlir.eager_mode.torch_mlir_dispatch import (
UnsupportedByTorchMlirEagerMode,
normalize_args_kwargs,
check_get_aliased_arg,
)
from torch_mlir.eager_mode import EAGER_MODE_DEBUG
from torch_mlir.eager_mode.torch_mlir_tensor import (
TorchMLIRTensor,
check_requires_grad,
make_wrapper_subclass_from_torch_tensor,
make_bare_wrapper_subclass,
UNSUPPORTED_OPS,
no_dispatch,
)
from torch_mlir.eager_mode import torch_mlir_tensor
from amdshark.iree_eager_backend import EagerModeIREELinalgOnTensorsBackend
backend = EagerModeIREELinalgOnTensorsBackend("cpu")
torch_mlir_tensor.backend = backend
rtol = 1e-04
atol = 1e-05
class TorchMLIRLockstepTensor(TorchMLIRTensor):
"""This class overrides the dispatching for TorchMLIRTensor to allow for an op-by-op numerical comparison between PyTorch and the Torch-MLIR -> IREE backend compilation pipeline. This only supports the IREE backend and focuses on op-by-op level verification.
TODO: Extend this to do a cumulative trace with summary statistics at the end. Possibly requires a wrapper environment to store full trace info.
"""
def __new__(cls, elem, **kwargs):
if kwargs.get("constructing_from_device_tensor", False):
tensor_meta_data = backend.get_torch_metadata(elem, kwargs)
r = make_bare_wrapper_subclass(
cls=cls,
size=tensor_meta_data.size,
strides=tensor_meta_data.strides,
storage_offset=tensor_meta_data.storage_offset,
dtype=tensor_meta_data.dtype,
layout=tensor_meta_data.layout,
device=tensor_meta_data.device,
requires_grad=tensor_meta_data.requires_grad,
)
r.elem = elem
elif isinstance(elem, torch.nn.Parameter):
r = make_wrapper_subclass_from_torch_tensor(
cls, elem.data, **kwargs
)
# This is a hack to handle non-contiguous data through IREE-backend
nt = elem.detach().data.numpy()
if not nt.flags["C_CONTIGUOUS"]:
nt = np.ascontiguousarray(nt, dtype=nt.dtype)
r.elem = backend.transfer_from_torch_to_device(
torch.from_numpy(nt)
)
elif isinstance(elem, torch.Tensor):
r = make_wrapper_subclass_from_torch_tensor(cls, elem, **kwargs)
# Ditto TODO: Find a better way to handle this
nt = elem.numpy()
if not nt.flags["C_CONTIGUOUS"]:
nt = np.ascontiguousarray(nt, dtype=nt.dtype)
r.elem = backend.transfer_from_torch_to_device(
torch.from_numpy(nt)
)
# This branch handles the case when a python scalar is passed to some op
# or is returned from some aten op, such as _local_scalar_dense.
elif isinstance(elem, (int, float, bool)):
return elem
else:
raise ValueError(f"Unknown element type: {type(elem)}")
return r
def __repr__(self):
if self.grad_fn:
return f"TorchMLIRLockstepTensor({self.elem}, backend={backend.__class__.__name__}, grad_fn={self.grad_fn})"
else:
return f"TorchMLIRLockstepTensor({self.elem}, backend={backend.__class__.__name__})"
"""This does essentially the same dispatch as TorchMLIRTensor but operates as if debug mode is enabled. The numeric verification happens after the Torch-MLIR result is obtained by comparing against the
"""
@classmethod
def __torch_dispatch__(cls, func, _types, args=(), kwargs=None):
requires_grad = check_requires_grad(*args, **kwargs)
try:
with no_dispatch():
if hasattr(func, "op_name"):
op_name = func.op_name
elif hasattr(func, "__name__"):
# Handle builtin_function_or_method.
op_name = func.__name__
else:
raise RuntimeError(f"op {func} has no name")
if UNSUPPORTED_OPS.match(op_name):
raise UnsupportedByTorchMlirEagerMode(op_name)
if not hasattr(func, "_schema"):
raise RuntimeError(f"op {func} has no schema.")
normalized_kwargs = normalize_args_kwargs(func, args, kwargs)
if "layout" in normalized_kwargs and normalized_kwargs[
"layout"
] not in {0, None}:
raise UnsupportedByTorchMlirEagerMode(
f"{normalized_kwargs['layout']} layout not supported."
)
if "memory_format" in normalized_kwargs and normalized_kwargs[
"memory_format"
] not in {0, None}:
raise UnsupportedByTorchMlirEagerMode(
f"{normalized_kwargs['memory_format']} memory format not supported."
)
eager_module = build_mlir_module(func, normalized_kwargs)
device_tensor_args = [
kwarg.elem
for _, kwarg in normalized_kwargs.items()
if isinstance(kwarg, cls)
]
assert len(eager_module.body.operations[0].arguments) == len(
device_tensor_args
), "Number of parameters and number of arguments differs."
op_mlir_backend_callable = backend.compile(eager_module)
out = op_mlir_backend_callable(*device_tensor_args)
out = tree_map(
lambda x: cls(
x,
requires_grad=requires_grad,
constructing_from_device_tensor=True,
),
out,
)
# Numeric verification; Value for comparison comes from PyTorch eager
with no_dispatch():
unwrapped_args = tree_map(cls.unwrap, args)
unwrapped_kwargs = tree_map(cls.unwrap, kwargs)
if "_reshape_alias" in op_name:
native_out = torch.ops.aten.view(
unwrapped_args[0], unwrapped_args[1]
)
else:
native_out = func(*unwrapped_args, **unwrapped_kwargs)
native_out = tree_map(
lambda x: cls(x, requires_grad=requires_grad), native_out
).elem
tmp_out = out.elem
try:
np.testing.assert_allclose(
native_out.to_host(),
tmp_out.to_host(),
rtol=rtol,
atol=atol,
)
except Exception as e:
shaped_args = [
arg.shape if torch.is_tensor(arg) else arg
for arg in unwrapped_args
]
shaped_kwargs = [
kwarg.shape if torch.is_tensor(kwarg) else kwarg
for kwarg in unwrapped_kwargs
]
warnings.warn(
f"Lockstep accuracy verification failed with error: *{str(e)}*; "
f"Dispatched function name: *{str(func)}*; "
f"Dispatched function args: *{str(shaped_args)}*; "
f"Dispatched function kwargs: *{str(shaped_kwargs)}*; "
)
except Exception as e:
warnings.warn(traceback.format_exc())
if isinstance(e, UnsupportedByTorchMlirEagerMode):
warnings.warn(
f"Couldn't use TorchMLIR eager because current incompatibility: *{str(e)}*; running through PyTorch eager."
)
else:
warnings.warn(
f"Couldn't use TorchMLIR eager because of error: *{str(e)}*; "
f"Running through PyTorch eager"
)
with no_dispatch():
unwrapped_args = tree_map(cls.unwrap, args)
unwrapped_kwargs = tree_map(cls.unwrap, kwargs)
if "_reshape_alias" in op_name:
out = torch.ops.aten.view(
unwrapped_args[0], unwrapped_args[1]
)
else:
out = func(*unwrapped_args, **unwrapped_kwargs)
out = tree_map(lambda x: cls(x, requires_grad=requires_grad), out)
maybe_aliased_arg_name = check_get_aliased_arg(func)
if maybe_aliased_arg_name is not None:
warnings.warn(
f"Found aliased arg, but didn't copy tensor contents. This could lead to incorrect results for E2E model execution but doesn't affect the validity of the lockstep op verification."
)
# TODO: Find a way to handle argument aliasing for IREE backend
# backend.copy_into(normalized_kwargs[maybe_aliased_arg_name].elem, out.elem)
return out

View File

@@ -1,90 +0,0 @@
# Copyright 2020 The Nod Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from torch_mlir.ir import StringAttr
import torch_mlir
from torch_mlir_e2e_test.linalg_on_tensors_backends import refbackend
import tempfile
from amdshark.parser import amdshark_args
import io
mlir_type_mapping_dict = {
"linalg": torch_mlir.OutputType.LINALG_ON_TENSORS,
"stablehlo": torch_mlir.OutputType.STABLEHLO,
"tosa": torch_mlir.OutputType.TOSA,
}
def get_module_name_for_asm_dump(module):
"""Gets a name suitable for an assembly dump.
The name is not guaranteed to be unique.
"""
if not "torch.debug_module_name" in module.operation.attributes:
return "UnnammedModule"
return StringAttr(
module.operation.attributes["torch.debug_module_name"]
).value
def run_on_refbackend(torch_module, inputs):
backend = refbackend.RefBackendLinalgOnTensorsBackend()
compiled = backend.compile(torch_module)
jit_module = backend.load(compiled)
np_inputs = [x.numpy() for x in inputs]
return jit_module.forward(np_inputs[0])
# Creates dynamic dims for all dims.
# TODO: Pass user specified dynamic dims.
def create_dynamic_placeholders(inputs):
placeholders = []
for inp in inputs:
placeholder = torch_mlir.TensorPlaceholder.like(
inp, dynamic_axes=[i for i in range(len(inp.shape))]
)
placeholders.append(placeholder)
return tuple(placeholders)
def get_torch_mlir_module(
module,
input: tuple,
dynamic: bool,
jit_trace: bool,
return_str: bool = False,
mlir_type: str = "linalg",
):
"""Get the MLIR's linalg-on-tensors module from the torchscipt module."""
ignore_traced_shapes = False
if dynamic:
input = create_dynamic_placeholders(input)
if jit_trace:
ignore_traced_shapes = True
tempfile.tempdir = "."
mlir_module = torch_mlir.compile(
module,
input,
output_type=mlir_type_mapping_dict[mlir_type],
use_tracing=jit_trace,
ignore_traced_shapes=ignore_traced_shapes,
)
if return_str:
return mlir_module.operation.get_asm()
bytecode_stream = io.BytesIO()
mlir_module.operation.write_bytecode(bytecode_stream)
bytecode = bytecode_stream.getvalue()
return bytecode

View File

@@ -1,389 +0,0 @@
import numpy as np
import json
from random import (
randint,
seed as seed_random,
getstate as random_getstate,
setstate as random_setstate,
)
from pathlib import Path
from apps.amdshark_studio.modules.shared_cmd_opts import cmd_opts
from cpuinfo import get_cpu_info
# TODO: migrate these utils to studio
from amdshark.iree_utils.vulkan_utils import (
set_iree_vulkan_runtime_flags,
get_vulkan_target_triple,
get_iree_vulkan_runtime_flags,
)
def get_available_devices():
def get_devices_by_name(driver_name):
from amdshark.iree_utils._common import iree_device_map
device_list = []
try:
driver_name = iree_device_map(driver_name)
device_list_dict = get_all_devices(driver_name)
print(f"{driver_name} devices are available.")
except:
print(f"{driver_name} devices are not available.")
else:
cpu_name = get_cpu_info()["brand_raw"]
for i, device in enumerate(device_list_dict):
device_name = (
cpu_name if device["name"] == "default" else device["name"]
)
if "local" in driver_name:
device_list.append(
f"{device_name} => {driver_name.replace('local', 'cpu')}"
)
else:
# for drivers with single devices
# let the default device be selected without any indexing
if len(device_list_dict) == 1:
device_list.append(f"{device_name} => {driver_name}")
else:
device_list.append(f"{device_name} => {driver_name}://{i}")
return device_list
set_iree_runtime_flags()
available_devices = []
rocm_devices = get_devices_by_name("rocm")
available_devices.extend(rocm_devices)
cpu_device = get_devices_by_name("cpu-sync")
available_devices.extend(cpu_device)
cpu_device = get_devices_by_name("cpu-task")
available_devices.extend(cpu_device)
from amdshark.iree_utils.vulkan_utils import (
get_all_vulkan_devices,
)
vulkaninfo_list = get_all_vulkan_devices()
vulkan_devices = []
id = 0
for device in vulkaninfo_list:
vulkan_devices.append(f"{device.strip()} => vulkan://{id}")
id += 1
if id != 0:
print(f"vulkan devices are available.")
available_devices.extend(vulkan_devices)
metal_devices = get_devices_by_name("metal")
available_devices.extend(metal_devices)
cuda_devices = get_devices_by_name("cuda")
available_devices.extend(cuda_devices)
hip_devices = get_devices_by_name("hip")
available_devices.extend(hip_devices)
for idx, device_str in enumerate(available_devices):
if "AMD Radeon(TM) Graphics =>" in device_str:
igpu_id_candidates = [
x.split("w/")[-1].split("=>")[0]
for x in available_devices
if "M Graphics" in x
]
for igpu_name in igpu_id_candidates:
if igpu_name:
available_devices[idx] = device_str.replace(
"AMD Radeon(TM) Graphics", igpu_name
)
break
return available_devices
def set_init_device_flags():
if "vulkan" in cmd_opts.device:
# set runtime flags for vulkan.
set_iree_runtime_flags()
# set triple flag to avoid multiple calls to get_vulkan_triple_flag
device_name, cmd_opts.device = map_device_to_name_path(cmd_opts.device)
if not cmd_opts.iree_vulkan_target_triple:
triple = get_vulkan_target_triple(device_name)
if triple is not None:
cmd_opts.iree_vulkan_target_triple = triple
print(
f"Found device {device_name}. Using target triple "
f"{cmd_opts.iree_vulkan_target_triple}."
)
elif "cuda" in cmd_opts.device:
cmd_opts.device = "cuda"
elif "metal" in cmd_opts.device:
device_name, cmd_opts.device = map_device_to_name_path(cmd_opts.device)
if not cmd_opts.iree_metal_target_platform:
from amdshark.iree_utils.metal_utils import get_metal_target_triple
triple = get_metal_target_triple(device_name)
if triple is not None:
cmd_opts.iree_metal_target_platform = triple.split("-")[-1]
print(
f"Found device {device_name}. Using target triple "
f"{cmd_opts.iree_metal_target_platform}."
)
elif "cpu" in cmd_opts.device:
cmd_opts.device = "cpu"
def set_iree_runtime_flags():
# TODO: This function should be device-agnostic and piped properly
# to general runtime driver init.
vulkan_runtime_flags = get_iree_vulkan_runtime_flags()
if cmd_opts.enable_rgp:
vulkan_runtime_flags += [
f"--enable_rgp=true",
f"--vulkan_debug_utils=true",
]
if cmd_opts.device_allocator_heap_key:
vulkan_runtime_flags += [
f"--device_allocator=caching:device_local={cmd_opts.device_allocator_heap_key}",
]
set_iree_vulkan_runtime_flags(flags=vulkan_runtime_flags)
def parse_device(device_str, target_override=""):
from amdshark.iree_utils.compile_utils import (
clean_device_info,
get_iree_target_triple,
iree_target_map,
)
rt_driver, device_id = clean_device_info(device_str)
target_backend = iree_target_map(rt_driver)
if device_id:
rt_device = f"{rt_driver}://{device_id}"
else:
rt_device = rt_driver
if target_override:
return target_backend, rt_device, target_override
match target_backend:
case "vulkan-spirv":
triple = get_iree_target_triple(device_str)
return target_backend, rt_device, triple
case "rocm":
triple = get_rocm_target_chip(device_str)
return target_backend, rt_device, triple
case "llvm-cpu":
return "llvm-cpu", "local-task", "x86_64-linux-gnu"
def get_rocm_target_chip(device_str):
# TODO: Use a data file to map device_str to target chip.
rocm_chip_map = {
"6700": "gfx1031",
"6800": "gfx1030",
"6900": "gfx1030",
"7900": "gfx1100",
"MI300X": "gfx942",
"MI300A": "gfx940",
"MI210": "gfx90a",
"MI250": "gfx90a",
"MI100": "gfx908",
"MI50": "gfx906",
"MI60": "gfx906",
"780M": "gfx1103",
}
for key in rocm_chip_map:
if key in device_str:
return rocm_chip_map[key]
raise AssertionError(
f"Device {device_str} not recognized. Please file an issue at https://github.com/nod-ai/AMD-SHARK-Studio/issues."
)
def get_all_devices(driver_name):
"""
Inputs: driver_name
Returns a list of all the available devices for a given driver sorted by
the iree path names of the device as in --list_devices option in iree.
"""
from iree.runtime import get_driver
driver = get_driver(driver_name)
device_list_src = driver.query_available_devices()
device_list_src.sort(key=lambda d: d["path"])
return device_list_src
def get_device_mapping(driver, key_combination=3):
"""This method ensures consistent device ordering when choosing
specific devices for execution
Args:
driver (str): execution driver (vulkan, cuda, rocm, etc)
key_combination (int, optional): choice for mapping value for
device name.
1 : path
2 : name
3 : (name, path)
Defaults to 3.
Returns:
dict: map to possible device names user can input mapped to desired
combination of name/path.
"""
from amdshark.iree_utils._common import iree_device_map
driver = iree_device_map(driver)
device_list = get_all_devices(driver)
device_map = dict()
def get_output_value(dev_dict):
if key_combination == 1:
return f"{driver}://{dev_dict['path']}"
if key_combination == 2:
return dev_dict["name"]
if key_combination == 3:
return dev_dict["name"], f"{driver}://{dev_dict['path']}"
# mapping driver name to default device (driver://0)
device_map[f"{driver}"] = get_output_value(device_list[0])
for i, device in enumerate(device_list):
# mapping with index
device_map[f"{driver}://{i}"] = get_output_value(device)
# mapping with full path
device_map[f"{driver}://{device['path']}"] = get_output_value(device)
return device_map
def get_opt_flags(model, precision="fp16"):
iree_flags = []
if len(cmd_opts.iree_vulkan_target_triple) > 0:
iree_flags.append(
f"-iree-vulkan-target-triple={cmd_opts.iree_vulkan_target_triple}"
)
if "rocm" in cmd_opts.device:
from amdshark.iree_utils.gpu_utils import get_iree_rocm_args
rocm_args = get_iree_rocm_args()
iree_flags.extend(rocm_args)
if cmd_opts.iree_constant_folding == False:
iree_flags.append("--iree-opt-const-expr-hoisting=False")
iree_flags.append(
"--iree-codegen-linalg-max-constant-fold-elements=9223372036854775807"
)
if cmd_opts.data_tiling == False:
iree_flags.append("--iree-opt-data-tiling=False")
if "vae" not in model:
# Due to lack of support for multi-reduce, we always collapse reduction
# dims before dispatch formation right now.
iree_flags += ["--iree-flow-collapse-reduction-dims"]
return iree_flags
def map_device_to_name_path(device, key_combination=3):
"""Gives the appropriate device data (supported name/path) for user
selected execution device
Args:
device (str): user
key_combination (int, optional): choice for mapping value for
device name.
1 : path
2 : name
3 : (name, path)
Defaults to 3.
Raises:
ValueError:
Returns:
str / tuple: returns the mapping str or tuple of mapping str for
the device depending on key_combination value
"""
driver = device.split("://")[0]
device_map = get_device_mapping(driver, key_combination)
try:
device_mapping = device_map[device]
except KeyError:
raise ValueError(f"Device '{device}' is not a valid device.")
return device_mapping
def get_devices_by_name(driver_name):
from amdshark.iree_utils._common import iree_device_map
device_list = []
try:
driver_name = iree_device_map(driver_name)
device_list_dict = get_all_devices(driver_name)
print(f"{driver_name} devices are available.")
except:
print(f"{driver_name} devices are not available.")
else:
cpu_name = get_cpu_info()["brand_raw"]
for i, device in enumerate(device_list_dict):
device_name = (
cpu_name if device["name"] == "default" else device["name"]
)
if "local" in driver_name:
device_list.append(
f"{device_name} => {driver_name.replace('local', 'cpu')}"
)
else:
# for drivers with single devices
# let the default device be selected without any indexing
if len(device_list_dict) == 1:
device_list.append(f"{device_name} => {driver_name}")
else:
device_list.append(f"{device_name} => {driver_name}://{i}")
return device_list
set_iree_runtime_flags()
available_devices = []
from amdshark.iree_utils.vulkan_utils import (
get_all_vulkan_devices,
)
vulkaninfo_list = get_all_vulkan_devices()
vulkan_devices = []
id = 0
for device in vulkaninfo_list:
vulkan_devices.append(f"{device.strip()} => vulkan://{id}")
id += 1
if id != 0:
print(f"vulkan devices are available.")
available_devices.extend(vulkan_devices)
metal_devices = get_devices_by_name("metal")
available_devices.extend(metal_devices)
cuda_devices = get_devices_by_name("cuda")
available_devices.extend(cuda_devices)
rocm_devices = get_devices_by_name("rocm")
available_devices.extend(rocm_devices)
cpu_device = get_devices_by_name("cpu-sync")
available_devices.extend(cpu_device)
cpu_device = get_devices_by_name("cpu-task")
available_devices.extend(cpu_device)
return available_devices
# Generate and return a new seed if the provided one is not in the
# supported range (including -1)
def sanitize_seed(seed: int | str):
seed = int(seed)
uint32_info = np.iinfo(np.uint32)
uint32_min, uint32_max = uint32_info.min, uint32_info.max
if seed < uint32_min or seed >= uint32_max:
seed = randint(uint32_min, uint32_max)
return seed
# take a seed expression in an input format and convert it to
# a list of integers, where possible
def parse_seed_input(seed_input: str | list | int):
if isinstance(seed_input, str):
try:
seed_input = json.loads(seed_input)
except (ValueError, TypeError):
seed_input = None
if isinstance(seed_input, int):
return [seed_input]
if isinstance(seed_input, list) and all(type(seed) is int for seed in seed_input):
return seed_input
raise TypeError(
"Seed input must be an integer or an array of integers in JSON format"
)

View File

@@ -1 +0,0 @@

View File

@@ -2,7 +2,7 @@
import os
import PIL
import numpy as np
from apps.amdshark_studio.web.utils.file_utils import (
from apps.shark_studio.web.utils.file_utils import (
get_generated_imgs_path,
)
from datetime import datetime

View File

@@ -6,13 +6,13 @@ import warnings
import json
from threading import Thread
from apps.amdshark_studio.modules.timer import startup_timer
from apps.shark_studio.modules.timer import startup_timer
from apps.amdshark_studio.web.utils.tmp_configs import (
from apps.shark_studio.web.utils.tmp_configs import (
config_tmp,
clear_tmp_mlir,
clear_tmp_imgs,
amdshark_tmp,
shark_tmp,
)
@@ -25,17 +25,25 @@ def imports():
)
warnings.filterwarnings(action="ignore", category=UserWarning, module="torchvision")
warnings.filterwarnings(action="ignore", category=UserWarning, module="torch")
warnings.filterwarnings(action="ignore", category=UserWarning, module="diffusers")
warnings.filterwarnings(action="ignore", category=FutureWarning, module="diffusers")
warnings.filterwarnings(
action="ignore", category=FutureWarning, module="huggingface-hub"
)
warnings.filterwarnings(
action="ignore", category=UserWarning, module="huggingface-hub"
)
import gradio # noqa: F401
# import gradio # noqa: F401
startup_timer.record("import gradio")
# startup_timer.record("import gradio")
import apps.amdshark_studio.web.utils.globals as global_obj
import apps.shark_studio.web.utils.globals as global_obj
global_obj._init()
startup_timer.record("initialize globals")
from apps.amdshark_studio.modules import (
from apps.shark_studio.modules import (
img_processing,
) # noqa: F401
@@ -44,23 +52,20 @@ def imports():
def initialize():
configure_sigint_handler()
# Setup to use amdshark_tmp for gradio's temporary image files and clear any
# Setup to use shark_tmp for gradio's temporary image files and clear any
# existing temporary images there if they exist. Then we can import gradio.
# It has to be in this order or gradio ignores what we've set up.
config_tmp()
# clear_tmp_mlir()
clear_tmp_imgs()
# config_tmp()
# clear_tmp_imgs()
from apps.amdshark_studio.web.utils.file_utils import (
from apps.shark_studio.web.utils.file_utils import (
create_model_folders,
)
# Create custom models folders if they don't exist
create_model_folders()
import gradio as gr
# initialize_rest(reload_script_modules=False)
@@ -83,7 +88,7 @@ def dumpstacks():
code.append(f"""File: "{filename}", line {lineno}, in {name}""")
if line:
code.append(" " + line.strip())
with open(os.path.join(amdshark_tmp, "stack_dump.log"), "w") as f:
with open(os.path.join(shark_tmp, "stack_dump.log"), "w") as f:
f.write("\n".join(code))
@@ -100,7 +105,7 @@ def setup_middleware(app):
def configure_cors_middleware(app):
from starlette.middleware.cors import CORSMiddleware
from apps.amdshark_studio.modules.shared_cmd_opts import cmd_opts
from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
cors_options = {
"allow_methods": ["*"],

View File

@@ -2,13 +2,13 @@ from turbine_models.custom_models import stateless_llama
from turbine_models.model_runner import vmfbRunner
from turbine_models.gen_external_params.gen_external_params import gen_external_params
import time
from amdshark.iree_utils.compile_utils import compile_module_to_flatbuffer
from apps.amdshark_studio.web.utils.file_utils import (
from shark.iree_utils.compile_utils import compile_module_to_flatbuffer
from apps.shark_studio.web.utils.file_utils import (
get_resource_path,
get_checkpoints_path,
)
from apps.amdshark_studio.modules.shared_cmd_opts import cmd_opts
from apps.amdshark_studio.api.utils import parse_device
from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
from apps.shark_studio.api.utils import parse_device
from urllib.request import urlopen
import iree.runtime as ireert
from itertools import chain
@@ -366,7 +366,7 @@ def get_mfma_spec_path(target_chip, save_dir):
def llm_chat_api(InputData: dict):
from datetime import datetime as dt
import apps.amdshark_studio.web.utils.globals as global_obj
import apps.shark_studio.web.utils.globals as global_obj
print(f"Input keys : {InputData.keys()}")

View File

@@ -12,29 +12,23 @@ from tqdm.auto import tqdm
from pathlib import Path
from random import randint
from turbine_models.custom_models.sd_inference.sd_pipeline import AMDSharkSDPipeline
from turbine_models.custom_models.sdxl_inference.sdxl_compiled_pipeline import (
AMDSharkSDXLPipeline,
)
from apps.amdshark_studio.api.controlnet import control_adapter_map
from apps.amdshark_studio.api.utils import parse_device
from apps.amdshark_studio.web.utils.state import status_label
from apps.amdshark_studio.web.utils.file_utils import (
from apps.shark_studio.api.controlnet import control_adapter_map
from apps.shark_studio.api.utils import parse_device
from apps.shark_studio.web.utils.state import status_label
from apps.shark_studio.web.utils.file_utils import (
safe_name,
get_resource_path,
get_checkpoints_path,
)
from apps.amdshark_studio.modules.img_processing import (
from apps.shark_studio.modules.img_processing import (
save_output_img,
)
from apps.amdshark_studio.modules.ckpt_processing import (
preprocessCKPT,
save_irpa,
)
from subprocess import check_output
EMPTY_SD_MAP = {
"clip": None,
@@ -67,7 +61,6 @@ def load_script(source, module_name):
:param module_name: name of module to register in sys.modules
:return: loaded module
"""
spec = importlib.util.spec_from_file_location(module_name, source)
module = importlib.util.module_from_spec(spec)
sys.modules[module_name] = module
@@ -99,7 +92,10 @@ class StableDiffusion:
import_ir: bool = True,
is_controlled: bool = False,
external_weights: str = "safetensors",
progress=gr.Progress(),
):
progress(0, desc="Initializing pipeline...")
self.ui_device = device
self.precision = precision
self.compiled_pipeline = False
self.base_model_id = base_model_id
@@ -112,12 +108,23 @@ class StableDiffusion:
"custom_pipeline",
)
self.turbine_pipe = custom_module.StudioPipeline
self.dynamic_steps = False
self.model_map = custom_module.MODEL_MAP
elif self.is_sdxl:
self.turbine_pipe = AMDSharkSDXLPipeline
from turbine_models.custom_models.sdxl_inference.sdxl_compiled_pipeline import (
SharkSDXLPipeline,
)
self.turbine_pipe = SharkSDXLPipeline
self.dynamic_steps = False
self.model_map = EMPTY_SDXL_MAP
else:
self.turbine_pipe = AMDSharkSDPipeline
from turbine_models.custom_models.sd_inference.sd_pipeline import (
SharkSDPipeline,
)
self.turbine_pipe = SharkSDPipeline
self.dynamic_steps = True
self.model_map = EMPTY_SD_MAP
max_length = 64
target_backend, self.rt_device, triple = parse_device(device, target_triple)
@@ -158,7 +165,7 @@ class StableDiffusion:
external_weights = None
elif target_backend == "llvm-cpu":
decomp_attn = False
progress(0.5, desc="Initializing pipeline...")
self.sd_pipe = self.turbine_pipe(
hf_model_name=base_model_id,
scheduler_id=scheduler,
@@ -178,13 +185,20 @@ class StableDiffusion:
external_weights=external_weights,
custom_vae=custom_vae,
)
print(f"\n[LOG] Pipeline initialized with pipe_id: {self.pipe_id}.")
progress(1, desc="Pipeline initialized!...")
gc.collect()
def prepare_pipe(
self, custom_weights, adapters, embeddings, is_img2img, compiled_pipeline
self,
custom_weights,
adapters,
embeddings,
is_img2img,
compiled_pipeline,
progress=gr.Progress(),
):
print(f"\n[LOG] Preparing pipeline...")
progress(0, desc="Preparing models...")
self.is_img2img = False
mlirs = copy.deepcopy(self.model_map)
vmfbs = copy.deepcopy(self.model_map)
@@ -194,6 +208,11 @@ class StableDiffusion:
self.compiled_pipeline = compiled_pipeline
if custom_weights:
from apps.shark_studio.modules.ckpt_processing import (
preprocessCKPT,
save_irpa,
)
custom_weights = os.path.join(
get_checkpoints_path("checkpoints"),
safe_name(self.base_model_id.split("/")[-1]),
@@ -208,14 +227,20 @@ class StableDiffusion:
"diffusion_pytorch_model.safetensors",
)
weights[key] = save_irpa(unet_weights_path, "unet.")
elif key in ["clip", "prompt_encoder"]:
if not self.is_sdxl:
if key in ["mmdit"]:
mmdit_weights_path = os.path.join(
diffusers_weights_path,
"mmdit",
"diffusion_pytorch_model_fp16.safetensors",
)
weights[key] = save_irpa(mmdit_weights_path, "mmdit.")
elif key in ["clip", "prompt_encoder", "text_encoder"]:
if not self.is_sdxl and not self.is_custom:
sd1_path = os.path.join(
diffusers_weights_path, "text_encoder", "model.safetensors"
)
weights[key] = save_irpa(sd1_path, "text_encoder_model.")
else:
elif self.is_sdxl:
clip_1_path = os.path.join(
diffusers_weights_path, "text_encoder", "model.safetensors"
)
@@ -228,7 +253,27 @@ class StableDiffusion:
save_irpa(clip_1_path, "text_encoder_model_1."),
save_irpa(clip_2_path, "text_encoder_model_2."),
]
elif self.is_custom:
clip_g_path = os.path.join(
diffusers_weights_path,
"text_encoder",
"model.fp16.safetensors",
)
clip_l_path = os.path.join(
diffusers_weights_path,
"text_encoder_2",
"model.fp16.safetensors",
)
t5xxl_path = os.path.join(
diffusers_weights_path,
"text_encoder_3",
"model.fp16.safetensors",
)
weights[key] = [
save_irpa(clip_g_path, "clip_g.transformer."),
save_irpa(clip_l_path, "clip_l.transformer."),
save_irpa(t5xxl_path, "t5xxl.transformer."),
]
elif key in ["vae_decode"] and weights[key] is None:
vae_weights_path = os.path.join(
diffusers_weights_path,
@@ -237,16 +282,18 @@ class StableDiffusion:
)
weights[key] = save_irpa(vae_weights_path, "vae.")
progress(0.25, desc=f"Preparing pipeline for {self.ui_device}...")
vmfbs, weights = self.sd_pipe.check_prepared(
mlirs, vmfbs, weights, interactive=False
)
print(f"\n[LOG] Loading pipeline to device {self.rt_device}.")
progress(0.5, desc=f"Artifacts ready!")
progress(0.75, desc=f"Loading models and weights...")
self.sd_pipe.load_pipeline(
vmfbs, weights, self.rt_device, self.compiled_pipeline
)
print(
"\n[LOG] Pipeline successfully prepared for runtime. Generating images..."
)
progress(1, desc="Pipeline loaded! Generating images...")
return
def generate_images(
@@ -261,7 +308,9 @@ class StableDiffusion:
resample_type,
control_mode,
hints,
progress=gr.Progress(),
):
img = self.sd_pipe.generate_images(
prompt,
negative_prompt,
@@ -273,50 +322,7 @@ class StableDiffusion:
return img
def amdshark_sd_fn_dict_input(
sd_kwargs: dict,
):
print("\n[LOG] Submitting Request...")
for key in sd_kwargs:
if sd_kwargs[key] in [None, []]:
sd_kwargs[key] = None
if sd_kwargs[key] in ["None"]:
sd_kwargs[key] = ""
if key == "seed":
sd_kwargs[key] = int(sd_kwargs[key])
# TODO: move these checks into the UI code so we don't have gradio warnings in a generalized dict input function.
if not sd_kwargs["device"]:
gr.Warning("No device specified. Please specify a device.")
return None, ""
if sd_kwargs["height"] not in [512, 1024]:
gr.Warning("Height must be 512 or 1024. This is a temporary limitation.")
return None, ""
if sd_kwargs["height"] != sd_kwargs["width"]:
gr.Warning("Height and width must be the same. This is a temporary limitation.")
return None, ""
if sd_kwargs["base_model_id"] == "stabilityai/sdxl-turbo":
if sd_kwargs["steps"] > 10:
gr.Warning("Max steps for sdxl-turbo is 10. 1 to 4 steps are recommended.")
return None, ""
if sd_kwargs["guidance_scale"] > 3:
gr.Warning(
"sdxl-turbo CFG scale should be less than 2.0 if using negative prompt, 0 otherwise."
)
return None, ""
if sd_kwargs["target_triple"] == "":
if parse_device(sd_kwargs["device"], sd_kwargs["target_triple"])[2] == "":
gr.Warning(
"Target device architecture could not be inferred. Please specify a target triple, e.g. 'gfx1100' for a Radeon 7900xtx."
)
return None, ""
generated_imgs = yield from amdshark_sd_fn(**sd_kwargs)
return generated_imgs
def amdshark_sd_fn(
def shark_sd_fn(
prompt,
negative_prompt,
sd_init_image: list,
@@ -340,14 +346,17 @@ def amdshark_sd_fn(
resample_type: str,
controlnets: dict,
embeddings: dict,
seed_increment: str | int = 1,
output_type: str = "png",
# progress=gr.Progress(),
):
sd_kwargs = locals()
if not isinstance(sd_init_image, list):
sd_init_image = [sd_init_image]
is_img2img = True if sd_init_image[0] is not None else False
from apps.amdshark_studio.modules.shared_cmd_opts import cmd_opts
import apps.amdshark_studio.web.utils.globals as global_obj
from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
import apps.shark_studio.web.utils.globals as global_obj
adapters = {}
is_controlled = False
@@ -413,6 +422,9 @@ def amdshark_sd_fn(
"control_mode": control_mode,
"hints": hints,
}
if global_obj.get_sd_obj() and global_obj.get_sd_obj().dynamic_steps:
submit_run_kwargs["steps"] = submit_pipe_kwargs["steps"]
submit_pipe_kwargs.pop("steps")
if (
not global_obj.get_sd_obj()
or global_obj.get_pipe_kwargs() != submit_pipe_kwargs
@@ -438,6 +450,12 @@ def amdshark_sd_fn(
global_obj.get_sd_obj().prepare_pipe(**submit_prep_kwargs)
generated_imgs = []
if submit_run_kwargs["seed"] in [-1, "-1"]:
submit_run_kwargs["seed"] = randint(0, 4294967295)
seed_increment = "random"
# print(f"\n[LOG] Random seed: {seed}")
# progress(None, desc=f"Generating...")
for current_batch in range(batch_count):
start_time = time.time()
out_imgs = global_obj.get_sd_obj().generate_images(**submit_run_kwargs)
@@ -450,23 +468,79 @@ def amdshark_sd_fn(
# break
# else:
for batch in range(batch_size):
save_output_img(
out_imgs[batch],
seed,
sd_kwargs,
)
if output_type == "png":
save_output_img(
out_imgs[batch],
seed,
sd_kwargs,
)
generated_imgs.extend(out_imgs)
# TODO: make seed changes over batch counts more configurable.
submit_run_kwargs["seed"] = submit_run_kwargs["seed"] + 1
yield generated_imgs, status_label(
"Stable Diffusion", current_batch + 1, batch_count, batch_size
)
if batch_count > 1:
submit_run_kwargs["seed"] = get_next_seed(seed, seed_increment)
return (generated_imgs, "")
def shark_sd_fn_dict_input(sd_kwargs: dict, *, progress=gr.Progress()):
print("\n[LOG] Submitting Request...")
for key in sd_kwargs:
if sd_kwargs[key] in [None, []]:
sd_kwargs[key] = None
if sd_kwargs[key] in ["None"]:
sd_kwargs[key] = ""
if key in ["steps", "height", "width", "batch_count", "batch_size"]:
sd_kwargs[key] = int(sd_kwargs[key])
if key == "seed":
sd_kwargs[key] = int(sd_kwargs[key])
# TODO: move these checks into the UI code so we don't have gradio warnings in a generalized dict input function.
if not sd_kwargs["device"]:
gr.Warning("No device specified. Please specify a device.")
return None, ""
if sd_kwargs["height"] not in [512, 1024]:
gr.Warning("Height must be 512 or 1024. This is a temporary limitation.")
return None, ""
if sd_kwargs["height"] != sd_kwargs["width"]:
gr.Warning("Height and width must be the same. This is a temporary limitation.")
return None, ""
if sd_kwargs["base_model_id"] == "stabilityai/sdxl-turbo":
if sd_kwargs["steps"] > 10:
gr.Warning("Max steps for sdxl-turbo is 10. 1 to 4 steps are recommended.")
return None, ""
if sd_kwargs["guidance_scale"] > 3:
gr.Warning(
"sdxl-turbo CFG scale should be less than 2.0 if using negative prompt, 0 otherwise."
)
return None, ""
if sd_kwargs["target_triple"] == "":
if not parse_device(sd_kwargs["device"], sd_kwargs["target_triple"])[2]:
gr.Warning(
"Target device architecture could not be inferred. Please specify a target triple, e.g. 'gfx1100' for a Radeon 7900xtx."
)
return None, ""
generated_imgs = yield from shark_sd_fn(**sd_kwargs)
return generated_imgs
def get_next_seed(seed, seed_increment: str | int = 10):
if isinstance(seed_increment, int):
# print(f"\n[LOG] Seed after batch increment: {seed + seed_increment}")
return int(seed + seed_increment)
elif seed_increment == "random":
seed = randint(0, 4294967295)
# print(f"\n[LOG] Random seed: {seed}")
return seed
def unload_sd():
print("Unloading models.")
import apps.amdshark_studio.web.utils.globals as global_obj
import apps.shark_studio.web.utils.globals as global_obj
global_obj.clear_cache()
gc.collect()
@@ -489,17 +563,17 @@ def safe_name(name):
if __name__ == "__main__":
from apps.amdshark_studio.modules.shared_cmd_opts import cmd_opts
import apps.amdshark_studio.web.utils.globals as global_obj
from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
import apps.shark_studio.web.utils.globals as global_obj
global_obj._init()
sd_json = view_json_file(
get_resource_path(os.path.join(cmd_opts.config_dir, "default_sd_config.json"))
get_resource_path(os.path.join(cmd_opts.config_dir, cmd_opts.default_config))
)
sd_kwargs = json.loads(sd_json)
for arg in vars(cmd_opts):
if arg in sd_kwargs:
sd_kwargs[arg] = getattr(cmd_opts, arg)
for i in amdshark_sd_fn_dict_input(sd_kwargs):
# for arg in vars(cmd_opts):
# if arg in sd_kwargs:
# sd_kwargs[arg] = getattr(cmd_opts, arg)
for i in shark_sd_fn_dict_input(sd_kwargs):
print(i)

View File

@@ -0,0 +1,288 @@
import numpy as np
import json
from random import (
randint,
seed as seed_random,
getstate as random_getstate,
setstate as random_setstate,
)
from pathlib import Path
from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
from cpuinfo import get_cpu_info
def iree_device_map(device):
uri_parts = device.split("://", 2)
iree_driver = (
_IREE_DEVICE_MAP[uri_parts[0]]
if uri_parts[0] in _IREE_DEVICE_MAP
else uri_parts[0]
)
if len(uri_parts) == 1:
return iree_driver
elif "rocm" in uri_parts:
return "rocm"
else:
return f"{iree_driver}://{uri_parts[1]}"
def get_supported_device_list():
return list(_IREE_DEVICE_MAP.keys())
_IREE_DEVICE_MAP = {
"cpu": "local-task",
"cpu-task": "local-task",
"cpu-sync": "local-sync",
"cuda": "cuda",
"vulkan": "vulkan",
"metal": "metal",
"rocm": "rocm",
"hip": "hip",
"intel-gpu": "level_zero",
}
def iree_target_map(device):
if "://" in device:
device = device.split("://")[0]
return _IREE_TARGET_MAP[device] if device in _IREE_TARGET_MAP else device
_IREE_TARGET_MAP = {
"cpu": "llvm-cpu",
"cpu-task": "llvm-cpu",
"cpu-sync": "llvm-cpu",
"cuda": "cuda",
"vulkan": "vulkan-spirv",
"metal": "metal",
"rocm": "rocm",
"hip": "rocm",
"intel-gpu": "opencl-spirv",
}
def get_available_devices():
return ["rocm", "cpu"]
def get_devices_by_name(driver_name):
device_list = []
try:
driver_name = iree_device_map(driver_name)
device_list_dict = get_all_devices(driver_name)
print(f"{driver_name} devices are available.")
except:
print(f"{driver_name} devices are not available.")
else:
cpu_name = get_cpu_info()["brand_raw"]
for i, device in enumerate(device_list_dict):
device_name = (
cpu_name if device["name"] == "default" else device["name"]
)
if "local" in driver_name:
device_list.append(
f"{device_name} => {driver_name.replace('local', 'cpu')}"
)
else:
# for drivers with single devices
# let the default device be selected without any indexing
if len(device_list_dict) == 1:
device_list.append(f"{device_name} => {driver_name}")
else:
device_list.append(f"{device_name} => {driver_name}://{i}")
return device_list
# set_iree_runtime_flags()
available_devices = []
rocm_devices = get_devices_by_name("rocm")
available_devices.extend(rocm_devices)
# cpu_device = get_devices_by_name("cpu-sync")
# available_devices.extend(cpu_device)
cpu_device = get_devices_by_name("cpu-task")
available_devices.extend(cpu_device)
# from shark.iree_utils.vulkan_utils import (
# get_all_vulkan_devices,
# )
# vulkaninfo_list = get_all_vulkan_devices()
# vulkan_devices = []
# id = 0
# for device in vulkaninfo_list:
# vulkan_devices.append(f"{device.strip()} => vulkan://{id}")
# id += 1
# if id != 0:
# print(f"vulkan devices are available.")
# available_devices.extend(vulkan_devices)
# metal_devices = get_devices_by_name("metal")
# available_devices.extend(metal_devices)
# cuda_devices = get_devices_by_name("cuda")
# available_devices.extend(cuda_devices)
# hip_devices = get_devices_by_name("hip")
# available_devices.extend(hip_devices)
for idx, device_str in enumerate(available_devices):
if "AMD Radeon(TM) Graphics =>" in device_str:
igpu_id_candidates = [
x.split("w/")[-1].split("=>")[0]
for x in available_devices
if "M Graphics" in x
]
for igpu_name in igpu_id_candidates:
if igpu_name:
available_devices[idx] = device_str.replace(
"AMD Radeon(TM) Graphics", igpu_name
)
break
return available_devices
def clean_device_info(raw_device):
# return appropriate device and device_id for consumption by Studio pipeline
# Multiple devices only supported for vulkan and rocm (as of now).
# default device must be selected for all others
device_id = None
device = raw_device if "=>" not in raw_device else raw_device.split("=>")[1].strip()
if "://" in device:
device, device_id = device.split("://")
if len(device_id) <= 2:
device_id = int(device_id)
if device not in ["hip", "rocm", "vulkan"]:
device_id = None
if device in ["hip", "rocm", "vulkan"] and device_id == None:
device_id = 0
return device, device_id
def parse_device(device_str, target_override=""):
rt_driver, device_id = clean_device_info(device_str)
target_backend = iree_target_map(rt_driver)
if device_id:
rt_device = f"{rt_driver}://{device_id}"
else:
rt_device = rt_driver
if target_override:
if "cpu" in device_str:
rt_device = "local-task"
return target_backend, rt_device, target_override
match target_backend:
case "vulkan-spirv":
triple = get_iree_target_triple(device_str)
return target_backend, rt_device, triple
case "rocm":
triple = get_rocm_target_chip(device_str)
return target_backend, rt_device, triple
case "llvm-cpu":
if "Ryzen 9" in device_str:
return target_backend, "local-task", "znver4"
else:
return "llvm-cpu", "local-task", "x86_64-linux-gnu"
def get_rocm_target_chip(device_str):
# TODO: Use a data file to map device_str to target chip.
rocm_chip_map = {
"6700": "gfx1031",
"6800": "gfx1030",
"6900": "gfx1030",
"7900": "gfx1100",
"MI300X": "gfx942",
"MI300A": "gfx940",
"MI210": "gfx90a",
"MI250": "gfx90a",
"MI100": "gfx908",
"MI50": "gfx906",
"MI60": "gfx906",
"780M": "gfx1103",
}
for key in rocm_chip_map:
if key in device_str:
return rocm_chip_map[key]
return None
def get_all_devices(driver_name):
"""
Inputs: driver_name
Returns a list of all the available devices for a given driver sorted by
the iree path names of the device as in --list_devices option in iree.
"""
from iree.runtime import get_driver
driver = get_driver(driver_name)
device_list_src = driver.query_available_devices()
device_list_src.sort(key=lambda d: d["path"])
del driver
return device_list_src
# def get_device_mapping(driver, key_combination=3):
# """This method ensures consistent device ordering when choosing
# specific devices for execution
# Args:
# driver (str): execution driver (vulkan, cuda, rocm, etc)
# key_combination (int, optional): choice for mapping value for
# device name.
# 1 : path
# 2 : name
# 3 : (name, path)
# Defaults to 3.
# Returns:
# dict: map to possible device names user can input mapped to desired
# combination of name/path.
# """
# driver = iree_device_map(driver)
# device_list = get_all_devices(driver)
# device_map = dict()
# def get_output_value(dev_dict):
# if key_combination == 1:
# return f"{driver}://{dev_dict['path']}"
# if key_combination == 2:
# return dev_dict["name"]
# if key_combination == 3:
# return dev_dict["name"], f"{driver}://{dev_dict['path']}"
# # mapping driver name to default device (driver://0)
# device_map[f"{driver}"] = get_output_value(device_list[0])
# for i, device in enumerate(device_list):
# # mapping with index
# device_map[f"{driver}://{i}"] = get_output_value(device)
# # mapping with full path
# device_map[f"{driver}://{device['path']}"] = get_output_value(device)
# return device_map
# def get_opt_flags(model, precision="fp16"):
# iree_flags = []
# if len(cmd_opts.iree_vulkan_target_triple) > 0:
# iree_flags.append(
# f"-iree-vulkan-target-triple={cmd_opts.iree_vulkan_target_triple}"
# )
# if "rocm" in cmd_opts.device:
# from shark.iree_utils.gpu_utils import get_iree_rocm_args
# rocm_args = get_iree_rocm_args()
# iree_flags.extend(rocm_args)
# if cmd_opts.iree_constant_folding == False:
# iree_flags.append("--iree-opt-const-expr-hoisting=False")
# iree_flags.append(
# "--iree-codegen-linalg-max-constant-fold-elements=9223372036854775807"
# )
# if cmd_opts.data_tiling == False:
# iree_flags.append("--iree-opt-data-tiling=False")
# if "vae" not in model:
# # Due to lack of support for multi-reduce, we always collapse reduction
# # dims before dispatch formation right now.
# iree_flags += ["--iree-flow-collapse-reduction-dims"]
# return iree_flags

View File

@@ -4,7 +4,7 @@ import re
import requests
import torch
import safetensors
from iree.turbine.aot.params import (
from shark_turbine.aot.params import (
ParameterArchiveBuilder,
)
from io import BytesIO
@@ -12,7 +12,7 @@ from pathlib import Path
from tqdm import tqdm
from omegaconf import OmegaConf
from diffusers import StableDiffusionPipeline
from apps.amdshark_studio.modules.shared_cmd_opts import cmd_opts
from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
from diffusers.pipelines.stable_diffusion.convert_from_ckpt import (
download_from_original_stable_diffusion_ckpt,
create_vae_diffusers_config,
@@ -71,7 +71,14 @@ def save_irpa(weights_path, prepend_str):
new_key = prepend_str + key
archive.add_tensor(new_key, weights[key])
irpa_file = weights_path.replace(".safetensors", ".irpa")
if "safetensors" in weights_path:
irpa_file = weights_path.replace(".safetensors", ".irpa")
elif "irpa" in weights_path:
irpa_file = weights_path
else:
return Exception(
"Invalid file format. Please provide a .safetensors or .irpa file."
)
archive.save(irpa_file)
return irpa_file

View File

@@ -5,7 +5,7 @@ import json
import safetensors
from dataclasses import dataclass
from safetensors.torch import load_file
from apps.amdshark_studio.web.utils.file_utils import (
from apps.shark_studio.web.utils.file_utils import (
get_checkpoint_pathfile,
get_path_stem,
)

View File

@@ -25,14 +25,16 @@ resampler_list = resamplers.keys()
# save output images and the inputs corresponding to it.
def save_output_img(output_img, img_seed, extra_info=None):
from apps.amdshark_studio.web.utils.file_utils import (
from apps.shark_studio.web.utils.file_utils import (
get_generated_imgs_path,
get_generated_imgs_todays_subdir,
)
from apps.amdshark_studio.modules.shared_cmd_opts import cmd_opts
from apps.shark_studio.modules.shared_cmd_opts import cmd_opts
if extra_info is None:
extra_info = {}
elif "progress" in extra_info.keys():
extra_info.pop("progress")
generated_imgs_path = Path(
get_generated_imgs_path(), get_generated_imgs_todays_subdir()
)

View File

@@ -30,8 +30,8 @@ def logger_test(x):
def read_sd_logs():
sys.stdout.flush()
with open("amdshark_tmp/sd.log", "r") as f:
with open("shark_tmp/sd.log", "r") as f:
return f.read()
sys.stdout = Logger("amdshark_tmp/sd.log", filter="[LOG]")
sys.stdout = Logger("shark_tmp/sd.log", filter="[LOG]")

View File

@@ -1,14 +1,14 @@
from amdshark.iree_utils.compile_utils import (
from shark.iree_utils.compile_utils import (
get_iree_compiled_module,
load_vmfb_using_mmap,
clean_device_info,
get_iree_target_triple,
)
from apps.amdshark_studio.web.utils.file_utils import (
from apps.shark_studio.web.utils.file_utils import (
get_checkpoints_path,
get_resource_path,
)
from apps.amdshark_studio.modules.shared_cmd_opts import (
from apps.shark_studio.modules.shared_cmd_opts import (
cmd_opts,
)
from iree import runtime as ireert
@@ -17,7 +17,7 @@ import gc
import os
class AMDSharkPipelineBase:
class SharkPipelineBase:
# This class is a lightweight base for managing an
# inference API class. It should provide methods for:
# - compiling a set (model map) of torch IR modules

Some files were not shown because too many files have changed in this diff Show More