PTX Reintegration and Passing Tests (#1512)

* move assembly, assembly_ptx

* successful but broken rendering of ptx asm

* clear ins before render asm

* slightly less broken :')

* we needed thread syncs

* fix float16 loading, rounding modifiers and other casting stuff, passing casts_from_half

* Fix runtime_args for gpuocelot

* our casts were flipped on both ends

* more casting

* add ternary where op

* dealing with storing/loading bool

* add test for casting to bool from negative

* Fix args.valid on ConstOp

* add to CI, TODO: fix runtime_args for test_uops

* fix placement of runtime_args to work with lazy.Device

* undo ci changes so I can push

* fix lints

* start cleanup and fix things we broke fixing lints

* add checks for PTX specifc asm instructions

* revert added test -- doesn't pass on llvm

* skip tests for underflow,overflow

* another fix for how we're setting runtime args

* Less broken cleanup

* add to CI

* add more env variables for ci test

* fix ci to install pycuda for ptx

* ci: copy cuda test command

* cleanup

* assert to make sure we're actually running ptx in ci

* remove test assert

* move is_ptx arg

* move assembly, assembly_ptx back to extras

* fix imports

* initial merge fixes

* clear registers, fix UOps.LOAD with invalid value

* draft merge fixes

* remove prints

* quick lint and merge fixes

* cleanup

* remove PTXProgram wrapper

* final cleanup

* temp change for ci rerun

* ci rerun

* rollback ISA version
This commit is contained in:
Ethan Sorrell
2023-08-16 18:20:20 -05:00
committed by GitHub
parent 8763037f0e
commit cb62911f6b
6 changed files with 131 additions and 85 deletions

View File

@@ -201,7 +201,7 @@ jobs:
strategy:
fail-fast: false
matrix:
backend: [llvm, clang, gpu, cuda]
backend: [llvm, clang, gpu, cuda, ptx]
name: Tests on (${{ matrix.backend }})
runs-on: ${{ matrix.backend == 'gpu' && 'ubuntu-20.04' || 'ubuntu-latest' }}
@@ -220,7 +220,7 @@ jobs:
path: '~/.cache/pip'
key: ${{ matrix.backend }}
- name: Set env
run: printf "${{ matrix.backend == 'llvm' && 'ENABLE_METHOD_CACHE=1\nLLVM=1' || matrix.backend == 'clang' && 'CLANG=1\nENABLED_METHOD_CACHE=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'cuda' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\n'}}" >> $GITHUB_ENV
run: printf "${{ matrix.backend == 'llvm' && 'ENABLE_METHOD_CACHE=1\nLLVM=1' || matrix.backend == 'clang' && 'CLANG=1\nENABLED_METHOD_CACHE=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'cuda' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\n' || matrix.backend == 'PTX' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nPTX=1' }}" >> $GITHUB_ENV
- name: Find faster apt mirror
# uses: vegardit/fast-apt-mirror.sh@v1
# - name: Install packages (gpu)
@@ -231,12 +231,12 @@ jobs:
sudo apt-get update -y && \
sudo apt-get install -y intel-oneapi-runtime-compilers intel-oneapi-runtime-opencl
- name: Install packages (cuda)
if: matrix.backend == 'cuda'
if: matrix.backend == 'cuda' || matrix.backend == 'ptx'
run: |
sudo apt-get update -y && \
sudo apt-get install -y --no-install-recommends git g++ cmake ninja-build llvm-15-dev zlib1g-dev libglew-dev flex bison libfl-dev libboost-thread-dev libboost-filesystem-dev nvidia-cuda-toolkit-gcc
- name: Cache gpuocelot
if: matrix.backend == 'cuda'
if: matrix.backend == 'cuda' || matrix.backend == 'ptx'
id: cache-build
uses: actions/cache@v3
env:
@@ -245,7 +245,7 @@ jobs:
path: ${{ github.workspace }}/gpuocelot/ocelot
key: ubuntu22.04-gpuocelot-19626fc00b6ee321638c3111074269c69050e091
- name: Clone/compile gpuocelot
if: matrix.backend == 'cuda' && steps.cache-build.outputs.cache-hit != 'true'
if: (matrix.backend == 'cuda' || matrix.backend == 'ptx') && steps.cache-build.outputs.cache-hit != 'true'
run: |
git clone --recurse-submodules https://github.com/gpuocelot/gpuocelot.git ${{ github.workspace }}/gpuocelot
cd ${{ github.workspace }}/gpuocelot/ocelot
@@ -255,20 +255,23 @@ jobs:
cmake .. -Wno-dev -G Ninja -DOCELOT_BUILD_TOOLS=OFF
ninja
- name: Install gpuocelot
if: matrix.backend == 'cuda'
if: matrix.backend == 'cuda' || matrix.backend == 'ptx'
run: |
cd ${{ github.workspace }}/gpuocelot/ocelot/build
sudo ninja install
- name: Install dependencies
run: pip install -e '.[testing${{matrix.backend=='llvm'&&',llvm'||matrix.backend=='cuda'&&',cuda'||''}}]' --extra-index-url https://download.pytorch.org/whl/cpu
run: pip install -e '.[testing${{matrix.backend=='llvm'&&',llvm'||matrix.backend=='cuda'&&',cuda'||matrix.backend=='ptx'&&',cuda'||''}}]' --extra-index-url https://download.pytorch.org/whl/cpu
- name: Check Device.DEFAULT
run: python -c "from tinygrad.lazy import Device; assert Device.DEFAULT in ['LLVM','CLANG','CUDA','GPU'], Device.DEFAULT"
- name: Run pytest (not cuda)
if: matrix.backend!='cuda'
if: matrix.backend!='cuda' && matrix.backend!='ptx'
run: python -m pytest -n=auto test/ -k '${{matrix.backend=='llvm'&&'not (test_nn.py and test_conv_transpose2d)'||'test'}}' -m 'not exclude_${{matrix.backend}}'
- name: Run pytest (cuda)
if: matrix.backend=='cuda'
run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors) and not (test_conv2d and test_tensor.py)' -m 'not exclude_cuda' --ignore=test/external --ignore=test/models
- name: Run pytest (ptx)
if: matrix.backend=='ptx'
run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors) and not (test_conv2d and test_tensor.py)' -m 'not exclude_cuda' --ignore=test/external --ignore=test/models
testunicorn:
name: ARM64 unicorn Test
@@ -285,7 +288,7 @@ jobs:
uses: actions/cache@v3
with:
path: '~/.cache/pip'
key: unicorn
key: unicorn
- name: Install cross-assembler
run: |
sudo apt-get update -y && \