PTX Reintegration and Passing Tests (#1512)

* move assembly, assembly_ptx * successful but broken rendering of ptx asm * clear ins before render asm * slightly less broken :') * we needed thread syncs * fix float16 loading, rounding modifiers and other casting stuff, passing casts_from_half * Fix runtime_args for gpuocelot * our casts were flipped on both ends * more casting * add ternary where op * dealing with storing/loading bool * add test for casting to bool from negative * Fix args.valid on ConstOp * add to CI, TODO: fix runtime_args for test_uops * fix placement of runtime_args to work with lazy.Device * undo ci changes so I can push * fix lints * start cleanup and fix things we broke fixing lints * add checks for PTX specifc asm instructions * revert added test -- doesn't pass on llvm * skip tests for underflow,overflow * another fix for how we're setting runtime args * Less broken cleanup * add to CI * add more env variables for ci test * fix ci to install pycuda for ptx * ci: copy cuda test command * cleanup * assert to make sure we're actually running ptx in ci * remove test assert * move is_ptx arg * move assembly, assembly_ptx back to extras * fix imports * initial merge fixes * clear registers, fix UOps.LOAD with invalid value * draft merge fixes * remove prints * quick lint and merge fixes * cleanup * remove PTXProgram wrapper * final cleanup * temp change for ci rerun * ci rerun * rollback ISA version
2026-04-29 03:00:14 -04:00 · 2023-08-16 18:20:20 -05:00
parent 8763037f0e
commit cb62911f6b
6 changed files with 131 additions and 85 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -201,7 +201,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        backend: [llvm, clang, gpu, cuda]
+        backend: [llvm, clang, gpu, cuda, ptx]

    name: Tests on (${{ matrix.backend }})
    runs-on: ${{ matrix.backend == 'gpu'  && 'ubuntu-20.04' || 'ubuntu-latest' }}
@@ -220,7 +220,7 @@ jobs:
          path: '~/.cache/pip'
          key: ${{ matrix.backend }}
      - name: Set env
-        run: printf "${{ matrix.backend == 'llvm' && 'ENABLE_METHOD_CACHE=1\nLLVM=1' || matrix.backend == 'clang' && 'CLANG=1\nENABLED_METHOD_CACHE=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'cuda' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\n'}}" >> $GITHUB_ENV
+        run: printf "${{ matrix.backend == 'llvm' && 'ENABLE_METHOD_CACHE=1\nLLVM=1' || matrix.backend == 'clang' && 'CLANG=1\nENABLED_METHOD_CACHE=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'cuda' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\n' || matrix.backend == 'PTX' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\nPTX=1' }}" >> $GITHUB_ENV
      - name: Find faster apt mirror
      #   uses: vegardit/fast-apt-mirror.sh@v1
      # - name: Install packages (gpu)
@@ -231,12 +231,12 @@ jobs:
          sudo apt-get update -y && \
          sudo apt-get install -y intel-oneapi-runtime-compilers intel-oneapi-runtime-opencl
      - name: Install packages (cuda)
-        if: matrix.backend == 'cuda'
+        if: matrix.backend == 'cuda' || matrix.backend == 'ptx'
        run: |
          sudo apt-get update -y && \
          sudo apt-get install -y --no-install-recommends git g++ cmake ninja-build llvm-15-dev zlib1g-dev libglew-dev flex bison libfl-dev libboost-thread-dev libboost-filesystem-dev nvidia-cuda-toolkit-gcc
      - name: Cache gpuocelot
-        if: matrix.backend == 'cuda'
+        if: matrix.backend == 'cuda' || matrix.backend == 'ptx'
        id: cache-build
        uses: actions/cache@v3
        env:
@@ -245,7 +245,7 @@ jobs:
          path: ${{ github.workspace }}/gpuocelot/ocelot
          key: ubuntu22.04-gpuocelot-19626fc00b6ee321638c3111074269c69050e091
      - name: Clone/compile gpuocelot
-        if: matrix.backend == 'cuda' && steps.cache-build.outputs.cache-hit != 'true'
+        if: (matrix.backend == 'cuda' || matrix.backend == 'ptx') && steps.cache-build.outputs.cache-hit != 'true'
        run: |
          git clone --recurse-submodules https://github.com/gpuocelot/gpuocelot.git ${{ github.workspace }}/gpuocelot
          cd ${{ github.workspace }}/gpuocelot/ocelot
@@ -255,20 +255,23 @@ jobs:
          cmake .. -Wno-dev -G Ninja -DOCELOT_BUILD_TOOLS=OFF
          ninja
      - name: Install gpuocelot
-        if: matrix.backend == 'cuda'
+        if: matrix.backend == 'cuda' || matrix.backend == 'ptx'
        run: |
          cd ${{ github.workspace }}/gpuocelot/ocelot/build
          sudo ninja install
      - name: Install dependencies
-        run: pip install -e '.[testing${{matrix.backend=='llvm'&&',llvm'||matrix.backend=='cuda'&&',cuda'||''}}]' --extra-index-url https://download.pytorch.org/whl/cpu
+        run: pip install -e '.[testing${{matrix.backend=='llvm'&&',llvm'||matrix.backend=='cuda'&&',cuda'||matrix.backend=='ptx'&&',cuda'||''}}]' --extra-index-url https://download.pytorch.org/whl/cpu
      - name: Check Device.DEFAULT
        run: python -c "from tinygrad.lazy import Device; assert Device.DEFAULT in ['LLVM','CLANG','CUDA','GPU'], Device.DEFAULT"
      - name: Run pytest (not cuda)
-        if: matrix.backend!='cuda'
+        if: matrix.backend!='cuda' && matrix.backend!='ptx'
        run: python -m pytest -n=auto test/ -k '${{matrix.backend=='llvm'&&'not (test_nn.py and test_conv_transpose2d)'||'test'}}' -m 'not exclude_${{matrix.backend}}'
      - name: Run pytest (cuda)
        if: matrix.backend=='cuda'
        run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors) and not (test_conv2d and test_tensor.py)' -m 'not exclude_cuda' --ignore=test/external --ignore=test/models
+      - name: Run pytest (ptx)
+        if: matrix.backend=='ptx'
+        run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors) and not (test_conv2d and test_tensor.py)' -m 'not exclude_cuda' --ignore=test/external --ignore=test/models

  testunicorn:
    name: ARM64 unicorn Test
@@ -285,7 +288,7 @@ jobs:
        uses: actions/cache@v3
        with:
          path: '~/.cache/pip'
-          key: unicorn 
+          key: unicorn
      - name: Install cross-assembler
        run: |
          sudo apt-get update -y && \