From d2ff55e9c6b7e70224ef00a199791ea3c9b6f982 Mon Sep 17 00:00:00 2001 From: JaSpa99 Date: Thu, 13 Feb 2025 05:24:29 +0100 Subject: [PATCH] OSX GPUOcelot (#8209) * add patches * add osx test in ci * macos specific uvm, gpfifo mask * only do that for now * Revert "add patches" This reverts commit 80d3112a571c0b3fa0dc1a676cd38220688f05d4. * use fork for now * workflow only one worker * merge osxtests with tests * Revert "merge osxtests with tests" This reverts commit 3461c8f46cbe1b9ea38274852608faeaacd70df7. * macos pagesize 16384 --------- Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com> Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com> --- .github/actions/setup-tinygrad/action.yml | 14 ++++++++++---- .github/workflows/test.yml | 9 +++++++++ test/mockgpu/nv/nvgpu.py | 2 +- tinygrad/runtime/ops_nv.py | 8 ++++---- 4 files changed, 24 insertions(+), 9 deletions(-) diff --git a/.github/actions/setup-tinygrad/action.yml b/.github/actions/setup-tinygrad/action.yml index 28431625b9..5176ad3321 100644 --- a/.github/actions/setup-tinygrad/action.yml +++ b/.github/actions/setup-tinygrad/action.yml @@ -143,14 +143,20 @@ runs: # **** CUDA **** - - name: Install packages (cuda) - if: inputs.cuda == 'true' + - name: Install cuda packages (Linux) + if: inputs.cuda == 'true' && runner.os == 'Linux' shell: bash run: | echo 'Acquire::http::Pipeline-Depth "5";' | sudo tee -a /etc/apt/apt.conf.d/99parallel sudo apt update -y || true sudo apt install -y --no-install-recommends git g++ cmake ninja-build llvm-15-dev zlib1g-dev libglew-dev \ flex bison libfl-dev libboost-thread-dev libboost-filesystem-dev nvidia-cuda-toolkit-gcc libzstd-dev + - name: Install gpuocelot dependencies (MacOS) + if: inputs.cuda == 'true' && runner.os == 'macOS' + shell: bash + run: | + brew update + brew install cmake ninja llvm@15 zlib glew flex bison boost zstd ncurses - name: Cache gpuocelot if: inputs.cuda == 'true' id: cache-build @@ -159,7 +165,7 @@ runs: cache-name: cache-gpuocelot-build with: path: ${{ github.workspace }}/gpuocelot/ocelot - key: ubuntu22.04-gpuocelot-b16039dc940dc6bc4ea0a98380495769ff35ed99-rebuild-0 + key: ${{ runner.os }}-gpuocelot-b16039dc940dc6bc4ea0a98380495769ff35ed99-rebuild-0 - name: Clone/compile gpuocelot if: inputs.cuda == 'true' && steps.cache-build.outputs.cache-hit != 'true' shell: bash @@ -176,7 +182,7 @@ runs: shell: bash run: | cd ${{ github.workspace }}/gpuocelot/ocelot/build - sudo cp libgpuocelot.so /usr/lib/libgpuocelot.so + sudo cp libgpuocelot.${{ runner.os == 'macOS' && 'dylib' || 'so' }} /usr/${{ runner.os == 'macOS' && 'local/' || ''}}lib/ # **** WebGPU **** diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ea8e174bfd..e264872e50 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -513,6 +513,7 @@ jobs: deps: testing python-version: '3.11' amd: 'true' + cuda: 'true' - name: Run real world test run: JIT=2 METAL=1 python -m pytest -n=auto test/models/test_real_world.py --durations=20 - name: Test models (Metal) @@ -540,6 +541,14 @@ jobs: FORWARD_ONLY: 1 run: | python3 -m pytest -n=auto test/test_hcq.py test/test_tiny.py --durations=20 + - name: Run pytest (ptx) + env: + MOCKGPU: 1 + PTX: 1 + NV: 1 + FORWARD_ONLY: 1 + run: | + python3 -m pytest -n=auto test/test_hcq.py test/test_tiny.py --durations=20 - name: Run process replay tests uses: ./.github/actions/process-replay diff --git a/test/mockgpu/nv/nvgpu.py b/test/mockgpu/nv/nvgpu.py index f450a16ab1..63c03998e8 100644 --- a/test/mockgpu/nv/nvgpu.py +++ b/test/mockgpu/nv/nvgpu.py @@ -55,7 +55,7 @@ class GPFIFO: def _reset_buf_state(self): self.buf, self.buf_ptr = None, 0 def _set_buf_state(self, gpfifo_entry): - ptr = ((gpfifo_entry >> 2) & 0xfffffffff) << 2 + ptr = ((gpfifo_entry >> 2) & 0x3fffffffff) << 2 sz = ((gpfifo_entry >> 42) & 0x1fffff) << 2 self.buf = to_mv(ptr, sz).cast("I") self.buf_sz = sz // 4 diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index e5222878d1..d42b6b3ab3 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -7,7 +7,7 @@ from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, H from tinygrad.runtime.support.hcq import HWInterface, MOCKGPU from tinygrad.ops import sint from tinygrad.device import BufferSpec, CPUProgram -from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod +from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod, OSX from tinygrad.renderer.ptx import PTXRenderer from tinygrad.renderer.cstyle import NVRenderer from tinygrad.runtime.support.compiler_cuda import CUDACompiler, PTXCompiler, PTX, NVPTXCompiler, NVCompiler @@ -295,8 +295,8 @@ class NVDevice(HCQCompiled[NVSignal]): # TODO: Need a proper allocator for va addresses # 0x1000000000 - 0x2000000000, reserved for system/cpu mappings # VA space is 48bits. - low_uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=0x1000000000, base=0x1000000000, wrap=False) - uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=(1 << 48) - 1, base=0x2000000000, wrap=False) + low_uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=0x1000000000, base=0x8000000000 if OSX else 0x1000000000, wrap=False) + uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=(1 << 48) - 1, base=low_uvm_vaddr_allocator.base + low_uvm_vaddr_allocator.size, wrap=False) host_object_enumerator: int = 0x1000 def _new_gpu_fd(self): @@ -314,7 +314,7 @@ class NVDevice(HCQCompiled[NVSignal]): def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0, tag="") -> HCQBuffer: # Uncached memory is "system". Use huge pages only for gpu memory. - page_size = (4 << 10) if uncached or host else ((2 << 20) if size >= (8 << 20) else (4 << 10)) + page_size = (4 << (12 if OSX else 10)) if uncached or host else ((2 << 20) if size >= (8 << 20) else (4 << (12 if OSX else 10))) size = round_up(size, page_size) va_addr = self._alloc_gpu_vaddr(size, alignment=page_size, force_low=cpu_access)