OSX GPUOcelot (#8209)

* add patches

* add osx test in ci

* macos specific uvm, gpfifo mask

* only do that for now

* Revert "add patches"

This reverts commit 80d3112a57.

* use fork for now

* workflow only one worker

* merge osxtests with tests

* Revert "merge osxtests with tests"

This reverts commit 3461c8f46c.

* macos pagesize 16384

---------

Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com>
Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
This commit is contained in:
JaSpa99
2025-02-13 05:24:29 +01:00
committed by GitHub
parent f4f56d7c15
commit d2ff55e9c6
4 changed files with 24 additions and 9 deletions

View File

@@ -143,14 +143,20 @@ runs:
# **** CUDA **** # **** CUDA ****
- name: Install packages (cuda) - name: Install cuda packages (Linux)
if: inputs.cuda == 'true' if: inputs.cuda == 'true' && runner.os == 'Linux'
shell: bash shell: bash
run: | run: |
echo 'Acquire::http::Pipeline-Depth "5";' | sudo tee -a /etc/apt/apt.conf.d/99parallel echo 'Acquire::http::Pipeline-Depth "5";' | sudo tee -a /etc/apt/apt.conf.d/99parallel
sudo apt update -y || true sudo apt update -y || true
sudo apt install -y --no-install-recommends git g++ cmake ninja-build llvm-15-dev zlib1g-dev libglew-dev \ sudo apt install -y --no-install-recommends git g++ cmake ninja-build llvm-15-dev zlib1g-dev libglew-dev \
flex bison libfl-dev libboost-thread-dev libboost-filesystem-dev nvidia-cuda-toolkit-gcc libzstd-dev flex bison libfl-dev libboost-thread-dev libboost-filesystem-dev nvidia-cuda-toolkit-gcc libzstd-dev
- name: Install gpuocelot dependencies (MacOS)
if: inputs.cuda == 'true' && runner.os == 'macOS'
shell: bash
run: |
brew update
brew install cmake ninja llvm@15 zlib glew flex bison boost zstd ncurses
- name: Cache gpuocelot - name: Cache gpuocelot
if: inputs.cuda == 'true' if: inputs.cuda == 'true'
id: cache-build id: cache-build
@@ -159,7 +165,7 @@ runs:
cache-name: cache-gpuocelot-build cache-name: cache-gpuocelot-build
with: with:
path: ${{ github.workspace }}/gpuocelot/ocelot path: ${{ github.workspace }}/gpuocelot/ocelot
key: ubuntu22.04-gpuocelot-b16039dc940dc6bc4ea0a98380495769ff35ed99-rebuild-0 key: ${{ runner.os }}-gpuocelot-b16039dc940dc6bc4ea0a98380495769ff35ed99-rebuild-0
- name: Clone/compile gpuocelot - name: Clone/compile gpuocelot
if: inputs.cuda == 'true' && steps.cache-build.outputs.cache-hit != 'true' if: inputs.cuda == 'true' && steps.cache-build.outputs.cache-hit != 'true'
shell: bash shell: bash
@@ -176,7 +182,7 @@ runs:
shell: bash shell: bash
run: | run: |
cd ${{ github.workspace }}/gpuocelot/ocelot/build cd ${{ github.workspace }}/gpuocelot/ocelot/build
sudo cp libgpuocelot.so /usr/lib/libgpuocelot.so sudo cp libgpuocelot.${{ runner.os == 'macOS' && 'dylib' || 'so' }} /usr/${{ runner.os == 'macOS' && 'local/' || ''}}lib/
# **** WebGPU **** # **** WebGPU ****

View File

@@ -513,6 +513,7 @@ jobs:
deps: testing deps: testing
python-version: '3.11' python-version: '3.11'
amd: 'true' amd: 'true'
cuda: 'true'
- name: Run real world test - name: Run real world test
run: JIT=2 METAL=1 python -m pytest -n=auto test/models/test_real_world.py --durations=20 run: JIT=2 METAL=1 python -m pytest -n=auto test/models/test_real_world.py --durations=20
- name: Test models (Metal) - name: Test models (Metal)
@@ -540,6 +541,14 @@ jobs:
FORWARD_ONLY: 1 FORWARD_ONLY: 1
run: | run: |
python3 -m pytest -n=auto test/test_hcq.py test/test_tiny.py --durations=20 python3 -m pytest -n=auto test/test_hcq.py test/test_tiny.py --durations=20
- name: Run pytest (ptx)
env:
MOCKGPU: 1
PTX: 1
NV: 1
FORWARD_ONLY: 1
run: |
python3 -m pytest -n=auto test/test_hcq.py test/test_tiny.py --durations=20
- name: Run process replay tests - name: Run process replay tests
uses: ./.github/actions/process-replay uses: ./.github/actions/process-replay

View File

@@ -55,7 +55,7 @@ class GPFIFO:
def _reset_buf_state(self): self.buf, self.buf_ptr = None, 0 def _reset_buf_state(self): self.buf, self.buf_ptr = None, 0
def _set_buf_state(self, gpfifo_entry): def _set_buf_state(self, gpfifo_entry):
ptr = ((gpfifo_entry >> 2) & 0xfffffffff) << 2 ptr = ((gpfifo_entry >> 2) & 0x3fffffffff) << 2
sz = ((gpfifo_entry >> 42) & 0x1fffff) << 2 sz = ((gpfifo_entry >> 42) & 0x1fffff) << 2
self.buf = to_mv(ptr, sz).cast("I") self.buf = to_mv(ptr, sz).cast("I")
self.buf_sz = sz // 4 self.buf_sz = sz // 4

View File

@@ -7,7 +7,7 @@ from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, H
from tinygrad.runtime.support.hcq import HWInterface, MOCKGPU from tinygrad.runtime.support.hcq import HWInterface, MOCKGPU
from tinygrad.ops import sint from tinygrad.ops import sint
from tinygrad.device import BufferSpec, CPUProgram from tinygrad.device import BufferSpec, CPUProgram
from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod, OSX
from tinygrad.renderer.ptx import PTXRenderer from tinygrad.renderer.ptx import PTXRenderer
from tinygrad.renderer.cstyle import NVRenderer from tinygrad.renderer.cstyle import NVRenderer
from tinygrad.runtime.support.compiler_cuda import CUDACompiler, PTXCompiler, PTX, NVPTXCompiler, NVCompiler from tinygrad.runtime.support.compiler_cuda import CUDACompiler, PTXCompiler, PTX, NVPTXCompiler, NVCompiler
@@ -295,8 +295,8 @@ class NVDevice(HCQCompiled[NVSignal]):
# TODO: Need a proper allocator for va addresses # TODO: Need a proper allocator for va addresses
# 0x1000000000 - 0x2000000000, reserved for system/cpu mappings # 0x1000000000 - 0x2000000000, reserved for system/cpu mappings
# VA space is 48bits. # VA space is 48bits.
low_uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=0x1000000000, base=0x1000000000, wrap=False) low_uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=0x1000000000, base=0x8000000000 if OSX else 0x1000000000, wrap=False)
uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=(1 << 48) - 1, base=0x2000000000, wrap=False) uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=(1 << 48) - 1, base=low_uvm_vaddr_allocator.base + low_uvm_vaddr_allocator.size, wrap=False)
host_object_enumerator: int = 0x1000 host_object_enumerator: int = 0x1000
def _new_gpu_fd(self): def _new_gpu_fd(self):
@@ -314,7 +314,7 @@ class NVDevice(HCQCompiled[NVSignal]):
def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0, tag="") -> HCQBuffer: def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0, tag="") -> HCQBuffer:
# Uncached memory is "system". Use huge pages only for gpu memory. # Uncached memory is "system". Use huge pages only for gpu memory.
page_size = (4 << 10) if uncached or host else ((2 << 20) if size >= (8 << 20) else (4 << 10)) page_size = (4 << (12 if OSX else 10)) if uncached or host else ((2 << 20) if size >= (8 << 20) else (4 << (12 if OSX else 10)))
size = round_up(size, page_size) size = round_up(size, page_size)
va_addr = self._alloc_gpu_vaddr(size, alignment=page_size, force_low=cpu_access) va_addr = self._alloc_gpu_vaddr(size, alignment=page_size, force_low=cpu_access)