mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-01-08 22:48:25 -05:00
OSX GPUOcelot (#8209)
* add patches * add osx test in ci * macos specific uvm, gpfifo mask * only do that for now * Revert "add patches" This reverts commit80d3112a57. * use fork for now * workflow only one worker * merge osxtests with tests * Revert "merge osxtests with tests" This reverts commit3461c8f46c. * macos pagesize 16384 --------- Co-authored-by: nimlgen <138685161+nimlgen@users.noreply.github.com> Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
This commit is contained in:
14
.github/actions/setup-tinygrad/action.yml
vendored
14
.github/actions/setup-tinygrad/action.yml
vendored
@@ -143,14 +143,20 @@ runs:
|
|||||||
|
|
||||||
# **** CUDA ****
|
# **** CUDA ****
|
||||||
|
|
||||||
- name: Install packages (cuda)
|
- name: Install cuda packages (Linux)
|
||||||
if: inputs.cuda == 'true'
|
if: inputs.cuda == 'true' && runner.os == 'Linux'
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
echo 'Acquire::http::Pipeline-Depth "5";' | sudo tee -a /etc/apt/apt.conf.d/99parallel
|
echo 'Acquire::http::Pipeline-Depth "5";' | sudo tee -a /etc/apt/apt.conf.d/99parallel
|
||||||
sudo apt update -y || true
|
sudo apt update -y || true
|
||||||
sudo apt install -y --no-install-recommends git g++ cmake ninja-build llvm-15-dev zlib1g-dev libglew-dev \
|
sudo apt install -y --no-install-recommends git g++ cmake ninja-build llvm-15-dev zlib1g-dev libglew-dev \
|
||||||
flex bison libfl-dev libboost-thread-dev libboost-filesystem-dev nvidia-cuda-toolkit-gcc libzstd-dev
|
flex bison libfl-dev libboost-thread-dev libboost-filesystem-dev nvidia-cuda-toolkit-gcc libzstd-dev
|
||||||
|
- name: Install gpuocelot dependencies (MacOS)
|
||||||
|
if: inputs.cuda == 'true' && runner.os == 'macOS'
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
brew update
|
||||||
|
brew install cmake ninja llvm@15 zlib glew flex bison boost zstd ncurses
|
||||||
- name: Cache gpuocelot
|
- name: Cache gpuocelot
|
||||||
if: inputs.cuda == 'true'
|
if: inputs.cuda == 'true'
|
||||||
id: cache-build
|
id: cache-build
|
||||||
@@ -159,7 +165,7 @@ runs:
|
|||||||
cache-name: cache-gpuocelot-build
|
cache-name: cache-gpuocelot-build
|
||||||
with:
|
with:
|
||||||
path: ${{ github.workspace }}/gpuocelot/ocelot
|
path: ${{ github.workspace }}/gpuocelot/ocelot
|
||||||
key: ubuntu22.04-gpuocelot-b16039dc940dc6bc4ea0a98380495769ff35ed99-rebuild-0
|
key: ${{ runner.os }}-gpuocelot-b16039dc940dc6bc4ea0a98380495769ff35ed99-rebuild-0
|
||||||
- name: Clone/compile gpuocelot
|
- name: Clone/compile gpuocelot
|
||||||
if: inputs.cuda == 'true' && steps.cache-build.outputs.cache-hit != 'true'
|
if: inputs.cuda == 'true' && steps.cache-build.outputs.cache-hit != 'true'
|
||||||
shell: bash
|
shell: bash
|
||||||
@@ -176,7 +182,7 @@ runs:
|
|||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
cd ${{ github.workspace }}/gpuocelot/ocelot/build
|
cd ${{ github.workspace }}/gpuocelot/ocelot/build
|
||||||
sudo cp libgpuocelot.so /usr/lib/libgpuocelot.so
|
sudo cp libgpuocelot.${{ runner.os == 'macOS' && 'dylib' || 'so' }} /usr/${{ runner.os == 'macOS' && 'local/' || ''}}lib/
|
||||||
|
|
||||||
# **** WebGPU ****
|
# **** WebGPU ****
|
||||||
|
|
||||||
|
|||||||
9
.github/workflows/test.yml
vendored
9
.github/workflows/test.yml
vendored
@@ -513,6 +513,7 @@ jobs:
|
|||||||
deps: testing
|
deps: testing
|
||||||
python-version: '3.11'
|
python-version: '3.11'
|
||||||
amd: 'true'
|
amd: 'true'
|
||||||
|
cuda: 'true'
|
||||||
- name: Run real world test
|
- name: Run real world test
|
||||||
run: JIT=2 METAL=1 python -m pytest -n=auto test/models/test_real_world.py --durations=20
|
run: JIT=2 METAL=1 python -m pytest -n=auto test/models/test_real_world.py --durations=20
|
||||||
- name: Test models (Metal)
|
- name: Test models (Metal)
|
||||||
@@ -540,6 +541,14 @@ jobs:
|
|||||||
FORWARD_ONLY: 1
|
FORWARD_ONLY: 1
|
||||||
run: |
|
run: |
|
||||||
python3 -m pytest -n=auto test/test_hcq.py test/test_tiny.py --durations=20
|
python3 -m pytest -n=auto test/test_hcq.py test/test_tiny.py --durations=20
|
||||||
|
- name: Run pytest (ptx)
|
||||||
|
env:
|
||||||
|
MOCKGPU: 1
|
||||||
|
PTX: 1
|
||||||
|
NV: 1
|
||||||
|
FORWARD_ONLY: 1
|
||||||
|
run: |
|
||||||
|
python3 -m pytest -n=auto test/test_hcq.py test/test_tiny.py --durations=20
|
||||||
- name: Run process replay tests
|
- name: Run process replay tests
|
||||||
uses: ./.github/actions/process-replay
|
uses: ./.github/actions/process-replay
|
||||||
|
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ class GPFIFO:
|
|||||||
|
|
||||||
def _reset_buf_state(self): self.buf, self.buf_ptr = None, 0
|
def _reset_buf_state(self): self.buf, self.buf_ptr = None, 0
|
||||||
def _set_buf_state(self, gpfifo_entry):
|
def _set_buf_state(self, gpfifo_entry):
|
||||||
ptr = ((gpfifo_entry >> 2) & 0xfffffffff) << 2
|
ptr = ((gpfifo_entry >> 2) & 0x3fffffffff) << 2
|
||||||
sz = ((gpfifo_entry >> 42) & 0x1fffff) << 2
|
sz = ((gpfifo_entry >> 42) & 0x1fffff) << 2
|
||||||
self.buf = to_mv(ptr, sz).cast("I")
|
self.buf = to_mv(ptr, sz).cast("I")
|
||||||
self.buf_sz = sz // 4
|
self.buf_sz = sz // 4
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, H
|
|||||||
from tinygrad.runtime.support.hcq import HWInterface, MOCKGPU
|
from tinygrad.runtime.support.hcq import HWInterface, MOCKGPU
|
||||||
from tinygrad.ops import sint
|
from tinygrad.ops import sint
|
||||||
from tinygrad.device import BufferSpec, CPUProgram
|
from tinygrad.device import BufferSpec, CPUProgram
|
||||||
from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod
|
from tinygrad.helpers import getenv, mv_address, init_c_struct_t, to_mv, round_up, data64, data64_le, DEBUG, prod, OSX
|
||||||
from tinygrad.renderer.ptx import PTXRenderer
|
from tinygrad.renderer.ptx import PTXRenderer
|
||||||
from tinygrad.renderer.cstyle import NVRenderer
|
from tinygrad.renderer.cstyle import NVRenderer
|
||||||
from tinygrad.runtime.support.compiler_cuda import CUDACompiler, PTXCompiler, PTX, NVPTXCompiler, NVCompiler
|
from tinygrad.runtime.support.compiler_cuda import CUDACompiler, PTXCompiler, PTX, NVPTXCompiler, NVCompiler
|
||||||
@@ -295,8 +295,8 @@ class NVDevice(HCQCompiled[NVSignal]):
|
|||||||
# TODO: Need a proper allocator for va addresses
|
# TODO: Need a proper allocator for va addresses
|
||||||
# 0x1000000000 - 0x2000000000, reserved for system/cpu mappings
|
# 0x1000000000 - 0x2000000000, reserved for system/cpu mappings
|
||||||
# VA space is 48bits.
|
# VA space is 48bits.
|
||||||
low_uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=0x1000000000, base=0x1000000000, wrap=False)
|
low_uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=0x1000000000, base=0x8000000000 if OSX else 0x1000000000, wrap=False)
|
||||||
uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=(1 << 48) - 1, base=0x2000000000, wrap=False)
|
uvm_vaddr_allocator: BumpAllocator = BumpAllocator(size=(1 << 48) - 1, base=low_uvm_vaddr_allocator.base + low_uvm_vaddr_allocator.size, wrap=False)
|
||||||
host_object_enumerator: int = 0x1000
|
host_object_enumerator: int = 0x1000
|
||||||
|
|
||||||
def _new_gpu_fd(self):
|
def _new_gpu_fd(self):
|
||||||
@@ -314,7 +314,7 @@ class NVDevice(HCQCompiled[NVSignal]):
|
|||||||
|
|
||||||
def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0, tag="") -> HCQBuffer:
|
def _gpu_alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0, tag="") -> HCQBuffer:
|
||||||
# Uncached memory is "system". Use huge pages only for gpu memory.
|
# Uncached memory is "system". Use huge pages only for gpu memory.
|
||||||
page_size = (4 << 10) if uncached or host else ((2 << 20) if size >= (8 << 20) else (4 << 10))
|
page_size = (4 << (12 if OSX else 10)) if uncached or host else ((2 << 20) if size >= (8 << 20) else (4 << (12 if OSX else 10)))
|
||||||
size = round_up(size, page_size)
|
size = round_up(size, page_size)
|
||||||
va_addr = self._alloc_gpu_vaddr(size, alignment=page_size, force_low=cpu_access)
|
va_addr = self._alloc_gpu_vaddr(size, alignment=page_size, force_low=cpu_access)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user