Remove webgpu, back to 5k lines (#3040)

* remove webgpu

* max 5000 lines
This commit is contained in:
George Hotz
2024-01-08 09:10:07 -08:00
committed by GitHub
parent cf2eea961c
commit 8cbcd1b342
8 changed files with 44 additions and 84 deletions

View File

@@ -63,8 +63,8 @@ jobs:
source venv/bin/activate
pip install $GITHUB_WORKSPACE
python -c "from tinygrad.tensor import Tensor; print(Tensor([1,2,3,4,5]))"
- name: Repo line count <6000 lines
run: MAX_LINE_COUNT=6000 python sz.py
- name: Repo line count <5000 lines
run: MAX_LINE_COUNT=5000 python sz.py
testcpuimagenet:
name: CPU and ImageNet to C Tests
@@ -214,48 +214,48 @@ jobs:
name: Test Beam Search
run: PYTHONPATH="." GPU=1 IGNORE_BEAM_CACHE=1 python3 -m pytest extra/optimization/test_beam_search.py
testwebgpu:
name: WebGPU Tests
runs-on: macos-13
timeout-minutes: 20
steps:
- name: Checkout Code
uses: actions/checkout@v3
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Cache python packages
uses: actions/cache@v3
with:
path: /Users/runner/Library/Python/3.11/lib/python/site-packages
key: webgpu-testing-user3-packages-${{ hashFiles('**/setup.py') }}
- name: Install Dependencies
run: pip install --user -e '.[webgpu,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
- name: Cache downloads
uses: actions/cache@v3
with:
path: ~/Library/Caches/tinygrad/downloads/
key: downloads-cache-webgpu-${{ env.DOWNLOAD_CACHE_VERSION }}
- name: Check Device.DEFAULT (WEBGPU) and print some source
run: |
WEBGPU=1 python -c "from tinygrad import Device; assert Device.DEFAULT == 'WEBGPU', Device.DEFAULT"
WEBGPU=1 DEBUG=4 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
#testwebgpu:
# name: WebGPU Tests
# runs-on: macos-13
# timeout-minutes: 20
# steps:
# - name: Checkout Code
# uses: actions/checkout@v3
# - name: Set up Python 3.11
# uses: actions/setup-python@v4
# with:
# python-version: 3.11
# - name: Cache python packages
# uses: actions/cache@v3
# with:
# path: /Users/runner/Library/Python/3.11/lib/python/site-packages
# key: webgpu-testing-user3-packages-${{ hashFiles('**/setup.py') }}
# - name: Install Dependencies
# run: pip install --user -e '.[webgpu,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
# - name: Cache downloads
# uses: actions/cache@v3
# with:
# path: ~/Library/Caches/tinygrad/downloads/
# key: downloads-cache-webgpu-${{ env.DOWNLOAD_CACHE_VERSION }}
# - name: Check Device.DEFAULT (WEBGPU) and print some source
# run: |
# WEBGPU=1 python -c "from tinygrad import Device; assert Device.DEFAULT == 'WEBGPU', Device.DEFAULT"
# WEBGPU=1 DEBUG=4 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
#- name: Run webgpu pytest
# run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m pytest -n=auto
- name: Run selected webgpu tests
run: |
WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m pytest -n=auto test/test_ops.py test/test_dtype.py \
test/test_jit.py test/test_symbolic_ops.py test/test_symbolic_jit.py test/test_linearizer.py \
test/test_linearizer_failures.py test/test_nn.py
- name: Build WEBGPU Efficientnet
run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m examples.compile_efficientnet
- name: Install Puppeteer
run: npm install puppeteer
- name: Run WEBGPU Efficientnet
run: node test/web/test_webgpu.js
- name: Test LLaMA compile speed
run: PYTHONPATH="." METAL=1 python test/external/external_test_speed_llama.py
# - name: Run selected webgpu tests
# run: |
# WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m pytest -n=auto test/test_ops.py test/test_dtype.py \
# test/test_jit.py test/test_symbolic_ops.py test/test_symbolic_jit.py test/test_linearizer.py \
# test/test_linearizer_failures.py test/test_nn.py
# - name: Build WEBGPU Efficientnet
# run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m examples.compile_efficientnet
# - name: Install Puppeteer
# run: npm install puppeteer
# - name: Run WEBGPU Efficientnet
# run: node test/web/test_webgpu.js
# - name: Test LLaMA compile speed
# run: PYTHONPATH="." METAL=1 python test/external/external_test_speed_llama.py
testmetal:
name: Metal Tests

View File

@@ -82,10 +82,8 @@ tinygrad already supports numerous accelerators, including:
- [x] [LLVM](tinygrad/runtime/ops_llvm.py)
- [x] [METAL](tinygrad/runtime/ops_metal.py)
- [x] [CUDA](tinygrad/runtime/ops_cuda.py)
- [x] [Triton](extra/triton/triton.py)
- [x] [PyTorch](tinygrad/runtime/ops_torch.py)
- [x] [HIP](tinygrad/runtime/ops_hip.py)
- [x] [WebGPU](tinygrad/runtime/ops_webgpu.py)
And it is easy to add more! Your accelerator of choice only needs to support a total of ~25 low level ops.
More information can be found in the [documentation for adding new accelerators](/docs/adding_new_accelerators.md).

View File

@@ -28,7 +28,6 @@ setup(name='tinygrad',
'llvm': ["llvmlite"],
'arm': ["unicorn"],
'triton': ["triton-nightly>=2.1.0.dev20231014192330"],
'webgpu': ["wgpu>=v0.12.0"],
'linting': [
"pylint",
"mypy",

View File

@@ -99,8 +99,7 @@ class LazyBuffer:
return LazyBuffer.loadop(LoadOps.CONST, tuple(), self.dtype, device, arg=self.base.arg)._view(self.st)
# if it's a shrink, do the shrink before the copy with CONTIGUOUS
# TODO: why is this required on WEBGPU?
if prod(self.st.shape) < prod(self.base.st.shape) or device == "WEBGPU":
if prod(self.st.shape) < prod(self.base.st.shape):
return create_lazybuffer(device, ShapeTracker.from_shape(self.shape), self.dtype, LoadOps.COPY, srcs=(self.contiguous(),))
# copy the base and apply the shapetracker on the new device
@@ -118,7 +117,7 @@ class LazyBuffer:
if op == TernaryOps.WHERE: assert srcs[0].dtype == dtypes.bool, "TernaryOps.WHERE must have the first arg be bool"
out_dtype = srcs[-1].dtype if op not in (BinaryOps.CMPLT, BinaryOps.CMPEQ) else dtypes.bool
ret = create_lazybuffer(self.device, ShapeTracker.from_shape(self.shape), out_dtype, op, arg, tuple(srcs))
return ret.cast(dtypes.float32) if (out_dtype == dtypes.bool and self.device == "WEBGPU") else ret
return ret
# *** reduce ops ***

View File

@@ -289,38 +289,3 @@ __device__ half16 make_half16(half x, half y, half z, half w, half a, half b, ha
"""
type_map = {dtypes.bfloat16: "hip_bfloat16"}
HIPRenderer = functools.partial(uops_to_cstyle, HIPLanguage())
# TODO: how much of this can be merged with above?
class WGSLLanguage(CStyleLanguage):
code_for_workitem = {"g": lambda x: f"i32(gindex.{'xyz'[x]})", "l": lambda x: f"i32(lindex.{'xyz'[x]})"}
size_prefix = "let"
barrier="workgroupBarrier();"
generic_var_prefix = "var "
external_local_bufs = True
code_for_op = { **CStyleLanguage().code_for_op,
BinaryOps.CMPLT: lambda x,y,dtype: f"f32({x}<{y})", BinaryOps.CMPEQ: lambda x,y,dtype: f"f32({x}=={y})",
TernaryOps.MULACC: lambda x,y,z,dtype: f"fma({x},{y},{z})", TernaryOps.WHERE: lambda a,b,c,dtype: f"select({c},{b},bool({a}))" }
# HACK: write bool as f32
type_map = {dtypes.float: "f32", dtypes.half: "f16", dtypes.int32: "i32", dtypes.uint32: "u32", dtypes.bool: "f32"}
def render_local(self, name: str, dtype:DType, size: int): return f"var<workgroup> {name}: array<{self.type_map[dtype]},{size}>;"
def render_const(self, x:Union[float,int], var_dtype) -> str:
if math.isnan(x): return "nan()"
elif math.isinf(x): return ("-" if x < 0 else "") + "inf(1.0)"
return f"({super().render_const(x, var_dtype)})"
def render_if(self, cond: str): return f"if (bool({cond})) {{"
def render_kernel(self, function_name:str, kernel:List[str], bufs:List[Tuple[str,DType]], local_size:List[int], prekernel:List[str]) -> str:
local_size = local_size[::-1] if local_size else [1]
bind_it = iter(range(len(bufs)))
prg = "fn nan() -> f32 { let bits = 0xffffffffu; return bitcast<f32>(bits); }\nfn inf(a: f32) -> f32 { return a/0.0; }\n"
prg += "\n".join(prekernel+[f"@group(0) @binding({next(bind_it)}) {'var<storage,read_write>' if isinstance(dtype, PtrDType) else 'var<uniform>'} {name}: {f'array<{self.type_map[dtype]}>' if isinstance(dtype, PtrDType) else 'i32'};" for name,dtype in bufs]) # noqa: E501
prg += f"\n@compute @workgroup_size({','.join([str(x) for x in local_size])}) fn {function_name}(@builtin(workgroup_id) gindex: vec3<u32>, @builtin(local_invocation_id) lindex: vec3<u32>) {{\n" + "\n".join(kernel) + "\n}" # noqa: E501
return prg
def render_cast(self, x:List[str], var_dtype:DType, bitcast=False) -> str:
if self.type_map[var_dtype]: return f"bitcast<{self.type_map[var_dtype]}>({x[0]})" if bitcast else f"{self.type_map[var_dtype]}({x[0]})"
raise NotImplementedError(f"no cast for {var_dtype}")
WGSLRenderer = functools.partial(uops_to_cstyle, WGSLLanguage())

View File

@@ -863,7 +863,6 @@ class Tensor:
def __imatmul__(self, x) -> Tensor: return self.assign(self.matmul(x))
def __ixor__(self, x) -> Tensor: return self.assign(self.xor(x))
# in webgpu bool cannot be used as a storage buffer type
def __lt__(self, x) -> Tensor: return mlops.Less.apply(*self._broadcasted(x, False))
def __gt__(self, x) -> Tensor: return mlops.Less.apply(*self._broadcasted(x, True))
def __ge__(self, x) -> Tensor: return (self<x).logical_not()