diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 537cc04e6e..9e61940345 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -622,10 +622,7 @@ jobs:
   wintests:
     strategy:
       fail-fast: false
-      matrix:
-        backend: [llvm]
-
-    name: Tests on Windows (${{ matrix.backend }})
+    name: Tests on Windows (llvm+clang)
     runs-on: windows-latest
     timeout-minutes: 45
     steps:
@@ -642,20 +639,34 @@ jobs:
         with:
           path: ${{ env.Python3_ROOT_DIR }}\Lib\site-packages
           key: windows-${{ matrix.backend }}-packages-${{ hashFiles('**/setup.py') }}
+      - name: Set env
+        shell: bash
+        run: |
+          if [ "${{ matrix.backend }}" = "clang" ]; then
+            echo "CLANG=1" >> $GITHUB_ENV
+          elif [ "${{ matrix.backend }}" = "llvm" ]; then
+            echo "LLVM=1" >> $GITHUB_ENV
+          fi
       - name: Install dependencies
         run: pip install --user -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
-      - name: Check Device.DEFAULT and print some source
-        env:
-          DEBUG: 5
-          LLVM: 1
-          PYTHONPATH: ${{ github.workspace }}
+      - name: Check Device.DEFAULT and print some source (llvm)
+        shell: bash
         run: |
-          python3 test/test_ops.py TestOps.test_add
-      - name: Run pytest
-        env:
-          DEBUG: 5
-          LLVM: 1
-        run: python -m pytest -n=auto test/test_tiny.py --durations=20
+          PYTHONPATH=${{ github.workspace }} LLVM=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT == 'LLVM', Device.DEFAULT"
+          DEBUG=5 PYTHONPATH=${{ github.workspace }} LLVM=1 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
+      - name: Check Device.DEFAULT and print some source (clang)
+        shell: bash
+        run: |
+          PYTHONPATH=${{ github.workspace }} CLANG=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT == 'CLANG', Device.DEFAULT"
+          DEBUG=5 PYTHONPATH=${{ github.workspace }} CLANG=1 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
+      - name: Run pytest (llvm)
+        shell: bash
+        run: |
+          DEBUG=5 LLVM=1 python -m pytest -n=auto test/test_tiny.py --durations=20
+      - name: Run pytest (clang)
+        shell: bash
+        run: |
+          DEBUG=5 CLANG=1 python -m pytest -n=auto test/test_tiny.py --durations=20
 
   #testunicorn:
   #  name: ARM64 unicorn Test
diff --git a/tinygrad/device.py b/tinygrad/device.py
index 04182a3c33..e7b502432b 100644
--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@@ -222,23 +222,31 @@ MAP_JIT = 0x0800
 class CPUProgram:
   helper_handle = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'kernel32' if sys.platform == "win32" else 'gcc_s'))
   def __init__(self, name:str, lib:bytes):
-    assert sys.platform != "win32", "clang is not supported for windows yet"
-    from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE
-    # On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
-    # MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
-    self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC)
+    if sys.platform == "win32":
+      PAGE_EXECUTE_READWRITE = 0x40
+      MEM_COMMIT =  0x1000
+      MEM_RESERVE = 0x2000
+      ctypes.windll.kernel32.VirtualAlloc.restype = ctypes.c_uint64
+      ptr = ctypes.windll.kernel32.VirtualAlloc(ctypes.c_int(0), ctypes.c_int(len(lib)), MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE)
+      ctypes.memmove(ptr, lib, len(lib))
+      self.fxn = ctypes.CFUNCTYPE(None)(ptr)
+    else:
+      from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE
+      # On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
+      # MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
+      self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC)
 
-    if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False)
-    self.mem.write(lib)
-    if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True)
+      if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False)
+      self.mem.write(lib)
+      if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True)
 
-    # __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
-    # libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
-    # it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
-    # Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
-    CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))
+      # __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
+      # libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
+      # it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
+      # Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
+      CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))
 
-    self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))
+      self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))
 
   def __call__(self, *bufs, vals=(), wait=False):
     args = list(bufs) + list(vals)
diff --git a/tinygrad/helpers.py b/tinygrad/helpers.py
index 5895b0cc68..9c055acd87 100644
--- a/tinygrad/helpers.py
+++ b/tinygrad/helpers.py
@@ -272,7 +272,7 @@ def cpu_objdump(lib, objdump_tool='objdump'):
 def capstone_flatdump(lib: bytes):
   import capstone
   match platform.machine():
-    case 'x86_64': cs = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_64)
+    case 'x86_64' | 'AMD64': cs = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_64)
     case 'aarch64' | 'arm64': cs = capstone.Cs(capstone.CS_ARCH_ARM64, capstone.CS_MODE_ARM)
     case machine: raise NotImplementedError(f"Capstone disassembly isn't supported for {machine}")
   for instr in cs.disasm(lib, 0):
diff --git a/tinygrad/renderer/cstyle.py b/tinygrad/renderer/cstyle.py
index 4f2e604f05..6b9a9f266d 100644
--- a/tinygrad/renderer/cstyle.py
+++ b/tinygrad/renderer/cstyle.py
@@ -1,5 +1,5 @@
 from typing import Optional, Union, Literal, Callable, cast
-import os, math
+import os, math, sys
 from collections import defaultdict, Counter
 from tinygrad.ops import GroupOp, Ops, UOp, PatternMatcher, UPat
 from tinygrad.helpers import strip_parens, getenv, prod, dedup, AMX
@@ -178,7 +178,8 @@ class ClangRenderer(CStyleLanguage):
     tensor_cores = [TensorCore(dims=(sz,sz,1), threads=1, elements_per_thread=(sz,sz,sz*sz), dtype_in=dt, dtype_out=dt,
                                swizzle=(None, ((),(4,5,6,7,0,1,2,3))), opts=("u0","u0","u0","u0","u1","u1","u1","u1"))
                                for dt,sz in [(dt, 64 // dt.itemsize) for dt in [dtypes.float]]]
-
+  if sys.platform == 'win32':
+    kernel_prefix = "__attribute__((ms_abi)) "
   def render_vector_prefix(self, dt:DType) -> str:
     return f"typedef {self.render_dtype(dt.scalar())} {self.render_dtype(dt)} __attribute__((aligned({(sz:=dt.itemsize)}),vector_size({sz})));"
 
diff --git a/tinygrad/runtime/ops_clang.py b/tinygrad/runtime/ops_clang.py
index 689c04c460..2baf572382 100644
--- a/tinygrad/runtime/ops_clang.py
+++ b/tinygrad/runtime/ops_clang.py
@@ -1,4 +1,4 @@
-import platform, tempfile, pathlib, subprocess
+import platform, tempfile, pathlib, subprocess, sys
 from tinygrad.helpers import cpu_objdump, capstone_flatdump
 from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
 from tinygrad.runtime.support.elf import jit_loader
@@ -26,7 +26,8 @@ class ClangJITCompiler(Compiler):
   def compile(self, src:str) -> bytes:
     # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call
     # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it
-    args = ['-march=native', f'--target={platform.machine()}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib']
+    target = 'x86_64' if sys.platform == 'win32' else platform.machine()
+    args = ['-march=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib']
     arch_args = ['-ffixed-x18'] if platform.machine() == 'arm64' else []
     obj = subprocess.check_output(['clang', '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
     return jit_loader(obj)