From 47e0c439762bfdd7381a31ebe85fbab1b75c6556 Mon Sep 17 00:00:00 2001
From: wozeparrot <wozeparrot@gmail.com>
Date: Mon, 13 Oct 2025 08:04:41 -0700
Subject: [PATCH] feat: Tensor.{load, store} (#12629)

---
 test/unit/test_tinyfs.py       | 22 ++++++++++++++
 tinygrad/runtime/ops_tinyfs.py |  6 ++--
 tinygrad/tensor.py             | 53 ++++++++++++++++++++++++++++++++++
 3 files changed, 78 insertions(+), 3 deletions(-)
 create mode 100644 test/unit/test_tinyfs.py

diff --git a/test/unit/test_tinyfs.py b/test/unit/test_tinyfs.py
new file mode 100644
index 0000000000..9fe4fed13f
--- /dev/null
+++ b/test/unit/test_tinyfs.py
@@ -0,0 +1,22 @@
+import unittest
+from tinygrad import Tensor
+
+class TestLoadStore(unittest.TestCase):
+  def test_load_shape(self):
+    t = Tensor(bytes(16)).load(1024).kernelize()
+    assert t.shape == (1024,), t.shape
+
+  def test_store_shape(self):
+    t = Tensor.zeros(1024).store().kernelize()
+    assert t.shape == (16,), t.shape
+
+  def test_load_large_shape(self):
+    t = Tensor(bytes(16)).load(10_000_000).kernelize()
+    assert t.shape == (10_000_000,), t.shape
+
+  def test_store_large_shape(self):
+    t = Tensor.zeros(10_000_000).store().kernelize()
+    assert t.shape == (16,), t.shape
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/tinygrad/runtime/ops_tinyfs.py b/tinygrad/runtime/ops_tinyfs.py
index 048d908763..69d5ff54e3 100644
--- a/tinygrad/runtime/ops_tinyfs.py
+++ b/tinygrad/runtime/ops_tinyfs.py
@@ -2,9 +2,9 @@ import socket, uuid, json, asyncio, threading
 from contextlib import asynccontextmanager
 from tinygrad.device import Compiled, Allocator
 from tinygrad.helpers import DEBUG, getenv
+from tinygrad import Tensor
 
 TINYFS_ENDPOINT = getenv("TINYFS_ENDPOINT", "localhost:6767")
-CHUNK_SIZE = 2**20
 
 class TinyFSDevice(Compiled):
   def __init__(self, device:str):
@@ -116,8 +116,8 @@ class TinyFSAllocator(Allocator[TinyFSDevice]):
     async def _worker(item):
       i, loc, h = item
       async with self.dev.connection(loc) as (reader, writer):
-        ptr = i * CHUNK_SIZE
-        size = min(len(dest[ptr:ptr+CHUNK_SIZE]), CHUNK_SIZE)
+        ptr = i * Tensor.CHUNK_SIZE
+        size = min(len(dest[ptr:ptr+Tensor.CHUNK_SIZE]), Tensor.CHUNK_SIZE)
 
         writer.write(f"CHUNK_OUT {size}\r\n".encode())
         writer.write(h)
diff --git a/tinygrad/tensor.py b/tinygrad/tensor.py
index 282d86818e..fc40dfaac7 100644
--- a/tinygrad/tensor.py
+++ b/tinygrad/tensor.py
@@ -411,6 +411,59 @@ class Tensor(MathTrait):
     """
     return self.replace(self.shard(devices, axis))
 
+  CHUNK_SIZE = 2**20
+  def load(self, size:int) -> Tensor:
+    """
+    Load a tensor from storage.
+
+    self should be a tensor of the hash to load
+    """
+    # TODO: this should work locally as well
+    assert self.dtype == dtypes.uint8, "hash is expected to be uint8"
+    h = self.contiguous().flatten()
+    assert h.shape[0] == 16, "expected hash"
+
+    base_chunks = math.ceil(size / Tensor.CHUNK_SIZE)
+    tree_depth = math.ceil(math.log(base_chunks, Tensor.CHUNK_SIZE // 16))
+    data, level_chunks = h, 0
+    for i in reversed(range(tree_depth + 1)):
+      data = data.to("tinyfs:load")
+
+      # if not last level, its still hashes
+      if i > 0 or tree_depth == 0:
+        level_chunks = max(1, math.ceil(base_chunks / (Tensor.CHUNK_SIZE // 16)**(i-1)))
+        pad_amt = 16 * level_chunks
+      else: pad_amt = Tensor.CHUNK_SIZE * level_chunks
+      if (tsize := data.shape[0]) < pad_amt: data = data.pad((0, pad_amt - tsize))
+      data = data[:pad_amt].contiguous()
+      if i != 0: data = data.to(self.device)
+
+    return data[:size]
+
+  def store(self) -> Tensor:
+    """
+    Store a tensor to storage.
+    """
+    # TODO: this should work locally as well
+    data = self.contiguous().flatten().bitcast(dtypes.uint8)
+
+    # pad to a multiple of 1mb
+    if (tsize := data.shape[0]) % Tensor.CHUNK_SIZE != 0: data = data.pad((0, Tensor.CHUNK_SIZE - tsize % Tensor.CHUNK_SIZE))
+    size = data.shape[0]
+
+    base_chunks = math.ceil(size / Tensor.CHUNK_SIZE)
+    tree_depth = math.ceil(math.log(base_chunks, Tensor.CHUNK_SIZE // 16))
+
+    to_device = "CPU" if isinstance(self.device, str) and self.device.startswith("DISK") else self.device
+
+    level_chunks = base_chunks
+    for _ in range(tree_depth + 1):
+      data = data.to("tinyfs:store")[:level_chunks * 16].contiguous().to(to_device)
+      if (tsize := data.shape[0]) % Tensor.CHUNK_SIZE != 0: data = data.pad((0, Tensor.CHUNK_SIZE - tsize % Tensor.CHUNK_SIZE))
+      level_chunks = math.ceil(data.shape[0] / Tensor.CHUNK_SIZE)
+
+    return data[:16].contiguous()
+
   @staticmethod
   def from_uop(y:UOp, **kwargs) -> Tensor:
     if y.op is Ops.BIND: return Tensor(y, **kwargs, requires_grad=False)