diff --git a/extra/onnx.py b/extra/onnx.py
index 03b6a8b51e..669175edfe 100644
--- a/extra/onnx.py
+++ b/extra/onnx.py
@@ -5,7 +5,7 @@ from typing import Any, Sequence, cast, Literal, Callable, get_args, NamedTuple
 import dataclasses, functools, io, math, types, warnings, pathlib, sys, enum, os, struct
 from tinygrad.nn.state import TensorIO
 from tinygrad.tensor import Tensor, _broadcast_shape, ReductionStr
-from tinygrad.helpers import getenv, DEBUG, all_same, prod, flatten, make_tuple, argsort, is_numpy_ndarray, get_single_element
+from tinygrad.helpers import getenv, DEBUG, all_same, prod, flatten, make_tuple, argsort, is_numpy_ndarray, get_single_element, polyN
 from tinygrad.dtype import DType, ConstType, dtypes, _from_np_dtype
 from tinygrad.device import is_dtype_supported, Device
 
@@ -755,6 +755,7 @@ def get_onnx_ops() -> dict[str, types.FunctionType|dict[OpSetId, types.FunctionT
       if nearest_mode not in mode_operations: raise ValueError(f"invalid {nearest_mode=}")
       indexes = [mode_operations[nearest_mode](idx).int() for idx in indexes]
       X = X[(..., *Tensor.meshgrid(*indexes))]
+
     if mode == "linear":
       expand = list(X.shape)
       for i in range(-len(sizes), 0):
@@ -762,7 +763,48 @@ def get_onnx_ops() -> dict[str, types.FunctionType|dict[OpSetId, types.FunctionT
         reshape[i] = expand[i] = sizes[i]
         low, high, perc = [y.reshape(reshape).expand(expand) for y in (index.floor().int(), index.ceil().int(), index - index.floor())]
         X = X.gather(i, low).lerp(X.gather(i, high), perc)
-    if mode == "cubic": raise NotImplementedError("cubic interpolation is not implemented")
+
+    if mode == "cubic":
+      A = cubic_coeff_a
+
+      def W(x:Tensor):
+        # Keys weights
+        # see piecewise function in: https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
+        x = x.abs()
+        w0_1 = polyN(x, [A + 2, -(A + 3), 0, 1])
+        w1_2 = polyN(x, [A, -5 * A, 8 * A, -4 * A])
+        return (x <= 1).where(w0_1, (x < 2).where(w1_2, 0))
+
+      expand = list(X.shape)
+      for i in range(-len(sizes), 0):
+        input_sz = X.shape[i]
+        reshape, index = [1] * X.ndim, indexes[i]
+        reshape[i] = expand[i] = sizes[i]
+
+        p = index.floor().int()
+        ratio = index - p
+
+        # Neighbor indices
+        idx0, idx1, idx2, idx3 = [p + d for d in [-1, 0, 1, 2]]
+        # Weights of distance from index and neighbor indices
+        c0, c1, c2, c3 = [W(ratio - d) for d in [-1, 0, 1, 2]]
+
+        if exclude_outside:
+          c0 = ((idx0 >= 0) & (idx0 < input_sz)).where(c0, 0)
+          c1 = ((idx1 >= 0) & (idx1 < input_sz)).where(c1, 0)
+          c2 = ((idx2 >= 0) & (idx2 < input_sz)).where(c2, 0)
+          c3 = ((idx3 >= 0) & (idx3 < input_sz)).where(c3, 0)
+
+          total = c0 + c1 + c2 + c3
+          c0, c1, c2, c3 = c0 / (total + 1e-9), c1 / (total + 1e-9), c2 / (total + 1e-9), c3 / (total + 1e-9)
+
+        # Reshape and expand
+        expanded_indices = [y.clip(0, input_sz - 1).reshape(reshape).expand(expand) for y in [idx0, idx1, idx2, idx3]]
+        expanded_coeffs = [y.reshape(reshape).expand(expand) for y in [c0, c1, c2, c3]]
+
+        # Gather values and apply coefficients
+        gathered_values = [X.gather(i, idx) for idx in expanded_indices]
+        X = sum(v * c for v, c in zip(gathered_values, expanded_coeffs))
     return X.permute(*argsort(perm)) if perm else X
   def Upsample(X, scales, mode): return Resize(X=X, scales=scales, mode=mode)  # deprecated
 
diff --git a/test/external/external_test_onnx_backend.py b/test/external/external_test_onnx_backend.py
index 9f82d5d143..618388ce46 100644
--- a/test/external/external_test_onnx_backend.py
+++ b/test/external/external_test_onnx_backend.py
@@ -68,6 +68,8 @@ backend_test.exclude('test_maxunpool_export_with_output_shape_cpu')
 backend_test.exclude('test_averagepool_3d_dilations_large_count_include_pad_is_1_ceil_mode_is_True_cpu')
 # tested in external_test_onnx_ops.py::TestMainOnnxOps.test_resize_downsample_scales_linear_align_corners
 backend_test.exclude('test_resize_downsample_scales_linear_align_corners_cpu')
+# tested in external_test_onnx_ops.py::TestMainOnnxOps.test_resize_downsample_scales_cubic_align_corners
+backend_test.exclude('test_resize_downsample_scales_cubic_align_corners_cpu')
 
 # about different dtypes
 if not is_dtype_supported(dtypes.float64):
@@ -168,10 +170,6 @@ backend_test.exclude('test_deform_conv_*')
 backend_test.exclude('test_lppool_*')
 backend_test.exclude('test_scan_*')
 backend_test.exclude('test_split_to_sequence_*')
-backend_test.exclude('test_resize_downsample_scales_cubic_*') # unsure how to implement cubic
-backend_test.exclude('test_resize_downsample_sizes_cubic_*') # unsure how to implement cubic
-backend_test.exclude('test_resize_upsample_scales_cubic_*') # unsure how to implement cubic
-backend_test.exclude('test_resize_upsample_sizes_cubic_*') # unsure how to implement cubic
 backend_test.exclude('test_ai_onnx_ml_tree_ensemble_*') # https://github.com/onnx/onnx/blob/main/onnx/reference/ops/aionnxml/op_tree_ensemble.py#L121
 
 # rest of the failing tests
@@ -181,6 +179,8 @@ backend_test.exclude('test_resize_tf_crop_and_resize_axes_3_2_cpu') # tf_crop_an
 backend_test.exclude('test_resize_tf_crop_and_resize_extrapolation_value_cpu') # tf_crop_and_resize value not implemented
 backend_test.exclude('test_resize_downsample_scales_linear_antialias_cpu') # antialias not implemented
 backend_test.exclude('test_resize_downsample_sizes_linear_antialias_cpu') # antialias not implemented
+backend_test.exclude('test_resize_downsample_scales_cubic_antialias_cpu') # antialias not implemented
+backend_test.exclude('test_resize_downsample_sizes_cubic_antialias_cpu') # antialias not implemented
 backend_test.exclude('test_ai_onnx_ml_label_encoder_tensor_value_only_mapping_cpu') # bad data type string
 backend_test.exclude('test_ai_onnx_ml_label_encoder_tensor_mapping_cpu') # bad data type string
 
diff --git a/test/external/external_test_onnx_ops.py b/test/external/external_test_onnx_ops.py
index 4de19105bb..ad620e594d 100644
--- a/test/external/external_test_onnx_ops.py
+++ b/test/external/external_test_onnx_ops.py
@@ -96,6 +96,10 @@ class TestMainOnnxOps(TestOnnxOps):
     # excluded 3.5 because some values divide into slight numerical differences, which when rounded gives wrong results
     self._test_resize_scales([0.01, 0.25, 0.5, 0.51, 0.6, 1.0, 1.5, 2.0, 20.0], mode="nearest")
 
+  def test_resize_cubic_mode(self):
+    self._test_resize_scales([0.01, 0.25, 0.5, 0.51, 0.6, 1.0, 1.5, 2.0, 3.5, 20.0], mode="cubic", exclude_outside=1)
+    self._test_resize_scales([0.01, 0.25, 0.5, 0.51, 0.6, 1.0, 1.5, 2.0, 3.5, 20.0], mode="cubic", exclude_outside=0)
+
   def test_resize_downsample_scales_linear_align_corners(self):
     # https://github.com/onnx/onnx/blob/main/docs/Operators.md#examples-131
     X = np.array([[[[1, 2, 3, 4], [5, 6, 7, 8]]]], dtype=np.float32)
@@ -105,6 +109,15 @@ class TestMainOnnxOps(TestOnnxOps):
     outputs = ["out"]
     self.helper_test_single_op("Resize", inputs, attributes, outputs)
 
+  def test_resize_downsample_scales_cubic_align_corners(self):
+    # https://github.com/onnx/onnx/blob/main/docs/Operators.md#examples-131
+    X = np.array([[[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]]], dtype=np.float32)
+    scales = np.array([1.0, 1.0, 0.8, 0.8], dtype=np.float32)
+    inputs = {"X": X, "roi": np.array([], dtype=np.float32), "scales": scales}
+    attributes = {"mode": "cubic", "coordinate_transformation_mode": "align_corners"}
+    outputs = ["out"]
+    self.helper_test_single_op("Resize", inputs, attributes, outputs)
+
   def test_maxunpool_export_with_output_shape(self):
     # https://github.com/onnx/onnx/blob/main/docs/Operators.md#examples-91
     xT = np.array([[[[5, 6], [7, 8]]]], dtype=np.float32)