From fbf17f00310c9dbedb6665d3ec1c58e3d359575d Mon Sep 17 00:00:00 2001
From: George Hotz <geohot@gmail.com>
Date: Sun, 4 Jun 2023 17:01:50 +0000
Subject: [PATCH] intel benchmark matmul gets 60 TFLOPS?

---
 extra/intel/benchmark_matmul.py       | 57 +++++++++++++++++++++++++++
 extra/intel/joint_matrix_bfloat16.cpp | 10 +++--
 2 files changed, 64 insertions(+), 3 deletions(-)
 create mode 100644 extra/intel/benchmark_matmul.py

diff --git a/extra/intel/benchmark_matmul.py b/extra/intel/benchmark_matmul.py
new file mode 100644
index 0000000000..5999039de0
--- /dev/null
+++ b/extra/intel/benchmark_matmul.py
@@ -0,0 +1,57 @@
+import time
+
+onnx_path = "/tmp/my.onnx"
+N = 2048
+CNT = 400
+
+"""
+import torch
+import torch.nn as nn
+#dtype = torch.bfloat16
+dtype = torch.float32
+class MatMul(nn.Module):
+  def __init__(self):
+    super().__init__()
+    self.a = nn.Linear(N, N, bias=False)
+  def forward(self, x):
+    x = x.to(dtype)
+    for i in range(CNT): x = self.a(x).relu()
+    return x.to(torch.float32)
+
+torch_model = MatMul().to(dtype)
+torch.onnx.export(torch_model, torch.randn(N, N), onnx_path)
+"""
+
+"""
+import onnx
+from tinygrad.tensor import Tensor
+from extra.onnx import get_run_onnx
+out = get_run_onnx(onnx.load(onnx_path))({"onnx::MatMul_0": Tensor.zeros(N, N)})
+for x in out.values(): x.realize()
+"""
+
+from openvino.runtime import Core
+core = Core()
+devices = core.available_devices
+for device in devices:
+  device_name = core.get_property(device, "FULL_DEVICE_NAME")
+  print(f"{device}: {device_name}")
+model = core.read_model(onnx_path)
+compiled_model = core.compile_model(model, device_name='GPU.0')
+print(compiled_model)
+ireq = compiled_model.create_infer_request()
+for model_input in compiled_model.inputs:
+  tensor = ireq.get_tensor(model_input)
+  tensor.data[:] = 2
+  print(tensor)
+print("request")
+ireq.infer()
+ireq.infer()
+print("did one")
+
+REPS = 20
+st = time.perf_counter()
+for i in range(REPS): ireq.infer()
+et = time.perf_counter() - st
+print(f"{et*1000:.2f} ms {(CNT*N*N*N*REPS*2/et)*1e-9:.2f} GFLOPS")
+
diff --git a/extra/intel/joint_matrix_bfloat16.cpp b/extra/intel/joint_matrix_bfloat16.cpp
index 717222b67f..b21d6089d2 100644
--- a/extra/intel/joint_matrix_bfloat16.cpp
+++ b/extra/intel/joint_matrix_bfloat16.cpp
@@ -81,15 +81,19 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A, big_matri
 
   queue q;
   auto start = std::chrono::steady_clock::now();
-  q.submit(program).wait();
+  auto e = q.submit(program);
+  auto submit = std::chrono::steady_clock::now();
+  e.wait();
   auto end = std::chrono::steady_clock::now();
-  std::cout << "compute: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " ms" << std::endl;
+  std::cout << "submit:  " << std::chrono::duration_cast<std::chrono::milliseconds>(submit - start).count() << " ms" << std::endl;
+  std::cout << "compute: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - submit).count() << " ms" << std::endl;
 
   // ahh, freeing is slow
 }
 
 //#define SCALE 1024
-#define SCALE 64
+//#define SCALE 64
+#define SCALE 256
 static constexpr size_t MATRIX_M = TM * SCALE;
 static constexpr size_t MATRIX_N = TN * SCALE;
 static constexpr size_t MATRIX_K = TK * SCALE;