From afd0be8a9c889abdc1bb68a18cf091694e1c507a Mon Sep 17 00:00:00 2001 From: George Hotz Date: Sun, 4 Jun 2023 06:43:09 +0000 Subject: [PATCH] intel example --- extra/intel/.gitignore | 1 + extra/intel/README | 2 + extra/intel/go.sh | 3 + extra/intel/joint_matrix_bfloat16.cpp | 169 ++++++++++++++++++++++++++ 4 files changed, 175 insertions(+) create mode 100644 extra/intel/.gitignore create mode 100644 extra/intel/README create mode 100755 extra/intel/go.sh create mode 100644 extra/intel/joint_matrix_bfloat16.cpp diff --git a/extra/intel/.gitignore b/extra/intel/.gitignore new file mode 100644 index 0000000000..cba7efc8ef --- /dev/null +++ b/extra/intel/.gitignore @@ -0,0 +1 @@ +a.out diff --git a/extra/intel/README b/extra/intel/README new file mode 100644 index 0000000000..6a6ed7cca2 --- /dev/null +++ b/extra/intel/README @@ -0,0 +1,2 @@ +source /opt/intel/oneapi/compiler/latest/env/vars.sh +sycl-ls diff --git a/extra/intel/go.sh b/extra/intel/go.sh new file mode 100755 index 0000000000..8c67088c05 --- /dev/null +++ b/extra/intel/go.sh @@ -0,0 +1,3 @@ +#!/bin/bash -e +/opt/intel/oneapi/compiler/latest/linux/bin-llvm/clang++ joint_matrix_bfloat16.cpp -fsycl +SYCL_PI_TRACE=1 ./a.out diff --git a/extra/intel/joint_matrix_bfloat16.cpp b/extra/intel/joint_matrix_bfloat16.cpp new file mode 100644 index 0000000000..717222b67f --- /dev/null +++ b/extra/intel/joint_matrix_bfloat16.cpp @@ -0,0 +1,169 @@ +//==-------- joint_matrix_bfloat16.cpp - DPC++ joint_matrix----------- ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; +using bfloat16 = sycl::ext::oneapi::bfloat16; + +//#define SG_SZ 16 +#define SG_SZ 8 + +#define TM 8 +#define TN SG_SZ +//#define TK 16 +#define TK 16 + +#define BF16_EPSILON 0.00781250 + +template struct big_matrix { +private: + T *mat; + +public: + T *get_data() { return mat; } + void set_data(T *data) { mat = data; } + big_matrix(T *data) : mat(data) {} +}; + +template +void matrix_multiply(big_matrix &C, big_matrix &A, big_matrix &B) { + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC((float *)C.get_data(), range<2>(M, N)); + + auto program = [&](handler &cgh) { + auto accC = bufC.get_access(cgh); + auto accA = bufA.get_access(cgh); + auto accB = bufB.get_access(cgh); + + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), + [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; + // For B, we assume B has been already VNNIed. + joint_matrix sub_b; + joint_matrix sub_c; + joint_matrix_load(sg, sub_c, accC.get_pointer() + (sg_startx * TM) * N + sg_starty / SG_SZ * TN, N, layout::row_major); + + for (int k = 0; k < K / TK; k += 1) { // + joint_matrix_load(sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK, K); + joint_matrix_load(sg, sub_b, accB.get_pointer() + (k * TK / 2) * (N * 2) + sg_starty / SG_SZ * TN * 2, N * 2); + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + } + joint_matrix_store(sg, sub_c, accC.get_pointer() + (sg_startx * TM) * N + sg_starty / SG_SZ * TN, N, layout::row_major); + }); // parallel for + }; + + queue q; + auto start = std::chrono::steady_clock::now(); + q.submit(program).wait(); + auto end = std::chrono::steady_clock::now(); + std::cout << "compute: " << std::chrono::duration_cast(end - start).count() << " ms" << std::endl; + + // ahh, freeing is slow +} + +//#define SCALE 1024 +#define SCALE 64 +static constexpr size_t MATRIX_M = TM * SCALE; +static constexpr size_t MATRIX_N = TN * SCALE; +static constexpr size_t MATRIX_K = TK * SCALE; +bfloat16 A[MATRIX_M][MATRIX_K]; +bfloat16 B[MATRIX_K / 2][MATRIX_N * 2]; +float C[MATRIX_M][MATRIX_N]; +float D[MATRIX_M][MATRIX_N]; + +float make_fp32(bfloat16 x) { + unsigned int y = *((int *)&x); + y = y << 16; + float *res = reinterpret_cast(&y); + return *res; +} + +void matrix_multiply_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N, + int K) { + for (int m = 0; m < M; m++) + for (int n = 0; n < N; n++) { + for (int k = 0; k < K; k++) { + // Because B was assumed VNNIed + bfloat16 *va = (bfloat16 *)(A_mem + m * K + k); + bfloat16 *vb = (bfloat16 *)(B_mem + k * N + n); + float acc = *((float *)(C_mem + m * N + n)); + for (int i = 0; i < 2; i++) { + acc += (make_fp32(va[i]) * make_fp32(vb[i])); + } + *((float *)(C_mem + m * N + n)) = acc; + } + } +} + +int main() { + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_K; j++) { + A[i][j] = bfloat16(1.0f * (i + j)); + } + } + for (int i = 0; i < MATRIX_K / 2; i++) { + for (int j = 0; j < MATRIX_N * 2; j++) { + B[i][j] = bfloat16(2.0f * i + 3.0f * j); + } + } + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + C[i][j] = 1.0; + D[i][j] = 1.0; + } + } + + std::cout << "M" << MATRIX_M << "N" << MATRIX_N << "K" << MATRIX_K << std::endl; + + big_matrix MC((float *)&C); + big_matrix MD((float *)&D); + big_matrix MA((bfloat16 *)&A); + big_matrix MB((bfloat16 *)&B); + + matrix_multiply(MC, MA, MB); + + /*start = std::chrono::steady_clock::now(); + matrix_multiply_ref((int32_t *)A, (int32_t *)B, (int32_t *)D, MATRIX_M, MATRIX_N, MATRIX_K / 2); + end = std::chrono::steady_clock::now(); + std::cout << "Elapsed time in milliseconds (reference): " << std::chrono::duration_cast(end - start).count() << " ms" << std::endl; + + bool res = true; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + if ((fabs(C[i][j]) - fabs(D[i][j])) > BF16_EPSILON) + res = false; + } + } + std::cout << (res ? "passed" : "failed") << std::endl; + return !res;*/ + + return 0; +} +