From e38cf4dc8604634daddcc6a6b05ab1187ec83efc Mon Sep 17 00:00:00 2001 From: Tuowen Zhao Date: Mon, 4 Oct 2021 16:23:27 +0000 Subject: Update formatting --- .clang-format | 2 ++ Makefile | 2 +- main.cpp | 97 ++++++++++++++++++++++++----------------------------------- subgroup.cpp | 40 ++++++++++++------------ 4 files changed, 63 insertions(+), 78 deletions(-) create mode 100644 .clang-format diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..07d9df6 --- /dev/null +++ b/.clang-format @@ -0,0 +1,2 @@ +BasedOnStyle: LLVM +ColumnLimit: 100 \ No newline at end of file diff --git a/Makefile b/Makefile index aea8c80..15323e3 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ SYCL=/opt/intel/oneapi/compiler/latest/linux -CXX=$(SYCL)/bin/clang++ +CXX=$(SYCL)/bin/dpcpp all:sycltest subgroup diff --git a/main.cpp b/main.cpp index 095bbed..5af0173 100644 --- a/main.cpp +++ b/main.cpp @@ -1,12 +1,12 @@ #include // #include -#include -#include #include +#include +#include #define N 256 #define GZ 16 -#define STRIDE (2*GZ + N) +#define STRIDE (2 * GZ + N) #define TILEK 4 #define TILEJ 4 #define TILEI 16 @@ -17,28 +17,24 @@ std::string demangle(const char *name) { int status = -4; // some arbitrary value to eliminate the compiler warning // enable c++11 by passing the flag -std=c++11 to g++ - std::unique_ptr res{ - abi::__cxa_demangle(name, NULL, NULL, &status), - std::free - }; + std::unique_ptr res{abi::__cxa_demangle(name, NULL, NULL, &status), + std::free}; return (status == 0) ? res.get() : name; } using namespace cl::sycl; -template -class subgr; +template class subgr; -template -class SGfunctor { +template class SGfunctor { public: SGfunctor(accessor sg_sz, - accessor sg_i) : sg_sz(sg_sz), sg_i(sg_i) {} + accessor sg_i) + : sg_sz(sg_sz), sg_i(sg_i) {} - [[intel::reqd_sub_group_size(16)]] - void operator()(nd_item<1> NdItem) const { - ONEAPI::sub_group SG = NdItem.get_sub_group(); + [[intel::reqd_sub_group_size(16)]] void operator()(nd_item<1> NdItem) const { + ONEAPI::sub_group SG = NdItem.get_sub_group(); uint32_t wggid = NdItem.get_global_id(0); uint32_t sgid = SG.get_local_id().get(0); if (wggid == 0) @@ -52,8 +48,7 @@ private: accessor sg_sz; }; -template -void subkrnl(device &Device, size_t G = 256, size_t L = 64) { +template void subkrnl(device &Device, size_t G = 256, size_t L = 64) { buffer sg_buf{1}; buffer sg_info{G}; queue Queue(Device); @@ -101,40 +96,26 @@ void run27pt(device &Device) { auto out = out_buf.get_access(cgh); auto c = c_buf.get_access(cgh); // Executing kernel - cgh.parallel_for( - NumOfWorkItems, [=](nd_item<3> WIid) { - uint32_t i = WIid.get_global_id(0) + GZ; - uint32_t j = WIid.get_global_id(1) + GZ; - uint32_t k = WIid.get_global_id(2) + GZ; - // Fill buffer with indexes - out[id<3>(i, j, k)] = c[0] * in[id<3>(i - 1, j - 1, k - 1)] - + c[1] * in[id<3>(i, j - 1, k - 1)] - + c[2] * in[id<3>(i + 1, j - 1, k - 1)] - + c[3] * in[id<3>(i - 1, j, k - 1)] - + c[4] * in[id<3>(i, j, k - 1)] - + c[5] * in[id<3>(i + 1, j, k - 1)] - + c[6] * in[id<3>(i - 1, j + 1, k - 1)] - + c[7] * in[id<3>(i, j + 1, k - 1)] - + c[8] * in[id<3>(i + 1, j + 1, k - 1)] - + c[9] * in[id<3>(i - 1, j - 1, k)] - + c[10] * in[id<3>(i, j - 1, k)] - + c[11] * in[id<3>(i + 1, j - 1, k)] - + c[12] * in[id<3>(i - 1, j, k)] - + c[13] * in[id<3>(i, j, k)] - + c[14] * in[id<3>(i + 1, j, k)] - + c[15] * in[id<3>(i - 1, j + 1, k)] - + c[16] * in[id<3>(i, j + 1, k)] - + c[17] * in[id<3>(i + 1, j + 1, k)] - + c[18] * in[id<3>(i - 1, j - 1, k + 1)] - + c[19] * in[id<3>(i, j - 1, k + 1)] - + c[20] * in[id<3>(i + 1, j - 1, k + 1)] - + c[21] * in[id<3>(i - 1, j, k + 1)] - + c[22] * in[id<3>(i, j, k + 1)] - + c[23] * in[id<3>(i + 1, j, k + 1)] - + c[24] * in[id<3>(i - 1, j + 1, k + 1)] - + c[25] * in[id<3>(i, j + 1, k + 1)] - + c[26] * in[id<3>(i + 1, j + 1, k + 1)]; - }); + cgh.parallel_for(NumOfWorkItems, [=](nd_item<3> WIid) { + uint32_t i = WIid.get_global_id(0) + GZ; + uint32_t j = WIid.get_global_id(1) + GZ; + uint32_t k = WIid.get_global_id(2) + GZ; + // Fill buffer with indexes + out[i][j][k] = c[0] * in[id<3>(i - 1, j - 1, k - 1)] + c[1] * in[id<3>(i, j - 1, k - 1)] + + c[2] * in[id<3>(i + 1, j - 1, k - 1)] + c[3] * in[id<3>(i - 1, j, k - 1)] + + c[4] * in[id<3>(i, j, k - 1)] + c[5] * in[id<3>(i + 1, j, k - 1)] + + c[6] * in[id<3>(i - 1, j + 1, k - 1)] + c[7] * in[id<3>(i, j + 1, k - 1)] + + c[8] * in[id<3>(i + 1, j + 1, k - 1)] + c[9] * in[id<3>(i - 1, j - 1, k)] + + c[10] * in[id<3>(i, j - 1, k)] + c[11] * in[id<3>(i + 1, j - 1, k)] + + c[12] * in[id<3>(i - 1, j, k)] + c[13] * in[id<3>(i, j, k)] + + c[14] * in[id<3>(i + 1, j, k)] + c[15] * in[id<3>(i - 1, j + 1, k)] + + c[16] * in[id<3>(i, j + 1, k)] + c[17] * in[id<3>(i + 1, j + 1, k)] + + c[18] * in[id<3>(i - 1, j - 1, k + 1)] + c[19] * in[id<3>(i, j - 1, k + 1)] + + c[20] * in[id<3>(i + 1, j - 1, k + 1)] + c[21] * in[id<3>(i - 1, j, k + 1)] + + c[22] * in[id<3>(i, j, k + 1)] + c[23] * in[id<3>(i + 1, j, k + 1)] + + c[24] * in[id<3>(i - 1, j + 1, k + 1)] + c[25] * in[id<3>(i, j + 1, k + 1)] + + c[26] * in[id<3>(i + 1, j + 1, k + 1)]; + }); }; auto st_event = Queue.submit(kernel); for (int i = 0; i < ITER - 2; ++i) { @@ -151,22 +132,24 @@ void run27pt(device &Device) { { // double ed = omp_get_wtime(); double elapsed = (ed_event.get_profiling_info() - - st_event.get_profiling_info()) * 1e-9; + st_event.get_profiling_info()) * + 1e-9; std::cout << "elapsed: " << elapsed / ITER << std::endl; std::cout << "flops: " << N * N * N * 53.0 * ITER / elapsed * 1e-9 << std::endl; } { // double ed = omp_get_wtime(); double elapsed = (ed_event.get_profiling_info() - - ed_event.get_profiling_info()) * 1e-9; + ed_event.get_profiling_info()) * + 1e-9; std::cout << "elapsed: " << elapsed << std::endl; std::cout << "flops: " << N * N * N * 53.0 / elapsed * 1e-9 << std::endl; } } void printInfo(device &Device) { - std::cout << "Using OpenCL device {" << Device.get_info() << - "} from {" << Device.get_info() << "}" << std::endl; + std::cout << "Using OpenCL device {" << Device.get_info() << "} from {" + << Device.get_info() << "}" << std::endl; std::cout << "I'm a " << (Device.is_cpu() ? "CPU" : "GPU") << std::endl; auto dot_num_groups = Device.get_info(); @@ -196,12 +179,12 @@ int main() { // Host device is not for compute if (Device.is_host()) continue; - + const std::string DeviceName = Device.get_info(); std::cout << "Device [" << DeviceNumber << "]:" << DeviceName << std::endl; try { - if (Device.is_gpu() && (DeviceName.find("Gen11") != std::string::npos)) { + if (Device.is_gpu() && (DeviceName.find("Graphics") != std::string::npos)) { printInfo(Device); run27pt(Device); runSubgroups(Device); diff --git a/subgroup.cpp b/subgroup.cpp index 59e873d..f55a1f8 100644 --- a/subgroup.cpp +++ b/subgroup.cpp @@ -9,17 +9,18 @@ using namespace cl::sycl; -template -inline void dev_shl(ONEAPI::sub_group &SG, T &res, T l, T r, unsigned kn, unsigned cw, unsigned cid) { +template +inline void dev_shl(ONEAPI::sub_group &SG, T &res, T l, T r, unsigned kn, unsigned cw, + unsigned cid) { T l_tmp = SG.shuffle_down(l, cw - (kn)); T r_tmp = SG.shuffle_up(r, kn); - res = (cid) < kn? l_tmp : r_tmp; + res = (cid) < kn ? l_tmp : r_tmp; } void printInfo(device &Device) { - std::cout << "Using OpenCL " << (Device.is_cpu() ? "CPU" : "GPU") - << " device {" << Device.get_info() - << "} from {" << Device.get_info() << "}" << std::endl; + std::cout << "Using OpenCL " << (Device.is_cpu() ? "CPU" : "GPU") << " device {" + << Device.get_info() << "} from {" + << Device.get_info() << "}" << std::endl; auto dot_num_groups = Device.get_info(); auto dot_wgsize = Device.get_info(); @@ -67,18 +68,17 @@ int main() { auto b = B.get_access(cgh); auto c = C.get_access(cgh); - cgh.parallel_for( - nworkitem, [=](nd_item<1> WIid) { - ONEAPI::sub_group SG = WIid.get_sub_group(); - int sglid = SG.get_local_id().get(0); - uint32_t i = WIid.get_global_id(0); - int* cp = &c[0]; - const int* ap = &a[0]; - const int* bp = &b[0]; -// c[i] = a[i] + b[i]; - dev_shl(SG, cp[i], ap[i], bp[i], 2, 8, sglid & 7); - c[i] = WIid.get_group(0); - }); + cgh.parallel_for(nworkitem, [=](nd_item<1> WIid) { + ONEAPI::sub_group SG = WIid.get_sub_group(); + int sglid = SG.get_local_id().get(0); + uint32_t i = WIid.get_global_id(0); + int *cp = &c[0]; + const int *ap = &a[0]; + const int *bp = &b[0]; + // c[i] = a[i] + b[i]; + dev_shl(SG, cp[i], ap[i], bp[i], 2, 8, sglid & 7); + c[i] = WIid.get_group(0); + }); }; std::cout << "submitting" << std::endl; auto e = Queue.submit(kernel); @@ -87,8 +87,8 @@ int main() { std::cout << "waited" << std::endl; auto c_r = C.get_access(); -// for (int i = 0; i < N; ++i) -// std::cout << i << " + " << N - i << " = " << c[i] << std::endl; + // for (int i = 0; i < N; ++i) + // std::cout << i << " + " << N - i << " = " << c[i] << std::endl; return 0; } -- cgit v1.2.3-70-g09d2