diff options
author | Tuowen Zhao <ztuowen@gmail.com> | 2021-10-04 16:23:27 +0000 |
---|---|---|
committer | Tuowen Zhao <ztuowen@gmail.com> | 2021-10-04 16:23:27 +0000 |
commit | e38cf4dc8604634daddcc6a6b05ab1187ec83efc (patch) | |
tree | c465f0f95c5766fc339d02aa367186bc209a3ae5 | |
parent | c0be98c7e7f71c79c10566760d7c29974206f97d (diff) | |
download | sycltest-e38cf4dc8604634daddcc6a6b05ab1187ec83efc.tar.gz sycltest-e38cf4dc8604634daddcc6a6b05ab1187ec83efc.tar.bz2 sycltest-e38cf4dc8604634daddcc6a6b05ab1187ec83efc.zip |
-rw-r--r-- | .clang-format | 2 | ||||
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | main.cpp | 97 | ||||
-rw-r--r-- | subgroup.cpp | 40 |
4 files changed, 63 insertions, 78 deletions
diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..07d9df6 --- /dev/null +++ b/.clang-format @@ -0,0 +1,2 @@ +BasedOnStyle: LLVM +ColumnLimit: 100
\ No newline at end of file @@ -1,5 +1,5 @@ SYCL=/opt/intel/oneapi/compiler/latest/linux -CXX=$(SYCL)/bin/clang++ +CXX=$(SYCL)/bin/dpcpp all:sycltest subgroup @@ -1,12 +1,12 @@ #include <CL/sycl.hpp> // #include <omp.h> -#include <typeinfo> -#include <memory> #include <cxxabi.h> +#include <memory> +#include <typeinfo> #define N 256 #define GZ 16 -#define STRIDE (2*GZ + N) +#define STRIDE (2 * GZ + N) #define TILEK 4 #define TILEJ 4 #define TILEI 16 @@ -17,28 +17,24 @@ std::string demangle(const char *name) { int status = -4; // some arbitrary value to eliminate the compiler warning // enable c++11 by passing the flag -std=c++11 to g++ - std::unique_ptr<char, void (*)(void *)> res{ - abi::__cxa_demangle(name, NULL, NULL, &status), - std::free - }; + std::unique_ptr<char, void (*)(void *)> res{abi::__cxa_demangle(name, NULL, NULL, &status), + std::free}; return (status == 0) ? res.get() : name; } using namespace cl::sycl; -template<typename T> -class subgr; +template <typename T> class subgr; -template<typename T> -class SGfunctor { +template <typename T> class SGfunctor { public: SGfunctor(accessor<uint32_t, 1, access::mode::write, access::target::global_buffer> sg_sz, - accessor<T, 1, access::mode::write, access::target::global_buffer> sg_i) : sg_sz(sg_sz), sg_i(sg_i) {} + accessor<T, 1, access::mode::write, access::target::global_buffer> sg_i) + : sg_sz(sg_sz), sg_i(sg_i) {} - [[intel::reqd_sub_group_size(16)]] - void operator()(nd_item<1> NdItem) const { - ONEAPI::sub_group SG = NdItem.get_sub_group(); + [[intel::reqd_sub_group_size(16)]] void operator()(nd_item<1> NdItem) const { + ONEAPI::sub_group SG = NdItem.get_sub_group(); uint32_t wggid = NdItem.get_global_id(0); uint32_t sgid = SG.get_local_id().get(0); if (wggid == 0) @@ -52,8 +48,7 @@ private: accessor<uint32_t, 1, access::mode::write, access::target::global_buffer> sg_sz; }; -template<typename T> -void subkrnl(device &Device, size_t G = 256, size_t L = 64) { +template <typename T> void subkrnl(device &Device, size_t G = 256, size_t L = 64) { buffer<uint32_t> sg_buf{1}; buffer<T> sg_info{G}; queue Queue(Device); @@ -101,40 +96,26 @@ void run27pt(device &Device) { auto out = out_buf.get_access<access::mode::write>(cgh); auto c = c_buf.get_access<access::mode::read>(cgh); // Executing kernel - cgh.parallel_for<class FillBuffer>( - NumOfWorkItems, [=](nd_item<3> WIid) { - uint32_t i = WIid.get_global_id(0) + GZ; - uint32_t j = WIid.get_global_id(1) + GZ; - uint32_t k = WIid.get_global_id(2) + GZ; - // Fill buffer with indexes - out[id<3>(i, j, k)] = c[0] * in[id<3>(i - 1, j - 1, k - 1)] - + c[1] * in[id<3>(i, j - 1, k - 1)] - + c[2] * in[id<3>(i + 1, j - 1, k - 1)] - + c[3] * in[id<3>(i - 1, j, k - 1)] - + c[4] * in[id<3>(i, j, k - 1)] - + c[5] * in[id<3>(i + 1, j, k - 1)] - + c[6] * in[id<3>(i - 1, j + 1, k - 1)] - + c[7] * in[id<3>(i, j + 1, k - 1)] - + c[8] * in[id<3>(i + 1, j + 1, k - 1)] - + c[9] * in[id<3>(i - 1, j - 1, k)] - + c[10] * in[id<3>(i, j - 1, k)] - + c[11] * in[id<3>(i + 1, j - 1, k)] - + c[12] * in[id<3>(i - 1, j, k)] - + c[13] * in[id<3>(i, j, k)] - + c[14] * in[id<3>(i + 1, j, k)] - + c[15] * in[id<3>(i - 1, j + 1, k)] - + c[16] * in[id<3>(i, j + 1, k)] - + c[17] * in[id<3>(i + 1, j + 1, k)] - + c[18] * in[id<3>(i - 1, j - 1, k + 1)] - + c[19] * in[id<3>(i, j - 1, k + 1)] - + c[20] * in[id<3>(i + 1, j - 1, k + 1)] - + c[21] * in[id<3>(i - 1, j, k + 1)] - + c[22] * in[id<3>(i, j, k + 1)] - + c[23] * in[id<3>(i + 1, j, k + 1)] - + c[24] * in[id<3>(i - 1, j + 1, k + 1)] - + c[25] * in[id<3>(i, j + 1, k + 1)] - + c[26] * in[id<3>(i + 1, j + 1, k + 1)]; - }); + cgh.parallel_for<class FillBuffer>(NumOfWorkItems, [=](nd_item<3> WIid) { + uint32_t i = WIid.get_global_id(0) + GZ; + uint32_t j = WIid.get_global_id(1) + GZ; + uint32_t k = WIid.get_global_id(2) + GZ; + // Fill buffer with indexes + out[i][j][k] = c[0] * in[id<3>(i - 1, j - 1, k - 1)] + c[1] * in[id<3>(i, j - 1, k - 1)] + + c[2] * in[id<3>(i + 1, j - 1, k - 1)] + c[3] * in[id<3>(i - 1, j, k - 1)] + + c[4] * in[id<3>(i, j, k - 1)] + c[5] * in[id<3>(i + 1, j, k - 1)] + + c[6] * in[id<3>(i - 1, j + 1, k - 1)] + c[7] * in[id<3>(i, j + 1, k - 1)] + + c[8] * in[id<3>(i + 1, j + 1, k - 1)] + c[9] * in[id<3>(i - 1, j - 1, k)] + + c[10] * in[id<3>(i, j - 1, k)] + c[11] * in[id<3>(i + 1, j - 1, k)] + + c[12] * in[id<3>(i - 1, j, k)] + c[13] * in[id<3>(i, j, k)] + + c[14] * in[id<3>(i + 1, j, k)] + c[15] * in[id<3>(i - 1, j + 1, k)] + + c[16] * in[id<3>(i, j + 1, k)] + c[17] * in[id<3>(i + 1, j + 1, k)] + + c[18] * in[id<3>(i - 1, j - 1, k + 1)] + c[19] * in[id<3>(i, j - 1, k + 1)] + + c[20] * in[id<3>(i + 1, j - 1, k + 1)] + c[21] * in[id<3>(i - 1, j, k + 1)] + + c[22] * in[id<3>(i, j, k + 1)] + c[23] * in[id<3>(i + 1, j, k + 1)] + + c[24] * in[id<3>(i - 1, j + 1, k + 1)] + c[25] * in[id<3>(i, j + 1, k + 1)] + + c[26] * in[id<3>(i + 1, j + 1, k + 1)]; + }); }; auto st_event = Queue.submit(kernel); for (int i = 0; i < ITER - 2; ++i) { @@ -151,22 +132,24 @@ void run27pt(device &Device) { { // double ed = omp_get_wtime(); double elapsed = (ed_event.get_profiling_info<info::event_profiling::command_end>() - - st_event.get_profiling_info<info::event_profiling::command_start>()) * 1e-9; + st_event.get_profiling_info<info::event_profiling::command_start>()) * + 1e-9; std::cout << "elapsed: " << elapsed / ITER << std::endl; std::cout << "flops: " << N * N * N * 53.0 * ITER / elapsed * 1e-9 << std::endl; } { // double ed = omp_get_wtime(); double elapsed = (ed_event.get_profiling_info<info::event_profiling::command_end>() - - ed_event.get_profiling_info<info::event_profiling::command_start>()) * 1e-9; + ed_event.get_profiling_info<info::event_profiling::command_start>()) * + 1e-9; std::cout << "elapsed: " << elapsed << std::endl; std::cout << "flops: " << N * N * N * 53.0 / elapsed * 1e-9 << std::endl; } } void printInfo(device &Device) { - std::cout << "Using OpenCL device {" << Device.get_info<info::device::name>() << - "} from {" << Device.get_info<info::device::vendor>() << "}" << std::endl; + std::cout << "Using OpenCL device {" << Device.get_info<info::device::name>() << "} from {" + << Device.get_info<info::device::vendor>() << "}" << std::endl; std::cout << "I'm a " << (Device.is_cpu() ? "CPU" : "GPU") << std::endl; auto dot_num_groups = Device.get_info<info::device::max_compute_units>(); @@ -196,12 +179,12 @@ int main() { // Host device is not for compute if (Device.is_host()) continue; - + const std::string DeviceName = Device.get_info<info::device::name>(); std::cout << "Device [" << DeviceNumber << "]:" << DeviceName << std::endl; try { - if (Device.is_gpu() && (DeviceName.find("Gen11") != std::string::npos)) { + if (Device.is_gpu() && (DeviceName.find("Graphics") != std::string::npos)) { printInfo(Device); run27pt(Device); runSubgroups(Device); diff --git a/subgroup.cpp b/subgroup.cpp index 59e873d..f55a1f8 100644 --- a/subgroup.cpp +++ b/subgroup.cpp @@ -9,17 +9,18 @@ using namespace cl::sycl; -template<typename T> -inline void dev_shl(ONEAPI::sub_group &SG, T &res, T l, T r, unsigned kn, unsigned cw, unsigned cid) { +template <typename T> +inline void dev_shl(ONEAPI::sub_group &SG, T &res, T l, T r, unsigned kn, unsigned cw, + unsigned cid) { T l_tmp = SG.shuffle_down(l, cw - (kn)); T r_tmp = SG.shuffle_up(r, kn); - res = (cid) < kn? l_tmp : r_tmp; + res = (cid) < kn ? l_tmp : r_tmp; } void printInfo(device &Device) { - std::cout << "Using OpenCL " << (Device.is_cpu() ? "CPU" : "GPU") - << " device {" << Device.get_info<info::device::name>() - << "} from {" << Device.get_info<info::device::vendor>() << "}" << std::endl; + std::cout << "Using OpenCL " << (Device.is_cpu() ? "CPU" : "GPU") << " device {" + << Device.get_info<info::device::name>() << "} from {" + << Device.get_info<info::device::vendor>() << "}" << std::endl; auto dot_num_groups = Device.get_info<info::device::max_compute_units>(); auto dot_wgsize = Device.get_info<info::device::max_work_group_size>(); @@ -67,18 +68,17 @@ int main() { auto b = B.get_access<access::mode::read>(cgh); auto c = C.get_access<access::mode::write>(cgh); - cgh.parallel_for<class FillBuffer>( - nworkitem, [=](nd_item<1> WIid) { - ONEAPI::sub_group SG = WIid.get_sub_group(); - int sglid = SG.get_local_id().get(0); - uint32_t i = WIid.get_global_id(0); - int* cp = &c[0]; - const int* ap = &a[0]; - const int* bp = &b[0]; -// c[i] = a[i] + b[i]; - dev_shl(SG, cp[i], ap[i], bp[i], 2, 8, sglid & 7); - c[i] = WIid.get_group(0); - }); + cgh.parallel_for<class FillBuffer>(nworkitem, [=](nd_item<1> WIid) { + ONEAPI::sub_group SG = WIid.get_sub_group(); + int sglid = SG.get_local_id().get(0); + uint32_t i = WIid.get_global_id(0); + int *cp = &c[0]; + const int *ap = &a[0]; + const int *bp = &b[0]; + // c[i] = a[i] + b[i]; + dev_shl(SG, cp[i], ap[i], bp[i], 2, 8, sglid & 7); + c[i] = WIid.get_group(0); + }); }; std::cout << "submitting" << std::endl; auto e = Queue.submit(kernel); @@ -87,8 +87,8 @@ int main() { std::cout << "waited" << std::endl; auto c_r = C.get_access<access::mode::read>(); -// for (int i = 0; i < N; ++i) -// std::cout << i << " + " << N - i << " = " << c[i] << std::endl; + // for (int i = 0; i < N; ++i) + // std::cout << i << " + " << N - i << " = " << c[i] << std::endl; return 0; } |