diff options
author | Tuowen Zhao <ztuowen@gmail.com> | 2021-10-04 16:23:27 +0000 |
---|---|---|
committer | Tuowen Zhao <ztuowen@gmail.com> | 2021-10-04 16:23:27 +0000 |
commit | e38cf4dc8604634daddcc6a6b05ab1187ec83efc (patch) | |
tree | c465f0f95c5766fc339d02aa367186bc209a3ae5 /main.cpp | |
parent | c0be98c7e7f71c79c10566760d7c29974206f97d (diff) | |
download | sycltest-master.tar.gz sycltest-master.tar.bz2 sycltest-master.zip |
Diffstat (limited to 'main.cpp')
-rw-r--r-- | main.cpp | 97 |
1 files changed, 40 insertions, 57 deletions
@@ -1,12 +1,12 @@ #include <CL/sycl.hpp> // #include <omp.h> -#include <typeinfo> -#include <memory> #include <cxxabi.h> +#include <memory> +#include <typeinfo> #define N 256 #define GZ 16 -#define STRIDE (2*GZ + N) +#define STRIDE (2 * GZ + N) #define TILEK 4 #define TILEJ 4 #define TILEI 16 @@ -17,28 +17,24 @@ std::string demangle(const char *name) { int status = -4; // some arbitrary value to eliminate the compiler warning // enable c++11 by passing the flag -std=c++11 to g++ - std::unique_ptr<char, void (*)(void *)> res{ - abi::__cxa_demangle(name, NULL, NULL, &status), - std::free - }; + std::unique_ptr<char, void (*)(void *)> res{abi::__cxa_demangle(name, NULL, NULL, &status), + std::free}; return (status == 0) ? res.get() : name; } using namespace cl::sycl; -template<typename T> -class subgr; +template <typename T> class subgr; -template<typename T> -class SGfunctor { +template <typename T> class SGfunctor { public: SGfunctor(accessor<uint32_t, 1, access::mode::write, access::target::global_buffer> sg_sz, - accessor<T, 1, access::mode::write, access::target::global_buffer> sg_i) : sg_sz(sg_sz), sg_i(sg_i) {} + accessor<T, 1, access::mode::write, access::target::global_buffer> sg_i) + : sg_sz(sg_sz), sg_i(sg_i) {} - [[intel::reqd_sub_group_size(16)]] - void operator()(nd_item<1> NdItem) const { - ONEAPI::sub_group SG = NdItem.get_sub_group(); + [[intel::reqd_sub_group_size(16)]] void operator()(nd_item<1> NdItem) const { + ONEAPI::sub_group SG = NdItem.get_sub_group(); uint32_t wggid = NdItem.get_global_id(0); uint32_t sgid = SG.get_local_id().get(0); if (wggid == 0) @@ -52,8 +48,7 @@ private: accessor<uint32_t, 1, access::mode::write, access::target::global_buffer> sg_sz; }; -template<typename T> -void subkrnl(device &Device, size_t G = 256, size_t L = 64) { +template <typename T> void subkrnl(device &Device, size_t G = 256, size_t L = 64) { buffer<uint32_t> sg_buf{1}; buffer<T> sg_info{G}; queue Queue(Device); @@ -101,40 +96,26 @@ void run27pt(device &Device) { auto out = out_buf.get_access<access::mode::write>(cgh); auto c = c_buf.get_access<access::mode::read>(cgh); // Executing kernel - cgh.parallel_for<class FillBuffer>( - NumOfWorkItems, [=](nd_item<3> WIid) { - uint32_t i = WIid.get_global_id(0) + GZ; - uint32_t j = WIid.get_global_id(1) + GZ; - uint32_t k = WIid.get_global_id(2) + GZ; - // Fill buffer with indexes - out[id<3>(i, j, k)] = c[0] * in[id<3>(i - 1, j - 1, k - 1)] - + c[1] * in[id<3>(i, j - 1, k - 1)] - + c[2] * in[id<3>(i + 1, j - 1, k - 1)] - + c[3] * in[id<3>(i - 1, j, k - 1)] - + c[4] * in[id<3>(i, j, k - 1)] - + c[5] * in[id<3>(i + 1, j, k - 1)] - + c[6] * in[id<3>(i - 1, j + 1, k - 1)] - + c[7] * in[id<3>(i, j + 1, k - 1)] - + c[8] * in[id<3>(i + 1, j + 1, k - 1)] - + c[9] * in[id<3>(i - 1, j - 1, k)] - + c[10] * in[id<3>(i, j - 1, k)] - + c[11] * in[id<3>(i + 1, j - 1, k)] - + c[12] * in[id<3>(i - 1, j, k)] - + c[13] * in[id<3>(i, j, k)] - + c[14] * in[id<3>(i + 1, j, k)] - + c[15] * in[id<3>(i - 1, j + 1, k)] - + c[16] * in[id<3>(i, j + 1, k)] - + c[17] * in[id<3>(i + 1, j + 1, k)] - + c[18] * in[id<3>(i - 1, j - 1, k + 1)] - + c[19] * in[id<3>(i, j - 1, k + 1)] - + c[20] * in[id<3>(i + 1, j - 1, k + 1)] - + c[21] * in[id<3>(i - 1, j, k + 1)] - + c[22] * in[id<3>(i, j, k + 1)] - + c[23] * in[id<3>(i + 1, j, k + 1)] - + c[24] * in[id<3>(i - 1, j + 1, k + 1)] - + c[25] * in[id<3>(i, j + 1, k + 1)] - + c[26] * in[id<3>(i + 1, j + 1, k + 1)]; - }); + cgh.parallel_for<class FillBuffer>(NumOfWorkItems, [=](nd_item<3> WIid) { + uint32_t i = WIid.get_global_id(0) + GZ; + uint32_t j = WIid.get_global_id(1) + GZ; + uint32_t k = WIid.get_global_id(2) + GZ; + // Fill buffer with indexes + out[i][j][k] = c[0] * in[id<3>(i - 1, j - 1, k - 1)] + c[1] * in[id<3>(i, j - 1, k - 1)] + + c[2] * in[id<3>(i + 1, j - 1, k - 1)] + c[3] * in[id<3>(i - 1, j, k - 1)] + + c[4] * in[id<3>(i, j, k - 1)] + c[5] * in[id<3>(i + 1, j, k - 1)] + + c[6] * in[id<3>(i - 1, j + 1, k - 1)] + c[7] * in[id<3>(i, j + 1, k - 1)] + + c[8] * in[id<3>(i + 1, j + 1, k - 1)] + c[9] * in[id<3>(i - 1, j - 1, k)] + + c[10] * in[id<3>(i, j - 1, k)] + c[11] * in[id<3>(i + 1, j - 1, k)] + + c[12] * in[id<3>(i - 1, j, k)] + c[13] * in[id<3>(i, j, k)] + + c[14] * in[id<3>(i + 1, j, k)] + c[15] * in[id<3>(i - 1, j + 1, k)] + + c[16] * in[id<3>(i, j + 1, k)] + c[17] * in[id<3>(i + 1, j + 1, k)] + + c[18] * in[id<3>(i - 1, j - 1, k + 1)] + c[19] * in[id<3>(i, j - 1, k + 1)] + + c[20] * in[id<3>(i + 1, j - 1, k + 1)] + c[21] * in[id<3>(i - 1, j, k + 1)] + + c[22] * in[id<3>(i, j, k + 1)] + c[23] * in[id<3>(i + 1, j, k + 1)] + + c[24] * in[id<3>(i - 1, j + 1, k + 1)] + c[25] * in[id<3>(i, j + 1, k + 1)] + + c[26] * in[id<3>(i + 1, j + 1, k + 1)]; + }); }; auto st_event = Queue.submit(kernel); for (int i = 0; i < ITER - 2; ++i) { @@ -151,22 +132,24 @@ void run27pt(device &Device) { { // double ed = omp_get_wtime(); double elapsed = (ed_event.get_profiling_info<info::event_profiling::command_end>() - - st_event.get_profiling_info<info::event_profiling::command_start>()) * 1e-9; + st_event.get_profiling_info<info::event_profiling::command_start>()) * + 1e-9; std::cout << "elapsed: " << elapsed / ITER << std::endl; std::cout << "flops: " << N * N * N * 53.0 * ITER / elapsed * 1e-9 << std::endl; } { // double ed = omp_get_wtime(); double elapsed = (ed_event.get_profiling_info<info::event_profiling::command_end>() - - ed_event.get_profiling_info<info::event_profiling::command_start>()) * 1e-9; + ed_event.get_profiling_info<info::event_profiling::command_start>()) * + 1e-9; std::cout << "elapsed: " << elapsed << std::endl; std::cout << "flops: " << N * N * N * 53.0 / elapsed * 1e-9 << std::endl; } } void printInfo(device &Device) { - std::cout << "Using OpenCL device {" << Device.get_info<info::device::name>() << - "} from {" << Device.get_info<info::device::vendor>() << "}" << std::endl; + std::cout << "Using OpenCL device {" << Device.get_info<info::device::name>() << "} from {" + << Device.get_info<info::device::vendor>() << "}" << std::endl; std::cout << "I'm a " << (Device.is_cpu() ? "CPU" : "GPU") << std::endl; auto dot_num_groups = Device.get_info<info::device::max_compute_units>(); @@ -196,12 +179,12 @@ int main() { // Host device is not for compute if (Device.is_host()) continue; - + const std::string DeviceName = Device.get_info<info::device::name>(); std::cout << "Device [" << DeviceNumber << "]:" << DeviceName << std::endl; try { - if (Device.is_gpu() && (DeviceName.find("Gen11") != std::string::npos)) { + if (Device.is_gpu() && (DeviceName.find("Graphics") != std::string::npos)) { printInfo(Device); run27pt(Device); runSubgroups(Device); |