From fe22bcb209bde62cf333232c3765d9c4836c37dd Mon Sep 17 00:00:00 2001 From: Tuowen Zhao Date: Sun, 10 Nov 2019 23:13:48 -0700 Subject: Update for float --- CMakeLists.txt | 1 - main.cpp | 115 ++++++++++++++++++++++++++++++--------------------------- 2 files changed, 61 insertions(+), 55 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 70da54a..2df33fe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,6 @@ set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_EXTENSIONS OFF) find_package(OpenMP REQUIRED) -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS} -fsycl") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS} -fsycl -march=native") get_filename_component(bin_dir "${CMAKE_CXX_COMPILER}" PATH) diff --git a/main.cpp b/main.cpp index c92e650..469ebac 100644 --- a/main.cpp +++ b/main.cpp @@ -4,7 +4,7 @@ #include #include -#define N 256 +#define N 64 #define GZ 16 #define STRIDE (2*GZ + N) #define TILEK 4 @@ -32,7 +32,7 @@ class subgr; template void subkrnl(device &Device, size_t G = 256, size_t L = 64) { - buffer sg_buf{1}; + buffer sg_buf{1}; buffer sg_info{G}; queue Queue(Device); nd_range<1> NumOfWorkItems(G, L); @@ -56,80 +56,87 @@ void subkrnl(device &Device, size_t G = 256, size_t L = 64) { auto sg_i = sg_info.template get_access(); std::cout << "Subgroup size for " << demangle(typeid(T).name()) << ": " << sg_sz[0] << std::endl; - for (int i = 0; i < sg_sz[0]; ++i) - std::cout << sg_i[i] << " "; - std::cout << std::endl; + // for (int i = 0; i < sg_sz[0]; ++i) + // std::cout << sg_i[i] << " "; + // std::cout << std::endl; } void runSubgroups(device &Device) { std::cout << "Running subgroup for different types" << std::endl; subkrnl(Device); subkrnl(Device); - subkrnl(Device); + subkrnl(Device); } void run27pt(device &Device) { // Creating buffer of 1024 ints to be used inside the kernel code - buffer in_buf{range<3>(STRIDE, STRIDE, STRIDE)}; - buffer out_buf{range<3>(STRIDE, STRIDE, STRIDE)}; - buffer c_buf{27}; + buffer in_buf{range<3>(STRIDE, STRIDE, STRIDE)}; + buffer out_buf{range<3>(STRIDE, STRIDE, STRIDE)}; + buffer c_buf{27}; // Creating SYCL queue - queue Queue(Device); + queue Queue(Device, {property::queue::enable_profiling()}); nd_range<3> NumOfWorkItems(range<3>(N, N, N), range<3>(TILEI, TILEJ, TILEK)); - double st = omp_get_wtime(); - for (int i = 0; i < ITER; ++i) + float st = omp_get_wtime(); + auto kernel = [&](handler &cgh) { + // Getting write only access to the buffer on a device + auto in = in_buf.get_access(cgh); + auto out = out_buf.get_access(cgh); + auto c = c_buf.get_access(cgh); + // Executing kernel + cgh.parallel_for( + NumOfWorkItems, [=](nd_item<3> WIid) { + uint32_t i = WIid.get_global_id(0) + GZ; + uint32_t j = WIid.get_global_id(1) + GZ; + uint32_t k = WIid.get_global_id(2) + GZ; + // Fill buffer with indexes + out[id<3>(i, j, k)] = c[0] * in[id<3>(i - 1, j - 1, k - 1)] + + c[1] * in[id<3>(i, j - 1, k - 1)] + + c[2] * in[id<3>(i + 1, j - 1, k - 1)] + + c[3] * in[id<3>(i - 1, j, k - 1)] + + c[4] * in[id<3>(i, j, k - 1)] + + c[5] * in[id<3>(i + 1, j, k - 1)] + + c[6] * in[id<3>(i - 1, j + 1, k - 1)] + + c[7] * in[id<3>(i, j + 1, k - 1)] + + c[8] * in[id<3>(i + 1, j + 1, k - 1)] + + c[9] * in[id<3>(i - 1, j - 1, k)] + + c[10] * in[id<3>(i, j - 1, k)] + + c[11] * in[id<3>(i + 1, j - 1, k)] + + c[12] * in[id<3>(i - 1, j, k)] + + c[13] * in[id<3>(i, j, k)] + + c[14] * in[id<3>(i + 1, j, k)] + + c[15] * in[id<3>(i - 1, j + 1, k)] + + c[16] * in[id<3>(i, j + 1, k)] + + c[17] * in[id<3>(i + 1, j + 1, k)] + + c[18] * in[id<3>(i - 1, j - 1, k + 1)] + + c[19] * in[id<3>(i, j - 1, k + 1)] + + c[20] * in[id<3>(i + 1, j - 1, k + 1)] + + c[21] * in[id<3>(i - 1, j, k + 1)] + + c[22] * in[id<3>(i, j, k + 1)] + + c[23] * in[id<3>(i + 1, j, k + 1)] + + c[24] * in[id<3>(i - 1, j + 1, k + 1)] + + c[25] * in[id<3>(i, j + 1, k + 1)] + + c[26] * in[id<3>(i + 1, j + 1, k + 1)]; + }); + }; + auto st_event = Queue.submit(kernel); + for (int i = 0; i < ITER - 2; ++i) { // Submitting command group(work) to queue - Queue.submit([&](handler &cgh) { - // Getting write only access to the buffer on a device - auto in = in_buf.get_access(cgh); - auto out = out_buf.get_access(cgh); - auto c = c_buf.get_access(cgh); - // Executing kernel - cgh.parallel_for( - NumOfWorkItems, [=](nd_item<3> WIid) { - uint32_t i = WIid.get_global_id(0) + GZ; - uint32_t j = WIid.get_global_id(1) + GZ; - uint32_t k = WIid.get_global_id(2) + GZ; - // Fill buffer with indexes - out[id<3>(i, j, k)] = c[0] * in[id<3>(i - 1, j - 1, k - 1)] - + c[1] * in[id<3>(i, j - 1, k - 1)] - + c[2] * in[id<3>(i + 1, j - 1, k - 1)] - + c[3] * in[id<3>(i - 1, j, k - 1)] - + c[4] * in[id<3>(i, j, k - 1)] - + c[5] * in[id<3>(i + 1, j, k - 1)] - + c[6] * in[id<3>(i - 1, j + 1, k - 1)] - + c[7] * in[id<3>(i, j + 1, k - 1)] - + c[8] * in[id<3>(i + 1, j + 1, k - 1)] - + c[9] * in[id<3>(i - 1, j - 1, k)] - + c[10] * in[id<3>(i, j - 1, k)] - + c[11] * in[id<3>(i + 1, j - 1, k)] - + c[12] * in[id<3>(i - 1, j, k)] - + c[13] * in[id<3>(i, j, k)] - + c[14] * in[id<3>(i + 1, j, k)] - + c[15] * in[id<3>(i - 1, j + 1, k)] - + c[16] * in[id<3>(i, j + 1, k)] - + c[17] * in[id<3>(i + 1, j + 1, k)] - + c[18] * in[id<3>(i - 1, j - 1, k + 1)] - + c[19] * in[id<3>(i, j - 1, k + 1)] - + c[20] * in[id<3>(i + 1, j - 1, k + 1)] - + c[21] * in[id<3>(i - 1, j, k + 1)] - + c[22] * in[id<3>(i, j, k + 1)] - + c[23] * in[id<3>(i + 1, j, k + 1)] - + c[24] * in[id<3>(i - 1, j + 1, k + 1)] - + c[25] * in[id<3>(i, j + 1, k + 1)] - + c[26] * in[id<3>(i + 1, j + 1, k + 1)]; - }); - }); + Queue.submit(kernel); + } + auto ed_event = Queue.submit(kernel); + ed_event.wait(); // Getting read only access to the buffer on the host. // Implicit barrier waiting for queue to complete the work. const auto out_h = out_buf.get_access(); const auto in_h = in_buf.get_access(); double ed = omp_get_wtime(); + double elapsed = (ed_event.get_profiling_info() - st_event.get_profiling_info()) * 1e-9; std::cout << "elapsed: " << (ed - st) / ITER << std::endl; - std::cout << "flops: " << N * N * N * 53.0 * ITER / (ed - st) * 1e-9 << std::endl; + std::cout << "elapsed: " << elapsed << std::endl; + std::cout << "flops: " << N * N * N * 53.0 * ITER / elapsed * 1e-9 << std::endl; } void printInfo(device &Device) { -- cgit v1.2.3-70-g09d2