diff options
author | Tuowen Zhao <ztuowen@gmail.com> | 2019-11-10 23:13:48 -0700 |
---|---|---|
committer | Tuowen Zhao <ztuowen@gmail.com> | 2019-11-10 23:13:48 -0700 |
commit | fe22bcb209bde62cf333232c3765d9c4836c37dd (patch) | |
tree | 8b7d032ac1217660eea37fd14a87ea05f2b8fb18 | |
parent | 0d9d17554280bf9d19ce2db84bd29b5c5743d454 (diff) | |
download | sycltest-fe22bcb209bde62cf333232c3765d9c4836c37dd.tar.gz sycltest-fe22bcb209bde62cf333232c3765d9c4836c37dd.tar.bz2 sycltest-fe22bcb209bde62cf333232c3765d9c4836c37dd.zip |
Update for float
-rw-r--r-- | CMakeLists.txt | 1 | ||||
-rw-r--r-- | main.cpp | 115 |
2 files changed, 61 insertions, 55 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 70da54a..2df33fe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,6 @@ set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_EXTENSIONS OFF) find_package(OpenMP REQUIRED) -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS} -fsycl") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS} -fsycl -march=native") get_filename_component(bin_dir "${CMAKE_CXX_COMPILER}" PATH) @@ -4,7 +4,7 @@ #include <memory> #include <cxxabi.h> -#define N 256 +#define N 64 #define GZ 16 #define STRIDE (2*GZ + N) #define TILEK 4 @@ -32,7 +32,7 @@ class subgr; template<typename T> void subkrnl(device &Device, size_t G = 256, size_t L = 64) { - buffer<T> sg_buf{1}; + buffer<uint32_t> sg_buf{1}; buffer<T> sg_info{G}; queue Queue(Device); nd_range<1> NumOfWorkItems(G, L); @@ -56,80 +56,87 @@ void subkrnl(device &Device, size_t G = 256, size_t L = 64) { auto sg_i = sg_info.template get_access<access::mode::read>(); std::cout << "Subgroup size for " << demangle(typeid(T).name()) << ": " << sg_sz[0] << std::endl; - for (int i = 0; i < sg_sz[0]; ++i) - std::cout << sg_i[i] << " "; - std::cout << std::endl; + // for (int i = 0; i < sg_sz[0]; ++i) + // std::cout << sg_i[i] << " "; + // std::cout << std::endl; } void runSubgroups(device &Device) { std::cout << "Running subgroup for different types" << std::endl; subkrnl<int>(Device); subkrnl<long>(Device); - subkrnl<double>(Device); + subkrnl<float>(Device); } void run27pt(device &Device) { // Creating buffer of 1024 ints to be used inside the kernel code - buffer<double, 3> in_buf{range<3>(STRIDE, STRIDE, STRIDE)}; - buffer<double, 3> out_buf{range<3>(STRIDE, STRIDE, STRIDE)}; - buffer<double, 1> c_buf{27}; + buffer<float, 3> in_buf{range<3>(STRIDE, STRIDE, STRIDE)}; + buffer<float, 3> out_buf{range<3>(STRIDE, STRIDE, STRIDE)}; + buffer<float, 1> c_buf{27}; // Creating SYCL queue - queue Queue(Device); + queue Queue(Device, {property::queue::enable_profiling()}); nd_range<3> NumOfWorkItems(range<3>(N, N, N), range<3>(TILEI, TILEJ, TILEK)); - double st = omp_get_wtime(); - for (int i = 0; i < ITER; ++i) + float st = omp_get_wtime(); + auto kernel = [&](handler &cgh) { + // Getting write only access to the buffer on a device + auto in = in_buf.get_access<access::mode::read>(cgh); + auto out = out_buf.get_access<access::mode::write>(cgh); + auto c = c_buf.get_access<access::mode::read>(cgh); + // Executing kernel + cgh.parallel_for<class FillBuffer>( + NumOfWorkItems, [=](nd_item<3> WIid) { + uint32_t i = WIid.get_global_id(0) + GZ; + uint32_t j = WIid.get_global_id(1) + GZ; + uint32_t k = WIid.get_global_id(2) + GZ; + // Fill buffer with indexes + out[id<3>(i, j, k)] = c[0] * in[id<3>(i - 1, j - 1, k - 1)] + + c[1] * in[id<3>(i, j - 1, k - 1)] + + c[2] * in[id<3>(i + 1, j - 1, k - 1)] + + c[3] * in[id<3>(i - 1, j, k - 1)] + + c[4] * in[id<3>(i, j, k - 1)] + + c[5] * in[id<3>(i + 1, j, k - 1)] + + c[6] * in[id<3>(i - 1, j + 1, k - 1)] + + c[7] * in[id<3>(i, j + 1, k - 1)] + + c[8] * in[id<3>(i + 1, j + 1, k - 1)] + + c[9] * in[id<3>(i - 1, j - 1, k)] + + c[10] * in[id<3>(i, j - 1, k)] + + c[11] * in[id<3>(i + 1, j - 1, k)] + + c[12] * in[id<3>(i - 1, j, k)] + + c[13] * in[id<3>(i, j, k)] + + c[14] * in[id<3>(i + 1, j, k)] + + c[15] * in[id<3>(i - 1, j + 1, k)] + + c[16] * in[id<3>(i, j + 1, k)] + + c[17] * in[id<3>(i + 1, j + 1, k)] + + c[18] * in[id<3>(i - 1, j - 1, k + 1)] + + c[19] * in[id<3>(i, j - 1, k + 1)] + + c[20] * in[id<3>(i + 1, j - 1, k + 1)] + + c[21] * in[id<3>(i - 1, j, k + 1)] + + c[22] * in[id<3>(i, j, k + 1)] + + c[23] * in[id<3>(i + 1, j, k + 1)] + + c[24] * in[id<3>(i - 1, j + 1, k + 1)] + + c[25] * in[id<3>(i, j + 1, k + 1)] + + c[26] * in[id<3>(i + 1, j + 1, k + 1)]; + }); + }; + auto st_event = Queue.submit(kernel); + for (int i = 0; i < ITER - 2; ++i) { // Submitting command group(work) to queue - Queue.submit([&](handler &cgh) { - // Getting write only access to the buffer on a device - auto in = in_buf.get_access<access::mode::read>(cgh); - auto out = out_buf.get_access<access::mode::write>(cgh); - auto c = c_buf.get_access<access::mode::read>(cgh); - // Executing kernel - cgh.parallel_for<class FillBuffer>( - NumOfWorkItems, [=](nd_item<3> WIid) { - uint32_t i = WIid.get_global_id(0) + GZ; - uint32_t j = WIid.get_global_id(1) + GZ; - uint32_t k = WIid.get_global_id(2) + GZ; - // Fill buffer with indexes - out[id<3>(i, j, k)] = c[0] * in[id<3>(i - 1, j - 1, k - 1)] - + c[1] * in[id<3>(i, j - 1, k - 1)] - + c[2] * in[id<3>(i + 1, j - 1, k - 1)] - + c[3] * in[id<3>(i - 1, j, k - 1)] - + c[4] * in[id<3>(i, j, k - 1)] - + c[5] * in[id<3>(i + 1, j, k - 1)] - + c[6] * in[id<3>(i - 1, j + 1, k - 1)] - + c[7] * in[id<3>(i, j + 1, k - 1)] - + c[8] * in[id<3>(i + 1, j + 1, k - 1)] - + c[9] * in[id<3>(i - 1, j - 1, k)] - + c[10] * in[id<3>(i, j - 1, k)] - + c[11] * in[id<3>(i + 1, j - 1, k)] - + c[12] * in[id<3>(i - 1, j, k)] - + c[13] * in[id<3>(i, j, k)] - + c[14] * in[id<3>(i + 1, j, k)] - + c[15] * in[id<3>(i - 1, j + 1, k)] - + c[16] * in[id<3>(i, j + 1, k)] - + c[17] * in[id<3>(i + 1, j + 1, k)] - + c[18] * in[id<3>(i - 1, j - 1, k + 1)] - + c[19] * in[id<3>(i, j - 1, k + 1)] - + c[20] * in[id<3>(i + 1, j - 1, k + 1)] - + c[21] * in[id<3>(i - 1, j, k + 1)] - + c[22] * in[id<3>(i, j, k + 1)] - + c[23] * in[id<3>(i + 1, j, k + 1)] - + c[24] * in[id<3>(i - 1, j + 1, k + 1)] - + c[25] * in[id<3>(i, j + 1, k + 1)] - + c[26] * in[id<3>(i + 1, j + 1, k + 1)]; - }); - }); + Queue.submit(kernel); + } + auto ed_event = Queue.submit(kernel); + ed_event.wait(); // Getting read only access to the buffer on the host. // Implicit barrier waiting for queue to complete the work. const auto out_h = out_buf.get_access<access::mode::read>(); const auto in_h = in_buf.get_access<access::mode::read>(); double ed = omp_get_wtime(); + double elapsed = (ed_event.get_profiling_info<info::event_profiling::command_end>() - st_event.get_profiling_info<info::event_profiling::command_start>()) * 1e-9; std::cout << "elapsed: " << (ed - st) / ITER << std::endl; - std::cout << "flops: " << N * N * N * 53.0 * ITER / (ed - st) * 1e-9 << std::endl; + std::cout << "elapsed: " << elapsed << std::endl; + std::cout << "flops: " << N * N * N * 53.0 * ITER / elapsed * 1e-9 << std::endl; } void printInfo(device &Device) { |