summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTuowen Zhao <ztuowen@gmail.com>2019-11-10 23:13:48 -0700
committerTuowen Zhao <ztuowen@gmail.com>2019-11-10 23:13:48 -0700
commitfe22bcb209bde62cf333232c3765d9c4836c37dd (patch)
tree8b7d032ac1217660eea37fd14a87ea05f2b8fb18
parent0d9d17554280bf9d19ce2db84bd29b5c5743d454 (diff)
downloadsycltest-fe22bcb209bde62cf333232c3765d9c4836c37dd.tar.gz
sycltest-fe22bcb209bde62cf333232c3765d9c4836c37dd.tar.bz2
sycltest-fe22bcb209bde62cf333232c3765d9c4836c37dd.zip
Update for float
-rw-r--r--CMakeLists.txt1
-rw-r--r--main.cpp115
2 files changed, 61 insertions, 55 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 70da54a..2df33fe 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,6 @@ set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_EXTENSIONS OFF)
find_package(OpenMP REQUIRED)
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS} -fsycl")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS} -fsycl -march=native")
get_filename_component(bin_dir "${CMAKE_CXX_COMPILER}" PATH)
diff --git a/main.cpp b/main.cpp
index c92e650..469ebac 100644
--- a/main.cpp
+++ b/main.cpp
@@ -4,7 +4,7 @@
#include <memory>
#include <cxxabi.h>
-#define N 256
+#define N 64
#define GZ 16
#define STRIDE (2*GZ + N)
#define TILEK 4
@@ -32,7 +32,7 @@ class subgr;
template<typename T>
void subkrnl(device &Device, size_t G = 256, size_t L = 64) {
- buffer<T> sg_buf{1};
+ buffer<uint32_t> sg_buf{1};
buffer<T> sg_info{G};
queue Queue(Device);
nd_range<1> NumOfWorkItems(G, L);
@@ -56,80 +56,87 @@ void subkrnl(device &Device, size_t G = 256, size_t L = 64) {
auto sg_i = sg_info.template get_access<access::mode::read>();
std::cout << "Subgroup size for " << demangle(typeid(T).name()) << ": " << sg_sz[0] << std::endl;
- for (int i = 0; i < sg_sz[0]; ++i)
- std::cout << sg_i[i] << " ";
- std::cout << std::endl;
+ // for (int i = 0; i < sg_sz[0]; ++i)
+ // std::cout << sg_i[i] << " ";
+ // std::cout << std::endl;
}
void runSubgroups(device &Device) {
std::cout << "Running subgroup for different types" << std::endl;
subkrnl<int>(Device);
subkrnl<long>(Device);
- subkrnl<double>(Device);
+ subkrnl<float>(Device);
}
void run27pt(device &Device) {
// Creating buffer of 1024 ints to be used inside the kernel code
- buffer<double, 3> in_buf{range<3>(STRIDE, STRIDE, STRIDE)};
- buffer<double, 3> out_buf{range<3>(STRIDE, STRIDE, STRIDE)};
- buffer<double, 1> c_buf{27};
+ buffer<float, 3> in_buf{range<3>(STRIDE, STRIDE, STRIDE)};
+ buffer<float, 3> out_buf{range<3>(STRIDE, STRIDE, STRIDE)};
+ buffer<float, 1> c_buf{27};
// Creating SYCL queue
- queue Queue(Device);
+ queue Queue(Device, {property::queue::enable_profiling()});
nd_range<3> NumOfWorkItems(range<3>(N, N, N), range<3>(TILEI, TILEJ, TILEK));
- double st = omp_get_wtime();
- for (int i = 0; i < ITER; ++i)
+ float st = omp_get_wtime();
+ auto kernel = [&](handler &cgh) {
+ // Getting write only access to the buffer on a device
+ auto in = in_buf.get_access<access::mode::read>(cgh);
+ auto out = out_buf.get_access<access::mode::write>(cgh);
+ auto c = c_buf.get_access<access::mode::read>(cgh);
+ // Executing kernel
+ cgh.parallel_for<class FillBuffer>(
+ NumOfWorkItems, [=](nd_item<3> WIid) {
+ uint32_t i = WIid.get_global_id(0) + GZ;
+ uint32_t j = WIid.get_global_id(1) + GZ;
+ uint32_t k = WIid.get_global_id(2) + GZ;
+ // Fill buffer with indexes
+ out[id<3>(i, j, k)] = c[0] * in[id<3>(i - 1, j - 1, k - 1)]
+ + c[1] * in[id<3>(i, j - 1, k - 1)]
+ + c[2] * in[id<3>(i + 1, j - 1, k - 1)]
+ + c[3] * in[id<3>(i - 1, j, k - 1)]
+ + c[4] * in[id<3>(i, j, k - 1)]
+ + c[5] * in[id<3>(i + 1, j, k - 1)]
+ + c[6] * in[id<3>(i - 1, j + 1, k - 1)]
+ + c[7] * in[id<3>(i, j + 1, k - 1)]
+ + c[8] * in[id<3>(i + 1, j + 1, k - 1)]
+ + c[9] * in[id<3>(i - 1, j - 1, k)]
+ + c[10] * in[id<3>(i, j - 1, k)]
+ + c[11] * in[id<3>(i + 1, j - 1, k)]
+ + c[12] * in[id<3>(i - 1, j, k)]
+ + c[13] * in[id<3>(i, j, k)]
+ + c[14] * in[id<3>(i + 1, j, k)]
+ + c[15] * in[id<3>(i - 1, j + 1, k)]
+ + c[16] * in[id<3>(i, j + 1, k)]
+ + c[17] * in[id<3>(i + 1, j + 1, k)]
+ + c[18] * in[id<3>(i - 1, j - 1, k + 1)]
+ + c[19] * in[id<3>(i, j - 1, k + 1)]
+ + c[20] * in[id<3>(i + 1, j - 1, k + 1)]
+ + c[21] * in[id<3>(i - 1, j, k + 1)]
+ + c[22] * in[id<3>(i, j, k + 1)]
+ + c[23] * in[id<3>(i + 1, j, k + 1)]
+ + c[24] * in[id<3>(i - 1, j + 1, k + 1)]
+ + c[25] * in[id<3>(i, j + 1, k + 1)]
+ + c[26] * in[id<3>(i + 1, j + 1, k + 1)];
+ });
+ };
+ auto st_event = Queue.submit(kernel);
+ for (int i = 0; i < ITER - 2; ++i) {
// Submitting command group(work) to queue
- Queue.submit([&](handler &cgh) {
- // Getting write only access to the buffer on a device
- auto in = in_buf.get_access<access::mode::read>(cgh);
- auto out = out_buf.get_access<access::mode::write>(cgh);
- auto c = c_buf.get_access<access::mode::read>(cgh);
- // Executing kernel
- cgh.parallel_for<class FillBuffer>(
- NumOfWorkItems, [=](nd_item<3> WIid) {
- uint32_t i = WIid.get_global_id(0) + GZ;
- uint32_t j = WIid.get_global_id(1) + GZ;
- uint32_t k = WIid.get_global_id(2) + GZ;
- // Fill buffer with indexes
- out[id<3>(i, j, k)] = c[0] * in[id<3>(i - 1, j - 1, k - 1)]
- + c[1] * in[id<3>(i, j - 1, k - 1)]
- + c[2] * in[id<3>(i + 1, j - 1, k - 1)]
- + c[3] * in[id<3>(i - 1, j, k - 1)]
- + c[4] * in[id<3>(i, j, k - 1)]
- + c[5] * in[id<3>(i + 1, j, k - 1)]
- + c[6] * in[id<3>(i - 1, j + 1, k - 1)]
- + c[7] * in[id<3>(i, j + 1, k - 1)]
- + c[8] * in[id<3>(i + 1, j + 1, k - 1)]
- + c[9] * in[id<3>(i - 1, j - 1, k)]
- + c[10] * in[id<3>(i, j - 1, k)]
- + c[11] * in[id<3>(i + 1, j - 1, k)]
- + c[12] * in[id<3>(i - 1, j, k)]
- + c[13] * in[id<3>(i, j, k)]
- + c[14] * in[id<3>(i + 1, j, k)]
- + c[15] * in[id<3>(i - 1, j + 1, k)]
- + c[16] * in[id<3>(i, j + 1, k)]
- + c[17] * in[id<3>(i + 1, j + 1, k)]
- + c[18] * in[id<3>(i - 1, j - 1, k + 1)]
- + c[19] * in[id<3>(i, j - 1, k + 1)]
- + c[20] * in[id<3>(i + 1, j - 1, k + 1)]
- + c[21] * in[id<3>(i - 1, j, k + 1)]
- + c[22] * in[id<3>(i, j, k + 1)]
- + c[23] * in[id<3>(i + 1, j, k + 1)]
- + c[24] * in[id<3>(i - 1, j + 1, k + 1)]
- + c[25] * in[id<3>(i, j + 1, k + 1)]
- + c[26] * in[id<3>(i + 1, j + 1, k + 1)];
- });
- });
+ Queue.submit(kernel);
+ }
+ auto ed_event = Queue.submit(kernel);
+ ed_event.wait();
// Getting read only access to the buffer on the host.
// Implicit barrier waiting for queue to complete the work.
const auto out_h = out_buf.get_access<access::mode::read>();
const auto in_h = in_buf.get_access<access::mode::read>();
double ed = omp_get_wtime();
+ double elapsed = (ed_event.get_profiling_info<info::event_profiling::command_end>() - st_event.get_profiling_info<info::event_profiling::command_start>()) * 1e-9;
std::cout << "elapsed: " << (ed - st) / ITER << std::endl;
- std::cout << "flops: " << N * N * N * 53.0 * ITER / (ed - st) * 1e-9 << std::endl;
+ std::cout << "elapsed: " << elapsed << std::endl;
+ std::cout << "flops: " << N * N * N * 53.0 * ITER / elapsed * 1e-9 << std::endl;
}
void printInfo(device &Device) {