From fe22bcb209bde62cf333232c3765d9c4836c37dd Mon Sep 17 00:00:00 2001
From: Tuowen Zhao <ztuowen@gmail.com>
Date: Sun, 10 Nov 2019 23:13:48 -0700
Subject: Update for float

---
 CMakeLists.txt |   1 -
 main.cpp       | 115 ++++++++++++++++++++++++++++++---------------------------
 2 files changed, 61 insertions(+), 55 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 70da54a..2df33fe 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,6 @@ set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_CXX_EXTENSIONS OFF)
 find_package(OpenMP REQUIRED)
 
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS} -fsycl")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS} -fsycl -march=native")
 
 get_filename_component(bin_dir "${CMAKE_CXX_COMPILER}" PATH)
diff --git a/main.cpp b/main.cpp
index c92e650..469ebac 100644
--- a/main.cpp
+++ b/main.cpp
@@ -4,7 +4,7 @@
 #include <memory>
 #include <cxxabi.h>
 
-#define N 256
+#define N 64
 #define GZ 16
 #define STRIDE (2*GZ + N)
 #define TILEK 4
@@ -32,7 +32,7 @@ class subgr;
 
 template<typename T>
 void subkrnl(device &Device, size_t G = 256, size_t L = 64) {
-  buffer<T> sg_buf{1};
+  buffer<uint32_t> sg_buf{1};
   buffer<T> sg_info{G};
   queue Queue(Device);
   nd_range<1> NumOfWorkItems(G, L);
@@ -56,80 +56,87 @@ void subkrnl(device &Device, size_t G = 256, size_t L = 64) {
   auto sg_i = sg_info.template get_access<access::mode::read>();
 
   std::cout << "Subgroup size for " << demangle(typeid(T).name()) << ": " << sg_sz[0] << std::endl;
-  for (int i = 0; i < sg_sz[0]; ++i)
-    std::cout << sg_i[i] << " ";
-  std::cout << std::endl;
+  // for (int i = 0; i < sg_sz[0]; ++i)
+  //   std::cout << sg_i[i] << " ";
+  // std::cout << std::endl;
 }
 
 void runSubgroups(device &Device) {
   std::cout << "Running subgroup for different types" << std::endl;
   subkrnl<int>(Device);
   subkrnl<long>(Device);
-  subkrnl<double>(Device);
+  subkrnl<float>(Device);
 }
 
 void run27pt(device &Device) {
   // Creating buffer of 1024 ints to be used inside the kernel code
-  buffer<double, 3> in_buf{range<3>(STRIDE, STRIDE, STRIDE)};
-  buffer<double, 3> out_buf{range<3>(STRIDE, STRIDE, STRIDE)};
-  buffer<double, 1> c_buf{27};
+  buffer<float, 3> in_buf{range<3>(STRIDE, STRIDE, STRIDE)};
+  buffer<float, 3> out_buf{range<3>(STRIDE, STRIDE, STRIDE)};
+  buffer<float, 1> c_buf{27};
 
   // Creating SYCL queue
-  queue Queue(Device);
+  queue Queue(Device, {property::queue::enable_profiling()});
   nd_range<3> NumOfWorkItems(range<3>(N, N, N), range<3>(TILEI, TILEJ, TILEK));
 
-  double st = omp_get_wtime();
-  for (int i = 0; i < ITER; ++i)
+  float st = omp_get_wtime();
+  auto kernel = [&](handler &cgh) {
+    // Getting write only access to the buffer on a device
+    auto in = in_buf.get_access<access::mode::read>(cgh);
+    auto out = out_buf.get_access<access::mode::write>(cgh);
+    auto c = c_buf.get_access<access::mode::read>(cgh);
+    // Executing kernel
+    cgh.parallel_for<class FillBuffer>(
+        NumOfWorkItems, [=](nd_item<3> WIid) {
+          uint32_t i = WIid.get_global_id(0) + GZ;
+          uint32_t j = WIid.get_global_id(1) + GZ;
+          uint32_t k = WIid.get_global_id(2) + GZ;
+          // Fill buffer with indexes
+          out[id<3>(i, j, k)] = c[0] * in[id<3>(i - 1, j - 1, k - 1)]
+                                + c[1] * in[id<3>(i, j - 1, k - 1)]
+                                + c[2] * in[id<3>(i + 1, j - 1, k - 1)]
+                                + c[3] * in[id<3>(i - 1, j, k - 1)]
+                                + c[4] * in[id<3>(i, j, k - 1)]
+                                + c[5] * in[id<3>(i + 1, j, k - 1)]
+                                + c[6] * in[id<3>(i - 1, j + 1, k - 1)]
+                                + c[7] * in[id<3>(i, j + 1, k - 1)]
+                                + c[8] * in[id<3>(i + 1, j + 1, k - 1)]
+                                + c[9] * in[id<3>(i - 1, j - 1, k)]
+                                + c[10] * in[id<3>(i, j - 1, k)]
+                                + c[11] * in[id<3>(i + 1, j - 1, k)]
+                                + c[12] * in[id<3>(i - 1, j, k)]
+                                + c[13] * in[id<3>(i, j, k)]
+                                + c[14] * in[id<3>(i + 1, j, k)]
+                                + c[15] * in[id<3>(i - 1, j + 1, k)]
+                                + c[16] * in[id<3>(i, j + 1, k)]
+                                + c[17] * in[id<3>(i + 1, j + 1, k)]
+                                + c[18] * in[id<3>(i - 1, j - 1, k + 1)]
+                                + c[19] * in[id<3>(i, j - 1, k + 1)]
+                                + c[20] * in[id<3>(i + 1, j - 1, k + 1)]
+                                + c[21] * in[id<3>(i - 1, j, k + 1)]
+                                + c[22] * in[id<3>(i, j, k + 1)]
+                                + c[23] * in[id<3>(i + 1, j, k + 1)]
+                                + c[24] * in[id<3>(i - 1, j + 1, k + 1)]
+                                + c[25] * in[id<3>(i, j + 1, k + 1)]
+                                + c[26] * in[id<3>(i + 1, j + 1, k + 1)];
+        });
+  };
+  auto st_event = Queue.submit(kernel);
+  for (int i = 0; i < ITER - 2; ++i) {
     // Submitting command group(work) to queue
-    Queue.submit([&](handler &cgh) {
-      // Getting write only access to the buffer on a device
-      auto in = in_buf.get_access<access::mode::read>(cgh);
-      auto out = out_buf.get_access<access::mode::write>(cgh);
-      auto c = c_buf.get_access<access::mode::read>(cgh);
-      // Executing kernel
-      cgh.parallel_for<class FillBuffer>(
-          NumOfWorkItems, [=](nd_item<3> WIid) {
-            uint32_t i = WIid.get_global_id(0) + GZ;
-            uint32_t j = WIid.get_global_id(1) + GZ;
-            uint32_t k = WIid.get_global_id(2) + GZ;
-            // Fill buffer with indexes
-            out[id<3>(i, j, k)] = c[0] * in[id<3>(i - 1, j - 1, k - 1)]
-                                  + c[1] * in[id<3>(i, j - 1, k - 1)]
-                                  + c[2] * in[id<3>(i + 1, j - 1, k - 1)]
-                                  + c[3] * in[id<3>(i - 1, j, k - 1)]
-                                  + c[4] * in[id<3>(i, j, k - 1)]
-                                  + c[5] * in[id<3>(i + 1, j, k - 1)]
-                                  + c[6] * in[id<3>(i - 1, j + 1, k - 1)]
-                                  + c[7] * in[id<3>(i, j + 1, k - 1)]
-                                  + c[8] * in[id<3>(i + 1, j + 1, k - 1)]
-                                  + c[9] * in[id<3>(i - 1, j - 1, k)]
-                                  + c[10] * in[id<3>(i, j - 1, k)]
-                                  + c[11] * in[id<3>(i + 1, j - 1, k)]
-                                  + c[12] * in[id<3>(i - 1, j, k)]
-                                  + c[13] * in[id<3>(i, j, k)]
-                                  + c[14] * in[id<3>(i + 1, j, k)]
-                                  + c[15] * in[id<3>(i - 1, j + 1, k)]
-                                  + c[16] * in[id<3>(i, j + 1, k)]
-                                  + c[17] * in[id<3>(i + 1, j + 1, k)]
-                                  + c[18] * in[id<3>(i - 1, j - 1, k + 1)]
-                                  + c[19] * in[id<3>(i, j - 1, k + 1)]
-                                  + c[20] * in[id<3>(i + 1, j - 1, k + 1)]
-                                  + c[21] * in[id<3>(i - 1, j, k + 1)]
-                                  + c[22] * in[id<3>(i, j, k + 1)]
-                                  + c[23] * in[id<3>(i + 1, j, k + 1)]
-                                  + c[24] * in[id<3>(i - 1, j + 1, k + 1)]
-                                  + c[25] * in[id<3>(i, j + 1, k + 1)]
-                                  + c[26] * in[id<3>(i + 1, j + 1, k + 1)];
-          });
-    });
+    Queue.submit(kernel);
+  }
+  auto ed_event = Queue.submit(kernel);
+  ed_event.wait();
 
   // Getting read only access to the buffer on the host.
   // Implicit barrier waiting for queue to complete the work.
   const auto out_h = out_buf.get_access<access::mode::read>();
   const auto in_h = in_buf.get_access<access::mode::read>();
   double ed = omp_get_wtime();
+  double elapsed = (ed_event.get_profiling_info<info::event_profiling::command_end>() - st_event.get_profiling_info<info::event_profiling::command_start>()) * 1e-9;
   std::cout << "elapsed: " << (ed - st) / ITER << std::endl;
-  std::cout << "flops: " << N * N * N * 53.0 * ITER / (ed - st) * 1e-9 << std::endl;
+  std::cout << "elapsed: " << elapsed << std::endl;
+  std::cout << "flops: " << N * N * N * 53.0 * ITER / elapsed * 1e-9 << std::endl;
 }
 
 void printInfo(device &Device) {
-- 
cgit v1.2.3-70-g09d2