summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTuowen Zhao <ztuowen@gmail.com>2019-04-21 17:07:07 -0600
committerTuowen Zhao <ztuowen@gmail.com>2019-04-21 17:07:07 -0600
commit0d9d17554280bf9d19ce2db84bd29b5c5743d454 (patch)
tree49f78b1e7dc4937483245c13053f03fc584ac83c
downloadsycltest-0d9d17554280bf9d19ce2db84bd29b5c5743d454.tar.gz
sycltest-0d9d17554280bf9d19ce2db84bd29b5c5743d454.tar.bz2
sycltest-0d9d17554280bf9d19ce2db84bd29b5c5743d454.zip
Initial commit
-rw-r--r--.gitignore15
-rw-r--r--CMakeLists.txt22
-rw-r--r--Makefile13
-rw-r--r--main.cpp184
4 files changed, 234 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7285b8b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,15 @@
+# CLion stuff
+.idea
+cmake-build-*
+
+# Vim stuff
+*.swp
+
+# My CMake build
+build
+
+# Test artifacts
+sycltest
+*.o
+*.s
+*.ll
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..70da54a
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,22 @@
+cmake_minimum_required(VERSION 3.14)
+project(sycltest)
+
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_EXTENSIONS OFF)
+find_package(OpenMP REQUIRED)
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS} -fsycl")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS} -fsycl -march=native")
+
+get_filename_component(bin_dir "${CMAKE_CXX_COMPILER}" PATH)
+get_filename_component(install_dir "${bin_dir}" PATH)
+
+message(STATUS "Setting include/lib path according to compiler path: ${CMAKE_CXX_COMPILER}")
+message(STATUS "SYCL compiler install dir: ${install_dir}")
+
+set(CMAKE_BUILD_RPATH "${install_dir}/lib")
+
+add_executable(sycltest main.cpp)
+target_include_directories(sycltest PUBLIC "${install_dir}/include")
+target_link_directories(sycltest PUBLIC "${install_dir}/lib")
+target_link_libraries(sycltest stdc++ sycl OpenCL)
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..ec76229
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,13 @@
+SYCL=/home/ztuowen/sycl/install
+CXX=$(SYCL)/bin/clang++
+LIBDIR=$(SYCL)/lib
+
+all:sycltest
+
+sycltest:main.cpp
+ $(CXX) -O3 -std=c++14 -lstdc++ -fopenmp -fsycl -o $@ $^ -lsycl -lOpenCL -Wl,--rpath=$(LIBDIR)
+
+clean:
+ -rm sycltest
+
+.PHONY:all clean
diff --git a/main.cpp b/main.cpp
new file mode 100644
index 0000000..c92e650
--- /dev/null
+++ b/main.cpp
@@ -0,0 +1,184 @@
+#include <CL/sycl.hpp>
+#include <omp.h>
+#include <typeinfo>
+#include <memory>
+#include <cxxabi.h>
+
+#define N 256
+#define GZ 16
+#define STRIDE (2*GZ + N)
+#define TILEK 4
+#define TILEJ 4
+#define TILEI 16
+#define ITER 10
+
+std::string demangle(const char* name) {
+
+ int status = -4; // some arbitrary value to eliminate the compiler warning
+
+ // enable c++11 by passing the flag -std=c++11 to g++
+ std::unique_ptr<char, void(*)(void*)> res {
+ abi::__cxa_demangle(name, NULL, NULL, &status),
+ std::free
+ };
+
+ return (status==0) ? res.get() : name ;
+}
+
+using namespace cl::sycl;
+
+template<typename T>
+class subgr;
+
+template<typename T>
+void subkrnl(device &Device, size_t G = 256, size_t L = 64) {
+ buffer<T> sg_buf{1};
+ buffer<T> sg_info{G};
+ queue Queue(Device);
+ nd_range<1> NumOfWorkItems(G, L);
+ Queue.submit([&](handler &cgh) {
+ auto sg_sz = sg_buf.template get_access<access::mode::write>(cgh);
+ auto sg_i = sg_info.template get_access<access::mode::write>(cgh);
+
+ cgh.parallel_for<subgr<T>>(NumOfWorkItems, [=](nd_item<1> NdItem) {
+ intel::sub_group SG = NdItem.get_sub_group();
+ uint32_t wggid = NdItem.get_global_id(0);
+ uint32_t sgid = SG.get_local_id().get(0);
+ if (wggid == 0)
+ sg_sz[0] = SG.get_max_local_range()[0];
+ sgid = SG.shuffle_up(sgid, 2);
+ sg_i[wggid] = sgid;
+ });
+ });
+
+ Queue.wait();
+ auto sg_sz = sg_buf.template get_access<access::mode::read>();
+ auto sg_i = sg_info.template get_access<access::mode::read>();
+
+ std::cout << "Subgroup size for " << demangle(typeid(T).name()) << ": " << sg_sz[0] << std::endl;
+ for (int i = 0; i < sg_sz[0]; ++i)
+ std::cout << sg_i[i] << " ";
+ std::cout << std::endl;
+}
+
+void runSubgroups(device &Device) {
+ std::cout << "Running subgroup for different types" << std::endl;
+ subkrnl<int>(Device);
+ subkrnl<long>(Device);
+ subkrnl<double>(Device);
+}
+
+void run27pt(device &Device) {
+ // Creating buffer of 1024 ints to be used inside the kernel code
+ buffer<double, 3> in_buf{range<3>(STRIDE, STRIDE, STRIDE)};
+ buffer<double, 3> out_buf{range<3>(STRIDE, STRIDE, STRIDE)};
+ buffer<double, 1> c_buf{27};
+
+ // Creating SYCL queue
+ queue Queue(Device);
+ nd_range<3> NumOfWorkItems(range<3>(N, N, N), range<3>(TILEI, TILEJ, TILEK));
+
+ double st = omp_get_wtime();
+ for (int i = 0; i < ITER; ++i)
+ // Submitting command group(work) to queue
+ Queue.submit([&](handler &cgh) {
+ // Getting write only access to the buffer on a device
+ auto in = in_buf.get_access<access::mode::read>(cgh);
+ auto out = out_buf.get_access<access::mode::write>(cgh);
+ auto c = c_buf.get_access<access::mode::read>(cgh);
+ // Executing kernel
+ cgh.parallel_for<class FillBuffer>(
+ NumOfWorkItems, [=](nd_item<3> WIid) {
+ uint32_t i = WIid.get_global_id(0) + GZ;
+ uint32_t j = WIid.get_global_id(1) + GZ;
+ uint32_t k = WIid.get_global_id(2) + GZ;
+ // Fill buffer with indexes
+ out[id<3>(i, j, k)] = c[0] * in[id<3>(i - 1, j - 1, k - 1)]
+ + c[1] * in[id<3>(i, j - 1, k - 1)]
+ + c[2] * in[id<3>(i + 1, j - 1, k - 1)]
+ + c[3] * in[id<3>(i - 1, j, k - 1)]
+ + c[4] * in[id<3>(i, j, k - 1)]
+ + c[5] * in[id<3>(i + 1, j, k - 1)]
+ + c[6] * in[id<3>(i - 1, j + 1, k - 1)]
+ + c[7] * in[id<3>(i, j + 1, k - 1)]
+ + c[8] * in[id<3>(i + 1, j + 1, k - 1)]
+ + c[9] * in[id<3>(i - 1, j - 1, k)]
+ + c[10] * in[id<3>(i, j - 1, k)]
+ + c[11] * in[id<3>(i + 1, j - 1, k)]
+ + c[12] * in[id<3>(i - 1, j, k)]
+ + c[13] * in[id<3>(i, j, k)]
+ + c[14] * in[id<3>(i + 1, j, k)]
+ + c[15] * in[id<3>(i - 1, j + 1, k)]
+ + c[16] * in[id<3>(i, j + 1, k)]
+ + c[17] * in[id<3>(i + 1, j + 1, k)]
+ + c[18] * in[id<3>(i - 1, j - 1, k + 1)]
+ + c[19] * in[id<3>(i, j - 1, k + 1)]
+ + c[20] * in[id<3>(i + 1, j - 1, k + 1)]
+ + c[21] * in[id<3>(i - 1, j, k + 1)]
+ + c[22] * in[id<3>(i, j, k + 1)]
+ + c[23] * in[id<3>(i + 1, j, k + 1)]
+ + c[24] * in[id<3>(i - 1, j + 1, k + 1)]
+ + c[25] * in[id<3>(i, j + 1, k + 1)]
+ + c[26] * in[id<3>(i + 1, j + 1, k + 1)];
+ });
+ });
+
+ // Getting read only access to the buffer on the host.
+ // Implicit barrier waiting for queue to complete the work.
+ const auto out_h = out_buf.get_access<access::mode::read>();
+ const auto in_h = in_buf.get_access<access::mode::read>();
+ double ed = omp_get_wtime();
+ std::cout << "elapsed: " << (ed - st) / ITER << std::endl;
+ std::cout << "flops: " << N * N * N * 53.0 * ITER / (ed - st) * 1e-9 << std::endl;
+}
+
+void printInfo(device &Device) {
+ std::cout << "Using OpenCL device {" << Device.get_info<info::device::name>() <<
+ "} from {" << Device.get_info<info::device::vendor>() << "}" << std::endl;
+ std::cout << "I'm a " << (Device.is_cpu() ? "CPU" : "GPU") << std::endl;
+
+ auto dot_num_groups = Device.get_info<info::device::max_compute_units>();
+ auto dot_wgsize = Device.get_info<info::device::max_work_group_size>();
+ auto max_num_sg = Device.get_info<info::device::max_num_sub_groups>();
+
+ std::cout << "Compute units: " << dot_num_groups << std::endl;
+ std::cout << "Workgroup size: " << dot_wgsize << std::endl;
+ std::cout << "Maximum subgroup size: " << max_num_sg << std::endl;
+}
+
+int main() {
+ // This is an example of using a device_selector
+ // When instantiate a device use device(NEOGPUDeviceSelector)
+ class NEOGPUDeviceSelector : public device_selector {
+ public:
+ int operator()(const device &Device) const override {
+ const std::string DeviceName = Device.get_info<info::device::name>();
+ const std::string DeviceVendor = Device.get_info<info::device::vendor>();
+
+ return Device.is_gpu() && DeviceName.find("HD Graphics NEO") ? 1 : -1;
+ }
+ };
+
+ int DeviceNumber = 0;
+ for (device &Device : device::get_devices()) {
+
+ // Host device is not for compute
+ if (Device.is_host())
+ continue;
+
+ std::cout << "Device [" << DeviceNumber << "]:" << std::endl;
+ try {
+ printInfo(Device);
+ if (Device.is_gpu()) {
+ run27pt(Device);
+ runSubgroups(Device);
+ } else {
+ std::cout << "CPU devices possibly not supported" << std::endl;
+ }
+ } catch (invalid_parameter_error &E) {
+ std::cout << E.what() << std::endl;
+ }
+ std::cout << "==================" << std::endl;
+ ++DeviceNumber;
+ }
+}