diff options
| -rw-r--r-- | .gitignore | 15 | ||||
| -rw-r--r-- | CMakeLists.txt | 22 | ||||
| -rw-r--r-- | Makefile | 13 | ||||
| -rw-r--r-- | main.cpp | 184 | 
4 files changed, 234 insertions, 0 deletions
| diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7285b8b --- /dev/null +++ b/.gitignore @@ -0,0 +1,15 @@ +# CLion stuff +.idea +cmake-build-* + +# Vim stuff +*.swp + +# My CMake build +build + +# Test artifacts +sycltest +*.o +*.s +*.ll diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..70da54a --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,22 @@ +cmake_minimum_required(VERSION 3.14) +project(sycltest) + +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_EXTENSIONS OFF) +find_package(OpenMP REQUIRED) + +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS} -fsycl") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS} -fsycl -march=native") + +get_filename_component(bin_dir "${CMAKE_CXX_COMPILER}" PATH) +get_filename_component(install_dir "${bin_dir}" PATH) + +message(STATUS "Setting include/lib path according to compiler path: ${CMAKE_CXX_COMPILER}") +message(STATUS "SYCL compiler install dir: ${install_dir}") + +set(CMAKE_BUILD_RPATH "${install_dir}/lib") + +add_executable(sycltest main.cpp) +target_include_directories(sycltest PUBLIC "${install_dir}/include") +target_link_directories(sycltest PUBLIC "${install_dir}/lib") +target_link_libraries(sycltest stdc++ sycl OpenCL) diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..ec76229 --- /dev/null +++ b/Makefile @@ -0,0 +1,13 @@ +SYCL=/home/ztuowen/sycl/install +CXX=$(SYCL)/bin/clang++ +LIBDIR=$(SYCL)/lib + +all:sycltest + +sycltest:main.cpp +	$(CXX) -O3 -std=c++14 -lstdc++ -fopenmp -fsycl -o $@ $^ -lsycl -lOpenCL -Wl,--rpath=$(LIBDIR) + +clean: +	-rm sycltest + +.PHONY:all clean diff --git a/main.cpp b/main.cpp new file mode 100644 index 0000000..c92e650 --- /dev/null +++ b/main.cpp @@ -0,0 +1,184 @@ +#include <CL/sycl.hpp> +#include <omp.h> +#include <typeinfo> +#include <memory> +#include <cxxabi.h> + +#define N 256 +#define GZ 16 +#define STRIDE (2*GZ + N) +#define TILEK 4 +#define TILEJ 4 +#define TILEI 16 +#define ITER 10 + +std::string demangle(const char* name) { + +  int status = -4; // some arbitrary value to eliminate the compiler warning + +  // enable c++11 by passing the flag -std=c++11 to g++ +  std::unique_ptr<char, void(*)(void*)> res { +      abi::__cxa_demangle(name, NULL, NULL, &status), +      std::free +  }; + +  return (status==0) ? res.get() : name ; +} + +using namespace cl::sycl; + +template<typename T> +class subgr; + +template<typename T> +void subkrnl(device &Device, size_t G = 256, size_t L = 64) { +  buffer<T> sg_buf{1}; +  buffer<T> sg_info{G}; +  queue Queue(Device); +  nd_range<1> NumOfWorkItems(G, L); +  Queue.submit([&](handler &cgh) { +    auto sg_sz = sg_buf.template get_access<access::mode::write>(cgh); +    auto sg_i = sg_info.template get_access<access::mode::write>(cgh); + +    cgh.parallel_for<subgr<T>>(NumOfWorkItems, [=](nd_item<1> NdItem) { +      intel::sub_group SG = NdItem.get_sub_group(); +      uint32_t wggid = NdItem.get_global_id(0); +      uint32_t sgid = SG.get_local_id().get(0); +      if (wggid == 0) +        sg_sz[0] = SG.get_max_local_range()[0]; +      sgid = SG.shuffle_up(sgid, 2); +      sg_i[wggid] = sgid; +    }); +  }); + +  Queue.wait(); +  auto sg_sz = sg_buf.template get_access<access::mode::read>(); +  auto sg_i = sg_info.template get_access<access::mode::read>(); + +  std::cout << "Subgroup size for " << demangle(typeid(T).name()) << ": " << sg_sz[0] << std::endl; +  for (int i = 0; i < sg_sz[0]; ++i) +    std::cout << sg_i[i] << " "; +  std::cout << std::endl; +} + +void runSubgroups(device &Device) { +  std::cout << "Running subgroup for different types" << std::endl; +  subkrnl<int>(Device); +  subkrnl<long>(Device); +  subkrnl<double>(Device); +} + +void run27pt(device &Device) { +  // Creating buffer of 1024 ints to be used inside the kernel code +  buffer<double, 3> in_buf{range<3>(STRIDE, STRIDE, STRIDE)}; +  buffer<double, 3> out_buf{range<3>(STRIDE, STRIDE, STRIDE)}; +  buffer<double, 1> c_buf{27}; + +  // Creating SYCL queue +  queue Queue(Device); +  nd_range<3> NumOfWorkItems(range<3>(N, N, N), range<3>(TILEI, TILEJ, TILEK)); + +  double st = omp_get_wtime(); +  for (int i = 0; i < ITER; ++i) +    // Submitting command group(work) to queue +    Queue.submit([&](handler &cgh) { +      // Getting write only access to the buffer on a device +      auto in = in_buf.get_access<access::mode::read>(cgh); +      auto out = out_buf.get_access<access::mode::write>(cgh); +      auto c = c_buf.get_access<access::mode::read>(cgh); +      // Executing kernel +      cgh.parallel_for<class FillBuffer>( +          NumOfWorkItems, [=](nd_item<3> WIid) { +            uint32_t i = WIid.get_global_id(0) + GZ; +            uint32_t j = WIid.get_global_id(1) + GZ; +            uint32_t k = WIid.get_global_id(2) + GZ; +            // Fill buffer with indexes +            out[id<3>(i, j, k)] = c[0] * in[id<3>(i - 1, j - 1, k - 1)] +                                  + c[1] * in[id<3>(i, j - 1, k - 1)] +                                  + c[2] * in[id<3>(i + 1, j - 1, k - 1)] +                                  + c[3] * in[id<3>(i - 1, j, k - 1)] +                                  + c[4] * in[id<3>(i, j, k - 1)] +                                  + c[5] * in[id<3>(i + 1, j, k - 1)] +                                  + c[6] * in[id<3>(i - 1, j + 1, k - 1)] +                                  + c[7] * in[id<3>(i, j + 1, k - 1)] +                                  + c[8] * in[id<3>(i + 1, j + 1, k - 1)] +                                  + c[9] * in[id<3>(i - 1, j - 1, k)] +                                  + c[10] * in[id<3>(i, j - 1, k)] +                                  + c[11] * in[id<3>(i + 1, j - 1, k)] +                                  + c[12] * in[id<3>(i - 1, j, k)] +                                  + c[13] * in[id<3>(i, j, k)] +                                  + c[14] * in[id<3>(i + 1, j, k)] +                                  + c[15] * in[id<3>(i - 1, j + 1, k)] +                                  + c[16] * in[id<3>(i, j + 1, k)] +                                  + c[17] * in[id<3>(i + 1, j + 1, k)] +                                  + c[18] * in[id<3>(i - 1, j - 1, k + 1)] +                                  + c[19] * in[id<3>(i, j - 1, k + 1)] +                                  + c[20] * in[id<3>(i + 1, j - 1, k + 1)] +                                  + c[21] * in[id<3>(i - 1, j, k + 1)] +                                  + c[22] * in[id<3>(i, j, k + 1)] +                                  + c[23] * in[id<3>(i + 1, j, k + 1)] +                                  + c[24] * in[id<3>(i - 1, j + 1, k + 1)] +                                  + c[25] * in[id<3>(i, j + 1, k + 1)] +                                  + c[26] * in[id<3>(i + 1, j + 1, k + 1)]; +          }); +    }); + +  // Getting read only access to the buffer on the host. +  // Implicit barrier waiting for queue to complete the work. +  const auto out_h = out_buf.get_access<access::mode::read>(); +  const auto in_h = in_buf.get_access<access::mode::read>(); +  double ed = omp_get_wtime(); +  std::cout << "elapsed: " << (ed - st) / ITER << std::endl; +  std::cout << "flops: " << N * N * N * 53.0 * ITER / (ed - st) * 1e-9 << std::endl; +} + +void printInfo(device &Device) { +  std::cout << "Using OpenCL device {" << Device.get_info<info::device::name>() << +            "} from {" << Device.get_info<info::device::vendor>() << "}" << std::endl; +  std::cout << "I'm a " << (Device.is_cpu() ? "CPU" : "GPU") << std::endl; + +  auto dot_num_groups = Device.get_info<info::device::max_compute_units>(); +  auto dot_wgsize = Device.get_info<info::device::max_work_group_size>(); +  auto max_num_sg = Device.get_info<info::device::max_num_sub_groups>(); + +  std::cout << "Compute units: " << dot_num_groups << std::endl; +  std::cout << "Workgroup size: " << dot_wgsize << std::endl; +  std::cout << "Maximum subgroup size: " << max_num_sg << std::endl; +} + +int main() { +  // This is an example of using a device_selector +  // When instantiate a device use device(NEOGPUDeviceSelector) +  class NEOGPUDeviceSelector : public device_selector { +  public: +    int operator()(const device &Device) const override { +      const std::string DeviceName = Device.get_info<info::device::name>(); +      const std::string DeviceVendor = Device.get_info<info::device::vendor>(); + +      return Device.is_gpu() && DeviceName.find("HD Graphics NEO") ? 1 : -1; +    } +  }; + +  int DeviceNumber = 0; +  for (device &Device : device::get_devices()) { + +    // Host device is not for compute +    if (Device.is_host()) +      continue; + +    std::cout << "Device [" << DeviceNumber << "]:" << std::endl; +    try { +      printInfo(Device); +      if (Device.is_gpu()) { +        run27pt(Device); +        runSubgroups(Device); +      } else { +        std::cout << "CPU devices possibly not supported" << std::endl; +      } +    } catch (invalid_parameter_error &E) { +      std::cout << E.what() << std::endl; +    } +    std::cout << "==================" << std::endl; +    ++DeviceNumber; +  } +} | 
