diff options
author | Tuowen Zhao <ztuowen@gmail.com> | 2020-10-22 20:19:37 -0600 |
---|---|---|
committer | Tuowen Zhao <ztuowen@gmail.com> | 2020-10-22 20:19:37 -0600 |
commit | c699af920419025f86f284917385f8de5efd8fd3 (patch) | |
tree | 4c920ca9c8ee063913921623a7f8a4647d30bdbc | |
parent | 5ae0da8484744859e09fad869b44dccdb5f66f2f (diff) | |
download | sycltest-c699af920419025f86f284917385f8de5efd8fd3.tar.gz sycltest-c699af920419025f86f284917385f8de5efd8fd3.tar.bz2 sycltest-c699af920419025f86f284917385f8de5efd8fd3.zip |
update
-rw-r--r-- | .gitignore | 4 | ||||
-rw-r--r-- | CMakeLists.txt | 15 | ||||
-rw-r--r-- | Makefile | 11 | ||||
-rw-r--r-- | main.cpp | 37 | ||||
-rw-r--r-- | subgroup.cpp | 94 |
5 files changed, 141 insertions, 20 deletions
@@ -5,11 +5,15 @@ cmake-build-* # Vim stuff *.swp +# vscode +.vscode + # My CMake build build # Test artifacts sycltest +subgroup *.o *.s *.ll diff --git a/CMakeLists.txt b/CMakeLists.txt index 2df33fe..61bb869 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,16 @@ project(sycltest) set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_EXTENSIONS OFF) + +include(CheckCXXCompilerFlag) +check_cxx_compiler_flag(-fsycl HAS_SYCL) + +if (NOT HAS_SYCL) + message(FATAL_ERROR "Compiler has no sycl support") +endif () + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + find_package(OpenMP REQUIRED) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS} -fsycl -march=native") @@ -16,6 +26,7 @@ message(STATUS "SYCL compiler install dir: ${install_dir}") set(CMAKE_BUILD_RPATH "${install_dir}/lib") add_executable(sycltest main.cpp) -target_include_directories(sycltest PUBLIC "${install_dir}/include") -target_link_directories(sycltest PUBLIC "${install_dir}/lib") target_link_libraries(sycltest stdc++ sycl OpenCL) + +add_executable(subgroup subgroup.cpp) +target_link_libraries(subgroup stdc++ sycl OpenCL) @@ -1,13 +1,16 @@ -SYCL=/home/ztuowen/sycl/install +SYCL=/opt/intel/oneapi/compiler/latest/linux CXX=$(SYCL)/bin/clang++ -LIBDIR=$(SYCL)/lib -all:sycltest +all:sycltest subgroup sycltest:main.cpp - $(CXX) -O3 -std=c++14 -lstdc++ -fopenmp -fsycl -o $@ $^ -lsycl -lOpenCL -Wl,--rpath=$(LIBDIR) + $(CXX) -O3 -std=c++14 -lstdc++ -fsycl -o $@ $^ -lsycl -lOpenCL + +subgroup:subgroup.cpp + $(CXX) -O3 -std=c++14 -lstdc++ -fsycl -o $@ $^ -lsycl -lOpenCL clean: -rm sycltest + -rm subgroup .PHONY:all clean @@ -1,16 +1,16 @@ #include <CL/sycl.hpp> -#include <omp.h> +// #include <omp.h> #include <typeinfo> #include <memory> #include <cxxabi.h> -#define N 64 +#define N 256 #define GZ 16 #define STRIDE (2*GZ + N) #define TILEK 4 #define TILEJ 4 #define TILEI 16 -#define ITER 10 +#define ITER 1000 std::string demangle(const char *name) { @@ -36,8 +36,8 @@ public: SGfunctor(accessor<uint32_t, 1, access::mode::write, access::target::global_buffer> sg_sz, accessor<T, 1, access::mode::write, access::target::global_buffer> sg_i) : sg_sz(sg_sz), sg_i(sg_i) {} - [[cl::intel_reqd_sub_group_size(16)]] - void operator()(nd_item<1> NdItem) { + [[intel::reqd_sub_group_size(16)]] + void operator()(nd_item<1> NdItem) const { intel::sub_group SG = NdItem.get_sub_group(); uint32_t wggid = NdItem.get_global_id(0); uint32_t sgid = SG.get_local_id().get(0); @@ -94,7 +94,7 @@ void run27pt(device &Device) { queue Queue(Device, {property::queue::enable_profiling()}); nd_range<3> NumOfWorkItems(range<3>(N, N, N), range<3>(TILEI, TILEJ, TILEK)); - float st = omp_get_wtime(); + // float st = omp_get_wtime(); auto kernel = [&](handler &cgh) { // Getting write only access to the buffer on a device auto in = in_buf.get_access<access::mode::read>(cgh); @@ -148,12 +148,20 @@ void run27pt(device &Device) { // Implicit barrier waiting for queue to complete the work. const auto out_h = out_buf.get_access<access::mode::read>(); const auto in_h = in_buf.get_access<access::mode::read>(); - double ed = omp_get_wtime(); - double elapsed = (ed_event.get_profiling_info<info::event_profiling::command_end>() - - st_event.get_profiling_info<info::event_profiling::command_start>()) * 1e-9; - std::cout << "elapsed: " << (ed - st) / ITER << std::endl; - std::cout << "elapsed: " << elapsed << std::endl; - std::cout << "flops: " << N * N * N * 53.0 * ITER / elapsed * 1e-9 << std::endl; + { + // double ed = omp_get_wtime(); + double elapsed = (ed_event.get_profiling_info<info::event_profiling::command_end>() - + st_event.get_profiling_info<info::event_profiling::command_start>()) * 1e-9; + std::cout << "elapsed: " << elapsed / ITER << std::endl; + std::cout << "flops: " << N * N * N * 53.0 * ITER / elapsed * 1e-9 << std::endl; + } + { + // double ed = omp_get_wtime(); + double elapsed = (ed_event.get_profiling_info<info::event_profiling::command_end>() - + ed_event.get_profiling_info<info::event_profiling::command_start>()) * 1e-9; + std::cout << "elapsed: " << elapsed << std::endl; + std::cout << "flops: " << N * N * N * 53.0 / elapsed * 1e-9 << std::endl; + } } void printInfo(device &Device) { @@ -167,7 +175,6 @@ void printInfo(device &Device) { std::cout << "Compute units: " << dot_num_groups << std::endl; std::cout << "Workgroup size: " << dot_wgsize << std::endl; - std::cout << "Maximum subgroup size: " << max_num_sg << std::endl; } int main() { @@ -189,11 +196,13 @@ int main() { // Host device is not for compute if (Device.is_host()) continue; + + const std::string DeviceName = Device.get_info<info::device::name>(); std::cout << "Device [" << DeviceNumber << "]:" << std::endl; try { printInfo(Device); - if (Device.is_gpu()) { + if (Device.is_gpu() && (DeviceName.find("HD Graphics NEO") != std::string::npos)) { run27pt(Device); runSubgroups(Device); } else { diff --git a/subgroup.cpp b/subgroup.cpp new file mode 100644 index 0000000..1e4bc11 --- /dev/null +++ b/subgroup.cpp @@ -0,0 +1,94 @@ +// +// Created by Tuowen Zhao on 11/11/19. +// + +#include <CL/sycl.hpp> +#include <vector> + +#define N 1024 + +using namespace cl::sycl; + +template<typename T> +inline void dev_shl(intel::sub_group &SG, T &res, T l, T r, unsigned kn, unsigned cw, unsigned cid) { + T l_tmp = SG.shuffle_down(l, cw - (kn)); + T r_tmp = SG.shuffle_up(r, kn); + res = (cid) < kn? l_tmp : r_tmp; +} + +void printInfo(device &Device) { + std::cout << "Using OpenCL " << (Device.is_cpu() ? "CPU" : "GPU") + << " device {" << Device.get_info<info::device::name>() + << "} from {" << Device.get_info<info::device::vendor>() << "}" << std::endl; + + auto dot_num_groups = Device.get_info<info::device::max_compute_units>(); + auto dot_wgsize = Device.get_info<info::device::max_work_group_size>(); + auto max_num_sg = Device.get_info<info::device::max_num_sub_groups>(); + + std::cout << "Compute units: " << dot_num_groups << std::endl; + std::cout << "Workgroup size: " << dot_wgsize << std::endl; +} + +int main() { + class NEOGPUDeviceSelector : public device_selector { + public: + int operator()(const device &Device) const override { + const std::string DeviceName = Device.get_info<info::device::name>(); + const std::string DeviceVendor = Device.get_info<info::device::vendor>(); + + return Device.is_gpu() && DeviceName.find("HD Graphics NEO") ? 1 : -1; + } + }; + + device dev = device(NEOGPUDeviceSelector()); + printInfo(dev); + std::vector<int> a(N); + std::vector<int> b(N); + std::vector<int> c(N); + + buffer<int, 1> A((int *)a.data(), range<1>(N), {property::buffer::use_host_ptr()}); + buffer<int, 1> B((int *)b.data(), range<1>(N), {property::buffer::use_host_ptr()}); + buffer<int, 1> C((int *)c.data(), range<1>(N), {property::buffer::use_host_ptr()}); + + for (int i = 0; i < N; ++i) { + a[i] = i; + b[i] = N - i; + } + + queue Queue(dev, {property::queue::enable_profiling()}); + + nd_range<1> nworkitem(range<1>(N), range<1>(64)); + range<1> nwork{N}; + int len = 1024; + + auto kernel = [&](handler &cgh) { + // Getting write only access to the buffer on a device + auto a = A.get_access<access::mode::read>(cgh); + auto b = B.get_access<access::mode::read>(cgh); + auto c = C.get_access<access::mode::write>(cgh); + + cgh.parallel_for<class FillBuffer>( + nworkitem, [=](nd_item<1> WIid) { + intel::sub_group SG = WIid.get_sub_group(); + int sglid = SG.get_local_id().get(0); + uint32_t i = WIid.get_global_id(0); + int* cp = &c[0]; + const int* ap = &a[0]; + const int* bp = &b[0]; +// c[i] = a[i] + b[i]; + dev_shl(SG, cp[i], ap[i], bp[i], 2, 8, sglid & 7); + c[i] = WIid.get_group(0); + }); + }; + std::cout << "submitting" << std::endl; + auto e = Queue.submit(kernel); + std::cout << "waiting" << std::endl; + e.wait(); + std::cout << "waited" << std::endl; + auto c_r = C.get_access<access::mode::read>(); + +// for (int i = 0; i < N; ++i) +// std::cout << i << " + " << N - i << " = " << c[i] << std::endl; + + return 0; +} |