#include #include #include #include #include #define N 64 #define GZ 16 #define STRIDE (2*GZ + N) #define TILEK 4 #define TILEJ 4 #define TILEI 16 #define ITER 10 std::string demangle(const char* name) { int status = -4; // some arbitrary value to eliminate the compiler warning // enable c++11 by passing the flag -std=c++11 to g++ std::unique_ptr res { abi::__cxa_demangle(name, NULL, NULL, &status), std::free }; return (status==0) ? res.get() : name ; } using namespace cl::sycl; template class subgr; template void subkrnl(device &Device, size_t G = 256, size_t L = 64) { buffer sg_buf{1}; buffer sg_info{G}; queue Queue(Device); nd_range<1> NumOfWorkItems(G, L); Queue.submit([&](handler &cgh) { auto sg_sz = sg_buf.template get_access(cgh); auto sg_i = sg_info.template get_access(cgh); cgh.parallel_for>(NumOfWorkItems, [=](nd_item<1> NdItem) { intel::sub_group SG = NdItem.get_sub_group(); uint32_t wggid = NdItem.get_global_id(0); uint32_t sgid = SG.get_local_id().get(0); if (wggid == 0) sg_sz[0] = SG.get_max_local_range()[0]; sgid = SG.shuffle_up(sgid, 2); sg_i[wggid] = sgid; }); }); Queue.wait(); auto sg_sz = sg_buf.template get_access(); auto sg_i = sg_info.template get_access(); std::cout << "Subgroup size for " << demangle(typeid(T).name()) << ": " << sg_sz[0] << std::endl; // for (int i = 0; i < sg_sz[0]; ++i) // std::cout << sg_i[i] << " "; // std::cout << std::endl; } void runSubgroups(device &Device) { std::cout << "Running subgroup for different types" << std::endl; subkrnl(Device); subkrnl(Device); subkrnl(Device); } void run27pt(device &Device) { // Creating buffer of 1024 ints to be used inside the kernel code buffer in_buf{range<3>(STRIDE, STRIDE, STRIDE)}; buffer out_buf{range<3>(STRIDE, STRIDE, STRIDE)}; buffer c_buf{27}; // Creating SYCL queue queue Queue(Device, {property::queue::enable_profiling()}); nd_range<3> NumOfWorkItems(range<3>(N, N, N), range<3>(TILEI, TILEJ, TILEK)); float st = omp_get_wtime(); auto kernel = [&](handler &cgh) { // Getting write only access to the buffer on a device auto in = in_buf.get_access(cgh); auto out = out_buf.get_access(cgh); auto c = c_buf.get_access(cgh); // Executing kernel cgh.parallel_for( NumOfWorkItems, [=](nd_item<3> WIid) { uint32_t i = WIid.get_global_id(0) + GZ; uint32_t j = WIid.get_global_id(1) + GZ; uint32_t k = WIid.get_global_id(2) + GZ; // Fill buffer with indexes out[id<3>(i, j, k)] = c[0] * in[id<3>(i - 1, j - 1, k - 1)] + c[1] * in[id<3>(i, j - 1, k - 1)] + c[2] * in[id<3>(i + 1, j - 1, k - 1)] + c[3] * in[id<3>(i - 1, j, k - 1)] + c[4] * in[id<3>(i, j, k - 1)] + c[5] * in[id<3>(i + 1, j, k - 1)] + c[6] * in[id<3>(i - 1, j + 1, k - 1)] + c[7] * in[id<3>(i, j + 1, k - 1)] + c[8] * in[id<3>(i + 1, j + 1, k - 1)] + c[9] * in[id<3>(i - 1, j - 1, k)] + c[10] * in[id<3>(i, j - 1, k)] + c[11] * in[id<3>(i + 1, j - 1, k)] + c[12] * in[id<3>(i - 1, j, k)] + c[13] * in[id<3>(i, j, k)] + c[14] * in[id<3>(i + 1, j, k)] + c[15] * in[id<3>(i - 1, j + 1, k)] + c[16] * in[id<3>(i, j + 1, k)] + c[17] * in[id<3>(i + 1, j + 1, k)] + c[18] * in[id<3>(i - 1, j - 1, k + 1)] + c[19] * in[id<3>(i, j - 1, k + 1)] + c[20] * in[id<3>(i + 1, j - 1, k + 1)] + c[21] * in[id<3>(i - 1, j, k + 1)] + c[22] * in[id<3>(i, j, k + 1)] + c[23] * in[id<3>(i + 1, j, k + 1)] + c[24] * in[id<3>(i - 1, j + 1, k + 1)] + c[25] * in[id<3>(i, j + 1, k + 1)] + c[26] * in[id<3>(i + 1, j + 1, k + 1)]; }); }; auto st_event = Queue.submit(kernel); for (int i = 0; i < ITER - 2; ++i) { // Submitting command group(work) to queue Queue.submit(kernel); } auto ed_event = Queue.submit(kernel); ed_event.wait(); // Getting read only access to the buffer on the host. // Implicit barrier waiting for queue to complete the work. const auto out_h = out_buf.get_access(); const auto in_h = in_buf.get_access(); double ed = omp_get_wtime(); double elapsed = (ed_event.get_profiling_info() - st_event.get_profiling_info()) * 1e-9; std::cout << "elapsed: " << (ed - st) / ITER << std::endl; std::cout << "elapsed: " << elapsed << std::endl; std::cout << "flops: " << N * N * N * 53.0 * ITER / elapsed * 1e-9 << std::endl; } void printInfo(device &Device) { std::cout << "Using OpenCL device {" << Device.get_info() << "} from {" << Device.get_info() << "}" << std::endl; std::cout << "I'm a " << (Device.is_cpu() ? "CPU" : "GPU") << std::endl; auto dot_num_groups = Device.get_info(); auto dot_wgsize = Device.get_info(); auto max_num_sg = Device.get_info(); std::cout << "Compute units: " << dot_num_groups << std::endl; std::cout << "Workgroup size: " << dot_wgsize << std::endl; std::cout << "Maximum subgroup size: " << max_num_sg << std::endl; } int main() { // This is an example of using a device_selector // When instantiate a device use device(NEOGPUDeviceSelector) class NEOGPUDeviceSelector : public device_selector { public: int operator()(const device &Device) const override { const std::string DeviceName = Device.get_info(); const std::string DeviceVendor = Device.get_info(); return Device.is_gpu() && DeviceName.find("HD Graphics NEO") ? 1 : -1; } }; int DeviceNumber = 0; for (device &Device : device::get_devices()) { // Host device is not for compute if (Device.is_host()) continue; std::cout << "Device [" << DeviceNumber << "]:" << std::endl; try { printInfo(Device); if (Device.is_gpu()) { run27pt(Device); runSubgroups(Device); } else { std::cout << "CPU devices possibly not supported" << std::endl; } } catch (invalid_parameter_error &E) { std::cout << E.what() << std::endl; } std::cout << "==================" << std::endl; ++DeviceNumber; } }