// // Created by Tuowen Zhao on 11/11/19. // #include #include #define N 1024 using namespace cl::sycl; template inline void dev_shl(intel::sub_group &SG, T &res, T l, T r, unsigned kn, unsigned cw, unsigned cid) { T l_tmp = SG.shuffle_down(l, cw - (kn)); T r_tmp = SG.shuffle_up(r, kn); res = (cid) < kn? l_tmp : r_tmp; } void printInfo(device &Device) { std::cout << "Using OpenCL " << (Device.is_cpu() ? "CPU" : "GPU") << " device {" << Device.get_info() << "} from {" << Device.get_info() << "}" << std::endl; auto dot_num_groups = Device.get_info(); auto dot_wgsize = Device.get_info(); auto max_num_sg = Device.get_info(); std::cout << "Compute units: " << dot_num_groups << std::endl; std::cout << "Workgroup size: " << dot_wgsize << std::endl; } int main() { class NEOGPUDeviceSelector : public device_selector { public: int operator()(const device &Device) const override { const std::string DeviceName = Device.get_info(); const std::string DeviceVendor = Device.get_info(); return Device.is_gpu() && DeviceName.find("HD Graphics NEO") ? 1 : -1; } }; device dev = device(NEOGPUDeviceSelector()); printInfo(dev); std::vector a(N); std::vector b(N); std::vector c(N); buffer A((int *)a.data(), range<1>(N), {property::buffer::use_host_ptr()}); buffer B((int *)b.data(), range<1>(N), {property::buffer::use_host_ptr()}); buffer C((int *)c.data(), range<1>(N), {property::buffer::use_host_ptr()}); for (int i = 0; i < N; ++i) { a[i] = i; b[i] = N - i; } queue Queue(dev, {property::queue::enable_profiling()}); nd_range<1> nworkitem(range<1>(N), range<1>(64)); range<1> nwork{N}; int len = 1024; auto kernel = [&](handler &cgh) { // Getting write only access to the buffer on a device auto a = A.get_access(cgh); auto b = B.get_access(cgh); auto c = C.get_access(cgh); cgh.parallel_for( nworkitem, [=](nd_item<1> WIid) { intel::sub_group SG = WIid.get_sub_group(); int sglid = SG.get_local_id().get(0); uint32_t i = WIid.get_global_id(0); int* cp = &c[0]; const int* ap = &a[0]; const int* bp = &b[0]; // c[i] = a[i] + b[i]; dev_shl(SG, cp[i], ap[i], bp[i], 2, 8, sglid & 7); c[i] = WIid.get_group(0); }); }; std::cout << "submitting" << std::endl; auto e = Queue.submit(kernel); std::cout << "waiting" << std::endl; e.wait(); std::cout << "waited" << std::endl; auto c_r = C.get_access(); // for (int i = 0; i < N; ++i) // std::cout << i << " + " << N - i << " = " << c[i] << std::endl; return 0; }