diff options
| author | Tuowen Zhao <ztuowen@gmail.com> | 2021-10-04 16:23:27 +0000 | 
|---|---|---|
| committer | Tuowen Zhao <ztuowen@gmail.com> | 2021-10-04 16:23:27 +0000 | 
| commit | e38cf4dc8604634daddcc6a6b05ab1187ec83efc (patch) | |
| tree | c465f0f95c5766fc339d02aa367186bc209a3ae5 | |
| parent | c0be98c7e7f71c79c10566760d7c29974206f97d (diff) | |
| download | sycltest-master.tar.gz sycltest-master.tar.bz2 sycltest-master.zip | |
| -rw-r--r-- | .clang-format | 2 | ||||
| -rw-r--r-- | Makefile | 2 | ||||
| -rw-r--r-- | main.cpp | 97 | ||||
| -rw-r--r-- | subgroup.cpp | 40 | 
4 files changed, 63 insertions, 78 deletions
| diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..07d9df6 --- /dev/null +++ b/.clang-format @@ -0,0 +1,2 @@ +BasedOnStyle: LLVM +ColumnLimit: 100
\ No newline at end of file @@ -1,5 +1,5 @@  SYCL=/opt/intel/oneapi/compiler/latest/linux -CXX=$(SYCL)/bin/clang++ +CXX=$(SYCL)/bin/dpcpp  all:sycltest subgroup @@ -1,12 +1,12 @@  #include <CL/sycl.hpp>  // #include <omp.h> -#include <typeinfo> -#include <memory>  #include <cxxabi.h> +#include <memory> +#include <typeinfo>  #define N 256  #define GZ 16 -#define STRIDE (2*GZ + N) +#define STRIDE (2 * GZ + N)  #define TILEK 4  #define TILEJ 4  #define TILEI 16 @@ -17,28 +17,24 @@ std::string demangle(const char *name) {    int status = -4; // some arbitrary value to eliminate the compiler warning    // enable c++11 by passing the flag -std=c++11 to g++ -  std::unique_ptr<char, void (*)(void *)> res{ -      abi::__cxa_demangle(name, NULL, NULL, &status), -      std::free -  }; +  std::unique_ptr<char, void (*)(void *)> res{abi::__cxa_demangle(name, NULL, NULL, &status), +                                              std::free};    return (status == 0) ? res.get() : name;  }  using namespace cl::sycl; -template<typename T> -class subgr; +template <typename T> class subgr; -template<typename T> -class SGfunctor { +template <typename T> class SGfunctor {  public:    SGfunctor(accessor<uint32_t, 1, access::mode::write, access::target::global_buffer> sg_sz, -            accessor<T, 1, access::mode::write, access::target::global_buffer> sg_i) : sg_sz(sg_sz), sg_i(sg_i) {} +            accessor<T, 1, access::mode::write, access::target::global_buffer> sg_i) +      : sg_sz(sg_sz), sg_i(sg_i) {} -  [[intel::reqd_sub_group_size(16)]] -  void operator()(nd_item<1> NdItem) const { -	  ONEAPI::sub_group SG = NdItem.get_sub_group(); +  [[intel::reqd_sub_group_size(16)]] void operator()(nd_item<1> NdItem) const { +    ONEAPI::sub_group SG = NdItem.get_sub_group();      uint32_t wggid = NdItem.get_global_id(0);      uint32_t sgid = SG.get_local_id().get(0);      if (wggid == 0) @@ -52,8 +48,7 @@ private:    accessor<uint32_t, 1, access::mode::write, access::target::global_buffer> sg_sz;  }; -template<typename T> -void subkrnl(device &Device, size_t G = 256, size_t L = 64) { +template <typename T> void subkrnl(device &Device, size_t G = 256, size_t L = 64) {    buffer<uint32_t> sg_buf{1};    buffer<T> sg_info{G};    queue Queue(Device); @@ -101,40 +96,26 @@ void run27pt(device &Device) {      auto out = out_buf.get_access<access::mode::write>(cgh);      auto c = c_buf.get_access<access::mode::read>(cgh);      // Executing kernel -    cgh.parallel_for<class FillBuffer>( -        NumOfWorkItems, [=](nd_item<3> WIid) { -          uint32_t i = WIid.get_global_id(0) + GZ; -          uint32_t j = WIid.get_global_id(1) + GZ; -          uint32_t k = WIid.get_global_id(2) + GZ; -          // Fill buffer with indexes -          out[id<3>(i, j, k)] = c[0] * in[id<3>(i - 1, j - 1, k - 1)] -                                + c[1] * in[id<3>(i, j - 1, k - 1)] -                                + c[2] * in[id<3>(i + 1, j - 1, k - 1)] -                                + c[3] * in[id<3>(i - 1, j, k - 1)] -                                + c[4] * in[id<3>(i, j, k - 1)] -                                + c[5] * in[id<3>(i + 1, j, k - 1)] -                                + c[6] * in[id<3>(i - 1, j + 1, k - 1)] -                                + c[7] * in[id<3>(i, j + 1, k - 1)] -                                + c[8] * in[id<3>(i + 1, j + 1, k - 1)] -                                + c[9] * in[id<3>(i - 1, j - 1, k)] -                                + c[10] * in[id<3>(i, j - 1, k)] -                                + c[11] * in[id<3>(i + 1, j - 1, k)] -                                + c[12] * in[id<3>(i - 1, j, k)] -                                + c[13] * in[id<3>(i, j, k)] -                                + c[14] * in[id<3>(i + 1, j, k)] -                                + c[15] * in[id<3>(i - 1, j + 1, k)] -                                + c[16] * in[id<3>(i, j + 1, k)] -                                + c[17] * in[id<3>(i + 1, j + 1, k)] -                                + c[18] * in[id<3>(i - 1, j - 1, k + 1)] -                                + c[19] * in[id<3>(i, j - 1, k + 1)] -                                + c[20] * in[id<3>(i + 1, j - 1, k + 1)] -                                + c[21] * in[id<3>(i - 1, j, k + 1)] -                                + c[22] * in[id<3>(i, j, k + 1)] -                                + c[23] * in[id<3>(i + 1, j, k + 1)] -                                + c[24] * in[id<3>(i - 1, j + 1, k + 1)] -                                + c[25] * in[id<3>(i, j + 1, k + 1)] -                                + c[26] * in[id<3>(i + 1, j + 1, k + 1)]; -        }); +    cgh.parallel_for<class FillBuffer>(NumOfWorkItems, [=](nd_item<3> WIid) { +      uint32_t i = WIid.get_global_id(0) + GZ; +      uint32_t j = WIid.get_global_id(1) + GZ; +      uint32_t k = WIid.get_global_id(2) + GZ; +      // Fill buffer with indexes +      out[i][j][k] = c[0] * in[id<3>(i - 1, j - 1, k - 1)] + c[1] * in[id<3>(i, j - 1, k - 1)] + +                     c[2] * in[id<3>(i + 1, j - 1, k - 1)] + c[3] * in[id<3>(i - 1, j, k - 1)] + +                     c[4] * in[id<3>(i, j, k - 1)] + c[5] * in[id<3>(i + 1, j, k - 1)] + +                     c[6] * in[id<3>(i - 1, j + 1, k - 1)] + c[7] * in[id<3>(i, j + 1, k - 1)] + +                     c[8] * in[id<3>(i + 1, j + 1, k - 1)] + c[9] * in[id<3>(i - 1, j - 1, k)] + +                     c[10] * in[id<3>(i, j - 1, k)] + c[11] * in[id<3>(i + 1, j - 1, k)] + +                     c[12] * in[id<3>(i - 1, j, k)] + c[13] * in[id<3>(i, j, k)] + +                     c[14] * in[id<3>(i + 1, j, k)] + c[15] * in[id<3>(i - 1, j + 1, k)] + +                     c[16] * in[id<3>(i, j + 1, k)] + c[17] * in[id<3>(i + 1, j + 1, k)] + +                     c[18] * in[id<3>(i - 1, j - 1, k + 1)] + c[19] * in[id<3>(i, j - 1, k + 1)] + +                     c[20] * in[id<3>(i + 1, j - 1, k + 1)] + c[21] * in[id<3>(i - 1, j, k + 1)] + +                     c[22] * in[id<3>(i, j, k + 1)] + c[23] * in[id<3>(i + 1, j, k + 1)] + +                     c[24] * in[id<3>(i - 1, j + 1, k + 1)] + c[25] * in[id<3>(i, j + 1, k + 1)] + +                     c[26] * in[id<3>(i + 1, j + 1, k + 1)]; +    });    };    auto st_event = Queue.submit(kernel);    for (int i = 0; i < ITER - 2; ++i) { @@ -151,22 +132,24 @@ void run27pt(device &Device) {    {      // double ed = omp_get_wtime();      double elapsed = (ed_event.get_profiling_info<info::event_profiling::command_end>() - -                      st_event.get_profiling_info<info::event_profiling::command_start>()) * 1e-9; +                      st_event.get_profiling_info<info::event_profiling::command_start>()) * +                     1e-9;      std::cout << "elapsed: " << elapsed / ITER << std::endl;      std::cout << "flops: " << N * N * N * 53.0 * ITER / elapsed * 1e-9 << std::endl;    }    {      // double ed = omp_get_wtime();      double elapsed = (ed_event.get_profiling_info<info::event_profiling::command_end>() - -                      ed_event.get_profiling_info<info::event_profiling::command_start>()) * 1e-9; +                      ed_event.get_profiling_info<info::event_profiling::command_start>()) * +                     1e-9;      std::cout << "elapsed: " << elapsed << std::endl;      std::cout << "flops: " << N * N * N * 53.0 / elapsed * 1e-9 << std::endl;    }  }  void printInfo(device &Device) { -  std::cout << "Using OpenCL device {" << Device.get_info<info::device::name>() << -            "} from {" << Device.get_info<info::device::vendor>() << "}" << std::endl; +  std::cout << "Using OpenCL device {" << Device.get_info<info::device::name>() << "} from {" +            << Device.get_info<info::device::vendor>() << "}" << std::endl;    std::cout << "I'm a " << (Device.is_cpu() ? "CPU" : "GPU") << std::endl;    auto dot_num_groups = Device.get_info<info::device::max_compute_units>(); @@ -196,12 +179,12 @@ int main() {      // Host device is not for compute      if (Device.is_host())        continue; -     +      const std::string DeviceName = Device.get_info<info::device::name>();      std::cout << "Device [" << DeviceNumber << "]:" << DeviceName << std::endl;      try { -      if (Device.is_gpu() && (DeviceName.find("Gen11") != std::string::npos)) { +      if (Device.is_gpu() && (DeviceName.find("Graphics") != std::string::npos)) {          printInfo(Device);          run27pt(Device);          runSubgroups(Device); diff --git a/subgroup.cpp b/subgroup.cpp index 59e873d..f55a1f8 100644 --- a/subgroup.cpp +++ b/subgroup.cpp @@ -9,17 +9,18 @@  using namespace cl::sycl; -template<typename T> -inline void dev_shl(ONEAPI::sub_group &SG, T &res, T l, T r, unsigned kn, unsigned cw, unsigned cid) { +template <typename T> +inline void dev_shl(ONEAPI::sub_group &SG, T &res, T l, T r, unsigned kn, unsigned cw, +                    unsigned cid) {    T l_tmp = SG.shuffle_down(l, cw - (kn));    T r_tmp = SG.shuffle_up(r, kn); -  res = (cid) < kn? l_tmp : r_tmp; +  res = (cid) < kn ? l_tmp : r_tmp;  }  void printInfo(device &Device) { -  std::cout << "Using OpenCL " << (Device.is_cpu() ? "CPU" : "GPU") -            << " device {" << Device.get_info<info::device::name>() -            << "} from {" << Device.get_info<info::device::vendor>() << "}" << std::endl; +  std::cout << "Using OpenCL " << (Device.is_cpu() ? "CPU" : "GPU") << " device {" +            << Device.get_info<info::device::name>() << "} from {" +            << Device.get_info<info::device::vendor>() << "}" << std::endl;    auto dot_num_groups = Device.get_info<info::device::max_compute_units>();    auto dot_wgsize = Device.get_info<info::device::max_work_group_size>(); @@ -67,18 +68,17 @@ int main() {      auto b = B.get_access<access::mode::read>(cgh);      auto c = C.get_access<access::mode::write>(cgh); -    cgh.parallel_for<class FillBuffer>( -        nworkitem, [=](nd_item<1> WIid) { -          ONEAPI::sub_group SG = WIid.get_sub_group(); -          int sglid = SG.get_local_id().get(0); -          uint32_t i = WIid.get_global_id(0); -          int* cp = &c[0]; -          const int* ap = &a[0]; -          const int* bp = &b[0]; -//          c[i] = a[i] + b[i]; -          dev_shl(SG, cp[i], ap[i], bp[i], 2, 8, sglid & 7); -          c[i] = WIid.get_group(0); -        }); +    cgh.parallel_for<class FillBuffer>(nworkitem, [=](nd_item<1> WIid) { +      ONEAPI::sub_group SG = WIid.get_sub_group(); +      int sglid = SG.get_local_id().get(0); +      uint32_t i = WIid.get_global_id(0); +      int *cp = &c[0]; +      const int *ap = &a[0]; +      const int *bp = &b[0]; +      //          c[i] = a[i] + b[i]; +      dev_shl(SG, cp[i], ap[i], bp[i], 2, 8, sglid & 7); +      c[i] = WIid.get_group(0); +    });    };    std::cout << "submitting" << std::endl;    auto e = Queue.submit(kernel); @@ -87,8 +87,8 @@ int main() {    std::cout << "waited" << std::endl;    auto c_r = C.get_access<access::mode::read>(); -//  for (int i = 0; i < N; ++i) -//    std::cout << i << " + " << N - i << " = " << c[i] << std::endl; +  //  for (int i = 0; i < N; ++i) +  //    std::cout << i << " + " << N - i << " = " << c[i] << std::endl;    return 0;  } | 
