1 files changed, 123 insertions, 0 deletions
diff --git a/main.cu b/main.cu
new file mode 100644
index 0000000..427a7d2
--- /dev/null
+++ b/main.cu
@@ -0,0 +1,123 @@
+#include <iostream>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <string>
+#include <omp.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#define UVM
+
+#ifndef NDEBUG
+#define cudaCheck(x) x
+#else
+#include <cstdio>
+#define cudaCheck(x) _cudaCheck(x, #x ,__FILE__, __LINE__)
+#endif
+
+template<typename T>
+void _cudaCheck(T e, const char *func, const char *call, const int line) {
+  if (e != cudaSuccess) {
+    printf("\"%s\" at %d in %s\n\treturned %d\n-> %s\n", func, line, call, (int) e, cudaGetErrorString(e));
+    exit(EXIT_FAILURE);
+  }
+}
+
+#define GIG (1024ul*1024*1024)
+#define VOLUME (2ul*GIG)
+
+// Real values is in the ~65536, only use half of it to be safe
+// sysctl vm.max_map_count
+#define MAXMMAPS 32768
+
+__global__ void
+cudacpy(long *src, long *dst, long len) {
+  long st = len / gridDim.x;
+  long ed = min(st * (blockIdx.x + 1), len);
+  st = st * blockIdx.x;
+  st += threadIdx.x;
+  for (; st < ed; st += blockDim.x)
+    dst[st] = src[st];
+}
+
+int main(int argc, char **argv) {
+  auto page_size = sysconf(_SC_PAGESIZE);
+  std::cout << "The system have a page size of " << page_size << std::endl;
+  // Src is using anonymous mapping
+  long nmaps = 1024;
+  if (argc > 1) {
+    nmaps = std::stoi(argv[1]);
+    if (VOLUME / page_size % nmaps != 0) {
+      std::cout << "nmaps is not perfect multiple, quit" << std::endl;
+      return 0;
+    }
+  }
+
+  long mmap_sz = VOLUME / nmaps;
+  std::cout << "Each mapped region is of size(pages) " << mmap_sz / page_size << std::endl;
+
+  auto dst = (long *) malloc(VOLUME);
+
+  uint8_t *hint = (uint8_t *) 0x600000000000UL;
+  hint -= VOLUME;
+  auto src = (long *) hint;
+
+  for (long i = 0; i < nmaps; ++i) {
+    auto r = mmap(hint, mmap_sz, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0);
+    if (r == MAP_FAILED || r != hint)
+      printf("MMAP failed somehow\n");
+    hint += mmap_sz;
+  }
+
+#pragma omp parallel for
+  for (long i = 0; i < VOLUME / sizeof(long); ++i) {
+    src[i] = i;
+    dst[i] = 0;
+  }
+
+  int ITER = 100;
+  double st = omp_get_wtime();
+
+  for (int t = 0; t < ITER; ++t) {
+#pragma omp parallel for
+    for (long i = 0; i < VOLUME / sizeof(long); ++i)
+      dst[i] = src[i];
+  }
+
+  st = (omp_get_wtime() - st) / ITER;
+
+  printf("CPU\nAverage time(s) %f\n", st);
+  printf("Average throughput(GB/s) %f\n", VOLUME * 2 / st * 1e-9);
+
+#ifdef UVM
+  CUdevice device = 0;
+  CUcontext pctx;
+  cudaCheck((cudaError_t) cudaSetDevice(device));
+  cudaCheck((cudaError_t) cuCtxCreate(&pctx, CU_CTX_SCHED_AUTO | CU_CTX_MAP_HOST, device));
+
+  cudaCheck(cudaMemAdvise(src, VOLUME, cudaMemAdviseSetPreferredLocation, device));
+  cudaMemPrefetchAsync(src, VOLUME, device);
+
+  cudaCheck(cudaMemAdvise(dst, VOLUME, cudaMemAdviseSetPreferredLocation, device));
+  cudaMemPrefetchAsync(dst, VOLUME, device);
+
+  float elapsed;
+  cudaEvent_t c_0, c_1;
+  cudaEventCreate(&c_0);
+  cudaEventCreate(&c_1);
+
+  cudaEventRecord(c_0);
+
+  ITER = 1000;
+  for (int t = 0; t < ITER; ++t)
+    cudacpy << < 1024, 256 >> > (src, dst, VOLUME / sizeof(float));
+
+  cudaEventRecord(c_1);
+  cudaEventSynchronize(c_1);
+  cudaEventElapsedTime(&elapsed, c_0, c_1);
+  st = elapsed / 1000 / ITER;
+  printf("GPU\nAverage time(s) %f\n", st);
+  printf("Average throughput(GB/s) %f\n", VOLUME * 2 / st * 1e-9);
+#endif
+  return 0;
+}