main.cu


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123

#include <iostream>
#include <unistd.h>
#include <sys/mman.h>
#include <string>
#include <omp.h>
#include <cuda.h>
#include <cuda_runtime.h>

#define UVM

#ifndef NDEBUG
#define cudaCheck(x) x
#else
#include <cstdio>
#define cudaCheck(x) _cudaCheck(x, #x ,__FILE__, __LINE__)
#endif

template<typename T>
void _cudaCheck(T e, const char *func, const char *call, const int line) {
  if (e != cudaSuccess) {
    printf("\"%s\" at %d in %s\n\treturned %d\n-> %s\n", func, line, call, (int) e, cudaGetErrorString(e));
    exit(EXIT_FAILURE);
  }
}

#define GIG (1024ul*1024*1024)
#define VOLUME (2ul*GIG)

// Real values is in the ~65536, only use half of it to be safe
// sysctl vm.max_map_count
#define MAXMMAPS 32768

__global__ void
cudacpy(long *src, long *dst, long len) {
  long st = len / gridDim.x;
  long ed = min(st * (blockIdx.x + 1), len);
  st = st * blockIdx.x;
  st += threadIdx.x;
  for (; st < ed; st += blockDim.x)
    dst[st] = src[st];
}

int main(int argc, char **argv) {
  auto page_size = sysconf(_SC_PAGESIZE);
  std::cout << "The system have a page size of " << page_size << std::endl;
  // Src is using anonymous mapping
  long nmaps = 1024;
  if (argc > 1) {
    nmaps = std::stoi(argv[1]);
    if (VOLUME / page_size % nmaps != 0) {
      std::cout << "nmaps is not perfect multiple, quit" << std::endl;
      return 0;
    }
  }

  long mmap_sz = VOLUME / nmaps;
  std::cout << "Each mapped region is of size(pages) " << mmap_sz / page_size << std::endl;

  auto dst = (long *) malloc(VOLUME);

  uint8_t *hint = (uint8_t *) 0x600000000000UL;
  hint -= VOLUME;
  auto src = (long *) hint;

  for (long i = 0; i < nmaps; ++i) {
    auto r = mmap(hint, mmap_sz, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0);
    if (r == MAP_FAILED || r != hint)
      printf("MMAP failed somehow\n");
    hint += mmap_sz;
  }

#pragma omp parallel for
  for (long i = 0; i < VOLUME / sizeof(long); ++i) {
    src[i] = i;
    dst[i] = 0;
  }

  int ITER = 100;
  double st = omp_get_wtime();

  for (int t = 0; t < ITER; ++t) {
#pragma omp parallel for
    for (long i = 0; i < VOLUME / sizeof(long); ++i)
      dst[i] = src[i];
  }

  st = (omp_get_wtime() - st) / ITER;

  printf("CPU\nAverage time(s) %f\n", st);
  printf("Average throughput(GB/s) %f\n", VOLUME * 2 / st * 1e-9);

#ifdef UVM
  CUdevice device = 0;
  CUcontext pctx;
  cudaCheck((cudaError_t) cudaSetDevice(device));
  cudaCheck((cudaError_t) cuCtxCreate(&pctx, CU_CTX_SCHED_AUTO | CU_CTX_MAP_HOST, device));

  cudaCheck(cudaMemAdvise(src, VOLUME, cudaMemAdviseSetPreferredLocation, device));
  cudaMemPrefetchAsync(src, VOLUME, device);

  cudaCheck(cudaMemAdvise(dst, VOLUME, cudaMemAdviseSetPreferredLocation, device));
  cudaMemPrefetchAsync(dst, VOLUME, device);

  float elapsed;
  cudaEvent_t c_0, c_1;
  cudaEventCreate(&c_0);
  cudaEventCreate(&c_1);

  cudaEventRecord(c_0);

  ITER = 1000;
  for (int t = 0; t < ITER; ++t)
    cudacpy << < 1024, 256 >> > (src, dst, VOLUME / sizeof(float));

  cudaEventRecord(c_1);
  cudaEventSynchronize(c_1);
  cudaEventElapsedTime(&elapsed, c_0, c_1);
  st = elapsed / 1000 / ITER;
  printf("GPU\nAverage time(s) %f\n", st);
  printf("Average throughput(GB/s) %f\n", VOLUME * 2 / st * 1e-9);
#endif
  return 0;
}