summaryrefslogtreecommitdiff
path: root/main.cu
diff options
context:
space:
mode:
Diffstat (limited to 'main.cu')
-rw-r--r--main.cu123
1 files changed, 123 insertions, 0 deletions
diff --git a/main.cu b/main.cu
new file mode 100644
index 0000000..427a7d2
--- /dev/null
+++ b/main.cu
@@ -0,0 +1,123 @@
+#include <iostream>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <string>
+#include <omp.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#define UVM
+
+#ifndef NDEBUG
+#define cudaCheck(x) x
+#else
+#include <cstdio>
+#define cudaCheck(x) _cudaCheck(x, #x ,__FILE__, __LINE__)
+#endif
+
+template<typename T>
+void _cudaCheck(T e, const char *func, const char *call, const int line) {
+ if (e != cudaSuccess) {
+ printf("\"%s\" at %d in %s\n\treturned %d\n-> %s\n", func, line, call, (int) e, cudaGetErrorString(e));
+ exit(EXIT_FAILURE);
+ }
+}
+
+#define GIG (1024ul*1024*1024)
+#define VOLUME (2ul*GIG)
+
+// Real values is in the ~65536, only use half of it to be safe
+// sysctl vm.max_map_count
+#define MAXMMAPS 32768
+
+__global__ void
+cudacpy(long *src, long *dst, long len) {
+ long st = len / gridDim.x;
+ long ed = min(st * (blockIdx.x + 1), len);
+ st = st * blockIdx.x;
+ st += threadIdx.x;
+ for (; st < ed; st += blockDim.x)
+ dst[st] = src[st];
+}
+
+int main(int argc, char **argv) {
+ auto page_size = sysconf(_SC_PAGESIZE);
+ std::cout << "The system have a page size of " << page_size << std::endl;
+ // Src is using anonymous mapping
+ long nmaps = 1024;
+ if (argc > 1) {
+ nmaps = std::stoi(argv[1]);
+ if (VOLUME / page_size % nmaps != 0) {
+ std::cout << "nmaps is not perfect multiple, quit" << std::endl;
+ return 0;
+ }
+ }
+
+ long mmap_sz = VOLUME / nmaps;
+ std::cout << "Each mapped region is of size(pages) " << mmap_sz / page_size << std::endl;
+
+ auto dst = (long *) malloc(VOLUME);
+
+ uint8_t *hint = (uint8_t *) 0x600000000000UL;
+ hint -= VOLUME;
+ auto src = (long *) hint;
+
+ for (long i = 0; i < nmaps; ++i) {
+ auto r = mmap(hint, mmap_sz, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0);
+ if (r == MAP_FAILED || r != hint)
+ printf("MMAP failed somehow\n");
+ hint += mmap_sz;
+ }
+
+#pragma omp parallel for
+ for (long i = 0; i < VOLUME / sizeof(long); ++i) {
+ src[i] = i;
+ dst[i] = 0;
+ }
+
+ int ITER = 100;
+ double st = omp_get_wtime();
+
+ for (int t = 0; t < ITER; ++t) {
+#pragma omp parallel for
+ for (long i = 0; i < VOLUME / sizeof(long); ++i)
+ dst[i] = src[i];
+ }
+
+ st = (omp_get_wtime() - st) / ITER;
+
+ printf("CPU\nAverage time(s) %f\n", st);
+ printf("Average throughput(GB/s) %f\n", VOLUME * 2 / st * 1e-9);
+
+#ifdef UVM
+ CUdevice device = 0;
+ CUcontext pctx;
+ cudaCheck((cudaError_t) cudaSetDevice(device));
+ cudaCheck((cudaError_t) cuCtxCreate(&pctx, CU_CTX_SCHED_AUTO | CU_CTX_MAP_HOST, device));
+
+ cudaCheck(cudaMemAdvise(src, VOLUME, cudaMemAdviseSetPreferredLocation, device));
+ cudaMemPrefetchAsync(src, VOLUME, device);
+
+ cudaCheck(cudaMemAdvise(dst, VOLUME, cudaMemAdviseSetPreferredLocation, device));
+ cudaMemPrefetchAsync(dst, VOLUME, device);
+
+ float elapsed;
+ cudaEvent_t c_0, c_1;
+ cudaEventCreate(&c_0);
+ cudaEventCreate(&c_1);
+
+ cudaEventRecord(c_0);
+
+ ITER = 1000;
+ for (int t = 0; t < ITER; ++t)
+ cudacpy << < 1024, 256 >> > (src, dst, VOLUME / sizeof(float));
+
+ cudaEventRecord(c_1);
+ cudaEventSynchronize(c_1);
+ cudaEventElapsedTime(&elapsed, c_0, c_1);
+ st = elapsed / 1000 / ITER;
+ printf("GPU\nAverage time(s) %f\n", st);
+ printf("Average throughput(GB/s) %f\n", VOLUME * 2 / st * 1e-9);
+#endif
+ return 0;
+}