summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTuowen Zhao <ztuowen@gmail.com>2019-12-04 21:19:19 -0700
committerTuowen Zhao <ztuowen@gmail.com>2019-12-04 21:19:19 -0700
commitae42285f61744d34e036e92644164482b22590f3 (patch)
tree2ae323a29d9162b7b37a778f9e7c3725230d714f
parentc0b34b3f764e12d296f6126eba3b930559bbcd42 (diff)
downloadatsmmap-ae42285f61744d34e036e92644164482b22590f3.tar.gz
atsmmap-ae42285f61744d34e036e92644164482b22590f3.tar.bz2
atsmmap-ae42285f61744d34e036e92644164482b22590f3.zip
GPU bench
-rw-r--r--CMakeLists.txt13
-rw-r--r--main.cpp64
-rw-r--r--main.cu123
3 files changed, 132 insertions, 68 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b24a5e4..4c02b00 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,10 +1,15 @@
cmake_minimum_required(VERSION 3.15)
-project(atsmmap)
+project(atsmmap LANGUAGES CXX CUDA)
-set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CUDA_STANDARD 11)
+set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
find_package(OpenMP REQUIRED)
-set(CMAKE_C_FLAGS ${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS})
+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS} -march=native")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler \"${OpenMP_CXX_FLAGS} -march=native\"")
-add_executable(atsmmap main.cpp) \ No newline at end of file
+add_executable(atsmmap main.cu)
+target_link_options(atsmmap PRIVATE "${OpenMP_CXX_FLAGS}")
+target_link_libraries(atsmmap rt cuda) \ No newline at end of file
diff --git a/main.cpp b/main.cpp
deleted file mode 100644
index 0cb47fa..0000000
--- a/main.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-#include <iostream>
-#include <unistd.h>
-#include <sys/mman.h>
-#include <string>
-#include <omp.h>
-
-#define GIG (1024ul*1024*1024)
-#define VOLUME (1024ul*1024*1024)
-
-// Real values is in the ~65536, only use half of it to be safe
-// sysctl vm.max_map_count
-#define MAXMMAPS 32768
-
-int main(int argc, char **argv) {
- auto page_size = sysconf(_SC_PAGESIZE);
- std::cout << "The system have a page size of " << page_size << std::endl;
- // Src is using anonymous mapping
- long nmaps = 1024;
- if (argc > 1) {
- nmaps = std::stoi(argv[1]);
- if (VOLUME / page_size % nmaps != 0) {
- std::cout << "nmaps is not perfect multiple, quit" << std::endl;
- return 0;
- }
- }
-
- long mmap_sz = VOLUME / nmaps;
- std::cout << "Each mapped region is of size(pages) " << mmap_sz/page_size << std::endl;
-
- auto dst = (long*)malloc(VOLUME);
-
- uint8_t *hint = (uint8_t*)0x600000000000UL;
- hint -= VOLUME;
- auto src = (long*)hint;
- for (long i = 0; i < nmaps; ++i) {
- auto r = mmap(hint, mmap_sz, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0);
- if (r == MAP_FAILED || r != hint)
- printf("MMAP failed somehow\n");
- hint += mmap_sz;
- }
-
-#pragma omp parallel for
- for (long i = 0; i < VOLUME/sizeof(long); ++i) {
- src[i] = i;
- dst[i] = 0;
- }
-
- int ITER = 100;
- double st = omp_get_wtime();
-
- for (int t = 0; t < ITER; ++t) {
-#pragma omp parallel for
- for (long i = 0; i < VOLUME/sizeof(long); ++i)
- dst[i] = src[i];
- }
-
- st = (omp_get_wtime() - st) / ITER;
-
- printf("Average time(s) %f\n", st);
-// std::cout << "Average time(s) " << st << std::endl;
- printf("Average throughput(GB/s) %f\n", VOLUME * 2 / st * 1e-9);
-// std::cout << "Average throughput(GB/s) " << VOLUME * 2 / st * 1e-9 << std::endl;
- return 0;
-}
diff --git a/main.cu b/main.cu
new file mode 100644
index 0000000..427a7d2
--- /dev/null
+++ b/main.cu
@@ -0,0 +1,123 @@
+#include <iostream>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <string>
+#include <omp.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#define UVM
+
+#ifndef NDEBUG
+#define cudaCheck(x) x
+#else
+#include <cstdio>
+#define cudaCheck(x) _cudaCheck(x, #x ,__FILE__, __LINE__)
+#endif
+
+template<typename T>
+void _cudaCheck(T e, const char *func, const char *call, const int line) {
+ if (e != cudaSuccess) {
+ printf("\"%s\" at %d in %s\n\treturned %d\n-> %s\n", func, line, call, (int) e, cudaGetErrorString(e));
+ exit(EXIT_FAILURE);
+ }
+}
+
+#define GIG (1024ul*1024*1024)
+#define VOLUME (2ul*GIG)
+
+// Real values is in the ~65536, only use half of it to be safe
+// sysctl vm.max_map_count
+#define MAXMMAPS 32768
+
+__global__ void
+cudacpy(long *src, long *dst, long len) {
+ long st = len / gridDim.x;
+ long ed = min(st * (blockIdx.x + 1), len);
+ st = st * blockIdx.x;
+ st += threadIdx.x;
+ for (; st < ed; st += blockDim.x)
+ dst[st] = src[st];
+}
+
+int main(int argc, char **argv) {
+ auto page_size = sysconf(_SC_PAGESIZE);
+ std::cout << "The system have a page size of " << page_size << std::endl;
+ // Src is using anonymous mapping
+ long nmaps = 1024;
+ if (argc > 1) {
+ nmaps = std::stoi(argv[1]);
+ if (VOLUME / page_size % nmaps != 0) {
+ std::cout << "nmaps is not perfect multiple, quit" << std::endl;
+ return 0;
+ }
+ }
+
+ long mmap_sz = VOLUME / nmaps;
+ std::cout << "Each mapped region is of size(pages) " << mmap_sz / page_size << std::endl;
+
+ auto dst = (long *) malloc(VOLUME);
+
+ uint8_t *hint = (uint8_t *) 0x600000000000UL;
+ hint -= VOLUME;
+ auto src = (long *) hint;
+
+ for (long i = 0; i < nmaps; ++i) {
+ auto r = mmap(hint, mmap_sz, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0);
+ if (r == MAP_FAILED || r != hint)
+ printf("MMAP failed somehow\n");
+ hint += mmap_sz;
+ }
+
+#pragma omp parallel for
+ for (long i = 0; i < VOLUME / sizeof(long); ++i) {
+ src[i] = i;
+ dst[i] = 0;
+ }
+
+ int ITER = 100;
+ double st = omp_get_wtime();
+
+ for (int t = 0; t < ITER; ++t) {
+#pragma omp parallel for
+ for (long i = 0; i < VOLUME / sizeof(long); ++i)
+ dst[i] = src[i];
+ }
+
+ st = (omp_get_wtime() - st) / ITER;
+
+ printf("CPU\nAverage time(s) %f\n", st);
+ printf("Average throughput(GB/s) %f\n", VOLUME * 2 / st * 1e-9);
+
+#ifdef UVM
+ CUdevice device = 0;
+ CUcontext pctx;
+ cudaCheck((cudaError_t) cudaSetDevice(device));
+ cudaCheck((cudaError_t) cuCtxCreate(&pctx, CU_CTX_SCHED_AUTO | CU_CTX_MAP_HOST, device));
+
+ cudaCheck(cudaMemAdvise(src, VOLUME, cudaMemAdviseSetPreferredLocation, device));
+ cudaMemPrefetchAsync(src, VOLUME, device);
+
+ cudaCheck(cudaMemAdvise(dst, VOLUME, cudaMemAdviseSetPreferredLocation, device));
+ cudaMemPrefetchAsync(dst, VOLUME, device);
+
+ float elapsed;
+ cudaEvent_t c_0, c_1;
+ cudaEventCreate(&c_0);
+ cudaEventCreate(&c_1);
+
+ cudaEventRecord(c_0);
+
+ ITER = 1000;
+ for (int t = 0; t < ITER; ++t)
+ cudacpy << < 1024, 256 >> > (src, dst, VOLUME / sizeof(float));
+
+ cudaEventRecord(c_1);
+ cudaEventSynchronize(c_1);
+ cudaEventElapsedTime(&elapsed, c_0, c_1);
+ st = elapsed / 1000 / ITER;
+ printf("GPU\nAverage time(s) %f\n", st);
+ printf("Average throughput(GB/s) %f\n", VOLUME * 2 / st * 1e-9);
+#endif
+ return 0;
+}