diff options
author | Tuowen Zhao <ztuowen@gmail.com> | 2019-12-04 21:19:19 -0700 |
---|---|---|
committer | Tuowen Zhao <ztuowen@gmail.com> | 2019-12-04 21:19:19 -0700 |
commit | ae42285f61744d34e036e92644164482b22590f3 (patch) | |
tree | 2ae323a29d9162b7b37a778f9e7c3725230d714f | |
parent | c0b34b3f764e12d296f6126eba3b930559bbcd42 (diff) | |
download | atsmmap-ae42285f61744d34e036e92644164482b22590f3.tar.gz atsmmap-ae42285f61744d34e036e92644164482b22590f3.tar.bz2 atsmmap-ae42285f61744d34e036e92644164482b22590f3.zip |
GPU bench
-rw-r--r-- | CMakeLists.txt | 13 | ||||
-rw-r--r-- | main.cpp | 64 | ||||
-rw-r--r-- | main.cu | 123 |
3 files changed, 132 insertions, 68 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index b24a5e4..4c02b00 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,10 +1,15 @@ cmake_minimum_required(VERSION 3.15) -project(atsmmap) +project(atsmmap LANGUAGES CXX CUDA) -set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CUDA_STANDARD 11) +set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) find_package(OpenMP REQUIRED) -set(CMAKE_C_FLAGS ${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS} -march=native") +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler \"${OpenMP_CXX_FLAGS} -march=native\"") -add_executable(atsmmap main.cpp)
\ No newline at end of file +add_executable(atsmmap main.cu) +target_link_options(atsmmap PRIVATE "${OpenMP_CXX_FLAGS}") +target_link_libraries(atsmmap rt cuda)
\ No newline at end of file diff --git a/main.cpp b/main.cpp deleted file mode 100644 index 0cb47fa..0000000 --- a/main.cpp +++ /dev/null @@ -1,64 +0,0 @@ -#include <iostream> -#include <unistd.h> -#include <sys/mman.h> -#include <string> -#include <omp.h> - -#define GIG (1024ul*1024*1024) -#define VOLUME (1024ul*1024*1024) - -// Real values is in the ~65536, only use half of it to be safe -// sysctl vm.max_map_count -#define MAXMMAPS 32768 - -int main(int argc, char **argv) { - auto page_size = sysconf(_SC_PAGESIZE); - std::cout << "The system have a page size of " << page_size << std::endl; - // Src is using anonymous mapping - long nmaps = 1024; - if (argc > 1) { - nmaps = std::stoi(argv[1]); - if (VOLUME / page_size % nmaps != 0) { - std::cout << "nmaps is not perfect multiple, quit" << std::endl; - return 0; - } - } - - long mmap_sz = VOLUME / nmaps; - std::cout << "Each mapped region is of size(pages) " << mmap_sz/page_size << std::endl; - - auto dst = (long*)malloc(VOLUME); - - uint8_t *hint = (uint8_t*)0x600000000000UL; - hint -= VOLUME; - auto src = (long*)hint; - for (long i = 0; i < nmaps; ++i) { - auto r = mmap(hint, mmap_sz, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); - if (r == MAP_FAILED || r != hint) - printf("MMAP failed somehow\n"); - hint += mmap_sz; - } - -#pragma omp parallel for - for (long i = 0; i < VOLUME/sizeof(long); ++i) { - src[i] = i; - dst[i] = 0; - } - - int ITER = 100; - double st = omp_get_wtime(); - - for (int t = 0; t < ITER; ++t) { -#pragma omp parallel for - for (long i = 0; i < VOLUME/sizeof(long); ++i) - dst[i] = src[i]; - } - - st = (omp_get_wtime() - st) / ITER; - - printf("Average time(s) %f\n", st); -// std::cout << "Average time(s) " << st << std::endl; - printf("Average throughput(GB/s) %f\n", VOLUME * 2 / st * 1e-9); -// std::cout << "Average throughput(GB/s) " << VOLUME * 2 / st * 1e-9 << std::endl; - return 0; -} @@ -0,0 +1,123 @@ +#include <iostream> +#include <unistd.h> +#include <sys/mman.h> +#include <string> +#include <omp.h> +#include <cuda.h> +#include <cuda_runtime.h> + +#define UVM + +#ifndef NDEBUG +#define cudaCheck(x) x +#else +#include <cstdio> +#define cudaCheck(x) _cudaCheck(x, #x ,__FILE__, __LINE__) +#endif + +template<typename T> +void _cudaCheck(T e, const char *func, const char *call, const int line) { + if (e != cudaSuccess) { + printf("\"%s\" at %d in %s\n\treturned %d\n-> %s\n", func, line, call, (int) e, cudaGetErrorString(e)); + exit(EXIT_FAILURE); + } +} + +#define GIG (1024ul*1024*1024) +#define VOLUME (2ul*GIG) + +// Real values is in the ~65536, only use half of it to be safe +// sysctl vm.max_map_count +#define MAXMMAPS 32768 + +__global__ void +cudacpy(long *src, long *dst, long len) { + long st = len / gridDim.x; + long ed = min(st * (blockIdx.x + 1), len); + st = st * blockIdx.x; + st += threadIdx.x; + for (; st < ed; st += blockDim.x) + dst[st] = src[st]; +} + +int main(int argc, char **argv) { + auto page_size = sysconf(_SC_PAGESIZE); + std::cout << "The system have a page size of " << page_size << std::endl; + // Src is using anonymous mapping + long nmaps = 1024; + if (argc > 1) { + nmaps = std::stoi(argv[1]); + if (VOLUME / page_size % nmaps != 0) { + std::cout << "nmaps is not perfect multiple, quit" << std::endl; + return 0; + } + } + + long mmap_sz = VOLUME / nmaps; + std::cout << "Each mapped region is of size(pages) " << mmap_sz / page_size << std::endl; + + auto dst = (long *) malloc(VOLUME); + + uint8_t *hint = (uint8_t *) 0x600000000000UL; + hint -= VOLUME; + auto src = (long *) hint; + + for (long i = 0; i < nmaps; ++i) { + auto r = mmap(hint, mmap_sz, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); + if (r == MAP_FAILED || r != hint) + printf("MMAP failed somehow\n"); + hint += mmap_sz; + } + +#pragma omp parallel for + for (long i = 0; i < VOLUME / sizeof(long); ++i) { + src[i] = i; + dst[i] = 0; + } + + int ITER = 100; + double st = omp_get_wtime(); + + for (int t = 0; t < ITER; ++t) { +#pragma omp parallel for + for (long i = 0; i < VOLUME / sizeof(long); ++i) + dst[i] = src[i]; + } + + st = (omp_get_wtime() - st) / ITER; + + printf("CPU\nAverage time(s) %f\n", st); + printf("Average throughput(GB/s) %f\n", VOLUME * 2 / st * 1e-9); + +#ifdef UVM + CUdevice device = 0; + CUcontext pctx; + cudaCheck((cudaError_t) cudaSetDevice(device)); + cudaCheck((cudaError_t) cuCtxCreate(&pctx, CU_CTX_SCHED_AUTO | CU_CTX_MAP_HOST, device)); + + cudaCheck(cudaMemAdvise(src, VOLUME, cudaMemAdviseSetPreferredLocation, device)); + cudaMemPrefetchAsync(src, VOLUME, device); + + cudaCheck(cudaMemAdvise(dst, VOLUME, cudaMemAdviseSetPreferredLocation, device)); + cudaMemPrefetchAsync(dst, VOLUME, device); + + float elapsed; + cudaEvent_t c_0, c_1; + cudaEventCreate(&c_0); + cudaEventCreate(&c_1); + + cudaEventRecord(c_0); + + ITER = 1000; + for (int t = 0; t < ITER; ++t) + cudacpy << < 1024, 256 >> > (src, dst, VOLUME / sizeof(float)); + + cudaEventRecord(c_1); + cudaEventSynchronize(c_1); + cudaEventElapsedTime(&elapsed, c_0, c_1); + st = elapsed / 1000 / ITER; + printf("GPU\nAverage time(s) %f\n", st); + printf("Average throughput(GB/s) %f\n", VOLUME * 2 / st * 1e-9); +#endif + return 0; +} |