From c0b34b3f764e12d296f6126eba3b930559bbcd42 Mon Sep 17 00:00:00 2001 From: Tuowen Zhao Date: Wed, 4 Dec 2019 14:50:16 -0700 Subject: CPU bench --- CMakeLists.txt | 4 ++++ README.md | 10 ++++++++++ main.cpp | 56 ++++++++++++++++++++++++++++++++++++++++++++++---------- 3 files changed, 60 insertions(+), 10 deletions(-) create mode 100644 README.md diff --git a/CMakeLists.txt b/CMakeLists.txt index 58e7603..b24a5e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,4 +3,8 @@ project(atsmmap) set(CMAKE_CXX_STANDARD 14) +find_package(OpenMP REQUIRED) +set(CMAKE_C_FLAGS ${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS} -march=native") + add_executable(atsmmap main.cpp) \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..0a59bc0 --- /dev/null +++ b/README.md @@ -0,0 +1,10 @@ +# Profiling performance relations between mmap and unified memory with host allocator + +## Unified memory with host allocator + +Can be realized using Address translation service for Power9 (ATS) or Heterogeneous memory management (HMM). + +## Problem description + +Every mmap region could add "normally" imperceptible latency to address resolution. This problem is less visible on + CPU due to xxx. However on GPU, this could contribute to visible latency. \ No newline at end of file diff --git a/main.cpp b/main.cpp index 43aba7f..0cb47fa 100644 --- a/main.cpp +++ b/main.cpp @@ -2,9 +2,10 @@ #include #include #include +#include -#define GIG (1024*1024*1024ul) -#define VOLUME (2ul*GIG) +#define GIG (1024ul*1024*1024) +#define VOLUME (1024ul*1024*1024) // Real values is in the ~65536, only use half of it to be safe // sysctl vm.max_map_count @@ -13,16 +14,51 @@ int main(int argc, char **argv) { auto page_size = sysconf(_SC_PAGESIZE); std::cout << "The system have a page size of " << page_size << std::endl; - auto dst = (long*)malloc(VOLUME); // Src is using anonymous mapping - long nmaps = std::stoi(argv[1]); - if (GIG % (page_size * nmaps) != 0) { - std::cout << "nmaps is not perfect multiple" << std::endl; - nmaps = GIG / (page_size * nmaps); + long nmaps = 1024; + if (argc > 1) { + nmaps = std::stoi(argv[1]); + if (VOLUME / page_size % nmaps != 0) { + std::cout << "nmaps is not perfect multiple, quit" << std::endl; + return 0; + } } - std::cout << argv[1] << std::endl; + + long mmap_sz = VOLUME / nmaps; + std::cout << "Each mapped region is of size(pages) " << mmap_sz/page_size << std::endl; + + auto dst = (long*)malloc(VOLUME); + uint8_t *hint = (uint8_t*)0x600000000000UL; - mmap(hint); - auto src = + hint -= VOLUME; + auto src = (long*)hint; + for (long i = 0; i < nmaps; ++i) { + auto r = mmap(hint, mmap_sz, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); + if (r == MAP_FAILED || r != hint) + printf("MMAP failed somehow\n"); + hint += mmap_sz; + } + +#pragma omp parallel for + for (long i = 0; i < VOLUME/sizeof(long); ++i) { + src[i] = i; + dst[i] = 0; + } + + int ITER = 100; + double st = omp_get_wtime(); + + for (int t = 0; t < ITER; ++t) { +#pragma omp parallel for + for (long i = 0; i < VOLUME/sizeof(long); ++i) + dst[i] = src[i]; + } + + st = (omp_get_wtime() - st) / ITER; + + printf("Average time(s) %f\n", st); +// std::cout << "Average time(s) " << st << std::endl; + printf("Average throughput(GB/s) %f\n", VOLUME * 2 / st * 1e-9); +// std::cout << "Average throughput(GB/s) " << VOLUME * 2 / st * 1e-9 << std::endl; return 0; } -- cgit v1.2.3-70-g09d2