summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTuowen Zhao <ztuowen@gmail.com>2019-12-04 14:50:16 -0700
committerTuowen Zhao <ztuowen@gmail.com>2019-12-04 14:50:16 -0700
commitc0b34b3f764e12d296f6126eba3b930559bbcd42 (patch)
treed2d5a4a39b58bb6f654394f9db465584cd288de8
parent79ba7c05e35b18aa2dc24da57399cfbb94a07d0e (diff)
downloadatsmmap-c0b34b3f764e12d296f6126eba3b930559bbcd42.tar.gz
atsmmap-c0b34b3f764e12d296f6126eba3b930559bbcd42.tar.bz2
atsmmap-c0b34b3f764e12d296f6126eba3b930559bbcd42.zip
CPU bench
-rw-r--r--CMakeLists.txt4
-rw-r--r--README.md10
-rw-r--r--main.cpp56
3 files changed, 60 insertions, 10 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 58e7603..b24a5e4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,4 +3,8 @@ project(atsmmap)
set(CMAKE_CXX_STANDARD 14)
+find_package(OpenMP REQUIRED)
+set(CMAKE_C_FLAGS ${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS})
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS} -march=native")
+
add_executable(atsmmap main.cpp) \ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..0a59bc0
--- /dev/null
+++ b/README.md
@@ -0,0 +1,10 @@
+# Profiling performance relations between mmap and unified memory with host allocator
+
+## Unified memory with host allocator
+
+Can be realized using Address translation service for Power9 (ATS) or Heterogeneous memory management (HMM).
+
+## Problem description
+
+Every mmap region could add "normally" imperceptible latency to address resolution. This problem is less visible on
+ CPU due to xxx. However on GPU, this could contribute to visible latency. \ No newline at end of file
diff --git a/main.cpp b/main.cpp
index 43aba7f..0cb47fa 100644
--- a/main.cpp
+++ b/main.cpp
@@ -2,9 +2,10 @@
#include <unistd.h>
#include <sys/mman.h>
#include <string>
+#include <omp.h>
-#define GIG (1024*1024*1024ul)
-#define VOLUME (2ul*GIG)
+#define GIG (1024ul*1024*1024)
+#define VOLUME (1024ul*1024*1024)
// Real values is in the ~65536, only use half of it to be safe
// sysctl vm.max_map_count
@@ -13,16 +14,51 @@
int main(int argc, char **argv) {
auto page_size = sysconf(_SC_PAGESIZE);
std::cout << "The system have a page size of " << page_size << std::endl;
- auto dst = (long*)malloc(VOLUME);
// Src is using anonymous mapping
- long nmaps = std::stoi(argv[1]);
- if (GIG % (page_size * nmaps) != 0) {
- std::cout << "nmaps is not perfect multiple" << std::endl;
- nmaps = GIG / (page_size * nmaps);
+ long nmaps = 1024;
+ if (argc > 1) {
+ nmaps = std::stoi(argv[1]);
+ if (VOLUME / page_size % nmaps != 0) {
+ std::cout << "nmaps is not perfect multiple, quit" << std::endl;
+ return 0;
+ }
}
- std::cout << argv[1] << std::endl;
+
+ long mmap_sz = VOLUME / nmaps;
+ std::cout << "Each mapped region is of size(pages) " << mmap_sz/page_size << std::endl;
+
+ auto dst = (long*)malloc(VOLUME);
+
uint8_t *hint = (uint8_t*)0x600000000000UL;
- mmap(hint);
- auto src =
+ hint -= VOLUME;
+ auto src = (long*)hint;
+ for (long i = 0; i < nmaps; ++i) {
+ auto r = mmap(hint, mmap_sz, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0);
+ if (r == MAP_FAILED || r != hint)
+ printf("MMAP failed somehow\n");
+ hint += mmap_sz;
+ }
+
+#pragma omp parallel for
+ for (long i = 0; i < VOLUME/sizeof(long); ++i) {
+ src[i] = i;
+ dst[i] = 0;
+ }
+
+ int ITER = 100;
+ double st = omp_get_wtime();
+
+ for (int t = 0; t < ITER; ++t) {
+#pragma omp parallel for
+ for (long i = 0; i < VOLUME/sizeof(long); ++i)
+ dst[i] = src[i];
+ }
+
+ st = (omp_get_wtime() - st) / ITER;
+
+ printf("Average time(s) %f\n", st);
+// std::cout << "Average time(s) " << st << std::endl;
+ printf("Average throughput(GB/s) %f\n", VOLUME * 2 / st * 1e-9);
+// std::cout << "Average throughput(GB/s) " << VOLUME * 2 / st * 1e-9 << std::endl;
return 0;
}