diff options
author | Tuowen Zhao <ztuowen@gmail.com> | 2019-12-05 11:30:04 -0700 |
---|---|---|
committer | Tuowen Zhao <ztuowen@gmail.com> | 2019-12-05 11:30:04 -0700 |
commit | f79a4c0166f9bd3ad2e57c7dc2b2583b6edc1a63 (patch) | |
tree | 8a2df70df4d6ba95464001135b938865785d24c1 /main.cu | |
parent | ae42285f61744d34e036e92644164482b22590f3 (diff) | |
download | atsmmap-f79a4c0166f9bd3ad2e57c7dc2b2583b6edc1a63.tar.gz atsmmap-f79a4c0166f9bd3ad2e57c7dc2b2583b6edc1a63.tar.bz2 atsmmap-f79a4c0166f9bd3ad2e57c7dc2b2583b6edc1a63.zip |
Diffstat (limited to 'main.cu')
-rw-r--r-- | main.cu | 79 |
1 files changed, 54 insertions, 25 deletions
@@ -75,19 +75,51 @@ int main(int argc, char **argv) { dst[i] = 0; } - int ITER = 100; - double st = omp_get_wtime(); - - for (int t = 0; t < ITER; ++t) { + auto hostcpy = [&](long *src, long *dst) -> void { #pragma omp parallel for for (long i = 0; i < VOLUME / sizeof(long); ++i) dst[i] = src[i]; - } + }; + + auto hostbench = [&](int ITER, long *src, long *dst, std::string desc) -> void { + double st = omp_get_wtime(); + + hostcpy(src, dst); + for (int t = 0; t < ITER; ++t) + hostcpy(src, dst); - st = (omp_get_wtime() - st) / ITER; + st = (omp_get_wtime() - st) / ITER; - printf("CPU\nAverage time(s) %f\n", st); - printf("Average throughput(GB/s) %f\n", VOLUME * 2 / st * 1e-9); + printf("%s (host)\nAverage time(s) %f\n", desc.c_str(), st); + printf("Average throughput(GB/s) %f\n\n", VOLUME * 2 / st * 1e-9); + }; + + auto devcpy = [&](long *src, long *dst) -> void { + cudacpy << < 1024, 256 >> > (src, dst, VOLUME / sizeof(float)); + }; + auto devbench = [&](int ITER, long *src, long *dst, std::string desc) -> void { + float elapsed; + cudaEvent_t c_0, c_1; + cudaEventCreate(&c_0); + cudaEventCreate(&c_1); + + devcpy(src, dst); + cudaEventRecord(c_0); + + for (int t = 0; t < ITER; ++t) + devcpy(src, dst); + + cudaEventRecord(c_1); + cudaEventSynchronize(c_1); + cudaEventElapsedTime(&elapsed, c_0, c_1); + double st = elapsed / 1000 / ITER; + printf("%s (device)\nAverage time(s) %f\n", desc.c_str(), st); + printf("Average throughput(GB/s) %f\n\n", VOLUME * 2 / st * 1e-9); + cudaEventDestroy(c_0); + cudaEventDestroy(c_1); + }; + + hostbench(100, src, dst, "CPU -> CPU"); #ifdef UVM CUdevice device = 0; @@ -95,29 +127,26 @@ int main(int argc, char **argv) { cudaCheck((cudaError_t) cudaSetDevice(device)); cudaCheck((cudaError_t) cuCtxCreate(&pctx, CU_CTX_SCHED_AUTO | CU_CTX_MAP_HOST, device)); - cudaCheck(cudaMemAdvise(src, VOLUME, cudaMemAdviseSetPreferredLocation, device)); - cudaMemPrefetchAsync(src, VOLUME, device); - cudaCheck(cudaMemAdvise(dst, VOLUME, cudaMemAdviseSetPreferredLocation, device)); cudaMemPrefetchAsync(dst, VOLUME, device); - float elapsed; - cudaEvent_t c_0, c_1; - cudaEventCreate(&c_0); - cudaEventCreate(&c_1); + cudaCheck(cudaMemAdvise(dst, VOLUME, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId)); + cudaMemPrefetchAsync(dst, VOLUME, cudaCpuDeviceId); - cudaEventRecord(c_0); + devbench(300, src, dst, "GPU -> CPU"); - ITER = 1000; - for (int t = 0; t < ITER; ++t) - cudacpy << < 1024, 256 >> > (src, dst, VOLUME / sizeof(float)); + hostbench(100, src, dst, "GPU -> CPU"); + + devbench(300, dst, src, "CPU -> GPU"); + + hostbench(100, dst, src, "CPU -> GPU"); + + cudaCheck(cudaMemAdvise(src, VOLUME, cudaMemAdviseSetPreferredLocation, device)); + cudaMemPrefetchAsync(src, VOLUME, device); + + devbench(1000, src, dst, "GPU -> GPU"); - cudaEventRecord(c_1); - cudaEventSynchronize(c_1); - cudaEventElapsedTime(&elapsed, c_0, c_1); - st = elapsed / 1000 / ITER; - printf("GPU\nAverage time(s) %f\n", st); - printf("Average throughput(GB/s) %f\n", VOLUME * 2 / st * 1e-9); + hostbench(100, src, dst, "GPU -> GPU"); #endif return 0; } |