summaryrefslogtreecommitdiff
path: root/main.cu
diff options
context:
space:
mode:
authorTuowen Zhao <ztuowen@gmail.com>2019-12-05 11:30:04 -0700
committerTuowen Zhao <ztuowen@gmail.com>2019-12-05 11:30:04 -0700
commitf79a4c0166f9bd3ad2e57c7dc2b2583b6edc1a63 (patch)
tree8a2df70df4d6ba95464001135b938865785d24c1 /main.cu
parentae42285f61744d34e036e92644164482b22590f3 (diff)
downloadatsmmap-f79a4c0166f9bd3ad2e57c7dc2b2583b6edc1a63.tar.gz
atsmmap-f79a4c0166f9bd3ad2e57c7dc2b2583b6edc1a63.tar.bz2
atsmmap-f79a4c0166f9bd3ad2e57c7dc2b2583b6edc1a63.zip
Add multiple copy pathHEADmaster
Diffstat (limited to 'main.cu')
-rw-r--r--main.cu79
1 files changed, 54 insertions, 25 deletions
diff --git a/main.cu b/main.cu
index 427a7d2..da8de7b 100644
--- a/main.cu
+++ b/main.cu
@@ -75,19 +75,51 @@ int main(int argc, char **argv) {
dst[i] = 0;
}
- int ITER = 100;
- double st = omp_get_wtime();
-
- for (int t = 0; t < ITER; ++t) {
+ auto hostcpy = [&](long *src, long *dst) -> void {
#pragma omp parallel for
for (long i = 0; i < VOLUME / sizeof(long); ++i)
dst[i] = src[i];
- }
+ };
+
+ auto hostbench = [&](int ITER, long *src, long *dst, std::string desc) -> void {
+ double st = omp_get_wtime();
+
+ hostcpy(src, dst);
+ for (int t = 0; t < ITER; ++t)
+ hostcpy(src, dst);
- st = (omp_get_wtime() - st) / ITER;
+ st = (omp_get_wtime() - st) / ITER;
- printf("CPU\nAverage time(s) %f\n", st);
- printf("Average throughput(GB/s) %f\n", VOLUME * 2 / st * 1e-9);
+ printf("%s (host)\nAverage time(s) %f\n", desc.c_str(), st);
+ printf("Average throughput(GB/s) %f\n\n", VOLUME * 2 / st * 1e-9);
+ };
+
+ auto devcpy = [&](long *src, long *dst) -> void {
+ cudacpy << < 1024, 256 >> > (src, dst, VOLUME / sizeof(float));
+ };
+ auto devbench = [&](int ITER, long *src, long *dst, std::string desc) -> void {
+ float elapsed;
+ cudaEvent_t c_0, c_1;
+ cudaEventCreate(&c_0);
+ cudaEventCreate(&c_1);
+
+ devcpy(src, dst);
+ cudaEventRecord(c_0);
+
+ for (int t = 0; t < ITER; ++t)
+ devcpy(src, dst);
+
+ cudaEventRecord(c_1);
+ cudaEventSynchronize(c_1);
+ cudaEventElapsedTime(&elapsed, c_0, c_1);
+ double st = elapsed / 1000 / ITER;
+ printf("%s (device)\nAverage time(s) %f\n", desc.c_str(), st);
+ printf("Average throughput(GB/s) %f\n\n", VOLUME * 2 / st * 1e-9);
+ cudaEventDestroy(c_0);
+ cudaEventDestroy(c_1);
+ };
+
+ hostbench(100, src, dst, "CPU -> CPU");
#ifdef UVM
CUdevice device = 0;
@@ -95,29 +127,26 @@ int main(int argc, char **argv) {
cudaCheck((cudaError_t) cudaSetDevice(device));
cudaCheck((cudaError_t) cuCtxCreate(&pctx, CU_CTX_SCHED_AUTO | CU_CTX_MAP_HOST, device));
- cudaCheck(cudaMemAdvise(src, VOLUME, cudaMemAdviseSetPreferredLocation, device));
- cudaMemPrefetchAsync(src, VOLUME, device);
-
cudaCheck(cudaMemAdvise(dst, VOLUME, cudaMemAdviseSetPreferredLocation, device));
cudaMemPrefetchAsync(dst, VOLUME, device);
- float elapsed;
- cudaEvent_t c_0, c_1;
- cudaEventCreate(&c_0);
- cudaEventCreate(&c_1);
+ cudaCheck(cudaMemAdvise(dst, VOLUME, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
+ cudaMemPrefetchAsync(dst, VOLUME, cudaCpuDeviceId);
- cudaEventRecord(c_0);
+ devbench(300, src, dst, "GPU -> CPU");
- ITER = 1000;
- for (int t = 0; t < ITER; ++t)
- cudacpy << < 1024, 256 >> > (src, dst, VOLUME / sizeof(float));
+ hostbench(100, src, dst, "GPU -> CPU");
+
+ devbench(300, dst, src, "CPU -> GPU");
+
+ hostbench(100, dst, src, "CPU -> GPU");
+
+ cudaCheck(cudaMemAdvise(src, VOLUME, cudaMemAdviseSetPreferredLocation, device));
+ cudaMemPrefetchAsync(src, VOLUME, device);
+
+ devbench(1000, src, dst, "GPU -> GPU");
- cudaEventRecord(c_1);
- cudaEventSynchronize(c_1);
- cudaEventElapsedTime(&elapsed, c_0, c_1);
- st = elapsed / 1000 / ITER;
- printf("GPU\nAverage time(s) %f\n", st);
- printf("Average throughput(GB/s) %f\n", VOLUME * 2 / st * 1e-9);
+ hostbench(100, src, dst, "GPU -> GPU");
#endif
return 0;
}