Minimap2onGPU · gsitaram · Oct 1, 2022
diff --git a/pcie_bw/LICENSE.txt b/pcie_bw/LICENSE.txt
@@ -0,0 +1,26 @@
+The MIT License
+
+Copyright (c) 2018-     Dana-Farber Cancer Institute
+              2017-2018 Broad Institute, Inc.
+              2021-2022 Advanced Micro Devices, Inc. All rights reserved.
+
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/pcie_bw/Makefile b/pcie_bw/Makefile
@@ -0,0 +1,21 @@
+SOURCES = test.cpp 
+OBJECTS = $(SOURCES:.cpp=.o) 
+EXECUTABLE = test
+
+CXX=hipcc
+LDFLAGS=
+CXXFLAGS= -std=c++17 -Wno-unused-result -fopenmp -O2 #-O0 -g #--save-temps
+
+all: ${EXECUTABLE} 
+
+%.o: %.cpp
+	$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+$(EXECUTABLE): $(OBJECTS) 	
+	$(CXX) $(CXXFLAGS) $(OBJECTS) -o $@ $(LDFLAGS) 
+
+clean: 	
+	rm -f $(EXECUTABLE) 	
+	rm -f $(OBJECTS)
+	rm -f *amdgcn* *x86_64* results*
+
diff --git a/pcie_bw/README.md b/pcie_bw/README.md
@@ -0,0 +1,25 @@
+# pcie_bw
+This standalone helps evaluate the following:
+- Is transferring one large buffer faster than transferring 16 small buffers 
+  from host to device?
+- Is it better to have host buffers in pinned memory even for hipMemcpy calls?
+- Is it less efficient if multiple threads launched those small buffer transfers?
+
+## How to build and run
+
+```
+make
+OMP_NUM_THREADS=<nthread> ./test <nbuf> <bufsize_mb> <pinned> <iter>
+```
+where,
+- nbuf = number of buffers to copy from HtoD (Default: 1)
+- bufsize_mb = size of each buffer in MB (Default: 16)
+- pinned =0 use pageable host buffer, =1 use pinned host buffer (Default: 1)
+- niter  = number of iterations of timing loop (Default: 100)
+- async_copy = 0 for hipMemcpy for transfers, 1 for hipMemcpyAsync (Default: 0)
+
+A script is also included to test a range of buffer sizes. To run the script, 
+```
+./run.sh
+```
+One could augment the script to vary any or all input arguments as needed.
diff --git a/pcie_bw/run.sh b/pcie_bw/run.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+niter=20
+nthread=4
+async=1
+pinned=1
+
+#for pinned in {0,1}
+#do
+    nbuf=1
+    for bufsize in {64,128,256,512,1024,2048,4096,8192,16000,32000,48000,60000}
+    do
+        cmd="OMP_NUM_THREADS=$nthread ./test $nbuf $bufsize $pinned $niter $async"
+        eval $cmd
+    done
+
+    nbuf=16
+    for bufsize in {1,2,4,8,10,12,14,16,32,48,64,128,256,512,1024,2048}
+    do
+        cmd="OMP_NUM_THREADS=$nthread ./test $nbuf $bufsize $pinned $niter $async"
+        eval $cmd
+    done
+#done
+
diff --git a/pcie_bw/test.cpp b/pcie_bw/test.cpp
@@ -0,0 +1,169 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <sys/sysinfo.h>
+#include <omp.h>
+#include <hip/hip_runtime.h>
+
+#define ELAPSED(t1,t2) (t2.tv_sec-t1.tv_sec + (t2.tv_usec-t1.tv_usec)*1E-6)
+
+#define HIPCHECK(cmd) \
+{\
+    hipError_t error  = cmd;\
+    if (error != hipSuccess) { \
+        fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \
+        exit(EXIT_FAILURE);\
+      }\
+}
+
+int main (int argc, char *argv[])
+{
+    int ret, nthread=1;
+    int nbuf=1, bufsize_mb=16, niter=100, pinned=1, async_copy=0;
+    float avail_mem;
+    float *buf, *smbuf;
+    float **d_buf;
+    size_t bufsize_bytes, nelem;
+    struct timeval t1, t2;
+    struct sysinfo s;
+    hipDeviceProp_t prop;
+    hipStream_t stream;
+
+    // Get number of threads
+#pragma omp parallel 
+    {
+        nthread = omp_get_num_threads();
+    }
+    // ---------------------------------------------------------------
+    // Parse arguments
+    // ---------------------------------------------------------------
+    if (argc < 5) {
+        printf ("Usage: %s <nbuf> <bufsize_mb> <pinned> <niter> <async_copy>\n"
+                "where,\n"
+                "    nbuf       = number of buffers to copy from HtoD (Default: 1)\n"
+                "    bufsize_mb = size of each buffer in MB (Default: 16)\n"
+                "    pinned     = 0 for pageable host buffer, 1 forpinned host buffer (Default: 1)\n"
+                "    niter      = number of iterations of timing loop (Default: 100)\n"
+                "    async_copy = 0 for hipMemcpy for transfers, 1 for hipMemcpyAsync (Default: 0)\n",
+                argv[0]);
+        return 0;
+    } 
+    nbuf = atoi (argv[1]);
+    bufsize_mb = atoi (argv[2]);
+    pinned = atoi (argv[3]);
+    niter = atoi (argv[4]);
+    async_copy = atoi (argv[5]);
+
+    // ---------------------------------------------------------------
+    // Error-check arguments 
+    // ---------------------------------------------------------------
+    if (nbuf<1) {
+        printf ("Expecting at least 1 buffer\n");
+        return -1;
+    }
+
+    // Check buffer size against limits on host and GPU
+    // Need to fit 1 buffer in host memory
+    ret = sysinfo (&s);
+    avail_mem = (float)s.freeram*(float)s.mem_unit/(1048576.f);
+    if (bufsize_mb > avail_mem) {
+        printf ("Buffer size is too high. Available memory on host = %.1f MB," 
+                "requesting %d MB\n", avail_mem, bufsize_mb);
+        return -1;
+    }
+    // Need to fit nbuf buffers in GPU memory
+    HIPCHECK (hipGetDeviceProperties(&prop, 0));
+    size_t gpumem_avail = prop.totalGlobalMem;
+    size_t gpumem_needed = (size_t)nbuf * (size_t)bufsize_mb * (size_t)1048576;
+    if (gpumem_avail < gpumem_needed) {
+        printf ("gpumem=%zu, needed=%zu\n", gpumem_avail, gpumem_needed);
+        printf ("Cannot fit %d buffers of size %d MB in GPU memory\n", 
+                nbuf, bufsize_mb);
+        return -1;
+    }
+
+    if (pinned < 0 || pinned > 1) {
+        printf ("pinned must be either 0 or 1\n");
+        return -1;
+    }
+
+    if (niter<1) {
+        printf ("Expecting at least 1 iteration\n");
+        return -1;
+    }
+
+    if (async_copy < 0 || async_copy > 1) {
+        printf ("async_copy must be either 0 or 1\n");
+        return -1;
+    }
+
+    // ---------------------------------------------------------------
+    // Allocate a host buffer to copy from, nbuf buffers on the device
+    // ---------------------------------------------------------------
+    bufsize_bytes = (size_t) bufsize_mb * 1048576;
+    nelem = bufsize_bytes/sizeof(float);
+    if (async_copy) {
+        HIPCHECK (hipStreamCreate (&stream));
+    }
+
+    // Allocate and initialize host buffer
+    if (!pinned) {
+        buf = (float *) malloc (bufsize_bytes);
+    } else {
+        HIPCHECK (hipHostMalloc (&buf, bufsize_bytes, hipHostMallocDefault));
+    }
+
+    // Initialize host buffer
+    for (int i=0; i<nelem; i++) {
+        buf[i] = (float)i;
+    }
+
+    // Allocate device buffer(s)
+    d_buf = (float **) malloc (nbuf * sizeof (float *));
+    for (int i=0; i<nbuf; i++) {
+        HIPCHECK (hipMalloc (&d_buf[i], bufsize_bytes));
+    }
+
+    // ---------------------------------------------------------------
+    // Time memcpy of multiple small  buffers
+    // ---------------------------------------------------------------
+    // Amount of data transferred 
+    float gbs = ((float)niter*nbuf*(float)bufsize_bytes)/(1024.f*1024.f*1024.f);
+
+    gettimeofday (&t1, NULL);
+    for (int iter=0; iter<niter; iter++) {
+        #pragma omp parallel for shared(d_buf, buf, bufsize_bytes) num_threads(nthread)
+        for (int i=0; i<nbuf; i++) {
+            if (!async_copy) {
+                HIPCHECK (hipMemcpy (d_buf[i], buf, bufsize_bytes, hipMemcpyHostToDevice));
+            } else {
+                HIPCHECK (hipMemcpyAsync (d_buf[i], buf, bufsize_bytes, hipMemcpyHostToDevice, stream));
+            }
+        }
+    }
+    if (async_copy) {
+        hipStreamSynchronize (stream);
+    }
+    gettimeofday (&t2, NULL);
+    printf ("nthread=%d pinned=%d niter=%d async_copy=%d nbuf=%d bufsize=%d MB, bw=%.3f GB/s\n", 
+             nthread, pinned, niter, async_copy, nbuf, bufsize_mb, gbs/ELAPSED(t1,t2));
+
+    // ---------------------------------------------------------------
+    // Cleanup
+    // ---------------------------------------------------------------
+    for (int i=0; i<nbuf; i++) {
+        HIPCHECK (hipFree (d_buf[i]));
+    }
+    free (d_buf);
+
+    if (!pinned) {
+        free(buf);
+    } else {
+        HIPCHECK (hipHostFree (buf));
+    }
+    if (async_copy) {
+        HIPCHECK (hipStreamDestroy (stream));
+    }
+
+    return 0;
+}