diff --git a/pcie_bw/LICENSE.txt b/pcie_bw/LICENSE.txt new file mode 100644 index 0000000..a82be38 --- /dev/null +++ b/pcie_bw/LICENSE.txt @@ -0,0 +1,26 @@ +The MIT License + +Copyright (c) 2018- Dana-Farber Cancer Institute + 2017-2018 Broad Institute, Inc. + 2021-2022 Advanced Micro Devices, Inc. All rights reserved. + + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/pcie_bw/Makefile b/pcie_bw/Makefile new file mode 100644 index 0000000..ebe5abb --- /dev/null +++ b/pcie_bw/Makefile @@ -0,0 +1,21 @@ +SOURCES = test.cpp +OBJECTS = $(SOURCES:.cpp=.o) +EXECUTABLE = test + +CXX=hipcc +LDFLAGS= +CXXFLAGS= -std=c++17 -Wno-unused-result -fopenmp -O2 #-O0 -g #--save-temps + +all: ${EXECUTABLE} + +%.o: %.cpp + $(CXX) $(CXXFLAGS) -o $@ -c $< + +$(EXECUTABLE): $(OBJECTS) + $(CXX) $(CXXFLAGS) $(OBJECTS) -o $@ $(LDFLAGS) + +clean: + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f *amdgcn* *x86_64* results* + diff --git a/pcie_bw/README.md b/pcie_bw/README.md new file mode 100644 index 0000000..2f068da --- /dev/null +++ b/pcie_bw/README.md @@ -0,0 +1,25 @@ +# pcie_bw +This standalone helps evaluate the following: +- Is transferring one large buffer faster than transferring 16 small buffers + from host to device? +- Is it better to have host buffers in pinned memory even for hipMemcpy calls? +- Is it less efficient if multiple threads launched those small buffer transfers? + +## How to build and run + +``` +make +OMP_NUM_THREADS= ./test +``` +where, +- nbuf = number of buffers to copy from HtoD (Default: 1) +- bufsize_mb = size of each buffer in MB (Default: 16) +- pinned =0 use pageable host buffer, =1 use pinned host buffer (Default: 1) +- niter = number of iterations of timing loop (Default: 100) +- async_copy = 0 for hipMemcpy for transfers, 1 for hipMemcpyAsync (Default: 0) + +A script is also included to test a range of buffer sizes. To run the script, +``` +./run.sh +``` +One could augment the script to vary any or all input arguments as needed. diff --git a/pcie_bw/run.sh b/pcie_bw/run.sh new file mode 100755 index 0000000..43f443d --- /dev/null +++ b/pcie_bw/run.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +niter=20 +nthread=4 +async=1 +pinned=1 + +#for pinned in {0,1} +#do + nbuf=1 + for bufsize in {64,128,256,512,1024,2048,4096,8192,16000,32000,48000,60000} + do + cmd="OMP_NUM_THREADS=$nthread ./test $nbuf $bufsize $pinned $niter $async" + eval $cmd + done + + nbuf=16 + for bufsize in {1,2,4,8,10,12,14,16,32,48,64,128,256,512,1024,2048} + do + cmd="OMP_NUM_THREADS=$nthread ./test $nbuf $bufsize $pinned $niter $async" + eval $cmd + done +#done + diff --git a/pcie_bw/test.cpp b/pcie_bw/test.cpp new file mode 100644 index 0000000..aea75fb --- /dev/null +++ b/pcie_bw/test.cpp @@ -0,0 +1,169 @@ +#include +#include +#include +#include +#include +#include + +#define ELAPSED(t1,t2) (t2.tv_sec-t1.tv_sec + (t2.tv_usec-t1.tv_usec)*1E-6) + +#define HIPCHECK(cmd) \ +{\ + hipError_t error = cmd;\ + if (error != hipSuccess) { \ + fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), error,__FILE__, __LINE__); \ + exit(EXIT_FAILURE);\ + }\ +} + +int main (int argc, char *argv[]) +{ + int ret, nthread=1; + int nbuf=1, bufsize_mb=16, niter=100, pinned=1, async_copy=0; + float avail_mem; + float *buf, *smbuf; + float **d_buf; + size_t bufsize_bytes, nelem; + struct timeval t1, t2; + struct sysinfo s; + hipDeviceProp_t prop; + hipStream_t stream; + + // Get number of threads +#pragma omp parallel + { + nthread = omp_get_num_threads(); + } + // --------------------------------------------------------------- + // Parse arguments + // --------------------------------------------------------------- + if (argc < 5) { + printf ("Usage: %s \n" + "where,\n" + " nbuf = number of buffers to copy from HtoD (Default: 1)\n" + " bufsize_mb = size of each buffer in MB (Default: 16)\n" + " pinned = 0 for pageable host buffer, 1 forpinned host buffer (Default: 1)\n" + " niter = number of iterations of timing loop (Default: 100)\n" + " async_copy = 0 for hipMemcpy for transfers, 1 for hipMemcpyAsync (Default: 0)\n", + argv[0]); + return 0; + } + nbuf = atoi (argv[1]); + bufsize_mb = atoi (argv[2]); + pinned = atoi (argv[3]); + niter = atoi (argv[4]); + async_copy = atoi (argv[5]); + + // --------------------------------------------------------------- + // Error-check arguments + // --------------------------------------------------------------- + if (nbuf<1) { + printf ("Expecting at least 1 buffer\n"); + return -1; + } + + // Check buffer size against limits on host and GPU + // Need to fit 1 buffer in host memory + ret = sysinfo (&s); + avail_mem = (float)s.freeram*(float)s.mem_unit/(1048576.f); + if (bufsize_mb > avail_mem) { + printf ("Buffer size is too high. Available memory on host = %.1f MB," + "requesting %d MB\n", avail_mem, bufsize_mb); + return -1; + } + // Need to fit nbuf buffers in GPU memory + HIPCHECK (hipGetDeviceProperties(&prop, 0)); + size_t gpumem_avail = prop.totalGlobalMem; + size_t gpumem_needed = (size_t)nbuf * (size_t)bufsize_mb * (size_t)1048576; + if (gpumem_avail < gpumem_needed) { + printf ("gpumem=%zu, needed=%zu\n", gpumem_avail, gpumem_needed); + printf ("Cannot fit %d buffers of size %d MB in GPU memory\n", + nbuf, bufsize_mb); + return -1; + } + + if (pinned < 0 || pinned > 1) { + printf ("pinned must be either 0 or 1\n"); + return -1; + } + + if (niter<1) { + printf ("Expecting at least 1 iteration\n"); + return -1; + } + + if (async_copy < 0 || async_copy > 1) { + printf ("async_copy must be either 0 or 1\n"); + return -1; + } + + // --------------------------------------------------------------- + // Allocate a host buffer to copy from, nbuf buffers on the device + // --------------------------------------------------------------- + bufsize_bytes = (size_t) bufsize_mb * 1048576; + nelem = bufsize_bytes/sizeof(float); + if (async_copy) { + HIPCHECK (hipStreamCreate (&stream)); + } + + // Allocate and initialize host buffer + if (!pinned) { + buf = (float *) malloc (bufsize_bytes); + } else { + HIPCHECK (hipHostMalloc (&buf, bufsize_bytes, hipHostMallocDefault)); + } + + // Initialize host buffer + for (int i=0; i