From 88304262394fe5e23ff5bad72679df5a286b1532 Mon Sep 17 00:00:00 2001 From: Duc Vu Nguyen Date: Wed, 6 Mar 2024 17:38:06 -0700 Subject: [PATCH] Fix the algorithm is not correct when the number of elements is not evenly divisible by 10 or 2 --- maxgpu-2point.cu | 17 ++++++++++------- maxgpu.cu | 30 ++++++++++++++++++++++-------- 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/maxgpu-2point.cu b/maxgpu-2point.cu index 47b6e12..e8d37d2 100644 --- a/maxgpu-2point.cu +++ b/maxgpu-2point.cu @@ -25,7 +25,7 @@ int main(int argc, char *argv[]) size = atol(argv[1]); //calculates number of blocks - unsigned int NUM_BLOCKS = size/THREADS_PER_BLOCK; + unsigned int NUM_BLOCKS = (size + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK; numbers = (unsigned int *)malloc(size * sizeof(unsigned int)); if( !numbers ) @@ -54,7 +54,7 @@ int main(int argc, char *argv[]) printf("element in %d: %u\n", i, numbers[i]); } } - printf("The max integer in the array is: %d\n", numbers[0]); + printf("The max integer in the array found by the GPU is: %d\n", numbers[0]); //free device matrices cudaFree(d_numbers); free(numbers); @@ -67,18 +67,21 @@ __global__ void get_max(unsigned int* num, unsigned int size){ unsigned int nTotalThreads = size; while(nTotalThreads > 1){ - unsigned int halfPoint = nTotalThreads / 2; // divide by two + unsigned int halfPoint = (nTotalThreads + 1) / 2; // divide by two // only the first half of the threads will be active. if (index < halfPoint){ - temp = num[ index + halfPoint ]; - if (temp > num[ index ]) { - num[index] = temp; + if (index + halfPoint < size) + { + temp = num[ index + halfPoint ]; + if (temp > num[ index ]) { + num[index] = temp; + } } } __syncthreads(); - nTotalThreads = nTotalThreads / 2; // divide by two. + nTotalThreads = (nTotalThreads + 1) / 2; // divide by two. } } diff --git a/maxgpu.cu b/maxgpu.cu index d43741e..2bc2292 100644 --- a/maxgpu.cu +++ b/maxgpu.cu @@ -42,10 +42,11 @@ int main(int argc, char *argv[]) unsigned int sizea = size; while(sizea > 1){ getmaxcu<<>>(d_numbers, sizea); - sizea = (sizea) / 10; + sizea = (sizea + 9) / 10; } cudaMemcpy(numbers, d_numbers, size * sizeof(unsigned int), cudaMemcpyDeviceToHost); - printf("The max integer in the array is: %d\n", numbers[0]); + printf("The max integer in the array found by the GPU is: %d\n", numbers[0]); + printf("While the max integer in the array found by the CPU is: %d\n", getmax(numbers, size)); //free device matrices cudaFree(d_numbers); free(numbers); @@ -57,13 +58,26 @@ __global__ void getmaxcu(unsigned int* num, unsigned int size){ unsigned int index = threadIdx.x + (blockDim.x * blockIdx.x); unsigned int nTotalThreads = size; unsigned int i; - unsigned int tenPoint = nTotalThreads / 10; // divide by ten + unsigned int tenPoint = (nTotalThreads + 9) / 10; // divide by ten if(index < tenPoint){ - for(i = 1; i < 10; i++){ - temp = num[index + tenPoint*i]; - //compare to "0" index - if(temp > num[index]){ - num[index] = temp; + if (index + tenPoint * 9 >= size) + { + for(i = 1; i < 9; i++){ + temp = num[index + tenPoint*i]; + //compare to "0" index + if(temp > num[index]){ + num[index] = temp; + } + } + } + else + { + for(i = 1; i < 10; i++){ + temp = num[index + tenPoint*i]; + //compare to "0" index + if(temp > num[index]){ + num[index] = temp; + } } } }