From 88304262394fe5e23ff5bad72679df5a286b1532 Mon Sep 17 00:00:00 2001
From: Duc Vu Nguyen <nguyenvuduc@gmail.com>
Date: Wed, 6 Mar 2024 17:38:06 -0700
Subject: [PATCH] Fix the algorithm is not correct when the number of elements
 is not evenly divisible by 10 or 2

---
 maxgpu-2point.cu | 17 ++++++++++-------
 maxgpu.cu        | 30 ++++++++++++++++++++++--------
 2 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/maxgpu-2point.cu b/maxgpu-2point.cu
index 47b6e12..e8d37d2 100644
--- a/maxgpu-2point.cu
+++ b/maxgpu-2point.cu
@@ -25,7 +25,7 @@ int main(int argc, char *argv[])
     size = atol(argv[1]);
 
     //calculates number of blocks
-    unsigned int NUM_BLOCKS = size/THREADS_PER_BLOCK;
+    unsigned int NUM_BLOCKS = (size + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
 
     numbers = (unsigned int *)malloc(size * sizeof(unsigned int));
     if( !numbers )
@@ -54,7 +54,7 @@ int main(int argc, char *argv[])
         printf("element in %d: %u\n", i, numbers[i]);
       }
     }
-     printf("The max integer in the array is: %d\n", numbers[0]);
+     printf("The max integer in the array found by the GPU is: %d\n", numbers[0]);
     //free device matrices
     cudaFree(d_numbers);
     free(numbers);
@@ -67,18 +67,21 @@ __global__ void get_max(unsigned int* num, unsigned int size){
   unsigned int nTotalThreads = size;
 
   while(nTotalThreads > 1){
-    unsigned int halfPoint = nTotalThreads / 2;	// divide by two
+    unsigned int halfPoint = (nTotalThreads + 1) / 2;	// divide by two
     // only the first half of the threads will be active.
     if (index < halfPoint){
-      temp = num[ index + halfPoint ];
-      if (temp > num[ index ]) {
-        num[index] = temp;
+      if (index + halfPoint < size)
+      {
+        temp = num[ index + halfPoint ];
+        if (temp > num[ index ]) {
+          num[index] = temp;
+        }
       }
     }
     __syncthreads();
 
 
-    nTotalThreads = nTotalThreads / 2;	// divide by two.
+    nTotalThreads = (nTotalThreads + 1) / 2;	// divide by two.
   }
 }
 
diff --git a/maxgpu.cu b/maxgpu.cu
index d43741e..2bc2292 100644
--- a/maxgpu.cu
+++ b/maxgpu.cu
@@ -42,10 +42,11 @@ int main(int argc, char *argv[])
     unsigned int sizea = size;
     while(sizea > 1){
       getmaxcu<<<NUM_BLOCKS, THREADS_PER_BLOCK>>>(d_numbers, sizea);
-      sizea = (sizea) / 10;
+      sizea = (sizea + 9) / 10;
     }
     cudaMemcpy(numbers, d_numbers, size * sizeof(unsigned int), cudaMemcpyDeviceToHost);
-     printf("The max integer in the array is: %d\n", numbers[0]);
+     printf("The max integer in the array found by the GPU is: %d\n", numbers[0]);
+     printf("While the max integer in the array found by the CPU is: %d\n", getmax(numbers, size));
     //free device matrices
     cudaFree(d_numbers);
     free(numbers);
@@ -57,13 +58,26 @@ __global__ void getmaxcu(unsigned int* num, unsigned int size){
   unsigned int index = threadIdx.x + (blockDim.x * blockIdx.x);
   unsigned int nTotalThreads = size;
   unsigned int i;
-    unsigned int tenPoint = nTotalThreads / 10;	// divide by ten
+    unsigned int tenPoint = (nTotalThreads + 9) / 10;	// divide by ten
     if(index < tenPoint){
-      for(i = 1; i < 10; i++){
-        temp = num[index + tenPoint*i];
-        //compare to "0" index
-        if(temp > num[index]){
-          num[index] = temp;
+      if (index + tenPoint * 9 >= size)
+      {
+        for(i = 1; i < 9; i++){
+          temp = num[index + tenPoint*i];
+          //compare to "0" index
+          if(temp > num[index]){
+            num[index] = temp;
+          }
+        }
+      }
+      else
+      {
+        for(i = 1; i < 10; i++){
+          temp = num[index + tenPoint*i];
+          //compare to "0" index
+          if(temp > num[index]){
+            num[index] = temp;
+          }
         }
       }
     }