GPU-comcot/GPUOutput.cu at master · ypbing80/GPU-comcot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#include "GPUHeader.h"
#include "GPUConfig.h"

extern "C" void maxamp_launch_(void);
// __global__ void maximum_recorder_kernel(float *, const float * __restrict__ , const uint32_t);
__global__ void maximum_recorder_kernel_tex(float *);
extern "C" void cuda_getz_(float *);
extern "C" void cuda_getmn_(float *,float *);
extern "C" void cuda_getzmax_(float *);


extern "C" void maxamp_launch_(void){
    clock_t st, fi;
    cudaError_t err;

    // st = clock();
    // maximum_recorder_kernel <<< GridMaxAmp, MAXAMP_BLOCK >>> (Zmax_hst, Zout_hst, size_hst[2]);
    // cudaDeviceSynchronize();
    // err = cudaGetLastError();
    // cudaERROR(err);
    // fi = clock();

    st = clock();
    maximum_recorder_kernel_tex <<< DimGridMaxAmp, DimBlockMaxAmp >>> (Zmax_hst);
    cudaDeviceSynchronize();
    err = cudaGetLastError();
    cudaERROR(err);
    fi = clock();


    #ifdef DEBUG_MAXAMP_KERNEL
        printf("GPU MAX_AMP\n" );
        printf("TIME SPENT ON GPU %f\n",(float)(fi-st)/CLOCKS_PER_SEC);
    #endif
}


// __global__ void maximum_recorder_kernel(float *max_array, const float* __restrict__ array, const uint32_t array_size){
//     uint32_t id=blockIdx.x*blockDim.x + threadIdx.x;
//     if (id < array_size) {
//         float current_max_value = max_array[id];
//         if (current_max_value >= array[id]) return;
//         else max_array[id] = array[id];
//     }
// }

__global__ void maximum_recorder_kernel_tex(float *max_array){
    uint32_t x=blockIdx.x*blockDim.x + threadIdx.x;
    uint32_t y=blockIdx.y*blockDim.y + threadIdx.y;
    float eta_now;
    float eta_max;
    if (x < size_dev[0] && y < size_dev[1]) {
        // if (tex2D<float2>(ZHtext, x, y).y > 0.0f) {
            eta_now = tex2D<float2>(ZHtext, x, y).x;
            eta_max = max_array[ID(x,y)];
            if (eta_max < eta_now) {
                max_array[ID(x,y)] = eta_now;
            }
        // }
    }
}

extern "C" void cuda_getz_(float *Z_f) {
    // cudaCHK( cudaMemcpy(Z_f, Zout_hst, size_hst[3], cudaMemcpyDeviceToHost) );
    cudaCHK( cudaMemcpy2D(ZHcontainer, size_hst[0]*sizeof(float2), ZHout_pitchedMEM_hst, LayerPitch, size_hst[0]*sizeof(float2), size_hst[1], cudaMemcpyDeviceToHost) );
    for (size_t i = 0; i < size_hst[1]; i++) {
        for (size_t j = 0; j < size_hst[0]; j++) {
            Z_f[i*size_hst[0] + j] = ZHcontainer[i*size_hst[0] + j].x;
        }
    }
}

extern "C" void cuda_getmn_(float *M_f, float *N_f) {
    cudaCHK( cudaMemcpy2D(MNcontainer, size_hst[0]*sizeof(float2), MNout_pitchedMEM_hst, LayerPitch, size_hst[0]*sizeof(float2), size_hst[1], cudaMemcpyDeviceToHost) );
    for (size_t j = 0; j < size_hst[1]; j++) {
        for (size_t i = 0; i < size_hst[0]; i++) {
            M_f[j*size_hst[0]+i] = MNcontainer[j*size_hst[0]+i].x;
            N_f[j*size_hst[0]+i] = MNcontainer[j*size_hst[0]+i].y;
        }
    }
    // cudaCHK( cudaMemcpy(M_f, MNout_hst, size_hst[3], cudaMemcpyDeviceToHost) );
    // cudaCHK( cudaMemcpy(N_f, MNout_hst+size_hst[2], size_hst[3], cudaMemcpyDeviceToHost) );
}

extern "C" void cuda_getzmax_(float *Zmax_f) {
    cudaCHK( cudaMemcpy(Zmax_f, Zmax_hst, size_hst[3], cudaMemcpyDeviceToHost) );
}

// void cuda_getmnmax_(float *Mmax_f, float *Nmax_f) {
//     cudaCHK( cudaMemcpy(Mmax_f, MNmax_hst, size_hst[3], cudaMemcpyDeviceToHost) );
//     cudaCHK( cudaMemcpy(Nmax_f, MNmax_hst+size_hst[2], size_hst[3], cudaMemcpyDeviceToHost) );
// }