-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodularity.cu
More file actions
executable file
·142 lines (111 loc) · 4.1 KB
/
modularity.cu
File metadata and controls
executable file
·142 lines (111 loc) · 4.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#include "louvain_cuda.cuh"
// #include "louvain_cuda_cpp_interface.hpp"
#include "GpuGraph.cuh"
#include <cstring>
#include <sstream>
#include <sys/time.h>
#include <time.h>
#include <cooperative_groups.h>
// #define PRINT_HYBRID
// #define PRINT_TIMEDS
#define L_THREADBLOCK_SIZE 1024
#define M_THREADBLOCK_SIZE 512
#define S_THREADBLOCK_SIZE 256
namespace cg = cooperative_groups;
using namespace CuVite;
__device__ GraphWeight f_weight_reduce(
cg::thread_group g, GraphWeight *x, GraphWeight val)
{
int lane = g.thread_rank();
// Each iteration halves the number of active threads
// Each thread adds its partial sum[i] to sum[lane+i]
for (int i = g.size() / 2; i > 0; i /= 2) {
x[lane] = val; g.sync();
val += x[lane+i]; g.sync();
}
return val; // note: only thread 0 will return full sum
}
template<int tile_sz>
__global__
void compute_modularity(
const GraphElem nv,
GraphWeight* localCinfo_degree,
GraphWeight* clusterWeight,
GraphWeight* sumDegree,
GraphWeight* sumWeight
)
{
__shared__ GraphWeight shared_localCinfo_degree[L_THREADBLOCK_SIZE];
__shared__ GraphWeight shared_clusterWeight[L_THREADBLOCK_SIZE];
int ii = threadIdx.x;
GraphElem i = blockIdx.x * blockDim.x + threadIdx.x ;
shared_localCinfo_degree[ii] = 0.0;
shared_clusterWeight[ii] = 0.0;
/// Create cooperative groups
auto thb_g = cg::this_thread_block();
// auto tileIdx = thb_g.thread_rank()/tile_sz;
#if __cuda_arch__ >= 700
auto tile = cg::partition<tile_sz>(cg::this_thread_block());
#else
auto tile = cg::tiled_partition<tile_sz>(cg::this_thread_block());
#endif
auto tileIdx = thb_g.thread_rank()/tile.size();
unsigned ti = tileIdx*tile.size();
if(i < nv) {
shared_localCinfo_degree[ii] = localCinfo_degree[i];
shared_clusterWeight[ii] = clusterWeight[i];
}
shared_localCinfo_degree[ii] *= shared_localCinfo_degree[ii];
thb_g.sync();
GraphWeight block_sum_weight = 0.0;
GraphWeight block_sum_degree = 0.0;
block_sum_weight = f_weight_reduce(thb_g, shared_clusterWeight,
shared_clusterWeight[thb_g.thread_rank()]);
block_sum_degree = f_weight_reduce(thb_g, shared_localCinfo_degree,
shared_localCinfo_degree[thb_g.thread_rank()]);
if(thb_g.thread_rank() == 0) {
#if __cuda_arch__ >= 600
atomicAdd(&sumDegree[0], block_sum_degree);
atomicAdd(&sumWeight[0], block_sum_weight);
#else
#ifndef USE_32_BIT_GRAPH
my_func_atomicAdd(&sumDegree[0], block_sum_degree);
my_func_atomicAdd(&sumWeight[0], block_sum_weight);
#else
atomicAdd(&sumDegree[0], block_sum_degree);
atomicAdd(&sumWeight[0], block_sum_weight);
#endif
#endif
}
}
void gpu_reduce_mod(CommVector &localCinfo, const GraphWeightVector &clusterWeight,
GpuGraph &gpu_graph, GraphWeight le_la_xx[], const GraphElem nv) {
GraphWeight* dev_clusterWeight = gpu_graph.get_clusterWeight();
GraphWeight* dev_localCinfo_degree = gpu_graph.get_ModlocalCinfo_degree();
gpu_graph.cpyVecTodev(clusterWeight, dev_clusterWeight);
GraphWeight* temp_ModlocalCinfo_degree = gpu_graph.getPinned_ModlocalCinfo_degree();
GraphWeight* dev_sumDegree = gpu_graph.get_sum_degree();
GraphWeight* dev_sumWeight = gpu_graph.get_sum_weight();
CUDA_SAFE(cudaMemset(dev_sumDegree, 0, sizeof(GraphWeight)));
CUDA_SAFE(cudaMemset(dev_sumWeight, 0, sizeof(GraphWeight)));
#pragma omp parallel default(none), \
shared(localCinfo, temp_ModlocalCinfo_degree)
#pragma omp for schedule(guided)
for(int ii=0; ii<localCinfo.size(); ii++) {
temp_ModlocalCinfo_degree[ii] = localCinfo[ii].degree;
}
gpu_graph.cpyArrayTodev(temp_ModlocalCinfo_degree,
dev_localCinfo_degree, localCinfo.size());
dim3 numBlocks01( (nv-1) / M_THREADBLOCK_SIZE + 1);
dim3 Block_dim01(M_THREADBLOCK_SIZE);
compute_modularity<PHY_WRP_SZ><<<numBlocks01,Block_dim01>>>(
nv,
dev_localCinfo_degree,
dev_clusterWeight,
dev_sumDegree,
dev_sumWeight);
CUDA_SAFE(cudaMemcpy(&le_la_xx[0], dev_sumWeight,
sizeof(GraphWeight), cudaMemcpyDeviceToHost));
CUDA_SAFE(cudaMemcpy(&le_la_xx[1], dev_sumDegree,
sizeof(GraphWeight), cudaMemcpyDeviceToHost));
}