-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmatmul.cpp
More file actions
70 lines (64 loc) · 2.67 KB
/
matmul.cpp
File metadata and controls
70 lines (64 loc) · 2.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
#include <Python.h>
#include<iostream>
#include<omp.h>
namespace py = pybind11;
// py:array_t is to work with a Numpy array
py::array_t<int32_t> matmul(py::array_t<int32_t> A, py::array_t<int32_t> B, std::size_t chunk) {
// I must work with a buffer_object containing all the information of the array
py::buffer_info A_buf= A.request();
py::buffer_info B_buf = B.request();
std::size_t cols = A_buf.shape[1];
// I create the result array
py::array_t<int32_t> res = py::array_t<int32_t>(chunk*cols);
py::buffer_info res_buf = res.request();
// I get the point32_ters to the data
int32_t *A_vals = (int32_t*) A_buf.ptr, *B_vals = (int32_t*) B_buf.ptr, *res_vals = (int32_t*) res_buf.ptr;
for (std::size_t i = 0; i < chunk; i++) {
for (std::size_t j = 0; j < cols; j++) {
res_vals[i*cols + j] = 0;
for (std::size_t k = 0; k < cols; k++) {
res_vals[i*cols + j] += A_vals[i*cols+ k] * B_vals[k*cols + j];
}
}
}
// I resize the result array to fit the correct shape
res.resize({chunk, cols});
return res;
}
// py:array_t is to work with a Numpy array
py::array_t<int32_t> matmul_omp(py::array_t<int32_t> A, py::array_t<int32_t> B, std::size_t chunk) {
// I must work with a buffer_object containing all the information of the array
py::buffer_info A_buf= A.request();
py::buffer_info B_buf = B.request();
std::size_t cols = A_buf.shape[1];
// I create the result array
py::array_t<int32_t> res = py::array_t<int32_t>(chunk*cols);
py::buffer_info res_buf = res.request();
// I get the point32_ters to the data
int32_t *A_vals = (int32_t*) A_buf.ptr, *B_vals = (int32_t*) B_buf.ptr, *res_vals = (int32_t*) res_buf.ptr;
#pragma omp parallel
{
#pragma omp single nowait
std::cerr << "Number of OMP threads: " << omp_get_num_threads() << "\n";
#pragma omp for schedule(static)
for (std::size_t i = 0; i < chunk; i++) {
for (std::size_t j = 0; j < cols; j++) {
res_vals[i*cols + j] = 0;
for (std::size_t k = 0; k < cols; k++) {
res_vals[i*cols + j] += A_vals[i*cols+ k] * B_vals[k*cols + j];
}
}
}
}
// I resize the result array to fit the correct shape
res.resize({chunk, cols});
return res;
}
PYBIND11_MODULE(matmul, m) {
m.doc() = "pybind11 matrix multiplication plugin";
m.def("matmul", &matmul, "Multiply 2 square matrix NxN");
m.def("matmul_omp", &matmul_omp, "Multiply 2 square matrix NxN with OpenMP parallelization");
}