Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/scorecard-scanner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ jobs:
- if: github.event.inputs.debug == true || runner.debug == true
name: Upload results as artifacts to the workflow Summary page
# yamllint disable rule:line-length
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
with:
name: Scorecard SARIF file
path: scorecard-results.sarif
Expand Down
52 changes: 26 additions & 26 deletions lib/simulator_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -344,13 +344,13 @@ class SimulatorCUDA final {

unsigned k = 5 + G;
unsigned n = num_qubits > k ? num_qubits - k : 0;
uint64_t size = uint64_t{1} << n;
unsigned size = unsigned{1} << n;
unsigned threads = 64U;
uint64_t blocks = std::max(uint64_t{1}, size / 2);
unsigned blocks = std::max(1U, size / 2);

IndicesH<G> d_i(d_ws);

ApplyGateH_Kernel<G><<<CreateGrid(blocks), threads>>>(
ApplyGateH_Kernel<G><<<blocks, threads>>>(
(fp_type*) d_ws, d_i.xss, d_i.ms, state.get());
}

Expand All @@ -368,13 +368,13 @@ class SimulatorCUDA final {

unsigned k = 5 + num_effective_qs;
unsigned n = num_qubits > k ? num_qubits - k : 0;
uint64_t size = uint64_t{1} << n;
unsigned size = unsigned{1} << n;
unsigned threads = 32;
uint64_t blocks = size;
unsigned blocks = size;

IndicesL<G> d_i(d_ws);

ApplyGateL_Kernel<G><<<CreateGrid(blocks), threads>>>(
ApplyGateL_Kernel<G><<<blocks, threads>>>(
(fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
1 << num_effective_qs, state.get());
}
Expand All @@ -401,13 +401,13 @@ class SimulatorCUDA final {

unsigned k = 5 + G + cqs.size();
unsigned n = num_qubits > k ? num_qubits - k : 0;
uint64_t size = uint64_t{1} << n;
unsigned size = unsigned{1} << n;
unsigned threads = 64U;
uint64_t blocks = std::max(uint64_t{1}, size / 2);
unsigned blocks = std::max(1U, size / 2);

IndicesH<G> d_i(d_ws);

ApplyControlledGateH_Kernel<G><<<CreateGrid(blocks), threads>>>(
ApplyControlledGateH_Kernel<G><<<blocks, threads>>>(
(fp_type*) d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get());
}

Expand All @@ -426,13 +426,13 @@ class SimulatorCUDA final {

unsigned k = 5 + G + cqs.size();
unsigned n = num_qubits > k ? num_qubits - k : 0;
uint64_t size = uint64_t{1} << n;
unsigned size = unsigned{1} << n;
unsigned threads = 32;
uint64_t blocks = size;
unsigned blocks = size;

IndicesL<G> d_i(d_ws);

ApplyControlledGateLH_Kernel<G><<<CreateGrid(blocks), threads>>>(
ApplyControlledGateLH_Kernel<G><<<blocks, threads>>>(
(fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, state.get());
}
Expand All @@ -452,13 +452,13 @@ class SimulatorCUDA final {

unsigned k = 5 + G + cqs.size();
unsigned n = num_qubits > k ? num_qubits - k : 0;
uint64_t size = uint64_t{1} << n;
unsigned size = unsigned{1} << n;
unsigned threads = 32;
uint64_t blocks = size;
unsigned blocks = size;

IndicesLC<G> d_i(d_ws);

ApplyControlledGateL_Kernel<G><<<CreateGrid(blocks), threads>>>(
ApplyControlledGateL_Kernel<G><<<blocks, threads>>>(
(fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis,
d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs,
1 << (5 - d.remaining_low_cqs), state.get());
Expand All @@ -479,11 +479,11 @@ class SimulatorCUDA final {

unsigned k = 5 + G;
unsigned n = num_qubits > k ? num_qubits - k : 0;
uint64_t size = uint64_t{1} << n;
unsigned size = unsigned{1} << n;

unsigned s = std::min(n >= 14 ? n - 14 : 0, 4U);
unsigned threads = 64U;
uint64_t blocks = std::max(uint64_t{1}, (size / 2) >> s);
unsigned blocks = std::max(1U, (size / 2) >> s);
unsigned num_iterations_per_block = 1 << s;

constexpr unsigned m = 16;
Expand All @@ -493,7 +493,7 @@ class SimulatorCUDA final {

IndicesH<G> d_i(d_ws);

ExpectationValueH_Kernel<G><<<CreateGrid(blocks), threads>>>(
ExpectationValueH_Kernel<G><<<blocks, threads>>>(
(fp_type*) d_ws, d_i.xss, d_i.ms, num_iterations_per_block,
state.get(), Plus<double>(), d_res1);

Expand All @@ -517,11 +517,11 @@ class SimulatorCUDA final {

unsigned k = 5 + num_effective_qs;
unsigned n = num_qubits > k ? num_qubits - k : 0;
uint64_t size = uint64_t{1} << n;
unsigned size = unsigned{1} << n;

unsigned s = std::min(n >= 13 ? n - 13 : 0, 5U);
unsigned threads = 32;
uint64_t blocks = size >> s;
unsigned blocks = size >> s;
unsigned num_iterations_per_block = 1 << s;

constexpr unsigned m = 16;
Expand All @@ -531,7 +531,7 @@ class SimulatorCUDA final {

IndicesL<G> d_i(d_ws);

ExpectationValueL_Kernel<G><<<CreateGrid(blocks), threads>>>(
ExpectationValueL_Kernel<G><<<blocks, threads>>>(
(fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
num_iterations_per_block, state.get(), Plus<double>(), d_res1);

Expand All @@ -542,18 +542,18 @@ class SimulatorCUDA final {

template <unsigned m>
std::complex<double> ExpectationValueReduceFinal(
uint64_t blocks, double mul,
unsigned blocks, double mul,
const Complex* d_res1, Complex* d_res2) const {
Complex res2[m];

if (blocks <= 16) {
ErrorCheck(cudaMemcpy(res2, d_res1, blocks * sizeof(Complex),
cudaMemcpyDeviceToHost));
} else {
unsigned threads2 = std::min(uint64_t{1024}, blocks);
uint64_t blocks2 = std::min(uint64_t{m}, blocks / threads2);
unsigned threads2 = std::min(1024U, blocks);
unsigned blocks2 = std::min(m, blocks / threads2);

unsigned dblocks = std::max(uint64_t{1}, blocks / (blocks2 * threads2));
unsigned dblocks = std::max(1U, blocks / (blocks2 * threads2));
unsigned bytes = threads2 * sizeof(Complex);

Reduce2Kernel<Complex><<<blocks2, threads2, bytes>>>(
Expand All @@ -568,7 +568,7 @@ class SimulatorCUDA final {
double re = 0;
double im = 0;

for (uint64_t i = 0; i < blocks; ++i) {
for (unsigned i = 0; i < blocks; ++i) {
re += res2[i].re;
im += res2[i].im;
}
Expand Down
40 changes: 13 additions & 27 deletions lib/simulator_cuda_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@
#ifdef __NVCC__
#include <cuda.h>
#include <cuda_runtime.h>

#include "util_cuda.h"
#elif __HIP__
#include <hip/hip_runtime.h>
#include "cuda2hip.h"
#endif

#include "util_cuda.h"

namespace qsim {

template <unsigned G, typename fp_type, typename idx_type>
Expand All @@ -33,8 +33,6 @@ __global__ void ApplyGateH_Kernel(
const idx_type* __restrict__ mss, fp_type* __restrict__ rstate) {
// blockDim.x must be equal to 64.

uint64_t blockId = GetBlockId();

static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");

constexpr unsigned gsize = 1 << G;
Expand Down Expand Up @@ -63,7 +61,7 @@ __global__ void ApplyGateH_Kernel(

__syncthreads();

idx_type i = (64 * idx_type{blockId} + threadIdx.x) & 0xffffffffffe0;
idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0;
idx_type ii = i & mss[0];
for (unsigned j = 1; j <= G; ++j) {
i *= 2;
Expand Down Expand Up @@ -117,8 +115,6 @@ __global__ void ApplyGateL_Kernel(
fp_type* __restrict__ rstate) {
// blockDim.x must be equal to 32.

uint64_t blockId = GetBlockId();

static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");

constexpr unsigned gsize = 1 << G;
Expand All @@ -141,7 +137,7 @@ __global__ void ApplyGateL_Kernel(
}
}

idx_type i = 32 * idx_type{blockId};
idx_type i = 32 * idx_type{blockIdx.x};
idx_type ii = i & mss[0];
for (unsigned j = 1; j <= G; ++j) {
i *= 2;
Expand Down Expand Up @@ -208,8 +204,6 @@ __global__ void ApplyControlledGateH_Kernel(
fp_type* __restrict__ rstate) {
// blockDim.x must be equal to 64.

uint64_t blockId = GetBlockId();

static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");

constexpr unsigned gsize = 1 << G;
Expand Down Expand Up @@ -238,7 +232,7 @@ __global__ void ApplyControlledGateH_Kernel(

__syncthreads();

idx_type i = (64 * idx_type{blockId} + threadIdx.x) & 0xffffffffffe0;
idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0;
idx_type ii = i & mss[0];
for (unsigned j = 1; j < num_mss; ++j) {
i *= 2;
Expand Down Expand Up @@ -294,8 +288,6 @@ __global__ void ApplyControlledGateLH_Kernel(
unsigned esize, fp_type* __restrict__ rstate) {
// blockDim.x must be equal to 32.

uint64_t blockId = GetBlockId();

static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");

constexpr unsigned gsize = 1 << G;
Expand All @@ -308,7 +300,7 @@ __global__ void ApplyControlledGateLH_Kernel(
__shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
__shared__ fp_type v[2 * gsize * rows];

idx_type i = 32 * idx_type{blockId};
idx_type i = 32 * idx_type{blockIdx.x};
idx_type ii = i & mss[0];
for (unsigned j = 1; j < num_mss; ++j) {
i *= 2;
Expand Down Expand Up @@ -389,8 +381,6 @@ __global__ void ApplyControlledGateL_Kernel(
fp_type* __restrict__ rstate) {
// blockDim.x must be equal to 32.

uint64_t blockId = GetBlockId();

static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");

constexpr unsigned gsize = 1 << G;
Expand All @@ -403,7 +393,7 @@ __global__ void ApplyControlledGateL_Kernel(
__shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
__shared__ fp_type v[2 * gsize * rows];

idx_type i = 32 * idx_type{blockId};
idx_type i = 32 * idx_type{blockIdx.x};
idx_type ii = i & mss[0];
for (unsigned j = 1; j < num_mss; ++j) {
i *= 2;
Expand Down Expand Up @@ -487,8 +477,6 @@ __global__ void ExpectationValueH_Kernel(
const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) {
// blockDim.x must be equal to 64.

uint64_t blockId = GetBlockId();

static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");

constexpr unsigned gsize = 1 << G;
Expand Down Expand Up @@ -520,7 +508,7 @@ __global__ void ExpectationValueH_Kernel(
double im = 0;

for (unsigned iter = 0; iter < num_iterations_per_block; ++iter) {
idx_type b = num_iterations_per_block * idx_type{blockId} + iter;
idx_type b = num_iterations_per_block * idx_type{blockIdx.x} + iter;

idx_type i = (64 * b + threadIdx.x) & 0xffffffffffe0;
idx_type ii = i & mss[0];
Expand Down Expand Up @@ -585,8 +573,8 @@ __global__ void ExpectationValueH_Kernel(
__syncthreads();

if (threadIdx.x == 0) {
result[blockId].re = partial2[0].re + partial2[1].re;
result[blockId].im = partial2[0].im + partial2[1].im;
result[blockIdx.x].re = partial2[0].re + partial2[1].re;
result[blockIdx.x].im = partial2[0].im + partial2[1].im;
}
}

Expand All @@ -599,8 +587,6 @@ __global__ void ExpectationValueL_Kernel(
const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) {
// blockDim.x must be equal to 32.

uint64_t blockId = GetBlockId();

static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");

constexpr unsigned gsize = 1 << G;
Expand All @@ -626,7 +612,7 @@ __global__ void ExpectationValueL_Kernel(
double im = 0;

for (idx_type iter = 0; iter < num_iterations_per_block; ++iter) {
idx_type i = 32 * (num_iterations_per_block * idx_type{blockId} + iter);
idx_type i = 32 * (num_iterations_per_block * idx_type{blockIdx.x} + iter);
idx_type ii = i & mss[0];
for (unsigned j = 1; j <= G; ++j) {
i *= 2;
Expand Down Expand Up @@ -687,8 +673,8 @@ __global__ void ExpectationValueL_Kernel(
auto val = WarpReduce(partial[threadIdx.x], op);

if (threadIdx.x == 0) {
result[blockId].re = val.re;
result[blockId].im = val.im;
result[blockIdx.x].re = val.re;
result[blockIdx.x].im = val.im;
}
}

Expand Down
Loading
Loading