From 003de6b30f26000c0249de1f2b1d4c270fdb3c36 Mon Sep 17 00:00:00 2001 From: mhucka <1450019+mhucka@users.noreply.github.com> Date: Thu, 26 Mar 2026 04:16:26 +0000 Subject: [PATCH 1/2] =?UTF-8?q?=F0=9F=A7=AA=20Add=20error=20test=20for=20a?= =?UTF-8?q?dd=5Fop=5Fto=5Fcircuit=20unsupported=20gate?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added a test case for the ValueError raised when an unsupported gate is passed to add_op_to_circuit. Also improved the library code by adding an explicit check for unsupported gates early in the function. --- qsimcirq/qsim_circuit.py | 2 ++ qsimcirq_tests/qsimcirq_test.py | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/qsimcirq/qsim_circuit.py b/qsimcirq/qsim_circuit.py index a5b4ae277..cd4c62cde 100644 --- a/qsimcirq/qsim_circuit.py +++ b/qsimcirq/qsim_circuit.py @@ -271,6 +271,8 @@ def add_op_to_circuit( """Adds an operation to a noisy or noiseless circuit.""" qsim_gate = qsim_op.gate gate_kind = _cirq_gate_kind(qsim_gate) + if gate_kind is None: + raise ValueError("{!r} is not a supported gate.".format(qsim_gate)) qubits = [qubit_to_index_dict[q] for q in qsim_op.qubits] qsim_qubits = qubits diff --git a/qsimcirq_tests/qsimcirq_test.py b/qsimcirq_tests/qsimcirq_test.py index 0b5cdbc8e..cd3f1ce3a 100644 --- a/qsimcirq_tests/qsimcirq_test.py +++ b/qsimcirq_tests/qsimcirq_test.py @@ -2196,3 +2196,23 @@ def test_1d_representation(): want = np.array([0.0 - 0.5j, 0.0 + 0.5j, 0.0 - 0.5j, 0.0 + 0.5j]) _, res, _ = qsim_sim.simulate_into_1d_array(c) np.testing.assert_allclose(res, np.array(want, dtype=np.complex64)) + + +def test_add_op_to_circuit_unsupported_gate(): + class UnsupportedGate(cirq.Gate): + def _num_qubits_(self) -> int: + return 1 + + def _unitary_(self): + return np.eye(2) + + def __repr__(self): + return "UnsupportedGate()" + + q0 = cirq.LineQubit(0) + op = UnsupportedGate().on(q0) + circuit = qsimcirq.qsim.Circuit() + qubit_to_index = {q0: 0} + + with pytest.raises(ValueError, match="UnsupportedGate\(\) is not a supported gate."): + qsimcirq.add_op_to_circuit(op, 0, qubit_to_index, circuit) From 079a43780f08372ee5d98c59234dbf5542fbc625 Mon Sep 17 00:00:00 2001 From: mhucka <1450019+mhucka@users.noreply.github.com> Date: Thu, 26 Mar 2026 04:23:43 +0000 Subject: [PATCH 2/2] =?UTF-8?q?=F0=9F=A7=AA=20[testing=20improvement]=20Ad?= =?UTF-8?q?d=20error=20test=20for=20add=5Fop=5Fto=5Fcircuit=20unsupported?= =?UTF-8?q?=20gate?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added a test case for the ValueError raised when an unsupported gate is passed to add_op_to_circuit. Refactored the ValueError message to use an f-string as requested. Improved the library code by adding an explicit check for unsupported gates early in the function. Corrected formatting using black to comply with project guidelines. --- .github/workflows/scorecard-scanner.yaml | 2 +- lib/simulator_cuda.h | 52 +++++++++--------- lib/simulator_cuda_kernels.h | 40 +++++--------- lib/statespace_cuda.h | 68 ++++++++++++------------ lib/statespace_cuda_kernels.h | 30 +++++------ lib/util_cuda.h | 19 ------- lib/vectorspace_cuda.h | 5 +- pyproject.toml | 21 +++----- qsimcirq/qsim_circuit.py | 2 +- qsimcirq_tests/qsimcirq_test.py | 4 +- 10 files changed, 103 insertions(+), 140 deletions(-) diff --git a/.github/workflows/scorecard-scanner.yaml b/.github/workflows/scorecard-scanner.yaml index b8deea7c4..3f40dd45e 100644 --- a/.github/workflows/scorecard-scanner.yaml +++ b/.github/workflows/scorecard-scanner.yaml @@ -80,7 +80,7 @@ jobs: - if: github.event.inputs.debug == true || runner.debug == true name: Upload results as artifacts to the workflow Summary page # yamllint disable rule:line-length - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 with: name: Scorecard SARIF file path: scorecard-results.sarif diff --git a/lib/simulator_cuda.h b/lib/simulator_cuda.h index 2a5e651e4..5743bea8b 100644 --- a/lib/simulator_cuda.h +++ b/lib/simulator_cuda.h @@ -344,13 +344,13 @@ class SimulatorCUDA final { unsigned k = 5 + G; unsigned n = num_qubits > k ? num_qubits - k : 0; - uint64_t size = uint64_t{1} << n; + unsigned size = unsigned{1} << n; unsigned threads = 64U; - uint64_t blocks = std::max(uint64_t{1}, size / 2); + unsigned blocks = std::max(1U, size / 2); IndicesH d_i(d_ws); - ApplyGateH_Kernel<<>>( + ApplyGateH_Kernel<<>>( (fp_type*) d_ws, d_i.xss, d_i.ms, state.get()); } @@ -368,13 +368,13 @@ class SimulatorCUDA final { unsigned k = 5 + num_effective_qs; unsigned n = num_qubits > k ? num_qubits - k : 0; - uint64_t size = uint64_t{1} << n; + unsigned size = unsigned{1} << n; unsigned threads = 32; - uint64_t blocks = size; + unsigned blocks = size; IndicesL d_i(d_ws); - ApplyGateL_Kernel<<>>( + ApplyGateL_Kernel<<>>( (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, 1 << num_effective_qs, state.get()); } @@ -401,13 +401,13 @@ class SimulatorCUDA final { unsigned k = 5 + G + cqs.size(); unsigned n = num_qubits > k ? num_qubits - k : 0; - uint64_t size = uint64_t{1} << n; + unsigned size = unsigned{1} << n; unsigned threads = 64U; - uint64_t blocks = std::max(uint64_t{1}, size / 2); + unsigned blocks = std::max(1U, size / 2); IndicesH d_i(d_ws); - ApplyControlledGateH_Kernel<<>>( + ApplyControlledGateH_Kernel<<>>( (fp_type*) d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get()); } @@ -426,13 +426,13 @@ class SimulatorCUDA final { unsigned k = 5 + G + cqs.size(); unsigned n = num_qubits > k ? num_qubits - k : 0; - uint64_t size = uint64_t{1} << n; + unsigned size = unsigned{1} << n; unsigned threads = 32; - uint64_t blocks = size; + unsigned blocks = size; IndicesL d_i(d_ws); - ApplyControlledGateLH_Kernel<<>>( + ApplyControlledGateLH_Kernel<<>>( (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, state.get()); } @@ -452,13 +452,13 @@ class SimulatorCUDA final { unsigned k = 5 + G + cqs.size(); unsigned n = num_qubits > k ? num_qubits - k : 0; - uint64_t size = uint64_t{1} << n; + unsigned size = unsigned{1} << n; unsigned threads = 32; - uint64_t blocks = size; + unsigned blocks = size; IndicesLC d_i(d_ws); - ApplyControlledGateL_Kernel<<>>( + ApplyControlledGateL_Kernel<<>>( (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis, d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, 1 << (5 - d.remaining_low_cqs), state.get()); @@ -479,11 +479,11 @@ class SimulatorCUDA final { unsigned k = 5 + G; unsigned n = num_qubits > k ? num_qubits - k : 0; - uint64_t size = uint64_t{1} << n; + unsigned size = unsigned{1} << n; unsigned s = std::min(n >= 14 ? n - 14 : 0, 4U); unsigned threads = 64U; - uint64_t blocks = std::max(uint64_t{1}, (size / 2) >> s); + unsigned blocks = std::max(1U, (size / 2) >> s); unsigned num_iterations_per_block = 1 << s; constexpr unsigned m = 16; @@ -493,7 +493,7 @@ class SimulatorCUDA final { IndicesH d_i(d_ws); - ExpectationValueH_Kernel<<>>( + ExpectationValueH_Kernel<<>>( (fp_type*) d_ws, d_i.xss, d_i.ms, num_iterations_per_block, state.get(), Plus(), d_res1); @@ -517,11 +517,11 @@ class SimulatorCUDA final { unsigned k = 5 + num_effective_qs; unsigned n = num_qubits > k ? num_qubits - k : 0; - uint64_t size = uint64_t{1} << n; + unsigned size = unsigned{1} << n; unsigned s = std::min(n >= 13 ? n - 13 : 0, 5U); unsigned threads = 32; - uint64_t blocks = size >> s; + unsigned blocks = size >> s; unsigned num_iterations_per_block = 1 << s; constexpr unsigned m = 16; @@ -531,7 +531,7 @@ class SimulatorCUDA final { IndicesL d_i(d_ws); - ExpectationValueL_Kernel<<>>( + ExpectationValueL_Kernel<<>>( (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, num_iterations_per_block, state.get(), Plus(), d_res1); @@ -542,7 +542,7 @@ class SimulatorCUDA final { template std::complex ExpectationValueReduceFinal( - uint64_t blocks, double mul, + unsigned blocks, double mul, const Complex* d_res1, Complex* d_res2) const { Complex res2[m]; @@ -550,10 +550,10 @@ class SimulatorCUDA final { ErrorCheck(cudaMemcpy(res2, d_res1, blocks * sizeof(Complex), cudaMemcpyDeviceToHost)); } else { - unsigned threads2 = std::min(uint64_t{1024}, blocks); - uint64_t blocks2 = std::min(uint64_t{m}, blocks / threads2); + unsigned threads2 = std::min(1024U, blocks); + unsigned blocks2 = std::min(m, blocks / threads2); - unsigned dblocks = std::max(uint64_t{1}, blocks / (blocks2 * threads2)); + unsigned dblocks = std::max(1U, blocks / (blocks2 * threads2)); unsigned bytes = threads2 * sizeof(Complex); Reduce2Kernel<<>>( @@ -568,7 +568,7 @@ class SimulatorCUDA final { double re = 0; double im = 0; - for (uint64_t i = 0; i < blocks; ++i) { + for (unsigned i = 0; i < blocks; ++i) { re += res2[i].re; im += res2[i].im; } diff --git a/lib/simulator_cuda_kernels.h b/lib/simulator_cuda_kernels.h index d39661c56..e21a9d62e 100644 --- a/lib/simulator_cuda_kernels.h +++ b/lib/simulator_cuda_kernels.h @@ -18,13 +18,13 @@ #ifdef __NVCC__ #include #include + + #include "util_cuda.h" #elif __HIP__ #include #include "cuda2hip.h" #endif -#include "util_cuda.h" - namespace qsim { template @@ -33,8 +33,6 @@ __global__ void ApplyGateH_Kernel( const idx_type* __restrict__ mss, fp_type* __restrict__ rstate) { // blockDim.x must be equal to 64. - uint64_t blockId = GetBlockId(); - static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); constexpr unsigned gsize = 1 << G; @@ -63,7 +61,7 @@ __global__ void ApplyGateH_Kernel( __syncthreads(); - idx_type i = (64 * idx_type{blockId} + threadIdx.x) & 0xffffffffffe0; + idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0; idx_type ii = i & mss[0]; for (unsigned j = 1; j <= G; ++j) { i *= 2; @@ -117,8 +115,6 @@ __global__ void ApplyGateL_Kernel( fp_type* __restrict__ rstate) { // blockDim.x must be equal to 32. - uint64_t blockId = GetBlockId(); - static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); constexpr unsigned gsize = 1 << G; @@ -141,7 +137,7 @@ __global__ void ApplyGateL_Kernel( } } - idx_type i = 32 * idx_type{blockId}; + idx_type i = 32 * idx_type{blockIdx.x}; idx_type ii = i & mss[0]; for (unsigned j = 1; j <= G; ++j) { i *= 2; @@ -208,8 +204,6 @@ __global__ void ApplyControlledGateH_Kernel( fp_type* __restrict__ rstate) { // blockDim.x must be equal to 64. - uint64_t blockId = GetBlockId(); - static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); constexpr unsigned gsize = 1 << G; @@ -238,7 +232,7 @@ __global__ void ApplyControlledGateH_Kernel( __syncthreads(); - idx_type i = (64 * idx_type{blockId} + threadIdx.x) & 0xffffffffffe0; + idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0; idx_type ii = i & mss[0]; for (unsigned j = 1; j < num_mss; ++j) { i *= 2; @@ -294,8 +288,6 @@ __global__ void ApplyControlledGateLH_Kernel( unsigned esize, fp_type* __restrict__ rstate) { // blockDim.x must be equal to 32. - uint64_t blockId = GetBlockId(); - static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); constexpr unsigned gsize = 1 << G; @@ -308,7 +300,7 @@ __global__ void ApplyControlledGateLH_Kernel( __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1]; __shared__ fp_type v[2 * gsize * rows]; - idx_type i = 32 * idx_type{blockId}; + idx_type i = 32 * idx_type{blockIdx.x}; idx_type ii = i & mss[0]; for (unsigned j = 1; j < num_mss; ++j) { i *= 2; @@ -389,8 +381,6 @@ __global__ void ApplyControlledGateL_Kernel( fp_type* __restrict__ rstate) { // blockDim.x must be equal to 32. - uint64_t blockId = GetBlockId(); - static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); constexpr unsigned gsize = 1 << G; @@ -403,7 +393,7 @@ __global__ void ApplyControlledGateL_Kernel( __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1]; __shared__ fp_type v[2 * gsize * rows]; - idx_type i = 32 * idx_type{blockId}; + idx_type i = 32 * idx_type{blockIdx.x}; idx_type ii = i & mss[0]; for (unsigned j = 1; j < num_mss; ++j) { i *= 2; @@ -487,8 +477,6 @@ __global__ void ExpectationValueH_Kernel( const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) { // blockDim.x must be equal to 64. - uint64_t blockId = GetBlockId(); - static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); constexpr unsigned gsize = 1 << G; @@ -520,7 +508,7 @@ __global__ void ExpectationValueH_Kernel( double im = 0; for (unsigned iter = 0; iter < num_iterations_per_block; ++iter) { - idx_type b = num_iterations_per_block * idx_type{blockId} + iter; + idx_type b = num_iterations_per_block * idx_type{blockIdx.x} + iter; idx_type i = (64 * b + threadIdx.x) & 0xffffffffffe0; idx_type ii = i & mss[0]; @@ -585,8 +573,8 @@ __global__ void ExpectationValueH_Kernel( __syncthreads(); if (threadIdx.x == 0) { - result[blockId].re = partial2[0].re + partial2[1].re; - result[blockId].im = partial2[0].im + partial2[1].im; + result[blockIdx.x].re = partial2[0].re + partial2[1].re; + result[blockIdx.x].im = partial2[0].im + partial2[1].im; } } @@ -599,8 +587,6 @@ __global__ void ExpectationValueL_Kernel( const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) { // blockDim.x must be equal to 32. - uint64_t blockId = GetBlockId(); - static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); constexpr unsigned gsize = 1 << G; @@ -626,7 +612,7 @@ __global__ void ExpectationValueL_Kernel( double im = 0; for (idx_type iter = 0; iter < num_iterations_per_block; ++iter) { - idx_type i = 32 * (num_iterations_per_block * idx_type{blockId} + iter); + idx_type i = 32 * (num_iterations_per_block * idx_type{blockIdx.x} + iter); idx_type ii = i & mss[0]; for (unsigned j = 1; j <= G; ++j) { i *= 2; @@ -687,8 +673,8 @@ __global__ void ExpectationValueL_Kernel( auto val = WarpReduce(partial[threadIdx.x], op); if (threadIdx.x == 0) { - result[blockId].re = val.re; - result[blockId].im = val.im; + result[blockIdx.x].re = val.re; + result[blockIdx.x].im = val.im; } } diff --git a/lib/statespace_cuda.h b/lib/statespace_cuda.h index e0f3cfe03..660db074c 100644 --- a/lib/statespace_cuda.h +++ b/lib/statespace_cuda.h @@ -49,7 +49,7 @@ class StateSpaceCUDA : struct Grid { unsigned threads; unsigned dblocks; - dim3 blocks; + unsigned blocks; }; public: @@ -86,10 +86,10 @@ class StateSpaceCUDA : uint64_t size = MinSize(state.num_qubits()) / 2; unsigned threads = std::min(size, uint64_t{param_.num_threads}); - uint64_t blocks = size / threads; + unsigned blocks = size / threads; unsigned bytes = 2 * threads * sizeof(fp_type); - InternalToNormalOrderKernel<<>>(state.get()); + InternalToNormalOrderKernel<<>>(state.get()); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); } @@ -98,10 +98,10 @@ class StateSpaceCUDA : uint64_t size = MinSize(state.num_qubits()) / 2; unsigned threads = std::min(size, uint64_t{param_.num_threads}); - uint64_t blocks = size / threads; + unsigned blocks = size / threads; unsigned bytes = 2 * threads * sizeof(fp_type); - NormalToInternalOrderKernel<<>>(state.get()); + NormalToInternalOrderKernel<<>>(state.get()); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); } @@ -117,11 +117,11 @@ class StateSpaceCUDA : uint64_t hsize = uint64_t{1} << state.num_qubits(); unsigned threads = std::min(size, uint64_t{param_.num_threads}); - uint64_t blocks = size / threads; + unsigned blocks = size / threads; fp_type v = double{1} / std::sqrt(hsize); - SetStateUniformKernel<<>>(v, hsize, state.get()); + SetStateUniformKernel<<>>(v, hsize, state.get()); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); } @@ -178,9 +178,9 @@ class StateSpaceCUDA : uint64_t size = MinSize(state.num_qubits()) / 2; unsigned threads = std::min(size, uint64_t{param_.num_threads}); - uint64_t blocks = size / threads; + unsigned blocks = size / threads; - BulkSetAmplKernel<<>>( + BulkSetAmplKernel<<>>( mask, bits, re, im, exclude, state.get()); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); @@ -195,9 +195,9 @@ class StateSpaceCUDA : uint64_t size = MinSize(src.num_qubits()); unsigned threads = std::min(size, uint64_t{param_.num_threads}); - uint64_t blocks = size / threads; + unsigned blocks = size / threads; - AddKernel<<>>(src.get(), dest.get()); + AddKernel<<>>(src.get(), dest.get()); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); @@ -209,9 +209,9 @@ class StateSpaceCUDA : uint64_t size = MinSize(state.num_qubits()); unsigned threads = std::min(size, uint64_t{param_.num_threads}); - uint64_t blocks = size / threads; + unsigned blocks = size / threads; - MultiplyKernel<<>>(a, state.get()); + MultiplyKernel<<>>(a, state.get()); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); } @@ -248,17 +248,15 @@ class StateSpaceCUDA : if (num_samples > 0) { Grid g1 = GetGrid1(MinSize(state.num_qubits()) / 2); unsigned bytes = g1.threads * sizeof(double); - uint64_t num_blocks1 = - MinSize(state.num_qubits()) / 2 / (g1.threads * g1.dblocks); - unsigned scratch_size = (num_blocks1 + 1) * sizeof(double) + unsigned scratch_size = (g1.blocks + 1) * sizeof(double) + num_samples * (sizeof(uint64_t) + sizeof(DistrRealType)); void* scratch = AllocScratch(scratch_size); double* d_res2 = (double*) scratch; double* d_res1 = d_res2 + 1; - uint64_t* d_bitstrings = (uint64_t*) (d_res1 + num_blocks1); + uint64_t* d_bitstrings = (uint64_t*) (d_res1 + g1.blocks); DistrRealType* d_rs = (DistrRealType *) (d_bitstrings + num_samples); auto op1 = RealProduct(); @@ -271,17 +269,17 @@ class StateSpaceCUDA : double norm; - if (num_blocks1 == 1) { + if (g1.blocks == 1) { ErrorCheck( cudaMemcpy(&norm, d_res1, sizeof(double), cudaMemcpyDeviceToHost)); } else { - Grid g2 = GetGrid2(num_blocks1); + Grid g2 = GetGrid2(g1.blocks); unsigned bytes = g2.threads * sizeof(double); auto op3 = Plus(); Reduce2Kernel<<>>( - g2.dblocks, num_blocks1, op3, op3, d_res1, d_res2); + g2.dblocks, g1.blocks, op3, op3, d_res1, d_res2); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); @@ -296,7 +294,7 @@ class StateSpaceCUDA : num_samples * sizeof(DistrRealType), cudaMemcpyHostToDevice)); - SampleKernel<<<1, g1.threads>>>(num_blocks1, g1.dblocks, num_samples, + SampleKernel<<<1, g1.threads>>>(g1.blocks, g1.dblocks, num_samples, d_rs, d_res1, state.get(), d_bitstrings); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); @@ -321,19 +319,17 @@ class StateSpaceCUDA : uint64_t size = MinSize(state.num_qubits()) / 2; unsigned threads = std::min(size, uint64_t{param_.num_threads}); - uint64_t blocks = size / threads; + unsigned blocks = size / threads; - CollapseKernel<<>>(mr.mask, mr.bits, renorm, state.get()); + CollapseKernel<<>>(mr.mask, mr.bits, renorm, state.get()); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); } std::vector PartialNorms(const State& state) const { Grid g = GetGrid1(MinSize(state.num_qubits()) / 2); - uint64_t num_blocks = - MinSize(state.num_qubits()) / 2 / (g.threads * g.dblocks); - unsigned scratch_size = num_blocks * sizeof(double); + unsigned scratch_size = g.blocks * sizeof(double); unsigned bytes = g.threads * sizeof(double); double* d_res = (double*) AllocScratch(scratch_size); @@ -346,7 +342,7 @@ class StateSpaceCUDA : ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); - std::vector norms(num_blocks); + std::vector norms(g.blocks); ErrorCheck( cudaMemcpy(norms.data(), d_res, scratch_size, cudaMemcpyDeviceToHost)); @@ -391,18 +387,21 @@ class StateSpaceCUDA : Grid GetGrid1(uint64_t size) const { Grid grid; + grid.threads = std::min(size, uint64_t{param_.num_threads}); grid.dblocks = std::min(size / grid.threads, uint64_t{param_.num_dblocks}); - uint64_t num_blocks = size / (grid.threads * grid.dblocks); - grid.blocks = CreateGrid(num_blocks); + grid.blocks = size / (grid.threads * grid.dblocks); + return grid; } Grid GetGrid2(unsigned size) const { Grid grid; + grid.threads = std::min(param_.num_threads, std::max(32U, size)); grid.dblocks = std::max(1U, size / grid.threads); - grid.blocks = dim3(1, 1, 1); + grid.blocks = 1; + return grid; } @@ -418,9 +417,8 @@ class StateSpaceCUDA : Grid g1 = GetGrid1(size); unsigned bytes = g1.threads * sizeof(FP1); - uint64_t num_blocks1 = size / (g1.threads * g1.dblocks); - FP2* d_res2 = (FP2*) AllocScratch((num_blocks1 + 1) * sizeof(FP2)); + FP2* d_res2 = (FP2*) AllocScratch((g1.blocks + 1) * sizeof(FP2)); FP2* d_res1 = d_res2 + 1; auto op1 = Op(); @@ -440,18 +438,18 @@ class StateSpaceCUDA : FP2 result; - if (num_blocks1 == 1) { + if (g1.blocks == 1) { ErrorCheck( cudaMemcpy(&result, d_res1, sizeof(FP2), cudaMemcpyDeviceToHost)); } else { - Grid g2 = GetGrid2(num_blocks1); + Grid g2 = GetGrid2(g1.blocks); unsigned bytes = g2.threads * sizeof(FP2); auto op2 = Plus(); auto op3 = Plus::type>(); Reduce2Kernel<<>>( - g2.dblocks, num_blocks1, op2, op3, d_res1, d_res2); + g2.dblocks, g1.blocks, op2, op3, d_res1, d_res2); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); diff --git a/lib/statespace_cuda_kernels.h b/lib/statespace_cuda_kernels.h index c762dfbb0..0bc4ba706 100644 --- a/lib/statespace_cuda_kernels.h +++ b/lib/statespace_cuda_kernels.h @@ -39,7 +39,7 @@ __device__ __forceinline__ FP1 BlockReduce1( unsigned warp = threadIdx.x / warp_size; unsigned lane = threadIdx.x % warp_size; - uint64_t k0 = 2 * n * GetBlockId() * blockDim.x + 2 * tid - lane; + uint64_t k0 = 2 * n * blockIdx.x * blockDim.x + 2 * tid - lane; uint64_t k1 = k0 + 2 * n * blockDim.x; FP1 r; @@ -88,7 +88,7 @@ __device__ __forceinline__ FP1 BlockReduce1Masked( unsigned warp = threadIdx.x / warp_size; unsigned lane = threadIdx.x % warp_size; - uint64_t k0 = 2 * n * GetBlockId() * blockDim.x + 2 * tid - lane; + uint64_t k0 = 2 * n * blockIdx.x * blockDim.x + 2 * tid - lane; uint64_t k1 = k0 + 2 * n * blockDim.x; FP1 r = 0; @@ -137,7 +137,7 @@ __device__ __forceinline__ FP1 BlockReduce2( FP1* partial1 = (FP1*) shared; unsigned tid = threadIdx.x; - uint64_t k0 = n * GetBlockId() * blockDim.x + tid; + uint64_t k0 = n * blockIdx.x * blockDim.x + tid; uint64_t k1 = k0 + n * blockDim.x; FP1 r = 0; @@ -185,7 +185,7 @@ __global__ void Reduce1Kernel(uint64_t n, Op1 op1, Op2 op2, Op3 op3, FP1 sum = detail::BlockReduce1(n, op1, op2, op3, s1, s2); if (threadIdx.x == 0) { - result[GetBlockId()] = sum; + result[blockIdx.x] = sum; } } @@ -198,7 +198,7 @@ __global__ void Reduce1MaskedKernel(uint64_t n, uint64_t mask, uint64_t bits, detail::BlockReduce1Masked(n, mask, bits, op1, op2, op3, s1, s2); if (threadIdx.x == 0) { - result[GetBlockId()] = sum; + result[blockIdx.x] = sum; } } @@ -209,7 +209,7 @@ __global__ void Reduce2Kernel( FP1 sum = detail::BlockReduce2(n, size, op2, op3, s); if (threadIdx.x == 0) { - result[GetBlockId()] = sum; + result[blockIdx.x] = sum; } } @@ -217,7 +217,7 @@ template __global__ void InternalToNormalOrderKernel(FP* state) { unsigned lane = threadIdx.x % warp_size; unsigned l = 2 * threadIdx.x - lane; - uint64_t k = 2 * GetBlockId() * blockDim.x + l; + uint64_t k = 2 * uint64_t{blockIdx.x} * blockDim.x + l; extern __shared__ float shared[]; FP* buf = (FP*) shared; @@ -235,7 +235,7 @@ template __global__ void NormalToInternalOrderKernel(FP* state) { unsigned lane = threadIdx.x % warp_size; unsigned l = 2 * threadIdx.x - lane; - uint64_t k = 2 * GetBlockId() * blockDim.x + l; + uint64_t k = 2 * uint64_t{blockIdx.x} * blockDim.x + l; extern __shared__ float shared[]; FP* buf = (FP*) shared; @@ -252,7 +252,7 @@ __global__ void NormalToInternalOrderKernel(FP* state) { template __global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) { unsigned lane = threadIdx.x % warp_size; - uint64_t k = 2 * (GetBlockId() * blockDim.x + threadIdx.x) - lane; + uint64_t k = 2 * (uint64_t{blockIdx.x} * blockDim.x + threadIdx.x) - lane; state[k] = lane < size ? v : 0; state[k + warp_size] = 0; @@ -260,19 +260,19 @@ __global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) { template __global__ void AddKernel(const FP* state1, FP* state2) { - uint64_t k = GetBlockId() * blockDim.x + threadIdx.x; + uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; state2[k] += state1[k]; } template __global__ void MultiplyKernel(FP a, FP* state) { - uint64_t k = GetBlockId() * blockDim.x + threadIdx.x; + uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; state[k] *= a; } template __global__ void CollapseKernel(uint64_t mask, uint64_t bits, FP r, FP* state) { - uint64_t k1 = GetBlockId() * blockDim.x + threadIdx.x; + uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; uint64_t k2 = 2 * k1 - threadIdx.x % warp_size; if ((k1 & mask) == bits) { @@ -287,7 +287,7 @@ __global__ void CollapseKernel(uint64_t mask, uint64_t bits, FP r, FP* state) { template __global__ void BulkSetAmplKernel( uint64_t mask, uint64_t bits, FP re, FP im, bool exclude, FP* state) { - uint64_t k1 = GetBlockId() * blockDim.x + threadIdx.x; + uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; uint64_t k2 = 2 * k1 - threadIdx.x % warp_size; bool set = ((k1 & mask) == bits) ^ exclude; @@ -299,7 +299,7 @@ __global__ void BulkSetAmplKernel( } template -__global__ void SampleKernel(uint64_t num_blocks, +__global__ void SampleKernel(unsigned num_blocks, uint64_t n, uint64_t num_samples, const FP1* rs, const FP2* ps, const FP3* state, uint64_t *bitstrings) { @@ -308,7 +308,7 @@ __global__ void SampleKernel(uint64_t num_blocks, uint64_t m = 0; double csum = 0; - for (uint64_t block_id = 0; block_id < num_blocks; ++block_id) { + for (unsigned block_id = 0; block_id < num_blocks; ++block_id) { uint64_t km = n * blockDim.x; uint64_t k0 = block_id * km; diff --git a/lib/util_cuda.h b/lib/util_cuda.h index 004ddc11b..b34292753 100644 --- a/lib/util_cuda.h +++ b/lib/util_cuda.h @@ -130,25 +130,6 @@ __device__ __forceinline__ FP1 WarpReduce(FP1 val, Op op) { return val; } -__device__ __forceinline__ uint64_t GetBlockId() { - return (uint64_t{blockIdx.z} * gridDim.y + uint64_t{blockIdx.y}) - * gridDim.x + blockIdx.x; -} - -inline dim3 CreateGrid(uint64_t blocks) { - if (blocks <= 65536) { - return dim3((uint32_t) blocks); - } - uint32_t x = 65536; - uint64_t rem = blocks / x; - if (rem <= 32768) { - return dim3(x, (uint32_t) rem); - } - uint32_t y = 32768; - uint32_t z = (uint32_t) (rem / y); - return dim3(x, y, z); -} - template __device__ __forceinline__ Complex WarpReduce(Complex val, Op op) { for (unsigned i = warp_size / 2; i > 0; i /= 2) { diff --git a/lib/vectorspace_cuda.h b/lib/vectorspace_cuda.h index f1efdd51d..5cfd4e834 100644 --- a/lib/vectorspace_cuda.h +++ b/lib/vectorspace_cuda.h @@ -86,8 +86,9 @@ class VectorSpaceCUDA { static Vector Create(unsigned num_qubits) { fp_type* p; - uint64_t size = sizeof(fp_type) * Impl::MinSize(num_qubits); + auto size = sizeof(fp_type) * Impl::MinSize(num_qubits); auto rc = cudaMalloc(&p, size); + if (rc == cudaSuccess) { return Vector{Pointer{(fp_type*) p, &detail::free}, num_qubits}; } else { @@ -168,4 +169,4 @@ class VectorSpaceCUDA { } // namespace qsim -#endif // VECTORSPACE_CUDA_H_ \ No newline at end of file +#endif // VECTORSPACE_CUDA_H_ diff --git a/pyproject.toml b/pyproject.toml index 9b1926371..b3d397cbb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -104,8 +104,8 @@ dev = [ "cibuildwheel", # Linters, formatters, and test utilities. - "black~=26.1.0", - "isort[colors]~=8.0.1", + "black~=25.9.0", + "isort[colors]~=6.0.1", "py-cpuinfo", "pylint~=4.0.2", "pytest", @@ -128,17 +128,12 @@ build = "cp310-* cp311-* cp312-* cp313-* cp314-*" dependency-versions = "latest" enable = ["cpython-prerelease"] environment.PIP_PREFER_BINARY = "1" -before-test = "pip install --group dev" -# import-mode=importlib prevents local source shadowing when importing qsimcirq. -test-command = "pytest --import-mode=importlib -n auto -s -v {package}/qsimcirq_tests/qsimcirq_test.py" - -[[tool.cibuildwheel.overrides]] -# Help increase the chances that pip will find binary wheels for NumPy. -# See https://cibuildwheel.pypa.io/en/stable/faq/#building-with-numpy -select = ["cp314*"] -inherit.environment = "append" -environment.PIP_EXTRA_INDEX_URL = "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/" -environment.PIP_PRERELEASE = "allow" +# Due to package & module name conflict, temporarily move it away to run tests: +before-test = "pip install --group dev && mv {package}/qsimcirq /tmp" +test-command = """ +pytest -n auto -s -v {package}/qsimcirq_tests/qsimcirq_test.py && +mv /tmp/qsimcirq {package} +""" [tool.cibuildwheel.macos] before-build = """ diff --git a/qsimcirq/qsim_circuit.py b/qsimcirq/qsim_circuit.py index cd4c62cde..9d2d4b9ed 100644 --- a/qsimcirq/qsim_circuit.py +++ b/qsimcirq/qsim_circuit.py @@ -272,7 +272,7 @@ def add_op_to_circuit( qsim_gate = qsim_op.gate gate_kind = _cirq_gate_kind(qsim_gate) if gate_kind is None: - raise ValueError("{!r} is not a supported gate.".format(qsim_gate)) + raise ValueError(f"{qsim_gate!r} is not a supported gate.") qubits = [qubit_to_index_dict[q] for q in qsim_op.qubits] qsim_qubits = qubits diff --git a/qsimcirq_tests/qsimcirq_test.py b/qsimcirq_tests/qsimcirq_test.py index cd3f1ce3a..4fef48ca9 100644 --- a/qsimcirq_tests/qsimcirq_test.py +++ b/qsimcirq_tests/qsimcirq_test.py @@ -2214,5 +2214,7 @@ def __repr__(self): circuit = qsimcirq.qsim.Circuit() qubit_to_index = {q0: 0} - with pytest.raises(ValueError, match="UnsupportedGate\(\) is not a supported gate."): + with pytest.raises( + ValueError, match="UnsupportedGate\(\) is not a supported gate." + ): qsimcirq.add_op_to_circuit(op, 0, qubit_to_index, circuit)