From 003de6b30f26000c0249de1f2b1d4c270fdb3c36 Mon Sep 17 00:00:00 2001
From: mhucka <1450019+mhucka@users.noreply.github.com>
Date: Thu, 26 Mar 2026 04:16:26 +0000
Subject: [PATCH 1/2] =?UTF-8?q?=F0=9F=A7=AA=20Add=20error=20test=20for=20a?=
 =?UTF-8?q?dd=5Fop=5Fto=5Fcircuit=20unsupported=20gate?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Added a test case for the ValueError raised when an unsupported gate
is passed to add_op_to_circuit. Also improved the library code by
adding an explicit check for unsupported gates early in the function.
---
 qsimcirq/qsim_circuit.py        |  2 ++
 qsimcirq_tests/qsimcirq_test.py | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/qsimcirq/qsim_circuit.py b/qsimcirq/qsim_circuit.py
index a5b4ae277..cd4c62cde 100644
--- a/qsimcirq/qsim_circuit.py
+++ b/qsimcirq/qsim_circuit.py
@@ -271,6 +271,8 @@ def add_op_to_circuit(
     """Adds an operation to a noisy or noiseless circuit."""
     qsim_gate = qsim_op.gate
     gate_kind = _cirq_gate_kind(qsim_gate)
+    if gate_kind is None:
+        raise ValueError("{!r} is not a supported gate.".format(qsim_gate))
     qubits = [qubit_to_index_dict[q] for q in qsim_op.qubits]
 
     qsim_qubits = qubits
diff --git a/qsimcirq_tests/qsimcirq_test.py b/qsimcirq_tests/qsimcirq_test.py
index 0b5cdbc8e..cd3f1ce3a 100644
--- a/qsimcirq_tests/qsimcirq_test.py
+++ b/qsimcirq_tests/qsimcirq_test.py
@@ -2196,3 +2196,23 @@ def test_1d_representation():
     want = np.array([0.0 - 0.5j, 0.0 + 0.5j, 0.0 - 0.5j, 0.0 + 0.5j])
     _, res, _ = qsim_sim.simulate_into_1d_array(c)
     np.testing.assert_allclose(res, np.array(want, dtype=np.complex64))
+
+
+def test_add_op_to_circuit_unsupported_gate():
+    class UnsupportedGate(cirq.Gate):
+        def _num_qubits_(self) -> int:
+            return 1
+
+        def _unitary_(self):
+            return np.eye(2)
+
+        def __repr__(self):
+            return "UnsupportedGate()"
+
+    q0 = cirq.LineQubit(0)
+    op = UnsupportedGate().on(q0)
+    circuit = qsimcirq.qsim.Circuit()
+    qubit_to_index = {q0: 0}
+
+    with pytest.raises(ValueError, match="UnsupportedGate\(\) is not a supported gate."):
+        qsimcirq.add_op_to_circuit(op, 0, qubit_to_index, circuit)

From 079a43780f08372ee5d98c59234dbf5542fbc625 Mon Sep 17 00:00:00 2001
From: mhucka <1450019+mhucka@users.noreply.github.com>
Date: Thu, 26 Mar 2026 04:23:43 +0000
Subject: [PATCH 2/2] =?UTF-8?q?=F0=9F=A7=AA=20[testing=20improvement]=20Ad?=
 =?UTF-8?q?d=20error=20test=20for=20add=5Fop=5Fto=5Fcircuit=20unsupported?=
 =?UTF-8?q?=20gate?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Added a test case for the ValueError raised when an unsupported gate
is passed to add_op_to_circuit. Refactored the ValueError message to
use an f-string as requested. Improved the library code by adding an
explicit check for unsupported gates early in the function. Corrected
formatting using black to comply with project guidelines.
---
 .github/workflows/scorecard-scanner.yaml |  2 +-
 lib/simulator_cuda.h                     | 52 +++++++++---------
 lib/simulator_cuda_kernels.h             | 40 +++++---------
 lib/statespace_cuda.h                    | 68 ++++++++++++------------
 lib/statespace_cuda_kernels.h            | 30 +++++------
 lib/util_cuda.h                          | 19 -------
 lib/vectorspace_cuda.h                   |  5 +-
 pyproject.toml                           | 21 +++-----
 qsimcirq/qsim_circuit.py                 |  2 +-
 qsimcirq_tests/qsimcirq_test.py          |  4 +-
 10 files changed, 103 insertions(+), 140 deletions(-)

diff --git a/.github/workflows/scorecard-scanner.yaml b/.github/workflows/scorecard-scanner.yaml
index b8deea7c4..3f40dd45e 100644
--- a/.github/workflows/scorecard-scanner.yaml
+++ b/.github/workflows/scorecard-scanner.yaml
@@ -80,7 +80,7 @@ jobs:
       - if: github.event.inputs.debug == true || runner.debug == true
         name: Upload results as artifacts to the workflow Summary page
         # yamllint disable rule:line-length
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
         with:
           name: Scorecard SARIF file
           path: scorecard-results.sarif
diff --git a/lib/simulator_cuda.h b/lib/simulator_cuda.h
index 2a5e651e4..5743bea8b 100644
--- a/lib/simulator_cuda.h
+++ b/lib/simulator_cuda.h
@@ -344,13 +344,13 @@ class SimulatorCUDA final {
 
     unsigned k = 5 + G;
     unsigned n = num_qubits > k ? num_qubits - k : 0;
-    uint64_t size = uint64_t{1} << n;
+    unsigned size = unsigned{1} << n;
     unsigned threads = 64U;
-    uint64_t blocks = std::max(uint64_t{1}, size / 2);
+    unsigned blocks = std::max(1U, size / 2);
 
     IndicesH<G> d_i(d_ws);
 
-    ApplyGateH_Kernel<G><<<CreateGrid(blocks), threads>>>(
+    ApplyGateH_Kernel<G><<<blocks, threads>>>(
         (fp_type*) d_ws, d_i.xss, d_i.ms, state.get());
   }
 
@@ -368,13 +368,13 @@ class SimulatorCUDA final {
 
     unsigned k = 5 + num_effective_qs;
     unsigned n = num_qubits > k ? num_qubits - k : 0;
-    uint64_t size = uint64_t{1} << n;
+    unsigned size = unsigned{1} << n;
     unsigned threads = 32;
-    uint64_t blocks = size;
+    unsigned blocks = size;
 
     IndicesL<G> d_i(d_ws);
 
-    ApplyGateL_Kernel<G><<<CreateGrid(blocks), threads>>>(
+    ApplyGateL_Kernel<G><<<blocks, threads>>>(
         (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
         1 << num_effective_qs, state.get());
   }
@@ -401,13 +401,13 @@ class SimulatorCUDA final {
 
     unsigned k = 5 + G + cqs.size();
     unsigned n = num_qubits > k ? num_qubits - k : 0;
-    uint64_t size = uint64_t{1} << n;
+    unsigned size = unsigned{1} << n;
     unsigned threads = 64U;
-    uint64_t blocks = std::max(uint64_t{1}, size / 2);
+    unsigned blocks = std::max(1U, size / 2);
 
     IndicesH<G> d_i(d_ws);
 
-    ApplyControlledGateH_Kernel<G><<<CreateGrid(blocks), threads>>>(
+    ApplyControlledGateH_Kernel<G><<<blocks, threads>>>(
         (fp_type*) d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get());
   }
 
@@ -426,13 +426,13 @@ class SimulatorCUDA final {
 
     unsigned k = 5 + G + cqs.size();
     unsigned n = num_qubits > k ? num_qubits - k : 0;
-    uint64_t size = uint64_t{1} << n;
+    unsigned size = unsigned{1} << n;
     unsigned threads = 32;
-    uint64_t blocks = size;
+    unsigned blocks = size;
 
     IndicesL<G> d_i(d_ws);
 
-    ApplyControlledGateLH_Kernel<G><<<CreateGrid(blocks), threads>>>(
+    ApplyControlledGateLH_Kernel<G><<<blocks, threads>>>(
         (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
         d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, state.get());
   }
@@ -452,13 +452,13 @@ class SimulatorCUDA final {
 
     unsigned k = 5 + G + cqs.size();
     unsigned n = num_qubits > k ? num_qubits - k : 0;
-    uint64_t size = uint64_t{1} << n;
+    unsigned size = unsigned{1} << n;
     unsigned threads = 32;
-    uint64_t blocks = size;
+    unsigned blocks = size;
 
     IndicesLC<G> d_i(d_ws);
 
-    ApplyControlledGateL_Kernel<G><<<CreateGrid(blocks), threads>>>(
+    ApplyControlledGateL_Kernel<G><<<blocks, threads>>>(
         (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis,
         d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs,
         1 << (5 - d.remaining_low_cqs), state.get());
@@ -479,11 +479,11 @@ class SimulatorCUDA final {
 
     unsigned k = 5 + G;
     unsigned n = num_qubits > k ? num_qubits - k : 0;
-    uint64_t size = uint64_t{1} << n;
+    unsigned size = unsigned{1} << n;
 
     unsigned s = std::min(n >= 14 ? n - 14 : 0, 4U);
     unsigned threads = 64U;
-    uint64_t blocks = std::max(uint64_t{1}, (size / 2) >> s);
+    unsigned blocks = std::max(1U, (size / 2) >> s);
     unsigned num_iterations_per_block = 1 << s;
 
     constexpr unsigned m = 16;
@@ -493,7 +493,7 @@ class SimulatorCUDA final {
 
     IndicesH<G> d_i(d_ws);
 
-    ExpectationValueH_Kernel<G><<<CreateGrid(blocks), threads>>>(
+    ExpectationValueH_Kernel<G><<<blocks, threads>>>(
         (fp_type*) d_ws, d_i.xss, d_i.ms, num_iterations_per_block,
         state.get(), Plus<double>(), d_res1);
 
@@ -517,11 +517,11 @@ class SimulatorCUDA final {
 
     unsigned k = 5 + num_effective_qs;
     unsigned n = num_qubits > k ? num_qubits - k : 0;
-    uint64_t size = uint64_t{1} << n;
+    unsigned size = unsigned{1} << n;
 
     unsigned s = std::min(n >= 13 ? n - 13 : 0, 5U);
     unsigned threads = 32;
-    uint64_t blocks = size >> s;
+    unsigned blocks = size >> s;
     unsigned num_iterations_per_block = 1 << s;
 
     constexpr unsigned m = 16;
@@ -531,7 +531,7 @@ class SimulatorCUDA final {
 
     IndicesL<G> d_i(d_ws);
 
-    ExpectationValueL_Kernel<G><<<CreateGrid(blocks), threads>>>(
+    ExpectationValueL_Kernel<G><<<blocks, threads>>>(
         (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
         num_iterations_per_block, state.get(), Plus<double>(), d_res1);
 
@@ -542,7 +542,7 @@ class SimulatorCUDA final {
 
   template <unsigned m>
   std::complex<double> ExpectationValueReduceFinal(
-      uint64_t blocks, double mul,
+      unsigned blocks, double mul,
       const Complex* d_res1, Complex* d_res2) const {
     Complex res2[m];
 
@@ -550,10 +550,10 @@ class SimulatorCUDA final {
       ErrorCheck(cudaMemcpy(res2, d_res1, blocks * sizeof(Complex),
                             cudaMemcpyDeviceToHost));
     } else {
-      unsigned threads2 = std::min(uint64_t{1024}, blocks);
-      uint64_t blocks2 = std::min(uint64_t{m}, blocks / threads2);
+      unsigned threads2 = std::min(1024U, blocks);
+      unsigned blocks2 = std::min(m, blocks / threads2);
 
-      unsigned dblocks = std::max(uint64_t{1}, blocks / (blocks2 * threads2));
+      unsigned dblocks = std::max(1U, blocks / (blocks2 * threads2));
       unsigned bytes = threads2 * sizeof(Complex);
 
       Reduce2Kernel<Complex><<<blocks2, threads2, bytes>>>(
@@ -568,7 +568,7 @@ class SimulatorCUDA final {
     double re = 0;
     double im = 0;
 
-    for (uint64_t i = 0; i < blocks; ++i) {
+    for (unsigned i = 0; i < blocks; ++i) {
       re += res2[i].re;
       im += res2[i].im;
     }
diff --git a/lib/simulator_cuda_kernels.h b/lib/simulator_cuda_kernels.h
index d39661c56..e21a9d62e 100644
--- a/lib/simulator_cuda_kernels.h
+++ b/lib/simulator_cuda_kernels.h
@@ -18,13 +18,13 @@
 #ifdef __NVCC__
   #include <cuda.h>
   #include <cuda_runtime.h>
+
+  #include "util_cuda.h"
 #elif __HIP__
   #include <hip/hip_runtime.h>
   #include "cuda2hip.h"
 #endif
 
-#include "util_cuda.h"
-
 namespace qsim {
 
 template <unsigned G, typename fp_type, typename idx_type>
@@ -33,8 +33,6 @@ __global__ void ApplyGateH_Kernel(
     const idx_type* __restrict__ mss, fp_type* __restrict__ rstate) {
   // blockDim.x must be equal to 64.
 
-  uint64_t blockId = GetBlockId();
-
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
   constexpr unsigned gsize = 1 << G;
@@ -63,7 +61,7 @@ __global__ void ApplyGateH_Kernel(
 
   __syncthreads();
 
-  idx_type i = (64 * idx_type{blockId} + threadIdx.x) & 0xffffffffffe0;
+  idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0;
   idx_type ii = i & mss[0];
   for (unsigned j = 1; j <= G; ++j) {
     i *= 2;
@@ -117,8 +115,6 @@ __global__ void ApplyGateL_Kernel(
     fp_type* __restrict__ rstate) {
   // blockDim.x must be equal to 32.
 
-  uint64_t blockId = GetBlockId();
-
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
   constexpr unsigned gsize = 1 << G;
@@ -141,7 +137,7 @@ __global__ void ApplyGateL_Kernel(
     }
   }
 
-  idx_type i = 32 * idx_type{blockId};
+  idx_type i = 32 * idx_type{blockIdx.x};
   idx_type ii = i & mss[0];
   for (unsigned j = 1; j <= G; ++j) {
     i *= 2;
@@ -208,8 +204,6 @@ __global__ void ApplyControlledGateH_Kernel(
     fp_type* __restrict__ rstate) {
   // blockDim.x must be equal to 64.
 
-  uint64_t blockId = GetBlockId();
-
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
   constexpr unsigned gsize = 1 << G;
@@ -238,7 +232,7 @@ __global__ void ApplyControlledGateH_Kernel(
 
   __syncthreads();
 
-  idx_type i = (64 * idx_type{blockId} + threadIdx.x) & 0xffffffffffe0;
+  idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0;
   idx_type ii = i & mss[0];
   for (unsigned j = 1; j < num_mss; ++j) {
     i *= 2;
@@ -294,8 +288,6 @@ __global__ void ApplyControlledGateLH_Kernel(
     unsigned esize, fp_type* __restrict__ rstate) {
   // blockDim.x must be equal to 32.
 
-  uint64_t blockId = GetBlockId();
-
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
   constexpr unsigned gsize = 1 << G;
@@ -308,7 +300,7 @@ __global__ void ApplyControlledGateLH_Kernel(
   __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
   __shared__ fp_type v[2 * gsize * rows];
 
-  idx_type i = 32 * idx_type{blockId};
+  idx_type i = 32 * idx_type{blockIdx.x};
   idx_type ii = i & mss[0];
   for (unsigned j = 1; j < num_mss; ++j) {
     i *= 2;
@@ -389,8 +381,6 @@ __global__ void ApplyControlledGateL_Kernel(
     fp_type* __restrict__ rstate) {
   // blockDim.x must be equal to 32.
 
-  uint64_t blockId = GetBlockId();
-
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
   constexpr unsigned gsize = 1 << G;
@@ -403,7 +393,7 @@ __global__ void ApplyControlledGateL_Kernel(
   __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
   __shared__ fp_type v[2 * gsize * rows];
 
-  idx_type i = 32 * idx_type{blockId};
+  idx_type i = 32 * idx_type{blockIdx.x};
   idx_type ii = i & mss[0];
   for (unsigned j = 1; j < num_mss; ++j) {
     i *= 2;
@@ -487,8 +477,6 @@ __global__ void ExpectationValueH_Kernel(
     const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) {
   // blockDim.x must be equal to 64.
 
-  uint64_t blockId = GetBlockId();
-
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
   constexpr unsigned gsize = 1 << G;
@@ -520,7 +508,7 @@ __global__ void ExpectationValueH_Kernel(
   double im = 0;
 
   for (unsigned iter = 0; iter < num_iterations_per_block; ++iter) {
-    idx_type b = num_iterations_per_block * idx_type{blockId} + iter;
+    idx_type b = num_iterations_per_block * idx_type{blockIdx.x} + iter;
 
     idx_type i = (64 * b + threadIdx.x) & 0xffffffffffe0;
     idx_type ii = i & mss[0];
@@ -585,8 +573,8 @@ __global__ void ExpectationValueH_Kernel(
   __syncthreads();
 
   if (threadIdx.x == 0) {
-    result[blockId].re = partial2[0].re + partial2[1].re;
-    result[blockId].im = partial2[0].im + partial2[1].im;
+    result[blockIdx.x].re = partial2[0].re + partial2[1].re;
+    result[blockIdx.x].im = partial2[0].im + partial2[1].im;
   }
 }
 
@@ -599,8 +587,6 @@ __global__ void ExpectationValueL_Kernel(
     const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) {
   // blockDim.x must be equal to 32.
 
-  uint64_t blockId = GetBlockId();
-
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
   constexpr unsigned gsize = 1 << G;
@@ -626,7 +612,7 @@ __global__ void ExpectationValueL_Kernel(
   double im = 0;
 
   for (idx_type iter = 0; iter < num_iterations_per_block; ++iter) {
-    idx_type i = 32 * (num_iterations_per_block * idx_type{blockId} + iter);
+    idx_type i = 32 * (num_iterations_per_block * idx_type{blockIdx.x} + iter);
     idx_type ii = i & mss[0];
     for (unsigned j = 1; j <= G; ++j) {
       i *= 2;
@@ -687,8 +673,8 @@ __global__ void ExpectationValueL_Kernel(
   auto val = WarpReduce(partial[threadIdx.x], op);
 
   if (threadIdx.x == 0) {
-    result[blockId].re = val.re;
-    result[blockId].im = val.im;
+    result[blockIdx.x].re = val.re;
+    result[blockIdx.x].im = val.im;
   }
 }
 
diff --git a/lib/statespace_cuda.h b/lib/statespace_cuda.h
index e0f3cfe03..660db074c 100644
--- a/lib/statespace_cuda.h
+++ b/lib/statespace_cuda.h
@@ -49,7 +49,7 @@ class StateSpaceCUDA :
   struct Grid {
     unsigned threads;
     unsigned dblocks;
-    dim3 blocks;
+    unsigned blocks;
   };
 
  public:
@@ -86,10 +86,10 @@ class StateSpaceCUDA :
     uint64_t size = MinSize(state.num_qubits()) / 2;
 
     unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    uint64_t blocks = size / threads;
+    unsigned blocks = size / threads;
     unsigned bytes = 2 * threads * sizeof(fp_type);
 
-    InternalToNormalOrderKernel<<<CreateGrid(blocks), threads, bytes>>>(state.get());
+    InternalToNormalOrderKernel<<<blocks, threads, bytes>>>(state.get());
     ErrorCheck(cudaPeekAtLastError());
     ErrorCheck(cudaDeviceSynchronize());
   }
@@ -98,10 +98,10 @@ class StateSpaceCUDA :
     uint64_t size = MinSize(state.num_qubits()) / 2;
 
     unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    uint64_t blocks = size / threads;
+    unsigned blocks = size / threads;
     unsigned bytes = 2 * threads * sizeof(fp_type);
 
-    NormalToInternalOrderKernel<<<CreateGrid(blocks), threads, bytes>>>(state.get());
+    NormalToInternalOrderKernel<<<blocks, threads, bytes>>>(state.get());
     ErrorCheck(cudaPeekAtLastError());
     ErrorCheck(cudaDeviceSynchronize());
   }
@@ -117,11 +117,11 @@ class StateSpaceCUDA :
     uint64_t hsize = uint64_t{1} << state.num_qubits();
 
     unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    uint64_t blocks = size / threads;
+    unsigned blocks = size / threads;
 
     fp_type v = double{1} / std::sqrt(hsize);
 
-    SetStateUniformKernel<<<CreateGrid(blocks), threads>>>(v, hsize, state.get());
+    SetStateUniformKernel<<<blocks, threads>>>(v, hsize, state.get());
     ErrorCheck(cudaPeekAtLastError());
     ErrorCheck(cudaDeviceSynchronize());
   }
@@ -178,9 +178,9 @@ class StateSpaceCUDA :
     uint64_t size = MinSize(state.num_qubits()) / 2;
 
     unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    uint64_t blocks = size / threads;
+    unsigned blocks = size / threads;
 
-    BulkSetAmplKernel<<<CreateGrid(blocks), threads>>>(
+    BulkSetAmplKernel<<<blocks, threads>>>(
         mask, bits, re, im, exclude, state.get());
     ErrorCheck(cudaPeekAtLastError());
     ErrorCheck(cudaDeviceSynchronize());
@@ -195,9 +195,9 @@ class StateSpaceCUDA :
     uint64_t size = MinSize(src.num_qubits());
 
     unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    uint64_t blocks = size / threads;
+    unsigned blocks = size / threads;
 
-    AddKernel<<<CreateGrid(blocks), threads>>>(src.get(), dest.get());
+    AddKernel<<<blocks, threads>>>(src.get(), dest.get());
     ErrorCheck(cudaPeekAtLastError());
     ErrorCheck(cudaDeviceSynchronize());
 
@@ -209,9 +209,9 @@ class StateSpaceCUDA :
     uint64_t size = MinSize(state.num_qubits());
 
     unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    uint64_t blocks = size / threads;
+    unsigned blocks = size / threads;
 
-    MultiplyKernel<<<CreateGrid(blocks), threads>>>(a, state.get());
+    MultiplyKernel<<<blocks, threads>>>(a, state.get());
     ErrorCheck(cudaPeekAtLastError());
     ErrorCheck(cudaDeviceSynchronize());
   }
@@ -248,17 +248,15 @@ class StateSpaceCUDA :
     if (num_samples > 0) {
       Grid g1 = GetGrid1(MinSize(state.num_qubits()) / 2);
       unsigned bytes = g1.threads * sizeof(double);
-      uint64_t num_blocks1 =
-          MinSize(state.num_qubits()) / 2 / (g1.threads * g1.dblocks);
 
-      unsigned scratch_size = (num_blocks1 + 1) * sizeof(double)
+      unsigned scratch_size = (g1.blocks + 1) * sizeof(double)
           + num_samples * (sizeof(uint64_t) + sizeof(DistrRealType));
 
       void* scratch = AllocScratch(scratch_size);
 
       double* d_res2 = (double*) scratch;
       double* d_res1 = d_res2 + 1;
-      uint64_t* d_bitstrings = (uint64_t*) (d_res1 + num_blocks1);
+      uint64_t* d_bitstrings = (uint64_t*) (d_res1 + g1.blocks);
       DistrRealType* d_rs = (DistrRealType *) (d_bitstrings + num_samples);
 
       auto op1 = RealProduct<fp_type>();
@@ -271,17 +269,17 @@ class StateSpaceCUDA :
 
       double norm;
 
-      if (num_blocks1 == 1) {
+      if (g1.blocks == 1) {
         ErrorCheck(
             cudaMemcpy(&norm, d_res1, sizeof(double), cudaMemcpyDeviceToHost));
       } else {
-        Grid g2 = GetGrid2(num_blocks1);
+        Grid g2 = GetGrid2(g1.blocks);
         unsigned bytes = g2.threads * sizeof(double);
 
         auto op3 = Plus<double>();
 
         Reduce2Kernel<double><<<g2.blocks, g2.threads, bytes>>>(
-            g2.dblocks, num_blocks1, op3, op3, d_res1, d_res2);
+            g2.dblocks, g1.blocks, op3, op3, d_res1, d_res2);
         ErrorCheck(cudaPeekAtLastError());
         ErrorCheck(cudaDeviceSynchronize());
 
@@ -296,7 +294,7 @@ class StateSpaceCUDA :
                             num_samples * sizeof(DistrRealType),
                             cudaMemcpyHostToDevice));
 
-      SampleKernel<<<1, g1.threads>>>(num_blocks1, g1.dblocks, num_samples,
+      SampleKernel<<<1, g1.threads>>>(g1.blocks, g1.dblocks, num_samples,
                                       d_rs, d_res1, state.get(), d_bitstrings);
       ErrorCheck(cudaPeekAtLastError());
       ErrorCheck(cudaDeviceSynchronize());
@@ -321,19 +319,17 @@ class StateSpaceCUDA :
     uint64_t size = MinSize(state.num_qubits()) / 2;
 
     unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    uint64_t blocks = size / threads;
+    unsigned blocks = size / threads;
 
-    CollapseKernel<<<CreateGrid(blocks), threads>>>(mr.mask, mr.bits, renorm, state.get());
+    CollapseKernel<<<blocks, threads>>>(mr.mask, mr.bits, renorm, state.get());
     ErrorCheck(cudaPeekAtLastError());
     ErrorCheck(cudaDeviceSynchronize());
   }
 
   std::vector<double> PartialNorms(const State& state) const {
     Grid g = GetGrid1(MinSize(state.num_qubits()) / 2);
-    uint64_t num_blocks =
-        MinSize(state.num_qubits()) / 2 / (g.threads * g.dblocks);
 
-    unsigned scratch_size = num_blocks * sizeof(double);
+    unsigned scratch_size = g.blocks * sizeof(double);
     unsigned bytes = g.threads * sizeof(double);
 
     double* d_res = (double*) AllocScratch(scratch_size);
@@ -346,7 +342,7 @@ class StateSpaceCUDA :
     ErrorCheck(cudaPeekAtLastError());
     ErrorCheck(cudaDeviceSynchronize());
 
-    std::vector<double> norms(num_blocks);
+    std::vector<double> norms(g.blocks);
 
     ErrorCheck(
         cudaMemcpy(norms.data(), d_res, scratch_size, cudaMemcpyDeviceToHost));
@@ -391,18 +387,21 @@ class StateSpaceCUDA :
 
   Grid GetGrid1(uint64_t size) const {
     Grid grid;
+
     grid.threads = std::min(size, uint64_t{param_.num_threads});
     grid.dblocks = std::min(size / grid.threads, uint64_t{param_.num_dblocks});
-    uint64_t num_blocks = size / (grid.threads * grid.dblocks);
-    grid.blocks = CreateGrid(num_blocks);
+    grid.blocks = size / (grid.threads * grid.dblocks);
+
     return grid;
   }
 
   Grid GetGrid2(unsigned size) const {
     Grid grid;
+
     grid.threads = std::min(param_.num_threads, std::max(32U, size));
     grid.dblocks = std::max(1U, size / grid.threads);
-    grid.blocks = dim3(1, 1, 1);
+    grid.blocks = 1;
+
     return grid;
   }
 
@@ -418,9 +417,8 @@ class StateSpaceCUDA :
 
     Grid g1 = GetGrid1(size);
     unsigned bytes = g1.threads * sizeof(FP1);
-    uint64_t num_blocks1 = size / (g1.threads * g1.dblocks);
 
-    FP2* d_res2 = (FP2*) AllocScratch((num_blocks1 + 1) * sizeof(FP2));
+    FP2* d_res2 = (FP2*) AllocScratch((g1.blocks + 1) * sizeof(FP2));
     FP2* d_res1 = d_res2 + 1;
 
     auto op1 = Op();
@@ -440,18 +438,18 @@ class StateSpaceCUDA :
 
     FP2 result;
 
-    if (num_blocks1 == 1) {
+    if (g1.blocks == 1) {
       ErrorCheck(
           cudaMemcpy(&result, d_res1, sizeof(FP2), cudaMemcpyDeviceToHost));
     } else {
-      Grid g2 = GetGrid2(num_blocks1);
+      Grid g2 = GetGrid2(g1.blocks);
       unsigned bytes = g2.threads * sizeof(FP2);
 
       auto op2 = Plus<FP2>();
       auto op3 = Plus<typename Scalar<FP2>::type>();
 
       Reduce2Kernel<FP2><<<g2.blocks, g2.threads, bytes>>>(
-          g2.dblocks, num_blocks1, op2, op3, d_res1, d_res2);
+          g2.dblocks, g1.blocks, op2, op3, d_res1, d_res2);
       ErrorCheck(cudaPeekAtLastError());
       ErrorCheck(cudaDeviceSynchronize());
 
diff --git a/lib/statespace_cuda_kernels.h b/lib/statespace_cuda_kernels.h
index c762dfbb0..0bc4ba706 100644
--- a/lib/statespace_cuda_kernels.h
+++ b/lib/statespace_cuda_kernels.h
@@ -39,7 +39,7 @@ __device__ __forceinline__ FP1 BlockReduce1(
   unsigned warp = threadIdx.x / warp_size;
   unsigned lane = threadIdx.x % warp_size;
 
-  uint64_t k0 = 2 * n * GetBlockId() * blockDim.x + 2 * tid - lane;
+  uint64_t k0 = 2 * n * blockIdx.x * blockDim.x + 2 * tid - lane;
   uint64_t k1 = k0 + 2 * n * blockDim.x;
 
   FP1 r;
@@ -88,7 +88,7 @@ __device__ __forceinline__ FP1 BlockReduce1Masked(
   unsigned warp = threadIdx.x / warp_size;
   unsigned lane = threadIdx.x % warp_size;
 
-  uint64_t k0 = 2 * n * GetBlockId() * blockDim.x + 2 * tid - lane;
+  uint64_t k0 = 2 * n * blockIdx.x * blockDim.x + 2 * tid - lane;
   uint64_t k1 = k0 + 2 * n * blockDim.x;
 
   FP1 r = 0;
@@ -137,7 +137,7 @@ __device__ __forceinline__ FP1 BlockReduce2(
   FP1* partial1 = (FP1*) shared;
 
   unsigned tid = threadIdx.x;
-  uint64_t k0 = n * GetBlockId() * blockDim.x + tid;
+  uint64_t k0 = n * blockIdx.x * blockDim.x + tid;
   uint64_t k1 = k0 + n * blockDim.x;
 
   FP1 r = 0;
@@ -185,7 +185,7 @@ __global__ void Reduce1Kernel(uint64_t n, Op1 op1, Op2 op2, Op3 op3,
   FP1 sum = detail::BlockReduce1<FP1>(n, op1, op2, op3, s1, s2);
 
   if (threadIdx.x == 0) {
-    result[GetBlockId()] = sum;
+    result[blockIdx.x] = sum;
   }
 }
 
@@ -198,7 +198,7 @@ __global__ void Reduce1MaskedKernel(uint64_t n, uint64_t mask, uint64_t bits,
       detail::BlockReduce1Masked<FP1>(n, mask, bits, op1, op2, op3, s1, s2);
 
   if (threadIdx.x == 0) {
-    result[GetBlockId()] = sum;
+    result[blockIdx.x] = sum;
   }
 }
 
@@ -209,7 +209,7 @@ __global__ void Reduce2Kernel(
   FP1 sum = detail::BlockReduce2<FP1>(n, size, op2, op3, s);
 
   if (threadIdx.x == 0) {
-    result[GetBlockId()] = sum;
+    result[blockIdx.x] = sum;
   }
 }
 
@@ -217,7 +217,7 @@ template <typename FP, unsigned warp_size = 32>
 __global__ void InternalToNormalOrderKernel(FP* state) {
   unsigned lane = threadIdx.x % warp_size;
   unsigned l = 2 * threadIdx.x - lane;
-  uint64_t k = 2 * GetBlockId() * blockDim.x + l;
+  uint64_t k = 2 * uint64_t{blockIdx.x} * blockDim.x + l;
 
   extern __shared__ float shared[];
   FP* buf = (FP*) shared;
@@ -235,7 +235,7 @@ template <typename FP, unsigned warp_size = 32>
 __global__ void NormalToInternalOrderKernel(FP* state) {
   unsigned lane = threadIdx.x % warp_size;
   unsigned l = 2 * threadIdx.x - lane;
-  uint64_t k = 2 * GetBlockId() * blockDim.x + l;
+  uint64_t k = 2 * uint64_t{blockIdx.x} * blockDim.x + l;
 
   extern __shared__ float shared[];
   FP* buf = (FP*) shared;
@@ -252,7 +252,7 @@ __global__ void NormalToInternalOrderKernel(FP* state) {
 template <typename FP, unsigned warp_size = 32>
 __global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) {
   unsigned lane = threadIdx.x % warp_size;
-  uint64_t k = 2 * (GetBlockId() * blockDim.x + threadIdx.x) - lane;
+  uint64_t k = 2 * (uint64_t{blockIdx.x} * blockDim.x + threadIdx.x) - lane;
 
   state[k] = lane < size ? v : 0;
   state[k + warp_size] = 0;
@@ -260,19 +260,19 @@ __global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) {
 
 template <typename FP, unsigned warp_size = 32>
 __global__ void AddKernel(const FP* state1, FP* state2) {
-  uint64_t k = GetBlockId() * blockDim.x + threadIdx.x;
+  uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
   state2[k] += state1[k];
 }
 
 template <typename FP, unsigned warp_size = 32>
 __global__ void MultiplyKernel(FP a, FP* state) {
-  uint64_t k = GetBlockId() * blockDim.x + threadIdx.x;
+  uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
   state[k] *= a;
 }
 
 template <typename FP, unsigned warp_size = 32>
 __global__ void CollapseKernel(uint64_t mask, uint64_t bits, FP r, FP* state) {
-  uint64_t k1 = GetBlockId() * blockDim.x + threadIdx.x;
+  uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
   uint64_t k2 = 2 * k1 - threadIdx.x % warp_size;
 
   if ((k1 & mask) == bits) {
@@ -287,7 +287,7 @@ __global__ void CollapseKernel(uint64_t mask, uint64_t bits, FP r, FP* state) {
 template <typename FP, unsigned warp_size = 32>
 __global__ void BulkSetAmplKernel(
     uint64_t mask, uint64_t bits, FP re, FP im, bool exclude, FP* state) {
-  uint64_t k1 = GetBlockId() * blockDim.x + threadIdx.x;
+  uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
   uint64_t k2 = 2 * k1 - threadIdx.x % warp_size;
 
   bool set = ((k1 & mask) == bits) ^ exclude;
@@ -299,7 +299,7 @@ __global__ void BulkSetAmplKernel(
 }
 
 template <typename FP1, typename FP2, typename FP3, unsigned warp_size = 32>
-__global__ void SampleKernel(uint64_t num_blocks,
+__global__ void SampleKernel(unsigned num_blocks,
                              uint64_t n, uint64_t num_samples,
                              const FP1* rs, const FP2* ps, const FP3* state,
                              uint64_t *bitstrings) {
@@ -308,7 +308,7 @@ __global__ void SampleKernel(uint64_t num_blocks,
     uint64_t m = 0;
     double csum = 0;
 
-    for (uint64_t block_id = 0; block_id < num_blocks; ++block_id) {
+    for (unsigned block_id = 0; block_id < num_blocks; ++block_id) {
       uint64_t km = n * blockDim.x;
       uint64_t k0 = block_id * km;
 
diff --git a/lib/util_cuda.h b/lib/util_cuda.h
index 004ddc11b..b34292753 100644
--- a/lib/util_cuda.h
+++ b/lib/util_cuda.h
@@ -130,25 +130,6 @@ __device__ __forceinline__ FP1 WarpReduce(FP1 val, Op op) {
   return val;
 }
 
-__device__ __forceinline__ uint64_t GetBlockId() {
-  return (uint64_t{blockIdx.z} * gridDim.y + uint64_t{blockIdx.y})
-         * gridDim.x + blockIdx.x;
-}
-
-inline dim3 CreateGrid(uint64_t blocks) {
-  if (blocks <= 65536) {
-    return dim3((uint32_t) blocks);
-  }
-  uint32_t x = 65536;
-  uint64_t rem = blocks / x;
-  if (rem <= 32768) {
-    return dim3(x, (uint32_t) rem);
-  }
-  uint32_t y = 32768;
-  uint32_t z = (uint32_t) (rem / y);
-  return dim3(x, y, z);
-}
-
 template <typename FP1, typename Op, unsigned warp_size = 32>
 __device__ __forceinline__ Complex<FP1> WarpReduce(Complex<FP1> val, Op op) {
   for (unsigned i = warp_size / 2; i > 0; i /= 2) {
diff --git a/lib/vectorspace_cuda.h b/lib/vectorspace_cuda.h
index f1efdd51d..5cfd4e834 100644
--- a/lib/vectorspace_cuda.h
+++ b/lib/vectorspace_cuda.h
@@ -86,8 +86,9 @@ class VectorSpaceCUDA {
 
   static Vector Create(unsigned num_qubits) {
     fp_type* p;
-    uint64_t size = sizeof(fp_type) * Impl::MinSize(num_qubits);
+    auto size = sizeof(fp_type) * Impl::MinSize(num_qubits);
     auto rc = cudaMalloc(&p, size);
+
     if (rc == cudaSuccess) {
       return Vector{Pointer{(fp_type*) p, &detail::free}, num_qubits};
     } else {
@@ -168,4 +169,4 @@ class VectorSpaceCUDA {
 
 }  // namespace qsim
 
-#endif  // VECTORSPACE_CUDA_H_
\ No newline at end of file
+#endif  // VECTORSPACE_CUDA_H_
diff --git a/pyproject.toml b/pyproject.toml
index 9b1926371..b3d397cbb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -104,8 +104,8 @@ dev = [
     "cibuildwheel",
 
     # Linters, formatters, and test utilities.
-    "black~=26.1.0",
-    "isort[colors]~=8.0.1",
+    "black~=25.9.0",
+    "isort[colors]~=6.0.1",
     "py-cpuinfo",
     "pylint~=4.0.2",
     "pytest",
@@ -128,17 +128,12 @@ build = "cp310-* cp311-* cp312-* cp313-* cp314-*"
 dependency-versions = "latest"
 enable = ["cpython-prerelease"]
 environment.PIP_PREFER_BINARY = "1"
-before-test = "pip install --group dev"
-# import-mode=importlib prevents local source shadowing when importing qsimcirq.
-test-command = "pytest --import-mode=importlib -n auto -s -v {package}/qsimcirq_tests/qsimcirq_test.py"
-
-[[tool.cibuildwheel.overrides]]
-# Help increase the chances that pip will find binary wheels for NumPy.
-# See https://cibuildwheel.pypa.io/en/stable/faq/#building-with-numpy
-select = ["cp314*"]
-inherit.environment = "append"
-environment.PIP_EXTRA_INDEX_URL = "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple/"
-environment.PIP_PRERELEASE = "allow"
+# Due to package & module name conflict, temporarily move it away to run tests:
+before-test = "pip install --group dev && mv {package}/qsimcirq /tmp"
+test-command = """
+pytest -n auto -s -v {package}/qsimcirq_tests/qsimcirq_test.py &&
+mv /tmp/qsimcirq {package}
+"""
 
 [tool.cibuildwheel.macos]
 before-build = """
diff --git a/qsimcirq/qsim_circuit.py b/qsimcirq/qsim_circuit.py
index cd4c62cde..9d2d4b9ed 100644
--- a/qsimcirq/qsim_circuit.py
+++ b/qsimcirq/qsim_circuit.py
@@ -272,7 +272,7 @@ def add_op_to_circuit(
     qsim_gate = qsim_op.gate
     gate_kind = _cirq_gate_kind(qsim_gate)
     if gate_kind is None:
-        raise ValueError("{!r} is not a supported gate.".format(qsim_gate))
+        raise ValueError(f"{qsim_gate!r} is not a supported gate.")
     qubits = [qubit_to_index_dict[q] for q in qsim_op.qubits]
 
     qsim_qubits = qubits
diff --git a/qsimcirq_tests/qsimcirq_test.py b/qsimcirq_tests/qsimcirq_test.py
index cd3f1ce3a..4fef48ca9 100644
--- a/qsimcirq_tests/qsimcirq_test.py
+++ b/qsimcirq_tests/qsimcirq_test.py
@@ -2214,5 +2214,7 @@ def __repr__(self):
     circuit = qsimcirq.qsim.Circuit()
     qubit_to_index = {q0: 0}
 
-    with pytest.raises(ValueError, match="UnsupportedGate\(\) is not a supported gate."):
+    with pytest.raises(
+        ValueError, match="UnsupportedGate\(\) is not a supported gate."
+    ):
         qsimcirq.add_op_to_circuit(op, 0, qubit_to_index, circuit)