Skip to content

Commit 4c272a1

Browse files
committed
fix: resolve #993 -remove formating noise and isolate MI300X scaling logic
1 parent b9b2812 commit 4c272a1

6 files changed

Lines changed: 349 additions & 69 deletions

File tree

apps/qsimh_base_cuda.cu

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
// Copyright 2019 Google LLC. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// https://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include <complex>
16+
#include <iomanip>
17+
#include <limits>
18+
#include <sstream>
19+
#include <string>
20+
#include <unistd.h>
21+
#include <vector>
22+
23+
#include "../lib/bitstring.h"
24+
#include "../lib/circuit_qsim_parser.h"
25+
#include "../lib/formux.h"
26+
#include "../lib/fuser_basic.h"
27+
#include "../lib/gates_qsim.h"
28+
#include "../lib/io_file.h"
29+
#include "../lib/run_qsimh.h"
30+
#include "../lib/simmux.h"
31+
#include "../lib/simulator_cuda.h"
32+
#include "../lib/util.h"
33+
#include "../lib/util_cpu.h"
34+
35+
constexpr char usage[] =
36+
"usage:\n ./qsimh_base_hip.x -c circuit_file "
37+
"-d maximum_time -k part1_qubits "
38+
"-w prefix -p num_prefix_gates -r num_root_gates "
39+
"-t num_threads -n num_dblocks -v verbosity -z\n";
40+
41+
struct Options {
42+
std::string circuit_file;
43+
std::vector<unsigned> part1;
44+
uint64_t prefix;
45+
unsigned maxtime = std::numeric_limits<unsigned>::max();
46+
unsigned num_prefix_gatexs = 0;
47+
unsigned num_root_gatexs = 0;
48+
unsigned num_threads = 256;
49+
unsigned num_dblocks = 16;
50+
unsigned verbosity = 0;
51+
bool denormals_are_zeros = false;
52+
};
53+
54+
Options GetOptions(int argc, char* argv[]) {
55+
Options opt;
56+
57+
int k;
58+
59+
auto to_int = [](const std::string& word) -> unsigned {
60+
return std::stoul(word);
61+
};
62+
63+
while ((k = getopt(argc, argv, "c:d:k:w:p:r:t:n:v:z")) != -1) {
64+
switch (k) {
65+
case 'c':
66+
opt.circuit_file = optarg;
67+
break;
68+
case 'd':
69+
opt.maxtime = std::stoul(optarg);
70+
break;
71+
case 'k':
72+
qsim::SplitString(optarg, ',', to_int, opt.part1);
73+
break;
74+
case 'w':
75+
opt.prefix = std::stoull(optarg);
76+
break;
77+
case 'p':
78+
opt.num_prefix_gatexs = std::stoul(optarg);
79+
break;
80+
case 'r':
81+
opt.num_root_gatexs = std::stoul(optarg);
82+
break;
83+
case 't':
84+
opt.num_threads = std::stoul(optarg);
85+
break;
86+
case 'n':
87+
opt.num_dblocks = std::stoul(optarg);
88+
break;
89+
case 'v':
90+
opt.verbosity = std::stoul(optarg);
91+
break;
92+
case 'z':
93+
opt.denormals_are_zeros = true;
94+
break;
95+
default:
96+
qsim::IO::errorf(usage);
97+
exit(1);
98+
}
99+
}
100+
101+
return opt;
102+
}
103+
104+
bool ValidateOptions(const Options& opt) {
105+
if (opt.circuit_file.empty()) {
106+
qsim::IO::errorf("circuit file is not provided.\n");
107+
qsim::IO::errorf(usage);
108+
return false;
109+
}
110+
111+
return true;
112+
}
113+
114+
bool ValidatePart1(unsigned num_qubits, const std::vector<unsigned>& part1) {
115+
for (std::size_t i = 0; i < part1.size(); ++i) {
116+
if (part1[i] >= num_qubits) {
117+
qsim::IO::errorf("part 1 qubit indices are too large.\n");
118+
return false;
119+
}
120+
}
121+
122+
return true;
123+
}
124+
125+
std::vector<unsigned> GetParts(
126+
unsigned num_qubits, const std::vector<unsigned>& part1) {
127+
std::vector<unsigned> parts(num_qubits, 0);
128+
129+
for (std::size_t i = 0; i < part1.size(); ++i) {
130+
parts[part1[i]] = 1;
131+
}
132+
133+
return parts;
134+
}
135+
136+
int main(int argc, char* argv[]) {
137+
using namespace qsim;
138+
139+
auto opt = GetOptions(argc, argv);
140+
if (!ValidateOptions(opt)) {
141+
return 1;
142+
}
143+
144+
Circuit<GateQSim<float>> circuit;
145+
if (!CircuitQsimParser<IOFile>::FromFile(
146+
opt.maxtime, opt.circuit_file, circuit)) {
147+
return 1;
148+
}
149+
150+
if (!ValidatePart1(circuit.num_qubits, opt.part1)) {
151+
return 1;
152+
}
153+
auto parts = GetParts(circuit.num_qubits, opt.part1);
154+
155+
if (opt.denormals_are_zeros) {
156+
SetFlushToZeroAndDenormalsAreZeros();
157+
}
158+
159+
uint64_t num_bitstrings =
160+
std::min(uint64_t{8}, uint64_t{1} << circuit.num_qubits);
161+
162+
std::vector<Bitstring> bitstrings;
163+
bitstrings.reserve(num_bitstrings);
164+
for (std::size_t i = 0; i < num_bitstrings; ++i) {
165+
bitstrings.push_back(i);
166+
}
167+
168+
struct Factory {
169+
using Simulator = qsim::SimulatorCUDA<float>;
170+
using StateSpace = Simulator::StateSpace;
171+
using fp_type = Simulator::fp_type;
172+
173+
Factory(const StateSpace::Parameter& param) : param(param) {}
174+
175+
StateSpace CreateStateSpace() const { return StateSpace(param); }
176+
177+
Simulator CreateSimulator() const { return Simulator(); }
178+
179+
const StateSpace::Parameter& param;
180+
};
181+
182+
using HybridSimulator =
183+
HybridSimulator<IO, GateQSim<float>, BasicGateFuser, For>;
184+
using Runner = QSimHRunner<IO, HybridSimulator>;
185+
186+
Runner::Parameter param;
187+
param.prefix = opt.prefix;
188+
param.num_prefix_gatexs = opt.num_prefix_gatexs;
189+
param.num_root_gatexs = opt.num_root_gatexs;
190+
param.num_threads =
191+
opt.num_threads; // This is reused for StateSpaceCUDA params implicitly
192+
// if not careful, but here we separate.
193+
param.verbosity = opt.verbosity;
194+
195+
std::vector<std::complex<Factory::fp_type>> results(num_bitstrings, 0);
196+
197+
// Setup CUDA parameters
198+
Factory::StateSpace::Parameter cuda_param;
199+
cuda_param.num_threads = opt.num_threads;
200+
cuda_param.num_dblocks = opt.num_dblocks;
201+
202+
Factory factory(cuda_param);
203+
204+
if (Runner::Run(param, factory, circuit, parts, bitstrings, results)) {
205+
static constexpr char const* bits[8] = {
206+
"000", "001", "010", "011", "100", "101", "110", "111",
207+
};
208+
209+
unsigned s = 3 - std::min(unsigned{3}, circuit.num_qubits);
210+
211+
for (std::size_t i = 0; i < num_bitstrings; ++i) {
212+
const auto& a = results[i];
213+
qsim::IO::messagef(
214+
"%s:%16.8g%16.8g%16.8g\n", bits[i] + s, std::real(a), std::imag(a),
215+
std::norm(a));
216+
}
217+
}
218+
219+
return 0;
220+
}

lib/simulator_cuda.h

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -350,8 +350,8 @@ class SimulatorCUDA final {
350350

351351
IndicesH<G> d_i(d_ws);
352352

353-
ApplyGateH_Kernel<G><<<blocks, threads>>>(
354-
(fp_type*) d_ws, d_i.xss, d_i.ms, state.get());
353+
ApplyGateH_Kernel<G><<<CreateGrid(blocks), threads>>>(
354+
(fp_type*)d_ws, d_i.xss, d_i.ms, state.get());
355355
}
356356

357357
template <unsigned G>
@@ -374,8 +374,8 @@ class SimulatorCUDA final {
374374

375375
IndicesL<G> d_i(d_ws);
376376

377-
ApplyGateL_Kernel<G><<<blocks, threads>>>(
378-
(fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
377+
ApplyGateL_Kernel<G><<<CreateGrid(blocks), threads>>>(
378+
(fp_type*)d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
379379
1 << num_effective_qs, state.get());
380380
}
381381

@@ -407,8 +407,8 @@ class SimulatorCUDA final {
407407

408408
IndicesH<G> d_i(d_ws);
409409

410-
ApplyControlledGateH_Kernel<G><<<blocks, threads>>>(
411-
(fp_type*) d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get());
410+
ApplyControlledGateH_Kernel<G><<<CreateGrid(blocks), threads>>>(
411+
(fp_type*)d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get());
412412
}
413413

414414
template <unsigned G>
@@ -432,9 +432,9 @@ class SimulatorCUDA final {
432432

433433
IndicesL<G> d_i(d_ws);
434434

435-
ApplyControlledGateLH_Kernel<G><<<blocks, threads>>>(
436-
(fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
437-
d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, state.get());
435+
ApplyControlledGateLH_Kernel<G><<<CreateGrid(blocks), threads>>>(
436+
(fp_type*)d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d.num_aqs + 1,
437+
d.cvalsh, 1 << d.num_effective_qs, state.get());
438438
}
439439

440440
template <unsigned G>
@@ -458,8 +458,8 @@ class SimulatorCUDA final {
458458

459459
IndicesLC<G> d_i(d_ws);
460460

461-
ApplyControlledGateL_Kernel<G><<<blocks, threads>>>(
462-
(fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis,
461+
ApplyControlledGateL_Kernel<G><<<CreateGrid(blocks), threads>>>(
462+
(fp_type*)d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis,
463463
d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs,
464464
1 << (5 - d.remaining_low_cqs), state.get());
465465
}
@@ -493,9 +493,9 @@ class SimulatorCUDA final {
493493

494494
IndicesH<G> d_i(d_ws);
495495

496-
ExpectationValueH_Kernel<G><<<blocks, threads>>>(
497-
(fp_type*) d_ws, d_i.xss, d_i.ms, num_iterations_per_block,
498-
state.get(), Plus<double>(), d_res1);
496+
ExpectationValueH_Kernel<G><<<CreateGrid(blocks), threads>>>(
497+
(fp_type*)d_ws, d_i.xss, d_i.ms, num_iterations_per_block, state.get(),
498+
Plus<double>(), d_res1);
499499

500500
double mul = size == 1 ? 0.5 : 1.0;
501501

@@ -531,8 +531,8 @@ class SimulatorCUDA final {
531531

532532
IndicesL<G> d_i(d_ws);
533533

534-
ExpectationValueL_Kernel<G><<<blocks, threads>>>(
535-
(fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
534+
ExpectationValueL_Kernel<G><<<CreateGrid(blocks), threads>>>(
535+
(fp_type*)d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
536536
num_iterations_per_block, state.get(), Plus<double>(), d_res1);
537537

538538
double mul = double(1 << (5 + num_effective_qs - G)) / 32;
@@ -895,6 +895,12 @@ class SimulatorCUDA final {
895895
return {cvalsh, num_aqs, num_effective_qs, remaining_low_cqs};
896896
}
897897

898+
static dim3 CreateGrid(uint64_t blocks) {
899+
if (blocks <= 65535) return dim3(blocks);
900+
uint32_t x = 65535;
901+
uint32_t y = (blocks + x - 1) / x;
902+
return dim3(x, y);
903+
}
898904

899905
void* AllocScratch(uint64_t size) const {
900906
if (size > scratch_size_) {

0 commit comments

Comments
 (0)