diff --git a/graph_net_bench/torch/eval_backend_perf.py b/graph_net_bench/torch/eval_backend_perf.py index 3fd6db3ff..b22e322da 100644 --- a/graph_net_bench/torch/eval_backend_perf.py +++ b/graph_net_bench/torch/eval_backend_perf.py @@ -15,6 +15,7 @@ from contextlib import redirect_stdout, redirect_stderr from graph_net_bench.torch.backend.graph_compiler_backend import GraphCompilerBackend from graph_net_bench import test_compiler_util +from .util.timing import measure_performance def register_op_lib(op_lib): @@ -129,69 +130,6 @@ def get_input_dict(args): } -def measure_performance(model_call, args, compiler): - stats = {} - outs = model_call() - - # Warmup runs - for _ in range(args.warmup): - model_call() - compiler.synchronize() - - print( - f"[Profiling] Warm up {args.warmup}, Trials {args.trials}", - file=sys.stderr, - flush=True, - ) - - if "cuda" in args.device: - torch.cuda.empty_cache() - e2e_times = [] - gpu_times = [] - - for i in range(args.trials): - # End-to-end timing (naive_timer) - duration_box = test_compiler_util.DurationBox(-1) - with test_compiler_util.naive_timer(duration_box, compiler.synchronize): - # GPU-only timing (CUDA Events) - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) - start_event.record() - - model_call() - - end_event.record() - compiler.synchronize() - - gpu_time_ms = start_event.elapsed_time(end_event) - e2e_times.append(duration_box.value) - gpu_times.append(gpu_time_ms) - print( - f"Trial {i + 1}: e2e={duration_box.value:.5f} ms, gpu={gpu_time_ms:.5f} ms", - file=sys.stderr, - flush=True, - ) - - stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times) - stats["gpu"] = test_compiler_util.get_timing_stats(gpu_times) - - else: # CPU or other devices - e2e_times = [] - for i in range(args.trials): - duration_box = test_compiler_util.DurationBox(-1) - with test_compiler_util.naive_timer(duration_box, compiler.synchronize): - model_call() - print( - f"Trial {i + 1}: e2e={duration_box.value:.5f} ms", - file=sys.stderr, - flush=True, - ) - e2e_times.append(duration_box.value) - stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times) - - return outs, stats - - def eval_single_model_with_single_backend(args): check_and_complete_args(args) set_seed(args.seed) diff --git a/graph_net_bench/torch/util/timing.py b/graph_net_bench/torch/util/timing.py new file mode 100644 index 000000000..e9c42cfa7 --- /dev/null +++ b/graph_net_bench/torch/util/timing.py @@ -0,0 +1,90 @@ +import torch +import sys +from graph_net_bench import test_compiler_util + + +def measure_performance(model_call, args, compiler): + stats = {} + outs = model_call() + # Warmup runs + for _ in range(args.warmup): + model_call() + compiler.synchronize() + + print( + f"[Profiling] Warm up {args.warmup}, Trials {args.trials}", + file=sys.stderr, + flush=True, + ) + + if "cuda" in args.device: + torch.cuda.empty_cache() + executor = CUDATrialExecutor(model_call, compiler) + else: + executor = NoneCUDATrialExecutor(model_call, compiler) + + timings = run_benchmark(args.trials, executor) + + stats = { + name: test_compiler_util.get_timing_stats(values) + for name, values in timings.items() + } + + return outs, stats + + +def run_benchmark(trials, executor): + results = {} + + for i in range(trials): + timings = executor.run_one_trial() + + for k, v in timings.items(): + results.setdefault(k, []).append(v) + + log_trial(i + 1, timings) + + return results + + +def log_trial(idx, timings): + msg = ", ".join(f"{k}={v:.5f} ms" for k, v in timings.items()) + print(f"Trial {idx}: {msg}", file=sys.stderr, flush=True) + + +class BaseTrialExecutor: + def __init__(self, model_call, compiler): + self.model_call = model_call + self.compiler = compiler + + def run_one_trial(self): + raise NotImplementedError + + +class NoneCUDATrialExecutor(BaseTrialExecutor): + def run_one_trial(self): + duration_box = test_compiler_util.DurationBox(-1) + with test_compiler_util.naive_timer(duration_box, self.compiler.synchronize): + self.model_call() + return {"e2e": duration_box.value} + + +class CUDATrialExecutor(BaseTrialExecutor): + def run_one_trial(self): + duration_box = test_compiler_util.DurationBox(-1) + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + with test_compiler_util.naive_timer(duration_box, self.compiler.synchronize): + start_event.record() + self.model_call() + end_event.record() + self.compiler.synchronize() + + gpu_time = start_event.elapsed_time(end_event) + + return { + "e2e": duration_box.value, + "gpu": gpu_time, + }