From 2692fc4c272f5712ebb9ad7c117f0d1118753d83 Mon Sep 17 00:00:00 2001 From: roll-away <220250881@seu.edu.cn> Date: Tue, 27 Jan 2026 08:40:53 +0000 Subject: [PATCH 1/6] split eval_backend_perf --- graph_net_bench/torch/eval_backend_diff.py | 2 +- graph_net_bench/torch/util/comparison.py | 221 +++++++++++++++++++++ graph_net_bench/torch/util/timing.py | 74 +++++++ 3 files changed, 296 insertions(+), 1 deletion(-) create mode 100644 graph_net_bench/torch/util/comparison.py create mode 100644 graph_net_bench/torch/util/timing.py diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py index cfa171dc6..d2d9cd417 100755 --- a/graph_net_bench/torch/eval_backend_diff.py +++ b/graph_net_bench/torch/eval_backend_diff.py @@ -9,7 +9,7 @@ import types from graph_net_bench import test_compiler_util from graph_net_bench import path_utils -from .eval_backend_perf import eval_single_model_with_single_backend +from .util.comparison import eval_single_model_with_single_backend def compare_correctness(expected_out, compiled_out, args): diff --git a/graph_net_bench/torch/util/comparison.py b/graph_net_bench/torch/util/comparison.py new file mode 100644 index 000000000..25c93aaca --- /dev/null +++ b/graph_net_bench/torch/util/comparison.py @@ -0,0 +1,221 @@ +from .. import utils +import argparse +import importlib.util +import torch +from pathlib import Path +from typing import Type +import sys +import os +import traceback +import json +import random +import numpy as np +import types +from contextlib import redirect_stdout, redirect_stderr +from graph_net_bench.torch.backend.graph_compiler_backend import GraphCompilerBackend +from graph_net_bench import test_compiler_util +from .timing import get_hardward_name, get_compiler_version, measure_performance + + +def register_op_lib(op_lib): + if op_lib == "flaggems": + import flag_gems + + flag_gems.enable() + else: + pass + + +def set_seed(random_seed): + random.seed(random_seed) + np.random.seed(random_seed) + torch.manual_seed(random_seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(random_seed) + torch.cuda.manual_seed_all(random_seed) + + +def load_class_from_file( + model_path: str, class_name: str, device: str +) -> Type[torch.nn.Module]: + file_path = f"{model_path}/model.py" + file = Path(file_path).resolve() + module_name = file.stem + + with open(file_path, "r", encoding="utf-8") as f: + model_code = f.read() + model_code = utils.modify_code_by_device(model_code, device) + spec = importlib.util.spec_from_loader(module_name, loader=None) + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + compiled_code = compile(model_code, filename=file, mode="exec") + exec(compiled_code, module.__dict__) + + model_class = getattr(module, class_name, None) + setattr(model_class, "__graph_net_file_path__", file_path) + setattr(model_class, "__graph_net_device__", device) + return model_class + + +def get_compiler_backend(args) -> GraphCompilerBackend: + """ + Dynamically load backend class based on args.compiler + """ + compiler_name = args.compiler.lower() + module_name = f"graph_net_bench.torch.backend.{compiler_name}_backend" + + try: + module = __import__(module_name, fromlist=[f"{compiler_name.title()}Backend"]) + + class_name = ( + f"{''.join(part.title() for part in compiler_name.split('_'))}Backend" + ) + + backend_class = None + if hasattr(module, class_name): + backend_class = getattr(module, class_name) + else: + raise ImportError(f"No valid backend class found in {module_name}") + + except ImportError as e: + raise ImportError(f"Failed to import backend module for '{compiler_name}': {e}") + + backend_config = ( + test_compiler_util.convert_to_dict(args.backend_config) + if args.backend_config is not None + else {} + ) + return backend_class(backend_config) + + +def get_model(args): + device = "xla" if args.compiler == "xla" else args.device + + # device: Torch device object specifying the target device for model loading (e.g., 'cuda', 'cpu', 'xla') + model_class = load_class_from_file( + args.model_path, class_name="GraphModule", device=device + ) + model = model_class().to(torch.device(args.device)) + return model + + +def get_input_dict(args): + inputs_params = utils.load_converted_from_text(f"{args.model_path}") + params = inputs_params["weight_info"] + for tensor_meta in params.values(): + if "device" in tensor_meta["info"]: + tensor_meta["info"]["device"] = args.device + return { + k: utils.replay_tensor(v).to(torch.device(args.device)) + for k, v in params.items() + } + + +def eval_single_model_with_single_backend(args): + check_and_complete_args(args) + set_seed(args.seed) + torch.set_default_device(args.device) + os.makedirs(args.output_path, exist_ok=True) + + log_path = utils.get_log_path(args.output_path, args.model_path) + output_dump_path = utils.get_output_path(args.output_path, args.model_path) + + with open(log_path, "w", encoding="utf-8") as log_f: + with redirect_stdout(log_f), redirect_stderr(log_f): + compiler = get_compiler_backend(args) + input_dict = get_input_dict(args) + model = get_model(args) + model.eval() + + test_compiler_util.print_config( + args, + get_hardward_name(args.device), + get_compiler_version(args.compiler), + ) + + success = False + time_stats = {} + try: + compiled_model = compiler(model) + + def model_call(): + return compiled_model(**input_dict) + + # 运行并测量 + outputs = model_call() + time_stats = measure_performance(model_call, args, compiler) + success = True + except Exception as e: + print( + f"Run model failed: {str(e)}\n{traceback.format_exc()}", + file=sys.stderr, + flush=True, + ) + + test_compiler_util.print_running_status(args, success) + if success: + torch.save(outputs, str(output_dump_path)) + + test_compiler_util.print_with_log_prompt( + "[Performance][eager]:", json.dumps(time_stats), args.log_prompt + ) + + with open(log_path, "r", encoding="utf-8") as f: + print(f.read(), file=sys.stderr, flush=True) + + +def check_and_complete_args(args): + """ + Ensure all required arguments are present with default values if missing + """ + defaults = { + "model_path": None, # Model path + "output_path": None, # Log and output directory + "seed": 123, # Random seed + "compiler": "inductor", # Compiler name + "device": "cuda", # Device for testing the compiler (e.g., 'cpu' or 'cuda') + "op_lib": None, # Operator library + "warmup": 3, # Number of warmup steps + "trials": 5, # Number of timing trials + "log_prompt": "graph-net-bench-log", # Log prompt for performance log filtering + "model_path_prefix": None, # Prefix path to model path in args.model-path + "backend_config": None, # backend configuration json + } + + for key, default in defaults.items(): + if not hasattr(args, key): + setattr(args, key, default) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Single Backend Performance Evaluation" + ) + parser.add_argument( + "--model-path", + type=str, + required=False, + default=None, + help="Path to model file(s), each subdirectory containing graph_net.json will be regarded as a model", + ) + parser.add_argument( + "--output-path", + type=str, + required=False, + default="/tmp/test_save", + help="Path to save outputs", + ) + parser.add_argument( + "--config", + type=str, + required=False, + default=None, + help="base64 encode configuration json.", + ) + args = parser.parse_args() + mut_args = types.SimpleNamespace( + model_path=args.model_path, + output_path=args.output_path, + **test_compiler_util.convert_to_dict(args.config), + ) + eval_single_model_with_single_backend(mut_args) diff --git a/graph_net_bench/torch/util/timing.py b/graph_net_bench/torch/util/timing.py new file mode 100644 index 000000000..fd1940cc6 --- /dev/null +++ b/graph_net_bench/torch/util/timing.py @@ -0,0 +1,74 @@ +import torch +import platform +import sys +from graph_net_bench import test_compiler_util + + +def get_hardward_name(device): + hardware_name = "unknown" + if "cuda" in device: + hardware_name = torch.cuda.get_device_name(device) + elif device == "cpu": + hardware_name = platform.processor() + return hardware_name + + +def get_compiler_version(compiler): + if compiler in ["inductor", "nope", "unstable_to_stable"]: + return torch.__version__ + # 兼容处理具有 version 属性的对象或字符串 + return getattr(compiler, "version", "unknown") + + +def measure_performance(model_call, args, compiler): + stats = {} + # 预热 + for _ in range(args.warmup): + model_call() + compiler.synchronize() + + print( + f"[Profiling] Warm up {args.warmup}, Trials {args.trials}", + file=sys.stderr, + flush=True, + ) + + if "cuda" in args.device: + torch.cuda.empty_cache() + e2e_times, gpu_times = [], [] + for i in range(args.trials): + duration_box = test_compiler_util.DurationBox(-1) + with test_compiler_util.naive_timer(duration_box, compiler.synchronize): + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + start_event.record() + model_call() + end_event.record() + compiler.synchronize() + + gpu_time_ms = start_event.elapsed_time(end_event) + e2e_times.append(duration_box.value) + gpu_times.append(gpu_time_ms) + print( + f"Trial {i + 1}: e2e={duration_box.value:.5f} ms, gpu={gpu_time_ms:.5f} ms", + file=sys.stderr, + flush=True, + ) + + stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times) + stats["gpu"] = test_compiler_util.get_timing_stats(gpu_times) + else: + e2e_times = [] + for i in range(args.trials): + duration_box = test_compiler_util.DurationBox(-1) + with test_compiler_util.naive_timer(duration_box, compiler.synchronize): + model_call() + e2e_times.append(duration_box.value) + print( + f"Trial {i + 1}: e2e={duration_box.value:.5f} ms", + file=sys.stderr, + flush=True, + ) + stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times) + + return stats From 49ea3b7b889359b24647977c000b1c05a772a6fe Mon Sep 17 00:00:00 2001 From: roll-away <220250881@seu.edu.cn> Date: Tue, 27 Jan 2026 08:58:35 +0000 Subject: [PATCH 2/6] split eval_backend_perf --- graph_net_bench/torch/util/comparison.py | 7 +++---- graph_net_bench/torch/util/timing.py | 11 +++++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/graph_net_bench/torch/util/comparison.py b/graph_net_bench/torch/util/comparison.py index 25c93aaca..7be2dac95 100644 --- a/graph_net_bench/torch/util/comparison.py +++ b/graph_net_bench/torch/util/comparison.py @@ -141,9 +141,7 @@ def eval_single_model_with_single_backend(args): def model_call(): return compiled_model(**input_dict) - # 运行并测量 - outputs = model_call() - time_stats = measure_performance(model_call, args, compiler) + outputs, time_stats = measure_performance(model_call, args, compiler) success = True except Exception as e: print( @@ -161,7 +159,8 @@ def model_call(): ) with open(log_path, "r", encoding="utf-8") as f: - print(f.read(), file=sys.stderr, flush=True) + content = f.read() + print(content, file=sys.stderr, flush=True) def check_and_complete_args(args): diff --git a/graph_net_bench/torch/util/timing.py b/graph_net_bench/torch/util/timing.py index fd1940cc6..8c98c467a 100644 --- a/graph_net_bench/torch/util/timing.py +++ b/graph_net_bench/torch/util/timing.py @@ -16,13 +16,16 @@ def get_hardward_name(device): def get_compiler_version(compiler): if compiler in ["inductor", "nope", "unstable_to_stable"]: return torch.__version__ - # 兼容处理具有 version 属性的对象或字符串 - return getattr(compiler, "version", "unknown") + elif compiler in ["tvm", "xla", "tensorrt", "bladedisc"]: + # Assuming compiler object has a version attribute + return f"{compiler.capitalize()} {compiler.version}" + return "unknown" def measure_performance(model_call, args, compiler): stats = {} - # 预热 + outs = model_call() + # Warmup runs for _ in range(args.warmup): model_call() compiler.synchronize() @@ -71,4 +74,4 @@ def measure_performance(model_call, args, compiler): ) stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times) - return stats + return outs, stats From dc5336da526e8c87092078e1b496708e5ad45508 Mon Sep 17 00:00:00 2001 From: roll-away <220250881@seu.edu.cn> Date: Wed, 28 Jan 2026 02:08:22 +0000 Subject: [PATCH 3/6] move some features from eval_backend_diff.py into timing.py --- graph_net_bench/torch/eval_backend_diff.py | 2 +- .../{comparison.py => eval_backend_perf.py} | 21 ++++++++++++++++++- graph_net_bench/torch/util/timing.py | 19 ----------------- 3 files changed, 21 insertions(+), 21 deletions(-) rename graph_net_bench/torch/util/{comparison.py => eval_backend_perf.py} (91%) diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py index d2d9cd417..49780971a 100755 --- a/graph_net_bench/torch/eval_backend_diff.py +++ b/graph_net_bench/torch/eval_backend_diff.py @@ -9,7 +9,7 @@ import types from graph_net_bench import test_compiler_util from graph_net_bench import path_utils -from .util.comparison import eval_single_model_with_single_backend +from .util.eval_backend_perf import eval_single_model_with_single_backend def compare_correctness(expected_out, compiled_out, args): diff --git a/graph_net_bench/torch/util/comparison.py b/graph_net_bench/torch/util/eval_backend_perf.py similarity index 91% rename from graph_net_bench/torch/util/comparison.py rename to graph_net_bench/torch/util/eval_backend_perf.py index 7be2dac95..8aa64f867 100644 --- a/graph_net_bench/torch/util/comparison.py +++ b/graph_net_bench/torch/util/eval_backend_perf.py @@ -10,11 +10,12 @@ import json import random import numpy as np +import platform import types from contextlib import redirect_stdout, redirect_stderr from graph_net_bench.torch.backend.graph_compiler_backend import GraphCompilerBackend from graph_net_bench import test_compiler_util -from .timing import get_hardward_name, get_compiler_version, measure_performance +from .timing import measure_performance def register_op_lib(op_lib): @@ -35,6 +36,24 @@ def set_seed(random_seed): torch.cuda.manual_seed_all(random_seed) +def get_hardward_name(device): + hardware_name = "unknown" + if "cuda" in device: + hardware_name = torch.cuda.get_device_name(device) + elif device == "cpu": + hardware_name = platform.processor() + return hardware_name + + +def get_compiler_version(compiler): + if compiler in ["inductor", "nope", "unstable_to_stable"]: + return torch.__version__ + elif compiler in ["tvm", "xla", "tensorrt", "bladedisc"]: + # Assuming compiler object has a version attribute + return f"{compiler.capitalize()} {compiler.version}" + return "unknown" + + def load_class_from_file( model_path: str, class_name: str, device: str ) -> Type[torch.nn.Module]: diff --git a/graph_net_bench/torch/util/timing.py b/graph_net_bench/torch/util/timing.py index 8c98c467a..67286e07f 100644 --- a/graph_net_bench/torch/util/timing.py +++ b/graph_net_bench/torch/util/timing.py @@ -1,27 +1,8 @@ import torch -import platform import sys from graph_net_bench import test_compiler_util -def get_hardward_name(device): - hardware_name = "unknown" - if "cuda" in device: - hardware_name = torch.cuda.get_device_name(device) - elif device == "cpu": - hardware_name = platform.processor() - return hardware_name - - -def get_compiler_version(compiler): - if compiler in ["inductor", "nope", "unstable_to_stable"]: - return torch.__version__ - elif compiler in ["tvm", "xla", "tensorrt", "bladedisc"]: - # Assuming compiler object has a version attribute - return f"{compiler.capitalize()} {compiler.version}" - return "unknown" - - def measure_performance(model_call, args, compiler): stats = {} outs = model_call() From 61dd8e077772d3480ccb044b89e8b48b2a0f3658 Mon Sep 17 00:00:00 2001 From: roll-away <220250881@seu.edu.cn> Date: Wed, 28 Jan 2026 05:15:07 +0000 Subject: [PATCH 4/6] pull some features from graph_net_bench/torch/eval_backend_perf.py to graph_net_bench/torch/util/timing.py --- graph_net_bench/torch/eval_backend_diff.py | 2 +- graph_net_bench/torch/eval_backend_perf.py | 64 +---- .../torch/util/eval_backend_perf.py | 239 ------------------ 3 files changed, 2 insertions(+), 303 deletions(-) delete mode 100644 graph_net_bench/torch/util/eval_backend_perf.py diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py index 49780971a..cfa171dc6 100755 --- a/graph_net_bench/torch/eval_backend_diff.py +++ b/graph_net_bench/torch/eval_backend_diff.py @@ -9,7 +9,7 @@ import types from graph_net_bench import test_compiler_util from graph_net_bench import path_utils -from .util.eval_backend_perf import eval_single_model_with_single_backend +from .eval_backend_perf import eval_single_model_with_single_backend def compare_correctness(expected_out, compiled_out, args): diff --git a/graph_net_bench/torch/eval_backend_perf.py b/graph_net_bench/torch/eval_backend_perf.py index 3fd6db3ff..b22e322da 100644 --- a/graph_net_bench/torch/eval_backend_perf.py +++ b/graph_net_bench/torch/eval_backend_perf.py @@ -15,6 +15,7 @@ from contextlib import redirect_stdout, redirect_stderr from graph_net_bench.torch.backend.graph_compiler_backend import GraphCompilerBackend from graph_net_bench import test_compiler_util +from .util.timing import measure_performance def register_op_lib(op_lib): @@ -129,69 +130,6 @@ def get_input_dict(args): } -def measure_performance(model_call, args, compiler): - stats = {} - outs = model_call() - - # Warmup runs - for _ in range(args.warmup): - model_call() - compiler.synchronize() - - print( - f"[Profiling] Warm up {args.warmup}, Trials {args.trials}", - file=sys.stderr, - flush=True, - ) - - if "cuda" in args.device: - torch.cuda.empty_cache() - e2e_times = [] - gpu_times = [] - - for i in range(args.trials): - # End-to-end timing (naive_timer) - duration_box = test_compiler_util.DurationBox(-1) - with test_compiler_util.naive_timer(duration_box, compiler.synchronize): - # GPU-only timing (CUDA Events) - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) - start_event.record() - - model_call() - - end_event.record() - compiler.synchronize() - - gpu_time_ms = start_event.elapsed_time(end_event) - e2e_times.append(duration_box.value) - gpu_times.append(gpu_time_ms) - print( - f"Trial {i + 1}: e2e={duration_box.value:.5f} ms, gpu={gpu_time_ms:.5f} ms", - file=sys.stderr, - flush=True, - ) - - stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times) - stats["gpu"] = test_compiler_util.get_timing_stats(gpu_times) - - else: # CPU or other devices - e2e_times = [] - for i in range(args.trials): - duration_box = test_compiler_util.DurationBox(-1) - with test_compiler_util.naive_timer(duration_box, compiler.synchronize): - model_call() - print( - f"Trial {i + 1}: e2e={duration_box.value:.5f} ms", - file=sys.stderr, - flush=True, - ) - e2e_times.append(duration_box.value) - stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times) - - return outs, stats - - def eval_single_model_with_single_backend(args): check_and_complete_args(args) set_seed(args.seed) diff --git a/graph_net_bench/torch/util/eval_backend_perf.py b/graph_net_bench/torch/util/eval_backend_perf.py deleted file mode 100644 index 8aa64f867..000000000 --- a/graph_net_bench/torch/util/eval_backend_perf.py +++ /dev/null @@ -1,239 +0,0 @@ -from .. import utils -import argparse -import importlib.util -import torch -from pathlib import Path -from typing import Type -import sys -import os -import traceback -import json -import random -import numpy as np -import platform -import types -from contextlib import redirect_stdout, redirect_stderr -from graph_net_bench.torch.backend.graph_compiler_backend import GraphCompilerBackend -from graph_net_bench import test_compiler_util -from .timing import measure_performance - - -def register_op_lib(op_lib): - if op_lib == "flaggems": - import flag_gems - - flag_gems.enable() - else: - pass - - -def set_seed(random_seed): - random.seed(random_seed) - np.random.seed(random_seed) - torch.manual_seed(random_seed) - if torch.cuda.is_available(): - torch.cuda.manual_seed(random_seed) - torch.cuda.manual_seed_all(random_seed) - - -def get_hardward_name(device): - hardware_name = "unknown" - if "cuda" in device: - hardware_name = torch.cuda.get_device_name(device) - elif device == "cpu": - hardware_name = platform.processor() - return hardware_name - - -def get_compiler_version(compiler): - if compiler in ["inductor", "nope", "unstable_to_stable"]: - return torch.__version__ - elif compiler in ["tvm", "xla", "tensorrt", "bladedisc"]: - # Assuming compiler object has a version attribute - return f"{compiler.capitalize()} {compiler.version}" - return "unknown" - - -def load_class_from_file( - model_path: str, class_name: str, device: str -) -> Type[torch.nn.Module]: - file_path = f"{model_path}/model.py" - file = Path(file_path).resolve() - module_name = file.stem - - with open(file_path, "r", encoding="utf-8") as f: - model_code = f.read() - model_code = utils.modify_code_by_device(model_code, device) - spec = importlib.util.spec_from_loader(module_name, loader=None) - module = importlib.util.module_from_spec(spec) - sys.modules[module_name] = module - compiled_code = compile(model_code, filename=file, mode="exec") - exec(compiled_code, module.__dict__) - - model_class = getattr(module, class_name, None) - setattr(model_class, "__graph_net_file_path__", file_path) - setattr(model_class, "__graph_net_device__", device) - return model_class - - -def get_compiler_backend(args) -> GraphCompilerBackend: - """ - Dynamically load backend class based on args.compiler - """ - compiler_name = args.compiler.lower() - module_name = f"graph_net_bench.torch.backend.{compiler_name}_backend" - - try: - module = __import__(module_name, fromlist=[f"{compiler_name.title()}Backend"]) - - class_name = ( - f"{''.join(part.title() for part in compiler_name.split('_'))}Backend" - ) - - backend_class = None - if hasattr(module, class_name): - backend_class = getattr(module, class_name) - else: - raise ImportError(f"No valid backend class found in {module_name}") - - except ImportError as e: - raise ImportError(f"Failed to import backend module for '{compiler_name}': {e}") - - backend_config = ( - test_compiler_util.convert_to_dict(args.backend_config) - if args.backend_config is not None - else {} - ) - return backend_class(backend_config) - - -def get_model(args): - device = "xla" if args.compiler == "xla" else args.device - - # device: Torch device object specifying the target device for model loading (e.g., 'cuda', 'cpu', 'xla') - model_class = load_class_from_file( - args.model_path, class_name="GraphModule", device=device - ) - model = model_class().to(torch.device(args.device)) - return model - - -def get_input_dict(args): - inputs_params = utils.load_converted_from_text(f"{args.model_path}") - params = inputs_params["weight_info"] - for tensor_meta in params.values(): - if "device" in tensor_meta["info"]: - tensor_meta["info"]["device"] = args.device - return { - k: utils.replay_tensor(v).to(torch.device(args.device)) - for k, v in params.items() - } - - -def eval_single_model_with_single_backend(args): - check_and_complete_args(args) - set_seed(args.seed) - torch.set_default_device(args.device) - os.makedirs(args.output_path, exist_ok=True) - - log_path = utils.get_log_path(args.output_path, args.model_path) - output_dump_path = utils.get_output_path(args.output_path, args.model_path) - - with open(log_path, "w", encoding="utf-8") as log_f: - with redirect_stdout(log_f), redirect_stderr(log_f): - compiler = get_compiler_backend(args) - input_dict = get_input_dict(args) - model = get_model(args) - model.eval() - - test_compiler_util.print_config( - args, - get_hardward_name(args.device), - get_compiler_version(args.compiler), - ) - - success = False - time_stats = {} - try: - compiled_model = compiler(model) - - def model_call(): - return compiled_model(**input_dict) - - outputs, time_stats = measure_performance(model_call, args, compiler) - success = True - except Exception as e: - print( - f"Run model failed: {str(e)}\n{traceback.format_exc()}", - file=sys.stderr, - flush=True, - ) - - test_compiler_util.print_running_status(args, success) - if success: - torch.save(outputs, str(output_dump_path)) - - test_compiler_util.print_with_log_prompt( - "[Performance][eager]:", json.dumps(time_stats), args.log_prompt - ) - - with open(log_path, "r", encoding="utf-8") as f: - content = f.read() - print(content, file=sys.stderr, flush=True) - - -def check_and_complete_args(args): - """ - Ensure all required arguments are present with default values if missing - """ - defaults = { - "model_path": None, # Model path - "output_path": None, # Log and output directory - "seed": 123, # Random seed - "compiler": "inductor", # Compiler name - "device": "cuda", # Device for testing the compiler (e.g., 'cpu' or 'cuda') - "op_lib": None, # Operator library - "warmup": 3, # Number of warmup steps - "trials": 5, # Number of timing trials - "log_prompt": "graph-net-bench-log", # Log prompt for performance log filtering - "model_path_prefix": None, # Prefix path to model path in args.model-path - "backend_config": None, # backend configuration json - } - - for key, default in defaults.items(): - if not hasattr(args, key): - setattr(args, key, default) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Single Backend Performance Evaluation" - ) - parser.add_argument( - "--model-path", - type=str, - required=False, - default=None, - help="Path to model file(s), each subdirectory containing graph_net.json will be regarded as a model", - ) - parser.add_argument( - "--output-path", - type=str, - required=False, - default="/tmp/test_save", - help="Path to save outputs", - ) - parser.add_argument( - "--config", - type=str, - required=False, - default=None, - help="base64 encode configuration json.", - ) - args = parser.parse_args() - mut_args = types.SimpleNamespace( - model_path=args.model_path, - output_path=args.output_path, - **test_compiler_util.convert_to_dict(args.config), - ) - eval_single_model_with_single_backend(mut_args) From 6c8f081f4c89927395b67c389b839cffd03f77b2 Mon Sep 17 00:00:00 2001 From: roll-away <220250881@seu.edu.cn> Date: Wed, 28 Jan 2026 06:04:49 +0000 Subject: [PATCH 5/6] Refactor code structure of graph_net_bench/torch/util/timing.py --- graph_net_bench/torch/util/timing.py | 73 ++++++++++++++++------------ 1 file changed, 42 insertions(+), 31 deletions(-) diff --git a/graph_net_bench/torch/util/timing.py b/graph_net_bench/torch/util/timing.py index 67286e07f..ced60e13d 100644 --- a/graph_net_bench/torch/util/timing.py +++ b/graph_net_bench/torch/util/timing.py @@ -19,40 +19,51 @@ def measure_performance(model_call, args, compiler): if "cuda" in args.device: torch.cuda.empty_cache() - e2e_times, gpu_times = [], [] - for i in range(args.trials): - duration_box = test_compiler_util.DurationBox(-1) - with test_compiler_util.naive_timer(duration_box, compiler.synchronize): - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) - start_event.record() - model_call() - end_event.record() - compiler.synchronize() - - gpu_time_ms = start_event.elapsed_time(end_event) - e2e_times.append(duration_box.value) - gpu_times.append(gpu_time_ms) - print( - f"Trial {i + 1}: e2e={duration_box.value:.5f} ms, gpu={gpu_time_ms:.5f} ms", - file=sys.stderr, - flush=True, - ) - + e2e_times, gpu_times = run_cuda_benchmark_timer( + model_call, args.trials, compiler + ) stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times) stats["gpu"] = test_compiler_util.get_timing_stats(gpu_times) else: - e2e_times = [] - for i in range(args.trials): - duration_box = test_compiler_util.DurationBox(-1) - with test_compiler_util.naive_timer(duration_box, compiler.synchronize): - model_call() - e2e_times.append(duration_box.value) - print( - f"Trial {i + 1}: e2e={duration_box.value:.5f} ms", - file=sys.stderr, - flush=True, - ) + e2e_times = run_non_cuda_benchmark_timer(model_call, args.trials, compiler) stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times) return outs, stats + + +def run_cuda_benchmark_timer(model_call, trials, compiler): + e2e_times, gpu_times = [], [] + for i in range(trials): + duration_box = test_compiler_util.DurationBox(-1) + with test_compiler_util.naive_timer(duration_box, compiler.synchronize): + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + start_event.record() + model_call() + end_event.record() + compiler.synchronize() + + gpu_time_ms = start_event.elapsed_time(end_event) + e2e_times.append(duration_box.value) + gpu_times.append(gpu_time_ms) + print( + f"Trial {i + 1}: e2e={duration_box.value:.5f} ms, gpu={gpu_time_ms:.5f} ms", + file=sys.stderr, + flush=True, + ) + return e2e_times, gpu_times + + +def run_non_cuda_benchmark_timer(model_call, trials, compiler): + e2e_times = [] + for i in range(trials): + duration_box = test_compiler_util.DurationBox(-1) + with test_compiler_util.naive_timer(duration_box, compiler.synchronize): + model_call() + e2e_times.append(duration_box.value) + print( + f"Trial {i + 1}: e2e={duration_box.value:.5f} ms", + file=sys.stderr, + flush=True, + ) + return e2e_times From 1d7522bc7f3232aec7f3914ecdd54b14deac3348 Mon Sep 17 00:00:00 2001 From: roll-away <220250881@seu.edu.cn> Date: Wed, 28 Jan 2026 08:48:23 +0000 Subject: [PATCH 6/6] refactor graph_net_bench/torch/util/timing.py --- graph_net_bench/torch/util/timing.py | 95 +++++++++++++++++----------- 1 file changed, 58 insertions(+), 37 deletions(-) diff --git a/graph_net_bench/torch/util/timing.py b/graph_net_bench/torch/util/timing.py index ced60e13d..e9c42cfa7 100644 --- a/graph_net_bench/torch/util/timing.py +++ b/graph_net_bench/torch/util/timing.py @@ -19,51 +19,72 @@ def measure_performance(model_call, args, compiler): if "cuda" in args.device: torch.cuda.empty_cache() - e2e_times, gpu_times = run_cuda_benchmark_timer( - model_call, args.trials, compiler - ) - stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times) - stats["gpu"] = test_compiler_util.get_timing_stats(gpu_times) + executor = CUDATrialExecutor(model_call, compiler) else: - e2e_times = run_non_cuda_benchmark_timer(model_call, args.trials, compiler) - stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times) + executor = NoneCUDATrialExecutor(model_call, compiler) + + timings = run_benchmark(args.trials, executor) + + stats = { + name: test_compiler_util.get_timing_stats(values) + for name, values in timings.items() + } return outs, stats -def run_cuda_benchmark_timer(model_call, trials, compiler): - e2e_times, gpu_times = [], [] +def run_benchmark(trials, executor): + results = {} + for i in range(trials): + timings = executor.run_one_trial() + + for k, v in timings.items(): + results.setdefault(k, []).append(v) + + log_trial(i + 1, timings) + + return results + + +def log_trial(idx, timings): + msg = ", ".join(f"{k}={v:.5f} ms" for k, v in timings.items()) + print(f"Trial {idx}: {msg}", file=sys.stderr, flush=True) + + +class BaseTrialExecutor: + def __init__(self, model_call, compiler): + self.model_call = model_call + self.compiler = compiler + + def run_one_trial(self): + raise NotImplementedError + + +class NoneCUDATrialExecutor(BaseTrialExecutor): + def run_one_trial(self): + duration_box = test_compiler_util.DurationBox(-1) + with test_compiler_util.naive_timer(duration_box, self.compiler.synchronize): + self.model_call() + return {"e2e": duration_box.value} + + +class CUDATrialExecutor(BaseTrialExecutor): + def run_one_trial(self): duration_box = test_compiler_util.DurationBox(-1) - with test_compiler_util.naive_timer(duration_box, compiler.synchronize): - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + with test_compiler_util.naive_timer(duration_box, self.compiler.synchronize): start_event.record() - model_call() + self.model_call() end_event.record() - compiler.synchronize() - - gpu_time_ms = start_event.elapsed_time(end_event) - e2e_times.append(duration_box.value) - gpu_times.append(gpu_time_ms) - print( - f"Trial {i + 1}: e2e={duration_box.value:.5f} ms, gpu={gpu_time_ms:.5f} ms", - file=sys.stderr, - flush=True, - ) - return e2e_times, gpu_times + self.compiler.synchronize() + gpu_time = start_event.elapsed_time(end_event) -def run_non_cuda_benchmark_timer(model_call, trials, compiler): - e2e_times = [] - for i in range(trials): - duration_box = test_compiler_util.DurationBox(-1) - with test_compiler_util.naive_timer(duration_box, compiler.synchronize): - model_call() - e2e_times.append(duration_box.value) - print( - f"Trial {i + 1}: e2e={duration_box.value:.5f} ms", - file=sys.stderr, - flush=True, - ) - return e2e_times + return { + "e2e": duration_box.value, + "gpu": gpu_time, + }