From 2692fc4c272f5712ebb9ad7c117f0d1118753d83 Mon Sep 17 00:00:00 2001
From: roll-away <220250881@seu.edu.cn>
Date: Tue, 27 Jan 2026 08:40:53 +0000
Subject: [PATCH 1/6] split eval_backend_perf

---
 graph_net_bench/torch/eval_backend_diff.py |   2 +-
 graph_net_bench/torch/util/comparison.py   | 221 +++++++++++++++++++++
 graph_net_bench/torch/util/timing.py       |  74 +++++++
 3 files changed, 296 insertions(+), 1 deletion(-)
 create mode 100644 graph_net_bench/torch/util/comparison.py
 create mode 100644 graph_net_bench/torch/util/timing.py

diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
index cfa171dc6..d2d9cd417 100755
--- a/graph_net_bench/torch/eval_backend_diff.py
+++ b/graph_net_bench/torch/eval_backend_diff.py
@@ -9,7 +9,7 @@
 import types
 from graph_net_bench import test_compiler_util
 from graph_net_bench import path_utils
-from .eval_backend_perf import eval_single_model_with_single_backend
+from .util.comparison import eval_single_model_with_single_backend
 
 
 def compare_correctness(expected_out, compiled_out, args):
diff --git a/graph_net_bench/torch/util/comparison.py b/graph_net_bench/torch/util/comparison.py
new file mode 100644
index 000000000..25c93aaca
--- /dev/null
+++ b/graph_net_bench/torch/util/comparison.py
@@ -0,0 +1,221 @@
+from .. import utils
+import argparse
+import importlib.util
+import torch
+from pathlib import Path
+from typing import Type
+import sys
+import os
+import traceback
+import json
+import random
+import numpy as np
+import types
+from contextlib import redirect_stdout, redirect_stderr
+from graph_net_bench.torch.backend.graph_compiler_backend import GraphCompilerBackend
+from graph_net_bench import test_compiler_util
+from .timing import get_hardward_name, get_compiler_version, measure_performance
+
+
+def register_op_lib(op_lib):
+    if op_lib == "flaggems":
+        import flag_gems
+
+        flag_gems.enable()
+    else:
+        pass
+
+
+def set_seed(random_seed):
+    random.seed(random_seed)
+    np.random.seed(random_seed)
+    torch.manual_seed(random_seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(random_seed)
+        torch.cuda.manual_seed_all(random_seed)
+
+
+def load_class_from_file(
+    model_path: str, class_name: str, device: str
+) -> Type[torch.nn.Module]:
+    file_path = f"{model_path}/model.py"
+    file = Path(file_path).resolve()
+    module_name = file.stem
+
+    with open(file_path, "r", encoding="utf-8") as f:
+        model_code = f.read()
+    model_code = utils.modify_code_by_device(model_code, device)
+    spec = importlib.util.spec_from_loader(module_name, loader=None)
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    compiled_code = compile(model_code, filename=file, mode="exec")
+    exec(compiled_code, module.__dict__)
+
+    model_class = getattr(module, class_name, None)
+    setattr(model_class, "__graph_net_file_path__", file_path)
+    setattr(model_class, "__graph_net_device__", device)
+    return model_class
+
+
+def get_compiler_backend(args) -> GraphCompilerBackend:
+    """
+    Dynamically load backend class based on args.compiler
+    """
+    compiler_name = args.compiler.lower()
+    module_name = f"graph_net_bench.torch.backend.{compiler_name}_backend"
+
+    try:
+        module = __import__(module_name, fromlist=[f"{compiler_name.title()}Backend"])
+
+        class_name = (
+            f"{''.join(part.title() for part in compiler_name.split('_'))}Backend"
+        )
+
+        backend_class = None
+        if hasattr(module, class_name):
+            backend_class = getattr(module, class_name)
+        else:
+            raise ImportError(f"No valid backend class found in {module_name}")
+
+    except ImportError as e:
+        raise ImportError(f"Failed to import backend module for '{compiler_name}': {e}")
+
+    backend_config = (
+        test_compiler_util.convert_to_dict(args.backend_config)
+        if args.backend_config is not None
+        else {}
+    )
+    return backend_class(backend_config)
+
+
+def get_model(args):
+    device = "xla" if args.compiler == "xla" else args.device
+
+    # device: Torch device object specifying the target device for model loading (e.g., 'cuda', 'cpu', 'xla')
+    model_class = load_class_from_file(
+        args.model_path, class_name="GraphModule", device=device
+    )
+    model = model_class().to(torch.device(args.device))
+    return model
+
+
+def get_input_dict(args):
+    inputs_params = utils.load_converted_from_text(f"{args.model_path}")
+    params = inputs_params["weight_info"]
+    for tensor_meta in params.values():
+        if "device" in tensor_meta["info"]:
+            tensor_meta["info"]["device"] = args.device
+    return {
+        k: utils.replay_tensor(v).to(torch.device(args.device))
+        for k, v in params.items()
+    }
+
+
+def eval_single_model_with_single_backend(args):
+    check_and_complete_args(args)
+    set_seed(args.seed)
+    torch.set_default_device(args.device)
+    os.makedirs(args.output_path, exist_ok=True)
+
+    log_path = utils.get_log_path(args.output_path, args.model_path)
+    output_dump_path = utils.get_output_path(args.output_path, args.model_path)
+
+    with open(log_path, "w", encoding="utf-8") as log_f:
+        with redirect_stdout(log_f), redirect_stderr(log_f):
+            compiler = get_compiler_backend(args)
+            input_dict = get_input_dict(args)
+            model = get_model(args)
+            model.eval()
+
+            test_compiler_util.print_config(
+                args,
+                get_hardward_name(args.device),
+                get_compiler_version(args.compiler),
+            )
+
+            success = False
+            time_stats = {}
+            try:
+                compiled_model = compiler(model)
+
+                def model_call():
+                    return compiled_model(**input_dict)
+
+                # 运行并测量
+                outputs = model_call()
+                time_stats = measure_performance(model_call, args, compiler)
+                success = True
+            except Exception as e:
+                print(
+                    f"Run model failed: {str(e)}\n{traceback.format_exc()}",
+                    file=sys.stderr,
+                    flush=True,
+                )
+
+            test_compiler_util.print_running_status(args, success)
+            if success:
+                torch.save(outputs, str(output_dump_path))
+
+            test_compiler_util.print_with_log_prompt(
+                "[Performance][eager]:", json.dumps(time_stats), args.log_prompt
+            )
+
+    with open(log_path, "r", encoding="utf-8") as f:
+        print(f.read(), file=sys.stderr, flush=True)
+
+
+def check_and_complete_args(args):
+    """
+    Ensure all required arguments are present with default values if missing
+    """
+    defaults = {
+        "model_path": None,  # Model path
+        "output_path": None,  # Log and output directory
+        "seed": 123,  # Random seed
+        "compiler": "inductor",  # Compiler name
+        "device": "cuda",  # Device for testing the compiler (e.g., 'cpu' or 'cuda')
+        "op_lib": None,  # Operator library
+        "warmup": 3,  # Number of warmup steps
+        "trials": 5,  # Number of timing trials
+        "log_prompt": "graph-net-bench-log",  # Log prompt for performance log filtering
+        "model_path_prefix": None,  # Prefix path to model path in args.model-path
+        "backend_config": None,  # backend configuration json
+    }
+
+    for key, default in defaults.items():
+        if not hasattr(args, key):
+            setattr(args, key, default)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Single Backend Performance Evaluation"
+    )
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        required=False,
+        default=None,
+        help="Path to model file(s), each subdirectory containing graph_net.json will be regarded as a model",
+    )
+    parser.add_argument(
+        "--output-path",
+        type=str,
+        required=False,
+        default="/tmp/test_save",
+        help="Path to save outputs",
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=False,
+        default=None,
+        help="base64 encode configuration json.",
+    )
+    args = parser.parse_args()
+    mut_args = types.SimpleNamespace(
+        model_path=args.model_path,
+        output_path=args.output_path,
+        **test_compiler_util.convert_to_dict(args.config),
+    )
+    eval_single_model_with_single_backend(mut_args)
diff --git a/graph_net_bench/torch/util/timing.py b/graph_net_bench/torch/util/timing.py
new file mode 100644
index 000000000..fd1940cc6
--- /dev/null
+++ b/graph_net_bench/torch/util/timing.py
@@ -0,0 +1,74 @@
+import torch
+import platform
+import sys
+from graph_net_bench import test_compiler_util
+
+
+def get_hardward_name(device):
+    hardware_name = "unknown"
+    if "cuda" in device:
+        hardware_name = torch.cuda.get_device_name(device)
+    elif device == "cpu":
+        hardware_name = platform.processor()
+    return hardware_name
+
+
+def get_compiler_version(compiler):
+    if compiler in ["inductor", "nope", "unstable_to_stable"]:
+        return torch.__version__
+    # 兼容处理具有 version 属性的对象或字符串
+    return getattr(compiler, "version", "unknown")
+
+
+def measure_performance(model_call, args, compiler):
+    stats = {}
+    # 预热
+    for _ in range(args.warmup):
+        model_call()
+    compiler.synchronize()
+
+    print(
+        f"[Profiling] Warm up {args.warmup}, Trials {args.trials}",
+        file=sys.stderr,
+        flush=True,
+    )
+
+    if "cuda" in args.device:
+        torch.cuda.empty_cache()
+        e2e_times, gpu_times = [], []
+        for i in range(args.trials):
+            duration_box = test_compiler_util.DurationBox(-1)
+            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
+                start_event = torch.cuda.Event(enable_timing=True)
+                end_event = torch.cuda.Event(enable_timing=True)
+                start_event.record()
+                model_call()
+                end_event.record()
+                compiler.synchronize()
+
+            gpu_time_ms = start_event.elapsed_time(end_event)
+            e2e_times.append(duration_box.value)
+            gpu_times.append(gpu_time_ms)
+            print(
+                f"Trial {i + 1}: e2e={duration_box.value:.5f} ms, gpu={gpu_time_ms:.5f} ms",
+                file=sys.stderr,
+                flush=True,
+            )
+
+        stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
+        stats["gpu"] = test_compiler_util.get_timing_stats(gpu_times)
+    else:
+        e2e_times = []
+        for i in range(args.trials):
+            duration_box = test_compiler_util.DurationBox(-1)
+            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
+                model_call()
+            e2e_times.append(duration_box.value)
+            print(
+                f"Trial {i + 1}: e2e={duration_box.value:.5f} ms",
+                file=sys.stderr,
+                flush=True,
+            )
+        stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
+
+    return stats

From 49ea3b7b889359b24647977c000b1c05a772a6fe Mon Sep 17 00:00:00 2001
From: roll-away <220250881@seu.edu.cn>
Date: Tue, 27 Jan 2026 08:58:35 +0000
Subject: [PATCH 2/6] split eval_backend_perf

---
 graph_net_bench/torch/util/comparison.py |  7 +++----
 graph_net_bench/torch/util/timing.py     | 11 +++++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/graph_net_bench/torch/util/comparison.py b/graph_net_bench/torch/util/comparison.py
index 25c93aaca..7be2dac95 100644
--- a/graph_net_bench/torch/util/comparison.py
+++ b/graph_net_bench/torch/util/comparison.py
@@ -141,9 +141,7 @@ def eval_single_model_with_single_backend(args):
                 def model_call():
                     return compiled_model(**input_dict)
 
-                # 运行并测量
-                outputs = model_call()
-                time_stats = measure_performance(model_call, args, compiler)
+                outputs, time_stats = measure_performance(model_call, args, compiler)
                 success = True
             except Exception as e:
                 print(
@@ -161,7 +159,8 @@ def model_call():
             )
 
     with open(log_path, "r", encoding="utf-8") as f:
-        print(f.read(), file=sys.stderr, flush=True)
+        content = f.read()
+        print(content, file=sys.stderr, flush=True)
 
 
 def check_and_complete_args(args):
diff --git a/graph_net_bench/torch/util/timing.py b/graph_net_bench/torch/util/timing.py
index fd1940cc6..8c98c467a 100644
--- a/graph_net_bench/torch/util/timing.py
+++ b/graph_net_bench/torch/util/timing.py
@@ -16,13 +16,16 @@ def get_hardward_name(device):
 def get_compiler_version(compiler):
     if compiler in ["inductor", "nope", "unstable_to_stable"]:
         return torch.__version__
-    # 兼容处理具有 version 属性的对象或字符串
-    return getattr(compiler, "version", "unknown")
+    elif compiler in ["tvm", "xla", "tensorrt", "bladedisc"]:
+        # Assuming compiler object has a version attribute
+        return f"{compiler.capitalize()} {compiler.version}"
+    return "unknown"
 
 
 def measure_performance(model_call, args, compiler):
     stats = {}
-    # 预热
+    outs = model_call()
+    # Warmup runs
     for _ in range(args.warmup):
         model_call()
     compiler.synchronize()
@@ -71,4 +74,4 @@ def measure_performance(model_call, args, compiler):
             )
         stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
 
-    return stats
+    return outs, stats

From dc5336da526e8c87092078e1b496708e5ad45508 Mon Sep 17 00:00:00 2001
From: roll-away <220250881@seu.edu.cn>
Date: Wed, 28 Jan 2026 02:08:22 +0000
Subject: [PATCH 3/6] move some features from eval_backend_diff.py into
 timing.py

---
 graph_net_bench/torch/eval_backend_diff.py    |  2 +-
 .../{comparison.py => eval_backend_perf.py}   | 21 ++++++++++++++++++-
 graph_net_bench/torch/util/timing.py          | 19 -----------------
 3 files changed, 21 insertions(+), 21 deletions(-)
 rename graph_net_bench/torch/util/{comparison.py => eval_backend_perf.py} (91%)

diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
index d2d9cd417..49780971a 100755
--- a/graph_net_bench/torch/eval_backend_diff.py
+++ b/graph_net_bench/torch/eval_backend_diff.py
@@ -9,7 +9,7 @@
 import types
 from graph_net_bench import test_compiler_util
 from graph_net_bench import path_utils
-from .util.comparison import eval_single_model_with_single_backend
+from .util.eval_backend_perf import eval_single_model_with_single_backend
 
 
 def compare_correctness(expected_out, compiled_out, args):
diff --git a/graph_net_bench/torch/util/comparison.py b/graph_net_bench/torch/util/eval_backend_perf.py
similarity index 91%
rename from graph_net_bench/torch/util/comparison.py
rename to graph_net_bench/torch/util/eval_backend_perf.py
index 7be2dac95..8aa64f867 100644
--- a/graph_net_bench/torch/util/comparison.py
+++ b/graph_net_bench/torch/util/eval_backend_perf.py
@@ -10,11 +10,12 @@
 import json
 import random
 import numpy as np
+import platform
 import types
 from contextlib import redirect_stdout, redirect_stderr
 from graph_net_bench.torch.backend.graph_compiler_backend import GraphCompilerBackend
 from graph_net_bench import test_compiler_util
-from .timing import get_hardward_name, get_compiler_version, measure_performance
+from .timing import measure_performance
 
 
 def register_op_lib(op_lib):
@@ -35,6 +36,24 @@ def set_seed(random_seed):
         torch.cuda.manual_seed_all(random_seed)
 
 
+def get_hardward_name(device):
+    hardware_name = "unknown"
+    if "cuda" in device:
+        hardware_name = torch.cuda.get_device_name(device)
+    elif device == "cpu":
+        hardware_name = platform.processor()
+    return hardware_name
+
+
+def get_compiler_version(compiler):
+    if compiler in ["inductor", "nope", "unstable_to_stable"]:
+        return torch.__version__
+    elif compiler in ["tvm", "xla", "tensorrt", "bladedisc"]:
+        # Assuming compiler object has a version attribute
+        return f"{compiler.capitalize()} {compiler.version}"
+    return "unknown"
+
+
 def load_class_from_file(
     model_path: str, class_name: str, device: str
 ) -> Type[torch.nn.Module]:
diff --git a/graph_net_bench/torch/util/timing.py b/graph_net_bench/torch/util/timing.py
index 8c98c467a..67286e07f 100644
--- a/graph_net_bench/torch/util/timing.py
+++ b/graph_net_bench/torch/util/timing.py
@@ -1,27 +1,8 @@
 import torch
-import platform
 import sys
 from graph_net_bench import test_compiler_util
 
 
-def get_hardward_name(device):
-    hardware_name = "unknown"
-    if "cuda" in device:
-        hardware_name = torch.cuda.get_device_name(device)
-    elif device == "cpu":
-        hardware_name = platform.processor()
-    return hardware_name
-
-
-def get_compiler_version(compiler):
-    if compiler in ["inductor", "nope", "unstable_to_stable"]:
-        return torch.__version__
-    elif compiler in ["tvm", "xla", "tensorrt", "bladedisc"]:
-        # Assuming compiler object has a version attribute
-        return f"{compiler.capitalize()} {compiler.version}"
-    return "unknown"
-
-
 def measure_performance(model_call, args, compiler):
     stats = {}
     outs = model_call()

From 61dd8e077772d3480ccb044b89e8b48b2a0f3658 Mon Sep 17 00:00:00 2001
From: roll-away <220250881@seu.edu.cn>
Date: Wed, 28 Jan 2026 05:15:07 +0000
Subject: [PATCH 4/6] pull some features from
 graph_net_bench/torch/eval_backend_perf.py to
 graph_net_bench/torch/util/timing.py

---
 graph_net_bench/torch/eval_backend_diff.py    |   2 +-
 graph_net_bench/torch/eval_backend_perf.py    |  64 +----
 .../torch/util/eval_backend_perf.py           | 239 ------------------
 3 files changed, 2 insertions(+), 303 deletions(-)
 delete mode 100644 graph_net_bench/torch/util/eval_backend_perf.py

diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
index 49780971a..cfa171dc6 100755
--- a/graph_net_bench/torch/eval_backend_diff.py
+++ b/graph_net_bench/torch/eval_backend_diff.py
@@ -9,7 +9,7 @@
 import types
 from graph_net_bench import test_compiler_util
 from graph_net_bench import path_utils
-from .util.eval_backend_perf import eval_single_model_with_single_backend
+from .eval_backend_perf import eval_single_model_with_single_backend
 
 
 def compare_correctness(expected_out, compiled_out, args):
diff --git a/graph_net_bench/torch/eval_backend_perf.py b/graph_net_bench/torch/eval_backend_perf.py
index 3fd6db3ff..b22e322da 100644
--- a/graph_net_bench/torch/eval_backend_perf.py
+++ b/graph_net_bench/torch/eval_backend_perf.py
@@ -15,6 +15,7 @@
 from contextlib import redirect_stdout, redirect_stderr
 from graph_net_bench.torch.backend.graph_compiler_backend import GraphCompilerBackend
 from graph_net_bench import test_compiler_util
+from .util.timing import measure_performance
 
 
 def register_op_lib(op_lib):
@@ -129,69 +130,6 @@ def get_input_dict(args):
     }
 
 
-def measure_performance(model_call, args, compiler):
-    stats = {}
-    outs = model_call()
-
-    # Warmup runs
-    for _ in range(args.warmup):
-        model_call()
-    compiler.synchronize()
-
-    print(
-        f"[Profiling] Warm up {args.warmup}, Trials {args.trials}",
-        file=sys.stderr,
-        flush=True,
-    )
-
-    if "cuda" in args.device:
-        torch.cuda.empty_cache()
-        e2e_times = []
-        gpu_times = []
-
-        for i in range(args.trials):
-            # End-to-end timing (naive_timer)
-            duration_box = test_compiler_util.DurationBox(-1)
-            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
-                # GPU-only timing (CUDA Events)
-                start_event = torch.cuda.Event(enable_timing=True)
-                end_event = torch.cuda.Event(enable_timing=True)
-                start_event.record()
-
-                model_call()
-
-                end_event.record()
-                compiler.synchronize()
-
-            gpu_time_ms = start_event.elapsed_time(end_event)
-            e2e_times.append(duration_box.value)
-            gpu_times.append(gpu_time_ms)
-            print(
-                f"Trial {i + 1}: e2e={duration_box.value:.5f} ms, gpu={gpu_time_ms:.5f} ms",
-                file=sys.stderr,
-                flush=True,
-            )
-
-        stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
-        stats["gpu"] = test_compiler_util.get_timing_stats(gpu_times)
-
-    else:  # CPU or other devices
-        e2e_times = []
-        for i in range(args.trials):
-            duration_box = test_compiler_util.DurationBox(-1)
-            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
-                model_call()
-            print(
-                f"Trial {i + 1}: e2e={duration_box.value:.5f} ms",
-                file=sys.stderr,
-                flush=True,
-            )
-            e2e_times.append(duration_box.value)
-        stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
-
-    return outs, stats
-
-
 def eval_single_model_with_single_backend(args):
     check_and_complete_args(args)
     set_seed(args.seed)
diff --git a/graph_net_bench/torch/util/eval_backend_perf.py b/graph_net_bench/torch/util/eval_backend_perf.py
deleted file mode 100644
index 8aa64f867..000000000
--- a/graph_net_bench/torch/util/eval_backend_perf.py
+++ /dev/null
@@ -1,239 +0,0 @@
-from .. import utils
-import argparse
-import importlib.util
-import torch
-from pathlib import Path
-from typing import Type
-import sys
-import os
-import traceback
-import json
-import random
-import numpy as np
-import platform
-import types
-from contextlib import redirect_stdout, redirect_stderr
-from graph_net_bench.torch.backend.graph_compiler_backend import GraphCompilerBackend
-from graph_net_bench import test_compiler_util
-from .timing import measure_performance
-
-
-def register_op_lib(op_lib):
-    if op_lib == "flaggems":
-        import flag_gems
-
-        flag_gems.enable()
-    else:
-        pass
-
-
-def set_seed(random_seed):
-    random.seed(random_seed)
-    np.random.seed(random_seed)
-    torch.manual_seed(random_seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(random_seed)
-        torch.cuda.manual_seed_all(random_seed)
-
-
-def get_hardward_name(device):
-    hardware_name = "unknown"
-    if "cuda" in device:
-        hardware_name = torch.cuda.get_device_name(device)
-    elif device == "cpu":
-        hardware_name = platform.processor()
-    return hardware_name
-
-
-def get_compiler_version(compiler):
-    if compiler in ["inductor", "nope", "unstable_to_stable"]:
-        return torch.__version__
-    elif compiler in ["tvm", "xla", "tensorrt", "bladedisc"]:
-        # Assuming compiler object has a version attribute
-        return f"{compiler.capitalize()} {compiler.version}"
-    return "unknown"
-
-
-def load_class_from_file(
-    model_path: str, class_name: str, device: str
-) -> Type[torch.nn.Module]:
-    file_path = f"{model_path}/model.py"
-    file = Path(file_path).resolve()
-    module_name = file.stem
-
-    with open(file_path, "r", encoding="utf-8") as f:
-        model_code = f.read()
-    model_code = utils.modify_code_by_device(model_code, device)
-    spec = importlib.util.spec_from_loader(module_name, loader=None)
-    module = importlib.util.module_from_spec(spec)
-    sys.modules[module_name] = module
-    compiled_code = compile(model_code, filename=file, mode="exec")
-    exec(compiled_code, module.__dict__)
-
-    model_class = getattr(module, class_name, None)
-    setattr(model_class, "__graph_net_file_path__", file_path)
-    setattr(model_class, "__graph_net_device__", device)
-    return model_class
-
-
-def get_compiler_backend(args) -> GraphCompilerBackend:
-    """
-    Dynamically load backend class based on args.compiler
-    """
-    compiler_name = args.compiler.lower()
-    module_name = f"graph_net_bench.torch.backend.{compiler_name}_backend"
-
-    try:
-        module = __import__(module_name, fromlist=[f"{compiler_name.title()}Backend"])
-
-        class_name = (
-            f"{''.join(part.title() for part in compiler_name.split('_'))}Backend"
-        )
-
-        backend_class = None
-        if hasattr(module, class_name):
-            backend_class = getattr(module, class_name)
-        else:
-            raise ImportError(f"No valid backend class found in {module_name}")
-
-    except ImportError as e:
-        raise ImportError(f"Failed to import backend module for '{compiler_name}': {e}")
-
-    backend_config = (
-        test_compiler_util.convert_to_dict(args.backend_config)
-        if args.backend_config is not None
-        else {}
-    )
-    return backend_class(backend_config)
-
-
-def get_model(args):
-    device = "xla" if args.compiler == "xla" else args.device
-
-    # device: Torch device object specifying the target device for model loading (e.g., 'cuda', 'cpu', 'xla')
-    model_class = load_class_from_file(
-        args.model_path, class_name="GraphModule", device=device
-    )
-    model = model_class().to(torch.device(args.device))
-    return model
-
-
-def get_input_dict(args):
-    inputs_params = utils.load_converted_from_text(f"{args.model_path}")
-    params = inputs_params["weight_info"]
-    for tensor_meta in params.values():
-        if "device" in tensor_meta["info"]:
-            tensor_meta["info"]["device"] = args.device
-    return {
-        k: utils.replay_tensor(v).to(torch.device(args.device))
-        for k, v in params.items()
-    }
-
-
-def eval_single_model_with_single_backend(args):
-    check_and_complete_args(args)
-    set_seed(args.seed)
-    torch.set_default_device(args.device)
-    os.makedirs(args.output_path, exist_ok=True)
-
-    log_path = utils.get_log_path(args.output_path, args.model_path)
-    output_dump_path = utils.get_output_path(args.output_path, args.model_path)
-
-    with open(log_path, "w", encoding="utf-8") as log_f:
-        with redirect_stdout(log_f), redirect_stderr(log_f):
-            compiler = get_compiler_backend(args)
-            input_dict = get_input_dict(args)
-            model = get_model(args)
-            model.eval()
-
-            test_compiler_util.print_config(
-                args,
-                get_hardward_name(args.device),
-                get_compiler_version(args.compiler),
-            )
-
-            success = False
-            time_stats = {}
-            try:
-                compiled_model = compiler(model)
-
-                def model_call():
-                    return compiled_model(**input_dict)
-
-                outputs, time_stats = measure_performance(model_call, args, compiler)
-                success = True
-            except Exception as e:
-                print(
-                    f"Run model failed: {str(e)}\n{traceback.format_exc()}",
-                    file=sys.stderr,
-                    flush=True,
-                )
-
-            test_compiler_util.print_running_status(args, success)
-            if success:
-                torch.save(outputs, str(output_dump_path))
-
-            test_compiler_util.print_with_log_prompt(
-                "[Performance][eager]:", json.dumps(time_stats), args.log_prompt
-            )
-
-    with open(log_path, "r", encoding="utf-8") as f:
-        content = f.read()
-        print(content, file=sys.stderr, flush=True)
-
-
-def check_and_complete_args(args):
-    """
-    Ensure all required arguments are present with default values if missing
-    """
-    defaults = {
-        "model_path": None,  # Model path
-        "output_path": None,  # Log and output directory
-        "seed": 123,  # Random seed
-        "compiler": "inductor",  # Compiler name
-        "device": "cuda",  # Device for testing the compiler (e.g., 'cpu' or 'cuda')
-        "op_lib": None,  # Operator library
-        "warmup": 3,  # Number of warmup steps
-        "trials": 5,  # Number of timing trials
-        "log_prompt": "graph-net-bench-log",  # Log prompt for performance log filtering
-        "model_path_prefix": None,  # Prefix path to model path in args.model-path
-        "backend_config": None,  # backend configuration json
-    }
-
-    for key, default in defaults.items():
-        if not hasattr(args, key):
-            setattr(args, key, default)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Single Backend Performance Evaluation"
-    )
-    parser.add_argument(
-        "--model-path",
-        type=str,
-        required=False,
-        default=None,
-        help="Path to model file(s), each subdirectory containing graph_net.json will be regarded as a model",
-    )
-    parser.add_argument(
-        "--output-path",
-        type=str,
-        required=False,
-        default="/tmp/test_save",
-        help="Path to save outputs",
-    )
-    parser.add_argument(
-        "--config",
-        type=str,
-        required=False,
-        default=None,
-        help="base64 encode configuration json.",
-    )
-    args = parser.parse_args()
-    mut_args = types.SimpleNamespace(
-        model_path=args.model_path,
-        output_path=args.output_path,
-        **test_compiler_util.convert_to_dict(args.config),
-    )
-    eval_single_model_with_single_backend(mut_args)

From 6c8f081f4c89927395b67c389b839cffd03f77b2 Mon Sep 17 00:00:00 2001
From: roll-away <220250881@seu.edu.cn>
Date: Wed, 28 Jan 2026 06:04:49 +0000
Subject: [PATCH 5/6] Refactor code structure of
 graph_net_bench/torch/util/timing.py

---
 graph_net_bench/torch/util/timing.py | 73 ++++++++++++++++------------
 1 file changed, 42 insertions(+), 31 deletions(-)

diff --git a/graph_net_bench/torch/util/timing.py b/graph_net_bench/torch/util/timing.py
index 67286e07f..ced60e13d 100644
--- a/graph_net_bench/torch/util/timing.py
+++ b/graph_net_bench/torch/util/timing.py
@@ -19,40 +19,51 @@ def measure_performance(model_call, args, compiler):
 
     if "cuda" in args.device:
         torch.cuda.empty_cache()
-        e2e_times, gpu_times = [], []
-        for i in range(args.trials):
-            duration_box = test_compiler_util.DurationBox(-1)
-            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
-                start_event = torch.cuda.Event(enable_timing=True)
-                end_event = torch.cuda.Event(enable_timing=True)
-                start_event.record()
-                model_call()
-                end_event.record()
-                compiler.synchronize()
-
-            gpu_time_ms = start_event.elapsed_time(end_event)
-            e2e_times.append(duration_box.value)
-            gpu_times.append(gpu_time_ms)
-            print(
-                f"Trial {i + 1}: e2e={duration_box.value:.5f} ms, gpu={gpu_time_ms:.5f} ms",
-                file=sys.stderr,
-                flush=True,
-            )
-
+        e2e_times, gpu_times = run_cuda_benchmark_timer(
+            model_call, args.trials, compiler
+        )
         stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
         stats["gpu"] = test_compiler_util.get_timing_stats(gpu_times)
     else:
-        e2e_times = []
-        for i in range(args.trials):
-            duration_box = test_compiler_util.DurationBox(-1)
-            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
-                model_call()
-            e2e_times.append(duration_box.value)
-            print(
-                f"Trial {i + 1}: e2e={duration_box.value:.5f} ms",
-                file=sys.stderr,
-                flush=True,
-            )
+        e2e_times = run_non_cuda_benchmark_timer(model_call, args.trials, compiler)
         stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
 
     return outs, stats
+
+
+def run_cuda_benchmark_timer(model_call, trials, compiler):
+    e2e_times, gpu_times = [], []
+    for i in range(trials):
+        duration_box = test_compiler_util.DurationBox(-1)
+        with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
+            start_event = torch.cuda.Event(enable_timing=True)
+            end_event = torch.cuda.Event(enable_timing=True)
+            start_event.record()
+            model_call()
+            end_event.record()
+            compiler.synchronize()
+
+        gpu_time_ms = start_event.elapsed_time(end_event)
+        e2e_times.append(duration_box.value)
+        gpu_times.append(gpu_time_ms)
+        print(
+            f"Trial {i + 1}: e2e={duration_box.value:.5f} ms, gpu={gpu_time_ms:.5f} ms",
+            file=sys.stderr,
+            flush=True,
+        )
+    return e2e_times, gpu_times
+
+
+def run_non_cuda_benchmark_timer(model_call, trials, compiler):
+    e2e_times = []
+    for i in range(trials):
+        duration_box = test_compiler_util.DurationBox(-1)
+        with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
+            model_call()
+        e2e_times.append(duration_box.value)
+        print(
+            f"Trial {i + 1}: e2e={duration_box.value:.5f} ms",
+            file=sys.stderr,
+            flush=True,
+        )
+    return e2e_times

From 1d7522bc7f3232aec7f3914ecdd54b14deac3348 Mon Sep 17 00:00:00 2001
From: roll-away <220250881@seu.edu.cn>
Date: Wed, 28 Jan 2026 08:48:23 +0000
Subject: [PATCH 6/6] refactor graph_net_bench/torch/util/timing.py

---
 graph_net_bench/torch/util/timing.py | 95 +++++++++++++++++-----------
 1 file changed, 58 insertions(+), 37 deletions(-)

diff --git a/graph_net_bench/torch/util/timing.py b/graph_net_bench/torch/util/timing.py
index ced60e13d..e9c42cfa7 100644
--- a/graph_net_bench/torch/util/timing.py
+++ b/graph_net_bench/torch/util/timing.py
@@ -19,51 +19,72 @@ def measure_performance(model_call, args, compiler):
 
     if "cuda" in args.device:
         torch.cuda.empty_cache()
-        e2e_times, gpu_times = run_cuda_benchmark_timer(
-            model_call, args.trials, compiler
-        )
-        stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
-        stats["gpu"] = test_compiler_util.get_timing_stats(gpu_times)
+        executor = CUDATrialExecutor(model_call, compiler)
     else:
-        e2e_times = run_non_cuda_benchmark_timer(model_call, args.trials, compiler)
-        stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
+        executor = NoneCUDATrialExecutor(model_call, compiler)
+
+    timings = run_benchmark(args.trials, executor)
+
+    stats = {
+        name: test_compiler_util.get_timing_stats(values)
+        for name, values in timings.items()
+    }
 
     return outs, stats
 
 
-def run_cuda_benchmark_timer(model_call, trials, compiler):
-    e2e_times, gpu_times = [], []
+def run_benchmark(trials, executor):
+    results = {}
+
     for i in range(trials):
+        timings = executor.run_one_trial()
+
+        for k, v in timings.items():
+            results.setdefault(k, []).append(v)
+
+        log_trial(i + 1, timings)
+
+    return results
+
+
+def log_trial(idx, timings):
+    msg = ", ".join(f"{k}={v:.5f} ms" for k, v in timings.items())
+    print(f"Trial {idx}: {msg}", file=sys.stderr, flush=True)
+
+
+class BaseTrialExecutor:
+    def __init__(self, model_call, compiler):
+        self.model_call = model_call
+        self.compiler = compiler
+
+    def run_one_trial(self):
+        raise NotImplementedError
+
+
+class NoneCUDATrialExecutor(BaseTrialExecutor):
+    def run_one_trial(self):
+        duration_box = test_compiler_util.DurationBox(-1)
+        with test_compiler_util.naive_timer(duration_box, self.compiler.synchronize):
+            self.model_call()
+        return {"e2e": duration_box.value}
+
+
+class CUDATrialExecutor(BaseTrialExecutor):
+    def run_one_trial(self):
         duration_box = test_compiler_util.DurationBox(-1)
-        with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
-            start_event = torch.cuda.Event(enable_timing=True)
-            end_event = torch.cuda.Event(enable_timing=True)
+
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+
+        with test_compiler_util.naive_timer(duration_box, self.compiler.synchronize):
             start_event.record()
-            model_call()
+            self.model_call()
             end_event.record()
-            compiler.synchronize()
-
-        gpu_time_ms = start_event.elapsed_time(end_event)
-        e2e_times.append(duration_box.value)
-        gpu_times.append(gpu_time_ms)
-        print(
-            f"Trial {i + 1}: e2e={duration_box.value:.5f} ms, gpu={gpu_time_ms:.5f} ms",
-            file=sys.stderr,
-            flush=True,
-        )
-    return e2e_times, gpu_times
+            self.compiler.synchronize()
 
+        gpu_time = start_event.elapsed_time(end_event)
 
-def run_non_cuda_benchmark_timer(model_call, trials, compiler):
-    e2e_times = []
-    for i in range(trials):
-        duration_box = test_compiler_util.DurationBox(-1)
-        with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
-            model_call()
-        e2e_times.append(duration_box.value)
-        print(
-            f"Trial {i + 1}: e2e={duration_box.value:.5f} ms",
-            file=sys.stderr,
-            flush=True,
-        )
-    return e2e_times
+        return {
+            "e2e": duration_box.value,
+            "gpu": gpu_time,
+        }