From 5e0ee6989b360bb99be648cee41be15421338f97 Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Thu, 15 Jan 2026 22:09:33 +0800
Subject: [PATCH 01/20] add eval_backend_perf

---
 graph_net_bench/torch/eval_backend_diff.py | 419 +++++----------------
 graph_net_bench/torch/eval_backend_perf.py | 337 +++++++++++++++++
 graph_net_bench/torch/utils.py             |  11 +
 test/eval_backend_diff_test.sh             |  13 +-
 4 files changed, 458 insertions(+), 322 deletions(-)
 create mode 100644 graph_net_bench/torch/eval_backend_perf.py

diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
index 8488b71b7..07a19ff88 100755
--- a/graph_net_bench/torch/eval_backend_diff.py
+++ b/graph_net_bench/torch/eval_backend_diff.py
@@ -1,103 +1,18 @@
 from . import utils
 import subprocess
 import argparse
-import importlib.util
 import torch
-from pathlib import Path
-from typing import Type
 import sys
 import os
 import os.path
 import traceback
 import json
-import random
-import numpy as np
-import platform
 import base64
-from graph_net_bench.torch.backend.graph_compiler_backend import GraphCompilerBackend
-from graph_net_bench.torch.backend.tvm_backend import TvmBackend
-from graph_net_bench.torch.backend.xla_backend import XlaBackend
-from graph_net_bench.torch.backend.inductor_backend import InductorBackend
-from graph_net_bench.torch.backend.tensorrt_backend import TensorRTBackend
-from graph_net_bench.torch.backend.blade_disc_backend import BladeDISCBackend
-from graph_net_bench.torch.backend.nope_backend import NopeBackend
-from graph_net_bench.torch.backend.pass_mgr_backend import PassMgrBackend
-from graph_net_bench.torch.backend.unstable_to_stable_backend import (
-    UnstableToStableBackend,
-)
-from graph_net_bench.torch.backend.range_decomposer_validator_backend import (
-    RangeDecomposerValidatorBackend,
-)
-from graph_net_bench.torch.backend.graph_variable_renamer_validator_backend import (
-    GraphVariableRenamerValidatorBackend,
-)
+import types
 from graph_net_bench import test_compiler_util
 from graph_net_bench import path_utils
 
 
-compiler_backend_name2class = {
-    "tvm": TvmBackend,
-    "xla": XlaBackend,
-    "inductor": InductorBackend,
-    "tensorrt": TensorRTBackend,
-    "bladedisc": BladeDISCBackend,
-    "nope": NopeBackend,
-    "pass_mgr": PassMgrBackend,
-    "unstable_to_stable": UnstableToStableBackend,
-    "range_decomposer_validator": RangeDecomposerValidatorBackend,
-    "graph_variable_renamer_validator": GraphVariableRenamerValidatorBackend,
-}
-
-
-def set_seed(random_seed):
-    random.seed(random_seed)
-    np.random.seed(random_seed)
-    torch.manual_seed(random_seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(random_seed)
-        torch.cuda.manual_seed_all(random_seed)
-
-
-def get_hardward_name(args):
-    hardware_name = "unknown"
-    if "cuda" in args.device:
-        hardware_name = torch.cuda.get_device_name(args.device)
-    elif args.device == "cpu":
-        hardware_name = platform.processor()
-    return hardware_name
-
-
-def get_compile_framework_version(args):
-    if args.compiler in ["inductor", "nope", "unstable_to_stable"]:
-        return torch.__version__
-    elif args.compiler in ["tvm", "xla", "tensorrt", "bladedisc"]:
-        # Assuming compiler object has a version attribute
-        return f"{args.compiler.capitalize()} {args.compiler.version}"
-    return "unknown"
-
-
-def load_class_from_file(
-    args: argparse.Namespace, class_name: str, device: str
-) -> Type[torch.nn.Module]:
-    file_path = f"{args.model_path}/model.py"
-    file = Path(file_path).resolve()
-    module_name = file.stem
-
-    with open(file_path, "r", encoding="utf-8") as f:
-        model_code = f.read()
-    model_code = utils.modify_code_by_device(model_code, device)
-    spec = importlib.util.spec_from_loader(module_name, loader=None)
-    module = importlib.util.module_from_spec(spec)
-    sys.modules[module_name] = module
-    compiled_code = compile(model_code, filename=file, mode="exec")
-    exec(compiled_code, module.__dict__)
-
-    model_class = getattr(module, class_name, None)
-    setattr(model_class, "__graph_net_file_path__", file_path)
-    setattr(model_class, "__graph_net_device__", device)
-    return model_class
-
-
 def convert_to_dict(config_str):
     if config_str in {None, "", "null", "None"}:
         return {}
@@ -107,203 +22,6 @@ def convert_to_dict(config_str):
     return config
 
 
-def get_compiler_backend(args) -> GraphCompilerBackend:
-    assert (
-        args.compiler in compiler_backend_name2class
-    ), f"Unknown compiler: {args.compiler}"
-    backend_class = compiler_backend_name2class[args.compiler]
-    return backend_class(args.backend_config)
-
-
-def get_model(args):
-    device = "xla" if args.compiler == "xla" else args.device
-
-    # device: Torch device object specifying the target device for model loading (e.g., 'cuda', 'cpu', 'xla')
-    model_class = load_class_from_file(args, class_name="GraphModule", device=device)
-    model = model_class().to(torch.device(args.device))
-    return model
-
-
-def get_input_dict(args):
-    inputs_params = utils.load_converted_from_text(f"{args.model_path}")
-    params = inputs_params["weight_info"]
-    for tensor_meta in params.values():
-        if "device" in tensor_meta["info"]:
-            tensor_meta["info"]["device"] = args.device
-    return {
-        k: utils.replay_tensor(v).to(torch.device(args.device))
-        for k, v in params.items()
-    }
-
-
-def measure_performance(model_call, args, compiler):
-    stats = {}
-    outs = model_call()
-
-    # Warmup runs
-    for _ in range(args.warmup):
-        model_call()
-    compiler.synchronize()
-
-    hardware_name = get_hardward_name(args)
-    print(
-        f"[Profiling] Using device: {args.device} {hardware_name}, warm up {args.warmup}, trials {args.trials}",
-        file=sys.stderr,
-        flush=True,
-    )
-
-    if "cuda" in args.device:
-        """
-        Acknowledgement: We evaluate the performance on both end-to-end and GPU-only timings,
-        With reference to methods only based on CUDA events from KernelBench in https://github.com/ScalingIntelligence/KernelBench
-        """
-
-        e2e_times = []
-        gpu_times = []
-
-        for i in range(args.trials):
-            # End-to-end timing (naive_timer)
-            duration_box = test_compiler_util.DurationBox(-1)
-            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
-                # GPU-only timing (CUDA Events)
-                start_event = torch.cuda.Event(enable_timing=True)
-                end_event = torch.cuda.Event(enable_timing=True)
-                start_event.record()
-
-                model_call()
-
-                end_event.record()
-                compiler.synchronize()
-
-            gpu_time_ms = start_event.elapsed_time(end_event)
-            e2e_times.append(duration_box.value)
-            gpu_times.append(gpu_time_ms)
-            print(
-                f"Trial {i + 1}: e2e={duration_box.value:.5f} ms, gpu={gpu_time_ms:.5f} ms",
-                file=sys.stderr,
-                flush=True,
-            )
-
-        stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
-        stats["gpu"] = test_compiler_util.get_timing_stats(gpu_times)
-
-    else:  # CPU or other devices
-        e2e_times = []
-        for i in range(args.trials):
-            duration_box = test_compiler_util.DurationBox(-1)
-            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
-                model_call()
-            print(
-                f"Trial {i + 1}: e2e={duration_box.value:.5f} ms",
-                file=sys.stderr,
-                flush=True,
-            )
-            e2e_times.append(duration_box.value)
-        stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
-
-    return outs, stats
-
-
-def test_single_model(args):
-    compiler = get_compiler_backend(args)
-    input_dict = get_input_dict(args)
-    model = get_model(args)
-    model_path = os.path.normpath(args.model_path)
-    test_compiler_util.print_with_log_prompt(
-        "[Processing]", model_path, args.log_prompt
-    )
-    test_compiler_util.print_basic_config(
-        args, get_hardward_name(args), get_compile_framework_version(args)
-    )
-
-    runtime_seed = 1024
-    eager_failure = False
-    expected_out = None
-    eager_time_stats = {}
-
-    try:
-
-        def eager_model_call():
-            return model(**input_dict)
-
-        expected_out, eager_time_stats = measure_performance(
-            eager_model_call, args, compiler
-        )
-
-        torch.manual_seed(runtime_seed)
-        if not isinstance(expected_out, tuple):
-            expected_out = (expected_out,)
-    except (TypeError, RuntimeError) as e:
-        print(f"Eager model execution failed: {str(e)}", file=sys.stderr)
-        eager_failure = True
-
-    compiled_failure = False
-    compiled_model = None
-    compiled_time_stats = {}
-
-    try:
-        compiled_model = compiler(model)
-        torch.manual_seed(runtime_seed)
-
-        def compiled_model_call():
-            return compiled_model(**input_dict)
-
-        compiled_out, compiled_time_stats = measure_performance(
-            compiled_model_call, args, compiler
-        )
-
-        if not isinstance(compiled_out, tuple):
-            compiled_out = (compiled_out,)
-        if args.compiler == "xla":
-            compiled_out = tuple(item.to("cpu").to("cuda") for item in compiled_out)
-    except (TypeError, RuntimeError) as e:
-        print(f"Compiled model execution failed: {str(e)}", file=sys.stderr)
-        compiled_failure = True
-        print("\n--- Full Traceback ---")
-        traceback.print_exc()
-        print(f"debug-model-execution {type(e).__name__} {args.model_path}", flush=True)
-    except Exception as e:
-        compiled_failure = True
-        print("\n--- Full Traceback ---")
-        traceback.print_exc()
-        print(f"debug-model-execution {type(e).__name__} {args.model_path}", flush=True)
-
-    if eager_failure:
-        print(f"{args.log_prompt} [Result] status: failed", file=sys.stderr, flush=True)
-        print(
-            f"{args.log_prompt} [Fail due to eager model execution error.]",
-            file=sys.stderr,
-            flush=True,
-        )
-    elif compiled_failure:
-        print(f"{args.log_prompt} [Result] status: failed", file=sys.stderr, flush=True)
-        print(
-            f"{args.log_prompt} [Fail due to compiled model execution error.]",
-            file=sys.stderr,
-            flush=True,
-        )
-    else:
-        compare_correctness(expected_out, compiled_out, args)
-
-        print(
-            f"{args.log_prompt} [Result] status: success", file=sys.stderr, flush=True
-        )
-
-        test_compiler_util.print_times_and_speedup(
-            args, eager_time_stats, compiled_time_stats
-        )
-
-
-def print_and_store_cmp(key, cmp_func, args, expected_out, compiled_out, **kwargs):
-    cmp_ret = cmp_func(expected_out, compiled_out, **kwargs)
-    print(
-        f"{args.log_prompt} [Correctness]{key}: {cmp_ret}",
-        file=sys.stderr,
-        flush=True,
-    )
-    return cmp_ret
-
-
 def compare_correctness(expected_out, compiled_out, args):
     eager_dtypes = [
         (
@@ -386,13 +104,24 @@ def get_cmp_diff_count(expected_out, compiled_out, atol, rtol):
     return " ".join(results)
 
 
-def get_sample_root(args):
-    return args.model_path_prefix
+def parse_time_stats_from_reference_log(log_path):
+    assert os.path.isfile(
+        log_path
+    ), f"{log_path} does not exist or is not a regular file."
+
+    with open(log_path, "r", encoding="utf-8") as f:
+        lines = f.readlines()
+        for line in reversed(lines):
+            if "[Performance][eager]" in line:
+                start = line.find("{")
+                end = line.rfind("}")
+                time_stats = json.loads(line[start : end + 1])
+    return time_stats
 
 
-def test_multi_models(args):
+def eval_multi_models(args, model_path_prefix):
     test_samples = test_compiler_util.get_allow_samples(
-        args.model_path_list, get_sample_root(args)
+        args.model_path_list, model_path_prefix
     )
 
     sample_idx = 0
@@ -435,15 +164,15 @@ def test_multi_models(args):
         print(f"- {model_path}", file=sys.stderr, flush=True)
 
 
-def test_multi_models_with_prefix(args):
-    assert os.path.isdir(args.model_path_prefix)
+def eval_multi_models_with_prefix(args, model_path_prefix):
+    assert os.path.isdir(model_path_prefix)
     assert os.path.isfile(args.model_path_list)
     test_samples = test_compiler_util.get_allow_samples(
-        args.model_path_list, get_sample_root(args)
+        args.model_path_list, model_path_prefix
     )
     py_module_name = os.path.splitext(os.path.basename(__file__))[0]
     for rel_model_path in test_samples:
-        model_path = os.path.join(args.model_path_prefix, rel_model_path)
+        model_path = os.path.join(model_path_prefix, rel_model_path)
         if not os.path.exists(model_path):
             continue
         if not os.path.exists(os.path.join(model_path, "model.py")):
@@ -467,39 +196,92 @@ def test_multi_models_with_prefix(args):
             traceback.print_exc()
 
 
+def compare_perf_diff(args, model_path, ref_dir, target_dir):
+    # A
+    ref_dump_path = utils.get_output_path(ref_dir, model_path)
+    ref_out = torch.load(str(ref_dump_path))
+
+    ref_log_path = utils.get_log_path(ref_dir, model_path)
+    ref_time_stats = parse_time_stats_from_reference_log(ref_log_path)
+
+    # B
+    target_dump_path = utils.get_output_path(target_dir, model_path)
+    target_out = torch.load(str(target_dump_path))
+
+    target_log_path = utils.get_log_path(target_dir, model_path)
+    target_time_stats = parse_time_stats_from_reference_log(target_log_path)
+
+    compare_correctness(ref_out, target_out, args)
+
+    test_compiler_util.print_times_and_speedup(args, ref_time_stats, target_time_stats)
+
+
+def eval_single_model(args):
+    ref_dir = "/tmp/eval_perf_diff/A"
+    target_dir = "/tmp/eval_perf_diff/B"
+
+    EvalCfg = types.SimpleNamespace(
+        ref_env=types.SimpleNamespace(**convert_to_dict(args.config)["ref_env"]),
+        target_env=types.SimpleNamespace(**convert_to_dict(args.config)["target_env"]),
+    )
+
+    ref_args = build_sub_args(EvalCfg.ref_env)
+    target_args = build_sub_args(EvalCfg.target_env)
+
+    run_sub_process(ref_args, args.model_path, ref_dir)
+    run_sub_process(target_args, args.model_path, target_dir)
+    compare_perf_diff(ref_args, args.model_path, ref_dir, target_dir)
+
+
+def run_sub_process(env_args, model_path, output_path):
+    cmd = [sys.executable, "-m", "graph_net_bench.torch.eval_backend_perf"]
+    args_pairs = [
+        ("--model-path", model_path),
+        ("--output-path", output_path),
+        ("--seed", str(env_args.seed)),
+        ("--compiler", env_args.compiler),
+        ("--device", env_args.device),
+        ("--op-lib", env_args.op_lib),
+        ("--warmup", str(env_args.warmup)),
+        ("--trials", str(env_args.trials)),
+        ("--log-prompt", env_args.log_prompt),
+        ("--model-path-prefix", env_args.model_path_prefix),
+        ("--config", env_args.backend_config),
+    ]
+
+    for arg_name, arg_value in args_pairs:
+        if arg_value is not None:
+            cmd.extend([arg_name, arg_value])
+
+    subprocess.run(cmd, check=True)
+
+
+def build_sub_args(env_ns: types.SimpleNamespace) -> argparse.Namespace:
+    sub = argparse.Namespace()
+    sub.seed = getattr(env_ns, "seed", 123)
+    sub.compiler = getattr(env_ns, "compiler", None)
+    sub.device = getattr(env_ns, "device", None)
+    sub.op_lib = getattr(env_ns, "op_lib", None)
+    sub.warmup = getattr(env_ns, "warmup", 3)
+    sub.trials = getattr(env_ns, "trials", 5)
+    sub.log_prompt = getattr(env_ns, "log_prompt", None)
+    sub.model_path_prefix = getattr(env_ns, "model_path_prefix", None)
+    sub.backend_config = getattr(env_ns, "backend_config", None)
+    return sub
+
+
 def main(args):
-    if args.model_path_list is not None and args.model_path_prefix is not None:
-        test_multi_models_with_prefix(args)
+    config_dict = convert_to_dict(args.config)
+    model_path_prefix = config_dict["ref_env"]["model_path_prefix"]
+    if args.model_path_list is not None and model_path_prefix is not None:
+        eval_multi_models_with_prefix(args, model_path_prefix)
         return
     assert os.path.isdir(args.model_path)
 
-    initalize_seed = 123
-    set_seed(random_seed=initalize_seed)
-
     if path_utils.is_single_model_dir(args.model_path):
-        test_single_model(args)
+        eval_single_model(args)
     else:
-        test_multi_models(args)
-
-
-def complete_default_args(
-    mut_args,
-    compiler: str = "inductor",  # Compiler name
-    device: str = "cuda",  # Device for testing the compiler (e.g., 'cpu' or 'cuda')
-    warmup: int = 3,  # Number of warmup steps
-    trials: int = 5,  # Number of timing trials
-    log_prompt: str = "graph-net-test-compiler-log",  # Log prompt for performance log filtering
-    model_path_prefix: str = None,  # Prefix path to model path in --model-path-list
-    backend_config: dict = None,  # backend configuration json
-):
-    backend_config = backend_config if backend_config is not None else {}
-    mut_args.compiler = compiler
-    mut_args.device = device
-    mut_args.warmup = warmup
-    mut_args.trials = trials
-    mut_args.log_prompt = log_prompt
-    mut_args.model_path_prefix = model_path_prefix
-    mut_args.backend_config = backend_config
+        eval_multi_models(args, model_path_prefix)
 
 
 if __name__ == "__main__":
@@ -526,5 +308,4 @@ def complete_default_args(
         help="base64 encode configuration json.",
     )
     args = parser.parse_args()
-    complete_default_args(args, **convert_to_dict(args.config))
     main(args=args)
diff --git a/graph_net_bench/torch/eval_backend_perf.py b/graph_net_bench/torch/eval_backend_perf.py
new file mode 100644
index 000000000..7e12f6ebf
--- /dev/null
+++ b/graph_net_bench/torch/eval_backend_perf.py
@@ -0,0 +1,337 @@
+from . import utils
+import argparse
+import importlib.util
+import torch
+from pathlib import Path
+from typing import Type
+import sys
+import os
+import traceback
+import json
+import random
+import numpy as np
+import platform
+import base64
+from contextlib import redirect_stdout, redirect_stderr
+
+from graph_net_bench.torch.backend.graph_compiler_backend import GraphCompilerBackend
+from graph_net_bench.torch.backend.tvm_backend import TvmBackend
+from graph_net_bench.torch.backend.xla_backend import XlaBackend
+from graph_net_bench.torch.backend.inductor_backend import InductorBackend
+from graph_net_bench.torch.backend.tensorrt_backend import TensorRTBackend
+from graph_net_bench.torch.backend.blade_disc_backend import BladeDISCBackend
+from graph_net_bench.torch.backend.nope_backend import NopeBackend
+from graph_net_bench.torch.backend.pass_mgr_backend import PassMgrBackend
+from graph_net_bench.torch.backend.unstable_to_stable_backend import (
+    UnstableToStableBackend,
+)
+from graph_net_bench.torch.backend.range_decomposer_validator_backend import (
+    RangeDecomposerValidatorBackend,
+)
+from graph_net_bench.torch.backend.graph_variable_renamer_validator_backend import (
+    GraphVariableRenamerValidatorBackend,
+)
+from graph_net_bench import test_compiler_util
+
+
+compiler_backend_name2class = {
+    "tvm": TvmBackend,
+    "xla": XlaBackend,
+    "inductor": InductorBackend,
+    "tensorrt": TensorRTBackend,
+    "bladedisc": BladeDISCBackend,
+    "nope": NopeBackend,
+    "pass_mgr": PassMgrBackend,
+    "unstable_to_stable": UnstableToStableBackend,
+    "range_decomposer_validator": RangeDecomposerValidatorBackend,
+    "graph_variable_renamer_validator": GraphVariableRenamerValidatorBackend,
+}
+
+
+def register_op_lib(op_lib):
+    if op_lib == "flaggems":
+        import flag_gems
+
+        flag_gems.enable()
+    else:
+        pass
+
+
+def set_seed(random_seed):
+    random.seed(random_seed)
+    np.random.seed(random_seed)
+    torch.manual_seed(random_seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(random_seed)
+        torch.cuda.manual_seed_all(random_seed)
+
+
+def get_hardward_name(args):
+    hardware_name = "unknown"
+    if "cuda" in args.device:
+        hardware_name = torch.cuda.get_device_name(args.device)
+    elif args.device == "cpu":
+        hardware_name = platform.processor()
+    return hardware_name
+
+
+def get_compile_framework_version(args):
+    if args.compiler in ["inductor", "nope", "unstable_to_stable"]:
+        return torch.__version__
+    elif args.compiler in ["tvm", "xla", "tensorrt", "bladedisc"]:
+        # Assuming compiler object has a version attribute
+        return f"{args.compiler.capitalize()} {args.compiler.version}"
+    return "unknown"
+
+
+def load_class_from_file(
+    args: argparse.Namespace, class_name: str, device: str
+) -> Type[torch.nn.Module]:
+    file_path = f"{args.model_path}/model.py"
+    file = Path(file_path).resolve()
+    module_name = file.stem
+
+    with open(file_path, "r", encoding="utf-8") as f:
+        model_code = f.read()
+    model_code = utils.modify_code_by_device(model_code, device)
+    spec = importlib.util.spec_from_loader(module_name, loader=None)
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    compiled_code = compile(model_code, filename=file, mode="exec")
+    exec(compiled_code, module.__dict__)
+
+    model_class = getattr(module, class_name, None)
+    setattr(model_class, "__graph_net_file_path__", file_path)
+    setattr(model_class, "__graph_net_device__", device)
+    return model_class
+
+
+def convert_to_dict(config_str):
+    if config_str is None or config_str == "None":
+        return {}
+    config_str = base64.b64decode(config_str).decode("utf-8")
+    config = json.loads(config_str)
+    assert isinstance(config, dict), f"config should be a dict. {config_str=}"
+    return config
+
+
+def get_compiler_backend(args) -> GraphCompilerBackend:
+    assert (
+        args.compiler in compiler_backend_name2class
+    ), f"Unknown compiler: {args.compiler}"
+    backend_class = compiler_backend_name2class[args.compiler]
+    config = convert_to_dict(args.config) if args.config is not None else {}
+    return backend_class(config)
+
+
+def get_model(args):
+    device = "xla" if args.compiler == "xla" else args.device
+
+    # device: Torch device object specifying the target device for model loading (e.g., 'cuda', 'cpu', 'xla')
+    model_class = load_class_from_file(args, class_name="GraphModule", device=device)
+    model = model_class().to(torch.device(args.device))
+    return model
+
+
+def get_input_dict(args):
+    inputs_params = utils.load_converted_from_text(f"{args.model_path}")
+    params = inputs_params["weight_info"]
+    for tensor_meta in params.values():
+        if "device" in tensor_meta["info"]:
+            tensor_meta["info"]["device"] = args.device
+    return {
+        k: utils.replay_tensor(v).to(torch.device(args.device))
+        for k, v in params.items()
+    }
+
+
+def measure_performance(model_call, args, compiler):
+    stats = {}
+    outs = model_call()
+
+    # Warmup runs
+    for _ in range(args.warmup):
+        model_call()
+    compiler.synchronize()
+
+    hardware_name = get_hardward_name(args)
+    print(
+        f"[Profiling] Using device: {args.device} {hardware_name}, warm up {args.warmup}, trials {args.trials}",
+        file=sys.stderr,
+        flush=True,
+    )
+
+    if "cuda" in args.device:
+        torch.cuda.empty_cache()
+        e2e_times = []
+        gpu_times = []
+
+        for i in range(args.trials):
+            # End-to-end timing (naive_timer)
+            duration_box = test_compiler_util.DurationBox(-1)
+            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
+                # GPU-only timing (CUDA Events)
+                start_event = torch.cuda.Event(enable_timing=True)
+                end_event = torch.cuda.Event(enable_timing=True)
+                start_event.record()
+
+                model_call()
+
+                end_event.record()
+                compiler.synchronize()
+
+            gpu_time_ms = start_event.elapsed_time(end_event)
+            e2e_times.append(duration_box.value)
+            gpu_times.append(gpu_time_ms)
+            print(
+                f"Trial {i + 1}: e2e={duration_box.value:.5f} ms, gpu={gpu_time_ms:.5f} ms",
+                file=sys.stderr,
+                flush=True,
+            )
+
+        stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
+        stats["gpu"] = test_compiler_util.get_timing_stats(gpu_times)
+
+    else:  # CPU or other devices
+        e2e_times = []
+        for i in range(args.trials):
+            duration_box = test_compiler_util.DurationBox(-1)
+            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
+                model_call()
+            print(
+                f"Trial {i + 1}: e2e={duration_box.value:.5f} ms",
+                file=sys.stderr,
+                flush=True,
+            )
+            e2e_times.append(duration_box.value)
+        stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
+
+    return outs, stats
+
+
+def eval_single_model(args):
+    log_path = utils.get_log_path(args.output_path, args.model_path)
+    output_dump_path = utils.get_output_path(args.output_path, args.model_path)
+    print(f"Log path: {log_path}", file=sys.stderr, flush=True)
+    print(f"Outputs path: {output_dump_path}", file=sys.stderr, flush=True)
+
+    with open(log_path, "w", encoding="utf-8") as log_f:
+        with redirect_stdout(log_f), redirect_stderr(log_f):
+            compiler = get_compiler_backend(args)
+
+            input_dict = get_input_dict(args)
+            model = get_model(args)
+            model.eval()
+
+            test_compiler_util.print_with_log_prompt(
+                "[Config] seed:", args.seed, args.log_prompt
+            )
+
+            test_compiler_util.print_basic_config(
+                args,
+                get_hardward_name(args),
+                get_compile_framework_version(args),
+            )
+
+            test_compiler_util.print_with_log_prompt(
+                "[Config] op_lib:", args.op_lib, args.log_prompt
+            )
+
+            success = False
+            time_stats = {}
+            try:
+                compiled_model = compiler(model)
+
+                def model_call():
+                    return compiled_model(**input_dict)
+
+                outputs, time_stats = measure_performance(model_call, args, compiler)
+                success = True
+            except Exception as e:
+                print(
+                    f"Run model failed: {str(e)}\n{traceback.format_exc()}",
+                    file=sys.stderr,
+                    flush=True,
+                )
+
+            test_compiler_util.print_running_status(args, success)
+            if success:
+                torch.save(outputs, str(output_dump_path))
+            test_compiler_util.print_with_log_prompt(
+                "[Performance][eager]:", json.dumps(time_stats), args.log_prompt
+            )
+
+    with open(log_path, "r", encoding="utf-8") as f:
+        content = f.read()
+        print(content, file=sys.stderr, flush=True)
+
+
+def main(args):
+    set_seed(args.seed)
+    os.makedirs(args.output_path, exist_ok=True)
+    eval_single_model(args)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="GraphNet Backend Performance Evaluation"
+    )
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        required=False,
+        default=None,
+        help="Path to model file(s), each subdirectory containing graph_net.json will be regarded as a model",
+    )
+    parser.add_argument(
+        "--output-path",
+        type=str,
+        required=False,
+        default="/tmp/test_save",
+        help="Path to save outputs",
+    )
+    parser.add_argument("--seed", type=int, required=False, default=123)
+    parser.add_argument(
+        "--compiler",
+        type=str,
+        required=False,
+        default="inductor",
+        help="Path to customized compiler python file",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        required=False,
+        default="cuda",
+        help="Device for testing the compiler (e.g., 'cpu' or 'cuda')",
+    )
+    parser.add_argument("--op-lib", type=str, required=False, default=None)
+    parser.add_argument(
+        "--warmup", type=int, required=False, default=3, help="Number of warmup steps"
+    )
+    parser.add_argument(
+        "--trials", type=int, required=False, default=5, help="Number of timing trials"
+    )
+    parser.add_argument(
+        "--log-prompt",
+        type=str,
+        required=False,
+        default="graph-net-test-compiler-log",
+        help="Log prompt for performance log filtering.",
+    )
+    parser.add_argument(
+        "--model-path-prefix",
+        type=str,
+        required=False,
+        default=None,
+        help="Prefix path to model path list",
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=False,
+        default=None,
+        help="base64 encode configuration json.",
+    )
+    args = parser.parse_args()
+    main(args=args)
diff --git a/graph_net_bench/torch/utils.py b/graph_net_bench/torch/utils.py
index c937ff4de..700a59972 100755
--- a/graph_net_bench/torch/utils.py
+++ b/graph_net_bench/torch/utils.py
@@ -1,4 +1,5 @@
 import torch
+import os
 import ast
 import math
 import inspect
@@ -7,6 +8,16 @@
 kLiteralTensorSize = 64
 
 
+def get_log_path(log_dir, model_path):
+    model_name = model_path.split("torch_samples/")[-1].replace(os.sep, "_")
+    return os.path.join(log_dir, f"{model_name}.log")
+
+
+def get_output_path(output_dir, model_path):
+    model_name = model_path.split("torch_samples/")[-1].replace(os.sep, "_")
+    return os.path.join(output_dir, f"{model_name}.pth")
+
+
 def get_limited_precision_float_str(value):
     if not isinstance(value, float):
         return value
diff --git a/test/eval_backend_diff_test.sh b/test/eval_backend_diff_test.sh
index e3fa79602..16da81903 100755
--- a/test/eval_backend_diff_test.sh
+++ b/test/eval_backend_diff_test.sh
@@ -10,9 +10,16 @@ python3 -m graph_net_bench.torch.eval_backend_diff \
     --model-path-list $model_list \
     --config $(base64 -w 0 <<EOF
 {
-    "model_path_prefix": "$AI4C_ROOT",
-    "compiler": "nope",
-    "device": "cuda"
+    "ref_env":  {
+        "compiler": "nope",
+        "device": "cuda",
+        "model_path_prefix": "$AI4C_ROOT"
+    },
+    "target_env": {
+        "compiler": "nope",
+        "device": "cuda",
+        "model_path_prefix": "$AI4C_ROOT"
+    }
 }
 EOF
 ) 2>&1 | tee "$OUTPUT_PATH/validation.log"

From f83ab0cfd1bfdb325c356a0604e9cb717e45abde Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Fri, 16 Jan 2026 08:56:00 +0800
Subject: [PATCH 02/20] Simplify eval_multi_models

---
 graph_net_bench/torch/eval_backend_diff.py | 123 ++++++++++-----------
 1 file changed, 57 insertions(+), 66 deletions(-)

diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
index 07a19ff88..50d17cb62 100755
--- a/graph_net_bench/torch/eval_backend_diff.py
+++ b/graph_net_bench/torch/eval_backend_diff.py
@@ -119,81 +119,69 @@ def parse_time_stats_from_reference_log(log_path):
     return time_stats
 
 
-def eval_multi_models(args, model_path_prefix):
-    test_samples = test_compiler_util.get_allow_samples(
-        args.model_path_list, model_path_prefix
-    )
-
+def eval_multi_models(args, model_path_prefix=None, use_model_list=False):
     sample_idx = 0
     failed_samples = []
     module_name = os.path.splitext(os.path.basename(__file__))[0]
-    for model_path in path_utils.get_recursively_model_path(args.model_path):
-        if test_samples is None or os.path.abspath(model_path) in test_samples:
-            print(
-                f"[{sample_idx}] {module_name}, model_path: {model_path}",
-                file=sys.stderr,
-                flush=True,
-            )
-            cmd = " ".join(
-                [
-                    sys.executable,
-                    f"-m graph_net_bench.torch.{module_name}",
-                    f"--model-path {model_path}",
-                    f"--config {args.config}",
-                ]
-            )
-            try:
-                process = subprocess.Popen(cmd, shell=True)
-                cmd_ret = process.wait()
-            except KeyboardInterrupt:
-                print("KeyboardInterrupt")
-                sys.exit(1)
-            except Exception:
-                print("\n--- Full Traceback ---")
-                traceback.print_exc()
-            if cmd_ret != 0:
-                failed_samples.append(model_path)
-            sample_idx += 1
-
-    print(
-        f"Totally {sample_idx} verified samples, failed {len(failed_samples)} samples.",
-        file=sys.stderr,
-        flush=True,
-    )
-    for model_path in failed_samples:
-        print(f"- {model_path}", file=sys.stderr, flush=True)
-
 
-def eval_multi_models_with_prefix(args, model_path_prefix):
-    assert os.path.isdir(model_path_prefix)
-    assert os.path.isfile(args.model_path_list)
-    test_samples = test_compiler_util.get_allow_samples(
-        args.model_path_list, model_path_prefix
-    )
-    py_module_name = os.path.splitext(os.path.basename(__file__))[0]
-    for rel_model_path in test_samples:
-        model_path = os.path.join(model_path_prefix, rel_model_path)
-        if not os.path.exists(model_path):
-            continue
-        if not os.path.exists(os.path.join(model_path, "model.py")):
-            continue
+    if use_model_list:
+        assert os.path.isdir(model_path_prefix)
+        assert os.path.isfile(args.model_path_list)
+        test_samples = test_compiler_util.get_allow_samples(
+            args.model_path_list, model_path_prefix
+        )
+        model_paths = []
+        for rel_model_path in test_samples:
+            model_path = os.path.join(model_path_prefix, rel_model_path)
+            if os.path.exists(model_path) and os.path.exists(
+                os.path.join(model_path, "model.py")
+            ):
+                model_paths.append(model_path)
+    else:
+        assert os.path.isdir(args.model_path)
+        test_samples = test_compiler_util.get_allow_samples(
+            args.model_path_list, model_path_prefix
+        )
+        model_paths = []
+        for model_path in path_utils.get_recursively_model_path(args.model_path):
+            if test_samples is None or os.path.abspath(model_path) in test_samples:
+                model_paths.append(model_path)
+
+    for model_path in model_paths:
+        print(
+            f"[{sample_idx}] {module_name}, model_path: {model_path}",
+            file=sys.stderr,
+            flush=True,
+        )
         cmd = " ".join(
             [
                 sys.executable,
-                f"-m graph_net_bench.torch.{py_module_name}",
+                f"-m graph_net_bench.torch.{module_name}",
                 f"--model-path {model_path}",
                 f"--config {args.config}",
             ]
         )
         try:
             process = subprocess.Popen(cmd, shell=True)
-            process.wait()
+            cmd_ret = process.wait()
         except KeyboardInterrupt:
             print("KeyboardInterrupt")
             sys.exit(1)
         except Exception:
             print("\n--- Full Traceback ---")
             traceback.print_exc()
+        if cmd_ret != 0:
+            failed_samples.append(model_path)
+        sample_idx += 1
+
+    print(
+        f"Totally {sample_idx} verified samples, failed {len(failed_samples)} samples.",
+        file=sys.stderr,
+        flush=True,
+    )
+    if failed_samples:
+        for model_path in failed_samples:
+            print(f"- {model_path}", file=sys.stderr, flush=True)
 
 
 def compare_perf_diff(args, model_path, ref_dir, target_dir):
@@ -272,20 +260,23 @@ def build_sub_args(env_ns: types.SimpleNamespace) -> argparse.Namespace:
 
 def main(args):
     config_dict = convert_to_dict(args.config)
-    model_path_prefix = config_dict["ref_env"]["model_path_prefix"]
-    if args.model_path_list is not None and model_path_prefix is not None:
-        eval_multi_models_with_prefix(args, model_path_prefix)
-        return
-    assert os.path.isdir(args.model_path)
-
-    if path_utils.is_single_model_dir(args.model_path):
-        eval_single_model(args)
+    model_path_prefix = config_dict.get("ref_env", {}).get("model_path_prefix")
+
+    if args.model_path_list and model_path_prefix:
+        eval_multi_models(args, model_path_prefix, use_model_list=True)
+    elif os.path.isdir(args.model_path):
+        if path_utils.is_single_model_dir(args.model_path):
+            eval_single_model(args)
+        else:
+            eval_multi_models(args, model_path_prefix, use_model_list=False)
     else:
-        eval_multi_models(args, model_path_prefix)
+        raise ValueError(f"Invalid model path: {args.model_path}")
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Test compiler performance.")
+    parser = argparse.ArgumentParser(
+        description="Evaluate backend performance difference."
+    )
     parser.add_argument(
         "--model-path",
         type=str,

From 9670c7a6787b7d8b03cee5e5232383587ed839ea Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Fri, 16 Jan 2026 08:59:30 +0800
Subject: [PATCH 03/20] minor change

---
 graph_net_bench/torch/eval_backend_diff.py | 4 ++--
 graph_net_bench/torch/eval_backend_perf.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
index 50d17cb62..a5c02ec7b 100755
--- a/graph_net_bench/torch/eval_backend_diff.py
+++ b/graph_net_bench/torch/eval_backend_diff.py
@@ -252,7 +252,7 @@ def build_sub_args(env_ns: types.SimpleNamespace) -> argparse.Namespace:
     sub.op_lib = getattr(env_ns, "op_lib", None)
     sub.warmup = getattr(env_ns, "warmup", 3)
     sub.trials = getattr(env_ns, "trials", 5)
-    sub.log_prompt = getattr(env_ns, "log_prompt", None)
+    sub.log_prompt = getattr(env_ns, "log_prompt", "graph-net-bench-log")
     sub.model_path_prefix = getattr(env_ns, "model_path_prefix", None)
     sub.backend_config = getattr(env_ns, "backend_config", None)
     return sub
@@ -275,7 +275,7 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="Evaluate backend performance difference."
+        description="Evaluate Backend Performance Difference."
     )
     parser.add_argument(
         "--model-path",
diff --git a/graph_net_bench/torch/eval_backend_perf.py b/graph_net_bench/torch/eval_backend_perf.py
index 7e12f6ebf..60194ae88 100644
--- a/graph_net_bench/torch/eval_backend_perf.py
+++ b/graph_net_bench/torch/eval_backend_perf.py
@@ -274,7 +274,7 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="GraphNet Backend Performance Evaluation"
+        description="Single Backend Performance Evaluation"
     )
     parser.add_argument(
         "--model-path",
@@ -316,7 +316,7 @@ def main(args):
         "--log-prompt",
         type=str,
         required=False,
-        default="graph-net-test-compiler-log",
+        default="graph-net-bench-log",
         help="Log prompt for performance log filtering.",
     )
     parser.add_argument(

From cb9a4f1e3cb7aff5a68b9f34083ba913faeb276f Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Fri, 16 Jan 2026 09:06:35 +0800
Subject: [PATCH 04/20] Minor change on names

---
 graph_net_bench/torch/eval_backend_perf.py | 28 ++++++++++++----------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/graph_net_bench/torch/eval_backend_perf.py b/graph_net_bench/torch/eval_backend_perf.py
index 60194ae88..c550767f0 100644
--- a/graph_net_bench/torch/eval_backend_perf.py
+++ b/graph_net_bench/torch/eval_backend_perf.py
@@ -66,28 +66,28 @@ def set_seed(random_seed):
         torch.cuda.manual_seed_all(random_seed)
 
 
-def get_hardward_name(args):
+def get_hardward_name(device):
     hardware_name = "unknown"
-    if "cuda" in args.device:
-        hardware_name = torch.cuda.get_device_name(args.device)
+    if "cuda" in device:
+        hardware_name = torch.cuda.get_device_name(device)
     elif args.device == "cpu":
         hardware_name = platform.processor()
     return hardware_name
 
 
-def get_compile_framework_version(args):
-    if args.compiler in ["inductor", "nope", "unstable_to_stable"]:
+def get_compiler_version(compiler):
+    if compiler in ["inductor", "nope", "unstable_to_stable"]:
         return torch.__version__
-    elif args.compiler in ["tvm", "xla", "tensorrt", "bladedisc"]:
+    elif compiler in ["tvm", "xla", "tensorrt", "bladedisc"]:
         # Assuming compiler object has a version attribute
-        return f"{args.compiler.capitalize()} {args.compiler.version}"
+        return f"{compiler.capitalize()} {compiler.version}"
     return "unknown"
 
 
 def load_class_from_file(
-    args: argparse.Namespace, class_name: str, device: str
+    model_path: str, class_name: str, device: str
 ) -> Type[torch.nn.Module]:
-    file_path = f"{args.model_path}/model.py"
+    file_path = f"{model_path}/model.py"
     file = Path(file_path).resolve()
     module_name = file.stem
 
@@ -128,7 +128,9 @@ def get_model(args):
     device = "xla" if args.compiler == "xla" else args.device
 
     # device: Torch device object specifying the target device for model loading (e.g., 'cuda', 'cpu', 'xla')
-    model_class = load_class_from_file(args, class_name="GraphModule", device=device)
+    model_class = load_class_from_file(
+        args.model_path, class_name="GraphModule", device=device
+    )
     model = model_class().to(torch.device(args.device))
     return model
 
@@ -154,7 +156,7 @@ def measure_performance(model_call, args, compiler):
         model_call()
     compiler.synchronize()
 
-    hardware_name = get_hardward_name(args)
+    hardware_name = get_hardward_name(args.device)
     print(
         f"[Profiling] Using device: {args.device} {hardware_name}, warm up {args.warmup}, trials {args.trials}",
         file=sys.stderr,
@@ -229,8 +231,8 @@ def eval_single_model(args):
 
             test_compiler_util.print_basic_config(
                 args,
-                get_hardward_name(args),
-                get_compile_framework_version(args),
+                get_hardward_name(args.device),
+                get_compiler_version(args.compiler),
             )
 
             test_compiler_util.print_with_log_prompt(

From 6b0975da11997a52ee4f8daf11215f6dd3b71564 Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Fri, 16 Jan 2026 14:41:35 +0800
Subject: [PATCH 05/20] use call method instead of bash

---
 graph_net_bench/torch/eval_backend_diff.py | 75 ++++++++++------------
 graph_net_bench/torch/eval_backend_perf.py |  4 +-
 2 files changed, 35 insertions(+), 44 deletions(-)

diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
index a5c02ec7b..bb7811689 100755
--- a/graph_net_bench/torch/eval_backend_diff.py
+++ b/graph_net_bench/torch/eval_backend_diff.py
@@ -1,5 +1,4 @@
 from . import utils
-import subprocess
 import argparse
 import torch
 import sys
@@ -11,6 +10,7 @@
 import types
 from graph_net_bench import test_compiler_util
 from graph_net_bench import path_utils
+from .eval_backend_perf import eval_single_model_with_single_backend
 
 
 def convert_to_dict(config_str):
@@ -153,23 +153,33 @@ def eval_multi_models(args, model_path_prefix=None, use_model_list=False):
             file=sys.stderr,
             flush=True,
         )
-        cmd = " ".join(
-            [
-                sys.executable,
-                f"-m graph_net_bench.torch.{module_name}",
-                f"--model-path {model_path}",
-                f"--config {args.config}",
-            ]
-        )
+
         try:
-            process = subprocess.Popen(cmd, shell=True)
-            cmd_ret = process.wait()
+            single_model_args = argparse.Namespace()
+
+            single_model_args.model_path = model_path
+            single_model_args.config = args.config
+            single_model_args.model_path_list = None
+
+            if path_utils.is_single_model_dir(model_path):
+                eval_single_model(single_model_args)
+            else:
+                submodel_paths = path_utils.get_recursively_model_path(model_path)
+                for submodel_path in submodel_paths:
+                    sub_args = argparse.Namespace()
+                    sub_args.model_path = submodel_path
+                    sub_args.config = args.config
+                    sub_args.model_path_list = None
+                    eval_single_model(sub_args)
+            cmd_ret = 0
         except KeyboardInterrupt:
             print("KeyboardInterrupt")
             sys.exit(1)
         except Exception:
             print("\n--- Full Traceback ---")
             traceback.print_exc()
+            cmd_ret = 1
+
         if cmd_ret != 0:
             failed_samples.append(model_path)
         sample_idx += 1
@@ -213,48 +223,29 @@ def eval_single_model(args):
         target_env=types.SimpleNamespace(**convert_to_dict(args.config)["target_env"]),
     )
 
-    ref_args = build_sub_args(EvalCfg.ref_env)
-    target_args = build_sub_args(EvalCfg.target_env)
+    ref_args = build_sub_args(EvalCfg.ref_env, args.model_path, ref_dir)
+    target_args = build_sub_args(EvalCfg.target_env, args.model_path, target_dir)
 
-    run_sub_process(ref_args, args.model_path, ref_dir)
-    run_sub_process(target_args, args.model_path, target_dir)
+    eval_single_model_with_single_backend(ref_args)
+    eval_single_model_with_single_backend(target_args)
     compare_perf_diff(ref_args, args.model_path, ref_dir, target_dir)
 
 
-def run_sub_process(env_args, model_path, output_path):
-    cmd = [sys.executable, "-m", "graph_net_bench.torch.eval_backend_perf"]
-    args_pairs = [
-        ("--model-path", model_path),
-        ("--output-path", output_path),
-        ("--seed", str(env_args.seed)),
-        ("--compiler", env_args.compiler),
-        ("--device", env_args.device),
-        ("--op-lib", env_args.op_lib),
-        ("--warmup", str(env_args.warmup)),
-        ("--trials", str(env_args.trials)),
-        ("--log-prompt", env_args.log_prompt),
-        ("--model-path-prefix", env_args.model_path_prefix),
-        ("--config", env_args.backend_config),
-    ]
-
-    for arg_name, arg_value in args_pairs:
-        if arg_value is not None:
-            cmd.extend([arg_name, arg_value])
-
-    subprocess.run(cmd, check=True)
-
-
-def build_sub_args(env_ns: types.SimpleNamespace) -> argparse.Namespace:
+def build_sub_args(
+    env_ns: types.SimpleNamespace, model_path: str, output_path: str
+) -> argparse.Namespace:
     sub = argparse.Namespace()
+    sub.model_path = model_path
+    sub.output_path = output_path
     sub.seed = getattr(env_ns, "seed", 123)
-    sub.compiler = getattr(env_ns, "compiler", None)
-    sub.device = getattr(env_ns, "device", None)
+    sub.compiler = getattr(env_ns, "compiler", "inductor")
+    sub.device = getattr(env_ns, "device", "cuda")
     sub.op_lib = getattr(env_ns, "op_lib", None)
     sub.warmup = getattr(env_ns, "warmup", 3)
     sub.trials = getattr(env_ns, "trials", 5)
     sub.log_prompt = getattr(env_ns, "log_prompt", "graph-net-bench-log")
     sub.model_path_prefix = getattr(env_ns, "model_path_prefix", None)
-    sub.backend_config = getattr(env_ns, "backend_config", None)
+    sub.config = getattr(env_ns, "backend_config", None)
     return sub
 
 
diff --git a/graph_net_bench/torch/eval_backend_perf.py b/graph_net_bench/torch/eval_backend_perf.py
index c550767f0..d099ac7d9 100644
--- a/graph_net_bench/torch/eval_backend_perf.py
+++ b/graph_net_bench/torch/eval_backend_perf.py
@@ -211,7 +211,7 @@ def measure_performance(model_call, args, compiler):
     return outs, stats
 
 
-def eval_single_model(args):
+def eval_single_model_with_single_backend(args):
     log_path = utils.get_log_path(args.output_path, args.model_path)
     output_dump_path = utils.get_output_path(args.output_path, args.model_path)
     print(f"Log path: {log_path}", file=sys.stderr, flush=True)
@@ -271,7 +271,7 @@ def model_call():
 def main(args):
     set_seed(args.seed)
     os.makedirs(args.output_path, exist_ok=True)
-    eval_single_model(args)
+    eval_single_model_with_single_backend(args)
 
 
 if __name__ == "__main__":

From 980f7377973080b6bfd89250e7129cf3647a638a Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Fri, 16 Jan 2026 14:46:21 +0800
Subject: [PATCH 06/20] minor change

---
 graph_net_bench/torch/eval_backend_perf.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/graph_net_bench/torch/eval_backend_perf.py b/graph_net_bench/torch/eval_backend_perf.py
index d099ac7d9..fcf313cef 100644
--- a/graph_net_bench/torch/eval_backend_perf.py
+++ b/graph_net_bench/torch/eval_backend_perf.py
@@ -212,6 +212,8 @@ def measure_performance(model_call, args, compiler):
 
 
 def eval_single_model_with_single_backend(args):
+    set_seed(args.seed)
+    os.makedirs(args.output_path, exist_ok=True)
     log_path = utils.get_log_path(args.output_path, args.model_path)
     output_dump_path = utils.get_output_path(args.output_path, args.model_path)
     print(f"Log path: {log_path}", file=sys.stderr, flush=True)
@@ -268,12 +270,6 @@ def model_call():
         print(content, file=sys.stderr, flush=True)
 
 
-def main(args):
-    set_seed(args.seed)
-    os.makedirs(args.output_path, exist_ok=True)
-    eval_single_model_with_single_backend(args)
-
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Single Backend Performance Evaluation"
@@ -336,4 +332,4 @@ def main(args):
         help="base64 encode configuration json.",
     )
     args = parser.parse_args()
-    main(args=args)
+    eval_single_model_with_single_backend(args=args)

From 5c49521d22fb87d4cf3f25549dd5ee2b76652262 Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Fri, 16 Jan 2026 14:55:08 +0800
Subject: [PATCH 07/20] change some names

---
 graph_net_bench/torch/eval_backend_diff.py | 16 ++++++++++------
 graph_net_bench/torch/eval_backend_perf.py |  8 +++++---
 test/eval_backend_diff_test.sh             |  8 ++++++--
 3 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
index bb7811689..0e7229086 100755
--- a/graph_net_bench/torch/eval_backend_diff.py
+++ b/graph_net_bench/torch/eval_backend_diff.py
@@ -219,12 +219,16 @@ def eval_single_model(args):
     target_dir = "/tmp/eval_perf_diff/B"
 
     EvalCfg = types.SimpleNamespace(
-        ref_env=types.SimpleNamespace(**convert_to_dict(args.config)["ref_env"]),
-        target_env=types.SimpleNamespace(**convert_to_dict(args.config)["target_env"]),
+        reference_config=types.SimpleNamespace(
+            **convert_to_dict(args.config)["reference_config"]
+        ),
+        target_config=types.SimpleNamespace(
+            **convert_to_dict(args.config)["target_config"]
+        ),
     )
 
-    ref_args = build_sub_args(EvalCfg.ref_env, args.model_path, ref_dir)
-    target_args = build_sub_args(EvalCfg.target_env, args.model_path, target_dir)
+    ref_args = build_sub_args(EvalCfg.reference_config, args.model_path, ref_dir)
+    target_args = build_sub_args(EvalCfg.target_config, args.model_path, target_dir)
 
     eval_single_model_with_single_backend(ref_args)
     eval_single_model_with_single_backend(target_args)
@@ -245,13 +249,13 @@ def build_sub_args(
     sub.trials = getattr(env_ns, "trials", 5)
     sub.log_prompt = getattr(env_ns, "log_prompt", "graph-net-bench-log")
     sub.model_path_prefix = getattr(env_ns, "model_path_prefix", None)
-    sub.config = getattr(env_ns, "backend_config", None)
+    sub.backend_config = getattr(env_ns, "backend_config", None)
     return sub
 
 
 def main(args):
     config_dict = convert_to_dict(args.config)
-    model_path_prefix = config_dict.get("ref_env", {}).get("model_path_prefix")
+    model_path_prefix = config_dict.get("reference_config", {}).get("model_path_prefix")
 
     if args.model_path_list and model_path_prefix:
         eval_multi_models(args, model_path_prefix, use_model_list=True)
diff --git a/graph_net_bench/torch/eval_backend_perf.py b/graph_net_bench/torch/eval_backend_perf.py
index fcf313cef..29c40d3fb 100644
--- a/graph_net_bench/torch/eval_backend_perf.py
+++ b/graph_net_bench/torch/eval_backend_perf.py
@@ -120,8 +120,10 @@ def get_compiler_backend(args) -> GraphCompilerBackend:
         args.compiler in compiler_backend_name2class
     ), f"Unknown compiler: {args.compiler}"
     backend_class = compiler_backend_name2class[args.compiler]
-    config = convert_to_dict(args.config) if args.config is not None else {}
-    return backend_class(config)
+    backend_config = (
+        convert_to_dict(args.backend_config) if args.backend_config is not None else {}
+    )
+    return backend_class(backend_config)
 
 
 def get_model(args):
@@ -325,7 +327,7 @@ def model_call():
         help="Prefix path to model path list",
     )
     parser.add_argument(
-        "--config",
+        "--backend-config",
         type=str,
         required=False,
         default=None,
diff --git a/test/eval_backend_diff_test.sh b/test/eval_backend_diff_test.sh
index 16da81903..17bba712e 100755
--- a/test/eval_backend_diff_test.sh
+++ b/test/eval_backend_diff_test.sh
@@ -10,14 +10,18 @@ python3 -m graph_net_bench.torch.eval_backend_diff \
     --model-path-list $model_list \
     --config $(base64 -w 0 <<EOF
 {
-    "ref_env":  {
+    "reference_config":  {
         "compiler": "nope",
         "device": "cuda",
+        "warmup": 1,
+        "trials": 1,
         "model_path_prefix": "$AI4C_ROOT"
     },
-    "target_env": {
+    "target_config": {
         "compiler": "nope",
         "device": "cuda",
+        "warmup": 1,
+        "trials": 1,
         "model_path_prefix": "$AI4C_ROOT"
     }
 }

From bce359ecc37fe293d879a2fd9872d43a247e0227 Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Fri, 16 Jan 2026 15:51:53 +0800
Subject: [PATCH 08/20] Dynamically load backend class based on args.compiler

---
 graph_net_bench/torch/eval_backend_perf.py | 61 +++++++++-------------
 1 file changed, 24 insertions(+), 37 deletions(-)

diff --git a/graph_net_bench/torch/eval_backend_perf.py b/graph_net_bench/torch/eval_backend_perf.py
index 29c40d3fb..4d5ea94a5 100644
--- a/graph_net_bench/torch/eval_backend_perf.py
+++ b/graph_net_bench/torch/eval_backend_perf.py
@@ -13,41 +13,10 @@
 import platform
 import base64
 from contextlib import redirect_stdout, redirect_stderr
-
 from graph_net_bench.torch.backend.graph_compiler_backend import GraphCompilerBackend
-from graph_net_bench.torch.backend.tvm_backend import TvmBackend
-from graph_net_bench.torch.backend.xla_backend import XlaBackend
-from graph_net_bench.torch.backend.inductor_backend import InductorBackend
-from graph_net_bench.torch.backend.tensorrt_backend import TensorRTBackend
-from graph_net_bench.torch.backend.blade_disc_backend import BladeDISCBackend
-from graph_net_bench.torch.backend.nope_backend import NopeBackend
-from graph_net_bench.torch.backend.pass_mgr_backend import PassMgrBackend
-from graph_net_bench.torch.backend.unstable_to_stable_backend import (
-    UnstableToStableBackend,
-)
-from graph_net_bench.torch.backend.range_decomposer_validator_backend import (
-    RangeDecomposerValidatorBackend,
-)
-from graph_net_bench.torch.backend.graph_variable_renamer_validator_backend import (
-    GraphVariableRenamerValidatorBackend,
-)
 from graph_net_bench import test_compiler_util
 
 
-compiler_backend_name2class = {
-    "tvm": TvmBackend,
-    "xla": XlaBackend,
-    "inductor": InductorBackend,
-    "tensorrt": TensorRTBackend,
-    "bladedisc": BladeDISCBackend,
-    "nope": NopeBackend,
-    "pass_mgr": PassMgrBackend,
-    "unstable_to_stable": UnstableToStableBackend,
-    "range_decomposer_validator": RangeDecomposerValidatorBackend,
-    "graph_variable_renamer_validator": GraphVariableRenamerValidatorBackend,
-}
-
-
 def register_op_lib(op_lib):
     if op_lib == "flaggems":
         import flag_gems
@@ -70,7 +39,7 @@ def get_hardward_name(device):
     hardware_name = "unknown"
     if "cuda" in device:
         hardware_name = torch.cuda.get_device_name(device)
-    elif args.device == "cpu":
+    elif device == "cpu":
         hardware_name = platform.processor()
     return hardware_name
 
@@ -116,10 +85,28 @@ def convert_to_dict(config_str):
 
 
 def get_compiler_backend(args) -> GraphCompilerBackend:
-    assert (
-        args.compiler in compiler_backend_name2class
-    ), f"Unknown compiler: {args.compiler}"
-    backend_class = compiler_backend_name2class[args.compiler]
+    """
+    Dynamically load backend class based on args.compiler
+    """
+    compiler_name = args.compiler.lower()
+    module_name = f"graph_net_bench.torch.backend.{compiler_name}_backend"
+
+    try:
+        module = __import__(module_name, fromlist=[f"{compiler_name.title()}Backend"])
+
+        class_name = (
+            f"{''.join(part.title() for part in compiler_name.split('_'))}Backend"
+        )
+
+        backend_class = None
+        if hasattr(module, class_name):
+            backend_class = getattr(module, class_name)
+        else:
+            raise ImportError(f"No valid backend class found in {module_name}")
+
+    except ImportError as e:
+        raise ImportError(f"Failed to import backend module for '{compiler_name}': {e}")
+
     backend_config = (
         convert_to_dict(args.backend_config) if args.backend_config is not None else {}
     )
@@ -327,7 +314,7 @@ def model_call():
         help="Prefix path to model path list",
     )
     parser.add_argument(
-        "--backend-config",
+        "--config",
         type=str,
         required=False,
         default=None,

From 8c2b1c3799020940836c34ace41350557b1e7ac8 Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Fri, 16 Jan 2026 16:28:01 +0800
Subject: [PATCH 09/20] Change argument passing to json config

---
 graph_net_bench/test_compiler_util.py      |  28 +++++
 graph_net_bench/torch/eval_backend_diff.py |  50 +++-----
 graph_net_bench/torch/eval_backend_perf.py | 131 +++++++--------------
 3 files changed, 88 insertions(+), 121 deletions(-)

diff --git a/graph_net_bench/test_compiler_util.py b/graph_net_bench/test_compiler_util.py
index f587da2ff..de38a29fa 100644
--- a/graph_net_bench/test_compiler_util.py
+++ b/graph_net_bench/test_compiler_util.py
@@ -5,6 +5,7 @@
 import time
 import subprocess
 import shutil
+import base64
 import numpy as np
 from dataclasses import dataclass
 from contextlib import contextmanager
@@ -156,6 +157,24 @@ def print_basic_config(args, hardware_name, compile_framework_version):
     )
 
 
+def print_config(model_path, config, hardware_name, compiler_version):
+    model_path = os.path.normpath(model_path)
+    model_name = get_model_name(model_path)
+    print_with_log_prompt("[Config] model:", model_name, config.log_prompt)
+    print_with_log_prompt("[Config] seed:", config.seed, config.log_prompt)
+    print_with_log_prompt("[Config] device:", config.device, config.log_prompt)
+    print_with_log_prompt("[Config] hardware:", hardware_name, config.log_prompt)
+    print_with_log_prompt("[Config] op_lib:", config.op_lib, config.log_prompt)
+    print_with_log_prompt("[Config] compiler:", config.compiler, config.log_prompt)
+    print_with_log_prompt("[Config] warmup:", config.warmup, config.log_prompt)
+    print_with_log_prompt("[Config] trials:", config.trials, config.log_prompt)
+    print_with_log_prompt(
+        "[Config] compile_framework_version:",
+        compiler_version,
+        config.log_prompt,
+    )
+
+
 def print_running_status(args, eager_success, compiled_success=None):
     def convert_to_str(b):
         return "success" if b else "failed"
@@ -353,3 +372,12 @@ def get_allow_samples(allow_list, model_path_prefix):
             test_samples.append(os.path.join(model_path_prefix, line.strip()))
 
     return test_samples
+
+
+def convert_to_dict(config_str):
+    if config_str in {None, "", "null", "None"}:
+        return {}
+    config_str = base64.b64decode(config_str).decode("utf-8")
+    config = json.loads(config_str)
+    assert isinstance(config, dict), f"config should be a dict. {config_str=}"
+    return config
diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
index 0e7229086..6f8dc550b 100755
--- a/graph_net_bench/torch/eval_backend_diff.py
+++ b/graph_net_bench/torch/eval_backend_diff.py
@@ -6,22 +6,12 @@
 import os.path
 import traceback
 import json
-import base64
 import types
 from graph_net_bench import test_compiler_util
 from graph_net_bench import path_utils
 from .eval_backend_perf import eval_single_model_with_single_backend
 
 
-def convert_to_dict(config_str):
-    if config_str in {None, "", "null", "None"}:
-        return {}
-    config_str = base64.b64decode(config_str).decode("utf-8")
-    config = json.loads(config_str)
-    assert isinstance(config, dict), f"config should be a dict. {config_str=}"
-    return config
-
-
 def compare_correctness(expected_out, compiled_out, args):
     eager_dtypes = [
         (
@@ -220,41 +210,37 @@ def eval_single_model(args):
 
     EvalCfg = types.SimpleNamespace(
         reference_config=types.SimpleNamespace(
-            **convert_to_dict(args.config)["reference_config"]
+            **test_compiler_util.convert_to_dict(args.config)["reference_config"]
         ),
         target_config=types.SimpleNamespace(
-            **convert_to_dict(args.config)["target_config"]
+            **test_compiler_util.convert_to_dict(args.config)["target_config"]
         ),
     )
 
-    ref_args = build_sub_args(EvalCfg.reference_config, args.model_path, ref_dir)
-    target_args = build_sub_args(EvalCfg.target_config, args.model_path, target_dir)
+    reference_config = build_sub_config(EvalCfg.reference_config)
+    target_config = build_sub_config(EvalCfg.target_config)
 
-    eval_single_model_with_single_backend(ref_args)
-    eval_single_model_with_single_backend(target_args)
-    compare_perf_diff(ref_args, args.model_path, ref_dir, target_dir)
+    eval_single_model_with_single_backend(args.model_path, ref_dir, reference_config)
+    eval_single_model_with_single_backend(args.model_path, target_dir, target_config)
+    compare_perf_diff(reference_config, args.model_path, ref_dir, target_dir)
 
 
-def build_sub_args(
-    env_ns: types.SimpleNamespace, model_path: str, output_path: str
-) -> argparse.Namespace:
+def build_sub_config(config):
     sub = argparse.Namespace()
-    sub.model_path = model_path
-    sub.output_path = output_path
-    sub.seed = getattr(env_ns, "seed", 123)
-    sub.compiler = getattr(env_ns, "compiler", "inductor")
-    sub.device = getattr(env_ns, "device", "cuda")
-    sub.op_lib = getattr(env_ns, "op_lib", None)
-    sub.warmup = getattr(env_ns, "warmup", 3)
-    sub.trials = getattr(env_ns, "trials", 5)
-    sub.log_prompt = getattr(env_ns, "log_prompt", "graph-net-bench-log")
-    sub.model_path_prefix = getattr(env_ns, "model_path_prefix", None)
-    sub.backend_config = getattr(env_ns, "backend_config", None)
+    sub.seed = getattr(config, "seed", 123)
+    sub.compiler = getattr(config, "compiler", "inductor")
+    sub.device = getattr(config, "device", "cuda")
+    sub.op_lib = getattr(config, "op_lib", None)
+    sub.warmup = getattr(config, "warmup", 3)
+    sub.trials = getattr(config, "trials", 5)
+    sub.log_prompt = getattr(config, "log_prompt", "graph-net-bench-log")
+    sub.model_path_prefix = getattr(config, "model_path_prefix", None)
+    sub.backend_config = getattr(config, "backend_config", None)
     return sub
 
 
 def main(args):
-    config_dict = convert_to_dict(args.config)
+    config_dict = test_compiler_util.convert_to_dict(args.config)
     model_path_prefix = config_dict.get("reference_config", {}).get("model_path_prefix")
 
     if args.model_path_list and model_path_prefix:
diff --git a/graph_net_bench/torch/eval_backend_perf.py b/graph_net_bench/torch/eval_backend_perf.py
index 4d5ea94a5..3774d4176 100644
--- a/graph_net_bench/torch/eval_backend_perf.py
+++ b/graph_net_bench/torch/eval_backend_perf.py
@@ -11,7 +11,6 @@
 import random
 import numpy as np
 import platform
-import base64
 from contextlib import redirect_stdout, redirect_stderr
 from graph_net_bench.torch.backend.graph_compiler_backend import GraphCompilerBackend
 from graph_net_bench import test_compiler_util
@@ -75,20 +74,11 @@ def load_class_from_file(
     return model_class
 
 
-def convert_to_dict(config_str):
-    if config_str is None or config_str == "None":
-        return {}
-    config_str = base64.b64decode(config_str).decode("utf-8")
-    config = json.loads(config_str)
-    assert isinstance(config, dict), f"config should be a dict. {config_str=}"
-    return config
-
-
-def get_compiler_backend(args) -> GraphCompilerBackend:
+def get_compiler_backend(config) -> GraphCompilerBackend:
     """
-    Dynamically load backend class based on args.compiler
+    Dynamically load backend class based on config.compiler
     """
-    compiler_name = args.compiler.lower()
+    compiler_name = config.compiler.lower()
     module_name = f"graph_net_bench.torch.backend.{compiler_name}_backend"
 
     try:
@@ -108,56 +98,58 @@ def get_compiler_backend(args) -> GraphCompilerBackend:
         raise ImportError(f"Failed to import backend module for '{compiler_name}': {e}")
 
     backend_config = (
-        convert_to_dict(args.backend_config) if args.backend_config is not None else {}
+        test_compiler_util.convert_to_dict(config.backend_config)
+        if config.backend_config is not None
+        else {}
     )
     return backend_class(backend_config)
 
 
-def get_model(args):
-    device = "xla" if args.compiler == "xla" else args.device
+def get_model(model_path, config):
+    device = "xla" if config.compiler == "xla" else config.device
 
     # device: Torch device object specifying the target device for model loading (e.g., 'cuda', 'cpu', 'xla')
     model_class = load_class_from_file(
-        args.model_path, class_name="GraphModule", device=device
+        model_path, class_name="GraphModule", device=device
     )
-    model = model_class().to(torch.device(args.device))
+    model = model_class().to(torch.device(config.device))
     return model
 
 
-def get_input_dict(args):
-    inputs_params = utils.load_converted_from_text(f"{args.model_path}")
+def get_input_dict(model_path, config):
+    inputs_params = utils.load_converted_from_text(f"{model_path}")
     params = inputs_params["weight_info"]
     for tensor_meta in params.values():
         if "device" in tensor_meta["info"]:
-            tensor_meta["info"]["device"] = args.device
+            tensor_meta["info"]["device"] = config.device
     return {
-        k: utils.replay_tensor(v).to(torch.device(args.device))
+        k: utils.replay_tensor(v).to(torch.device(config.device))
         for k, v in params.items()
     }
 
 
-def measure_performance(model_call, args, compiler):
+def measure_performance(model_call, config, compiler):
     stats = {}
     outs = model_call()
 
     # Warmup runs
-    for _ in range(args.warmup):
+    for _ in range(config.warmup):
         model_call()
     compiler.synchronize()
 
-    hardware_name = get_hardward_name(args.device)
+    hardware_name = get_hardward_name(config.device)
     print(
-        f"[Profiling] Using device: {args.device} {hardware_name}, warm up {args.warmup}, trials {args.trials}",
+        f"[Profiling] Using device: {config.device} {hardware_name}, warm up {config.warmup}, trials {config.trials}",
         file=sys.stderr,
         flush=True,
     )
 
-    if "cuda" in args.device:
+    if "cuda" in config.device:
         torch.cuda.empty_cache()
         e2e_times = []
         gpu_times = []
 
-        for i in range(args.trials):
+        for i in range(config.trials):
             # End-to-end timing (naive_timer)
             duration_box = test_compiler_util.DurationBox(-1)
             with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
@@ -185,7 +177,7 @@ def measure_performance(model_call, args, compiler):
 
     else:  # CPU or other devices
         e2e_times = []
-        for i in range(args.trials):
+        for i in range(config.trials):
             duration_box = test_compiler_util.DurationBox(-1)
             with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
                 model_call()
@@ -200,34 +192,27 @@ def measure_performance(model_call, args, compiler):
     return outs, stats
 
 
-def eval_single_model_with_single_backend(args):
-    set_seed(args.seed)
-    os.makedirs(args.output_path, exist_ok=True)
-    log_path = utils.get_log_path(args.output_path, args.model_path)
-    output_dump_path = utils.get_output_path(args.output_path, args.model_path)
+def eval_single_model_with_single_backend(model_path, output_path, config):
+    set_seed(config.seed)
+    os.makedirs(output_path, exist_ok=True)
+    log_path = utils.get_log_path(output_path, model_path)
+    output_dump_path = utils.get_output_path(output_path, model_path)
     print(f"Log path: {log_path}", file=sys.stderr, flush=True)
     print(f"Outputs path: {output_dump_path}", file=sys.stderr, flush=True)
 
     with open(log_path, "w", encoding="utf-8") as log_f:
         with redirect_stdout(log_f), redirect_stderr(log_f):
-            compiler = get_compiler_backend(args)
+            compiler = get_compiler_backend(config)
 
-            input_dict = get_input_dict(args)
-            model = get_model(args)
+            input_dict = get_input_dict(model_path, config)
+            model = get_model(model_path, config)
             model.eval()
 
-            test_compiler_util.print_with_log_prompt(
-                "[Config] seed:", args.seed, args.log_prompt
-            )
-
-            test_compiler_util.print_basic_config(
-                args,
-                get_hardward_name(args.device),
-                get_compiler_version(args.compiler),
-            )
-
-            test_compiler_util.print_with_log_prompt(
-                "[Config] op_lib:", args.op_lib, args.log_prompt
+            test_compiler_util.print_config(
+                model_path,
+                config,
+                get_hardward_name(config.device),
+                get_compiler_version(config.compiler),
             )
 
             success = False
@@ -238,7 +223,7 @@ def eval_single_model_with_single_backend(args):
                 def model_call():
                     return compiled_model(**input_dict)
 
-                outputs, time_stats = measure_performance(model_call, args, compiler)
+                outputs, time_stats = measure_performance(model_call, config, compiler)
                 success = True
             except Exception as e:
                 print(
@@ -247,11 +232,11 @@ def model_call():
                     flush=True,
                 )
 
-            test_compiler_util.print_running_status(args, success)
+            test_compiler_util.print_running_status(config, success)
             if success:
                 torch.save(outputs, str(output_dump_path))
             test_compiler_util.print_with_log_prompt(
-                "[Performance][eager]:", json.dumps(time_stats), args.log_prompt
+                "[Performance][eager]:", json.dumps(time_stats), config.log_prompt
             )
 
     with open(log_path, "r", encoding="utf-8") as f:
@@ -277,42 +262,6 @@ def model_call():
         default="/tmp/test_save",
         help="Path to save outputs",
     )
-    parser.add_argument("--seed", type=int, required=False, default=123)
-    parser.add_argument(
-        "--compiler",
-        type=str,
-        required=False,
-        default="inductor",
-        help="Path to customized compiler python file",
-    )
-    parser.add_argument(
-        "--device",
-        type=str,
-        required=False,
-        default="cuda",
-        help="Device for testing the compiler (e.g., 'cpu' or 'cuda')",
-    )
-    parser.add_argument("--op-lib", type=str, required=False, default=None)
-    parser.add_argument(
-        "--warmup", type=int, required=False, default=3, help="Number of warmup steps"
-    )
-    parser.add_argument(
-        "--trials", type=int, required=False, default=5, help="Number of timing trials"
-    )
-    parser.add_argument(
-        "--log-prompt",
-        type=str,
-        required=False,
-        default="graph-net-bench-log",
-        help="Log prompt for performance log filtering.",
-    )
-    parser.add_argument(
-        "--model-path-prefix",
-        type=str,
-        required=False,
-        default=None,
-        help="Prefix path to model path list",
-    )
     parser.add_argument(
         "--config",
         type=str,
@@ -321,4 +270,8 @@ def model_call():
         help="base64 encode configuration json.",
     )
     args = parser.parse_args()
-    eval_single_model_with_single_backend(args=args)
+    eval_single_model_with_single_backend(
+        args.model_path,
+        args.output_path,
+        **test_compiler_util.convert_to_dict(args.config),
+    )

From db877bdb37c4cf20ca29414340d6a5707f072be9 Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Fri, 16 Jan 2026 18:11:20 +0800
Subject: [PATCH 10/20] Add check_and_complete_args

---
 graph_net_bench/test_compiler_util.py      |  22 ++---
 graph_net_bench/torch/eval_backend_diff.py |  73 ++++++---------
 graph_net_bench/torch/eval_backend_perf.py | 100 +++++++++++++--------
 3 files changed, 101 insertions(+), 94 deletions(-)

diff --git a/graph_net_bench/test_compiler_util.py b/graph_net_bench/test_compiler_util.py
index de38a29fa..44ccc703e 100644
--- a/graph_net_bench/test_compiler_util.py
+++ b/graph_net_bench/test_compiler_util.py
@@ -157,21 +157,21 @@ def print_basic_config(args, hardware_name, compile_framework_version):
     )
 
 
-def print_config(model_path, config, hardware_name, compiler_version):
-    model_path = os.path.normpath(model_path)
+def print_config(args, hardware_name, compiler_version):
+    model_path = os.path.normpath(args.model_path)
     model_name = get_model_name(model_path)
-    print_with_log_prompt("[Config] model:", model_name, config.log_prompt)
-    print_with_log_prompt("[Config] seed:", config.seed, config.log_prompt)
-    print_with_log_prompt("[Config] device:", config.device, config.log_prompt)
-    print_with_log_prompt("[Config] hardware:", hardware_name, config.log_prompt)
-    print_with_log_prompt("[Config] op_lib:", config.op_lib, config.log_prompt)
-    print_with_log_prompt("[Config] compiler:", config.compiler, config.log_prompt)
-    print_with_log_prompt("[Config] warmup:", config.warmup, config.log_prompt)
-    print_with_log_prompt("[Config] trials:", config.trials, config.log_prompt)
+    print_with_log_prompt("[Config] model:", model_name, args.log_prompt)
+    print_with_log_prompt("[Config] seed:", args.seed, args.log_prompt)
+    print_with_log_prompt("[Config] device:", args.device, args.log_prompt)
+    print_with_log_prompt("[Config] hardware:", hardware_name, args.log_prompt)
+    print_with_log_prompt("[Config] op_lib:", args.op_lib, args.log_prompt)
+    print_with_log_prompt("[Config] compiler:", args.compiler, args.log_prompt)
+    print_with_log_prompt("[Config] warmup:", args.warmup, args.log_prompt)
+    print_with_log_prompt("[Config] trials:", args.trials, args.log_prompt)
     print_with_log_prompt(
         "[Config] compile_framework_version:",
         compiler_version,
-        config.log_prompt,
+        args.log_prompt,
     )
 
 
diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
index 6f8dc550b..c230f6bd8 100755
--- a/graph_net_bench/torch/eval_backend_diff.py
+++ b/graph_net_bench/torch/eval_backend_diff.py
@@ -146,10 +146,9 @@ def eval_multi_models(args, model_path_prefix=None, use_model_list=False):
 
         try:
             single_model_args = argparse.Namespace()
-
             single_model_args.model_path = model_path
-            single_model_args.config = args.config
             single_model_args.model_path_list = None
+            single_model_args.config = args.config
 
             if path_utils.is_single_model_dir(model_path):
                 eval_single_model(single_model_args)
@@ -158,8 +157,8 @@ def eval_multi_models(args, model_path_prefix=None, use_model_list=False):
                 for submodel_path in submodel_paths:
                     sub_args = argparse.Namespace()
                     sub_args.model_path = submodel_path
-                    sub_args.config = args.config
                     sub_args.model_path_list = None
+                    sub_args.config = args.config
                     eval_single_model(sub_args)
             cmd_ret = 0
         except KeyboardInterrupt:
@@ -184,60 +183,44 @@ def eval_multi_models(args, model_path_prefix=None, use_model_list=False):
             print(f"- {model_path}", file=sys.stderr, flush=True)
 
 
-def compare_perf_diff(args, model_path, ref_dir, target_dir):
+def eval_single_model(args):
+    ref_dir = "/tmp/eval_perf_diff/A"
+    target_dir = "/tmp/eval_perf_diff/B"
+
+    ref_args = types.SimpleNamespace(
+        model_path=args.model_path,
+        output_path=ref_dir,
+        **test_compiler_util.convert_to_dict(args.config)["reference_config"],
+    )
+    target_args = types.SimpleNamespace(
+        model_path=args.model_path,
+        output_path=target_dir,
+        **test_compiler_util.convert_to_dict(args.config)["target_config"],
+    )
+
+    eval_single_model_with_single_backend(ref_args)
+    eval_single_model_with_single_backend(target_args)
+
+    # compare_perf_diff
     # A
-    ref_dump_path = utils.get_output_path(ref_dir, model_path)
+    ref_dump_path = utils.get_output_path(ref_dir, args.model_path)
     ref_out = torch.load(str(ref_dump_path))
 
-    ref_log_path = utils.get_log_path(ref_dir, model_path)
+    ref_log_path = utils.get_log_path(ref_dir, args.model_path)
     ref_time_stats = parse_time_stats_from_reference_log(ref_log_path)
 
     # B
-    target_dump_path = utils.get_output_path(target_dir, model_path)
+    target_dump_path = utils.get_output_path(target_dir, args.model_path)
     target_out = torch.load(str(target_dump_path))
 
-    target_log_path = utils.get_log_path(target_dir, model_path)
+    target_log_path = utils.get_log_path(target_dir, args.model_path)
     target_time_stats = parse_time_stats_from_reference_log(target_log_path)
 
-    compare_correctness(ref_out, target_out, args)
-
-    test_compiler_util.print_times_and_speedup(args, ref_time_stats, target_time_stats)
-
-
-def eval_single_model(args):
-    ref_dir = "/tmp/eval_perf_diff/A"
-    target_dir = "/tmp/eval_perf_diff/B"
-
-    EvalCfg = types.SimpleNamespace(
-        reference_config=types.SimpleNamespace(
-            **test_compiler_util.convert_to_dict(args.config)["reference_config"]
-        ),
-        target_config=types.SimpleNamespace(
-            **test_compiler_util.convert_to_dict(args.config)["target_config"]
-        ),
+    compare_correctness(ref_out, target_out, ref_args)
+    test_compiler_util.print_times_and_speedup(
+        ref_args, ref_time_stats, target_time_stats
     )
 
-    reference_config = build_sub_config(EvalCfg.reference_config)
-    target_config = build_sub_config(EvalCfg.target_config)
-
-    eval_single_model_with_single_backend(args.model_path, ref_dir, reference_config)
-    eval_single_model_with_single_backend(args.model_path, target_dir, target_config)
-    compare_perf_diff(reference_config, args.model_path, ref_dir, target_dir)
-
-
-def build_sub_config(config):
-    sub = argparse.Namespace()
-    sub.seed = getattr(config, "seed", 123)
-    sub.compiler = getattr(config, "compiler", "inductor")
-    sub.device = getattr(config, "device", "cuda")
-    sub.op_lib = getattr(config, "op_lib", None)
-    sub.warmup = getattr(config, "warmup", 3)
-    sub.trials = getattr(config, "trials", 5)
-    sub.log_prompt = getattr(config, "log_prompt", "graph-net-bench-log")
-    sub.model_path_prefix = getattr(config, "model_path_prefix", None)
-    sub.backend_config = getattr(config, "backend_config", None)
-    return sub
-
 
 def main(args):
     config_dict = test_compiler_util.convert_to_dict(args.config)
diff --git a/graph_net_bench/torch/eval_backend_perf.py b/graph_net_bench/torch/eval_backend_perf.py
index 3774d4176..5c8586f30 100644
--- a/graph_net_bench/torch/eval_backend_perf.py
+++ b/graph_net_bench/torch/eval_backend_perf.py
@@ -11,6 +11,7 @@
 import random
 import numpy as np
 import platform
+import types
 from contextlib import redirect_stdout, redirect_stderr
 from graph_net_bench.torch.backend.graph_compiler_backend import GraphCompilerBackend
 from graph_net_bench import test_compiler_util
@@ -74,11 +75,11 @@ def load_class_from_file(
     return model_class
 
 
-def get_compiler_backend(config) -> GraphCompilerBackend:
+def get_compiler_backend(args) -> GraphCompilerBackend:
     """
-    Dynamically load backend class based on config.compiler
+    Dynamically load backend class based on args.compiler
     """
-    compiler_name = config.compiler.lower()
+    compiler_name = args.compiler.lower()
     module_name = f"graph_net_bench.torch.backend.{compiler_name}_backend"
 
     try:
@@ -98,58 +99,57 @@ def get_compiler_backend(config) -> GraphCompilerBackend:
         raise ImportError(f"Failed to import backend module for '{compiler_name}': {e}")
 
     backend_config = (
-        test_compiler_util.convert_to_dict(config.backend_config)
-        if config.backend_config is not None
+        test_compiler_util.convert_to_dict(args.backend_config)
+        if args.backend_config is not None
         else {}
     )
     return backend_class(backend_config)
 
 
-def get_model(model_path, config):
-    device = "xla" if config.compiler == "xla" else config.device
+def get_model(args):
+    device = "xla" if args.compiler == "xla" else args.device
 
     # device: Torch device object specifying the target device for model loading (e.g., 'cuda', 'cpu', 'xla')
     model_class = load_class_from_file(
-        model_path, class_name="GraphModule", device=device
+        args.model_path, class_name="GraphModule", device=device
     )
-    model = model_class().to(torch.device(config.device))
+    model = model_class().to(torch.device(args.device))
     return model
 
 
-def get_input_dict(model_path, config):
-    inputs_params = utils.load_converted_from_text(f"{model_path}")
+def get_input_dict(args):
+    inputs_params = utils.load_converted_from_text(f"{args.model_path}")
     params = inputs_params["weight_info"]
     for tensor_meta in params.values():
         if "device" in tensor_meta["info"]:
-            tensor_meta["info"]["device"] = config.device
+            tensor_meta["info"]["device"] = args.device
     return {
-        k: utils.replay_tensor(v).to(torch.device(config.device))
+        k: utils.replay_tensor(v).to(torch.device(args.device))
         for k, v in params.items()
     }
 
 
-def measure_performance(model_call, config, compiler):
+def measure_performance(model_call, args, compiler):
     stats = {}
     outs = model_call()
 
     # Warmup runs
-    for _ in range(config.warmup):
+    for _ in range(args.warmup):
         model_call()
     compiler.synchronize()
 
-    hardware_name = get_hardward_name(config.device)
     print(
-        f"[Profiling] Using device: {config.device} {hardware_name}, warm up {config.warmup}, trials {config.trials}",
+        f"[Profiling] Warm up {args.warmup}, Trials {args.trials}",
         file=sys.stderr,
         flush=True,
     )
 
-    if "cuda" in config.device:
+    if "cuda" in args.device:
         torch.cuda.empty_cache()
         e2e_times = []
         gpu_times = []
 
-        for i in range(config.trials):
+        for i in range(args.trials):
             # End-to-end timing (naive_timer)
             duration_box = test_compiler_util.DurationBox(-1)
             with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
@@ -177,7 +177,7 @@ def measure_performance(model_call, config, compiler):
 
     else:  # CPU or other devices
         e2e_times = []
-        for i in range(config.trials):
+        for i in range(args.trials):
             duration_box = test_compiler_util.DurationBox(-1)
             with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
                 model_call()
@@ -192,27 +192,27 @@ def measure_performance(model_call, config, compiler):
     return outs, stats
 
 
-def eval_single_model_with_single_backend(model_path, output_path, config):
-    set_seed(config.seed)
-    os.makedirs(output_path, exist_ok=True)
-    log_path = utils.get_log_path(output_path, model_path)
-    output_dump_path = utils.get_output_path(output_path, model_path)
+def eval_single_model_with_single_backend(args):
+    check_and_complete_args(args)
+    set_seed(args.seed)
+    os.makedirs(args.output_path, exist_ok=True)
+    log_path = utils.get_log_path(args.output_path, args.model_path)
+    output_dump_path = utils.get_output_path(args.output_path, args.model_path)
     print(f"Log path: {log_path}", file=sys.stderr, flush=True)
     print(f"Outputs path: {output_dump_path}", file=sys.stderr, flush=True)
 
     with open(log_path, "w", encoding="utf-8") as log_f:
         with redirect_stdout(log_f), redirect_stderr(log_f):
-            compiler = get_compiler_backend(config)
+            compiler = get_compiler_backend(args)
 
-            input_dict = get_input_dict(model_path, config)
-            model = get_model(model_path, config)
+            input_dict = get_input_dict(args)
+            model = get_model(args)
             model.eval()
 
             test_compiler_util.print_config(
-                model_path,
-                config,
-                get_hardward_name(config.device),
-                get_compiler_version(config.compiler),
+                args,
+                get_hardward_name(args.device),
+                get_compiler_version(args.compiler),
             )
 
             success = False
@@ -223,7 +223,7 @@ def eval_single_model_with_single_backend(model_path, output_path, config):
                 def model_call():
                     return compiled_model(**input_dict)
 
-                outputs, time_stats = measure_performance(model_call, config, compiler)
+                outputs, time_stats = measure_performance(model_call, args, compiler)
                 success = True
             except Exception as e:
                 print(
@@ -232,11 +232,11 @@ def model_call():
                     flush=True,
                 )
 
-            test_compiler_util.print_running_status(config, success)
+            test_compiler_util.print_running_status(args, success)
             if success:
                 torch.save(outputs, str(output_dump_path))
             test_compiler_util.print_with_log_prompt(
-                "[Performance][eager]:", json.dumps(time_stats), config.log_prompt
+                "[Performance][eager]:", json.dumps(time_stats), args.log_prompt
             )
 
     with open(log_path, "r", encoding="utf-8") as f:
@@ -244,6 +244,29 @@ def model_call():
         print(content, file=sys.stderr, flush=True)
 
 
+def check_and_complete_args(args):
+    """
+    Ensure all required arguments are present with default values if missing
+    """
+    defaults = {
+        "model_path": None,  # Model path
+        "output_path": None,  # Log and output directory
+        "seed": 123,  # Random seed
+        "compiler": "inductor",  # Compiler name
+        "device": "cuda",  # Device for testing the compiler (e.g., 'cpu' or 'cuda')
+        "op_lib": None,  # Operator library
+        "warmup": 3,  # Number of warmup steps
+        "trials": 5,  # Number of timing trials
+        "log_prompt": "graph-net-bench-log",  # Log prompt for performance log filtering
+        "model_path_prefix": None,  # Prefix path to model path in args.model-path
+        "backend_config": None,  # backend configuration json
+    }
+
+    for key, default in defaults.items():
+        if not hasattr(args, key):
+            setattr(args, key, default)
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Single Backend Performance Evaluation"
@@ -270,8 +293,9 @@ def model_call():
         help="base64 encode configuration json.",
     )
     args = parser.parse_args()
-    eval_single_model_with_single_backend(
-        args.model_path,
-        args.output_path,
+    mut_args = types.SimpleNamespace(
+        model_path=args.model_path,
+        output_path=args.output_path,
         **test_compiler_util.convert_to_dict(args.config),
     )
+    eval_single_model_with_single_backend(mut_args)

From 0e6ec45faf2fe026640c7535e0ed4d2e567dfe02 Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Fri, 16 Jan 2026 18:25:24 +0800
Subject: [PATCH 11/20] Simplify

---
 graph_net_bench/torch/eval_backend_diff.py | 78 +++++++++++-----------
 1 file changed, 40 insertions(+), 38 deletions(-)

diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
index c230f6bd8..ecafb71ae 100755
--- a/graph_net_bench/torch/eval_backend_diff.py
+++ b/graph_net_bench/torch/eval_backend_diff.py
@@ -109,72 +109,74 @@ def parse_time_stats_from_reference_log(log_path):
     return time_stats
 
 
-def eval_multi_models(args, model_path_prefix=None, use_model_list=False):
-    sample_idx = 0
-    failed_samples = []
-    module_name = os.path.splitext(os.path.basename(__file__))[0]
-
+def _get_model_paths(args, model_path_prefix, use_model_list):
     if use_model_list:
-        assert os.path.isdir(model_path_prefix)
-        assert os.path.isfile(args.model_path_list)
+        assert os.path.isdir(model_path_prefix) and os.path.isfile(args.model_path_list)
+
         test_samples = test_compiler_util.get_allow_samples(
             args.model_path_list, model_path_prefix
         )
-        model_paths = []
-        for rel_model_path in test_samples:
-            model_path = os.path.join(model_path_prefix, rel_model_path)
-            if os.path.exists(model_path) and os.path.exists(
-                os.path.join(model_path, "model.py")
-            ):
-                model_paths.append(model_path)
+        model_paths = [
+            os.path.join(model_path_prefix, rel_model_path)
+            for rel_model_path in test_samples
+            if os.path.exists(
+                os.path.join(model_path_prefix, rel_model_path, "model.py")
+            )
+        ]
     else:
         assert os.path.isdir(args.model_path)
+
         test_samples = test_compiler_util.get_allow_samples(
             args.model_path_list, model_path_prefix
         )
-        model_paths = []
-        for model_path in path_utils.get_recursively_model_path(args.model_path):
-            if test_samples is None or os.path.abspath(model_path) in test_samples:
-                model_paths.append(model_path)
+        model_paths = [
+            model_path
+            for model_path in path_utils.get_recursively_model_path(args.model_path)
+            if test_samples is None or os.path.abspath(model_path) in test_samples
+        ]
+
+    return model_paths
+
 
-    for model_path in model_paths:
+def _create_model_args(model_path, config):
+    args = argparse.Namespace()
+    args.model_path = model_path
+    args.model_path_list = None
+    args.config = config
+    return args
+
+
+def eval_multi_models(args, model_path_prefix=None, use_model_list=False):
+    module_name = os.path.splitext(os.path.basename(__file__))[0]
+
+    model_paths = _get_model_paths(args, model_path_prefix, use_model_list)
+    failed_samples = []
+    for sample_idx, model_path in enumerate(model_paths):
         print(
             f"[{sample_idx}] {module_name}, model_path: {model_path}",
             file=sys.stderr,
             flush=True,
         )
-
         try:
-            single_model_args = argparse.Namespace()
-            single_model_args.model_path = model_path
-            single_model_args.model_path_list = None
-            single_model_args.config = args.config
-
             if path_utils.is_single_model_dir(model_path):
-                eval_single_model(single_model_args)
+                eval_single_model(_create_model_args(model_path, args.config))
             else:
-                submodel_paths = path_utils.get_recursively_model_path(model_path)
-                for submodel_path in submodel_paths:
-                    sub_args = argparse.Namespace()
-                    sub_args.model_path = submodel_path
-                    sub_args.model_path_list = None
-                    sub_args.config = args.config
-                    eval_single_model(sub_args)
-            cmd_ret = 0
+                for submodel_path in path_utils.get_recursively_model_path(model_path):
+                    eval_single_model(_create_model_args(submodel_path, args.config))
+            success = True
         except KeyboardInterrupt:
             print("KeyboardInterrupt")
             sys.exit(1)
         except Exception:
             print("\n--- Full Traceback ---")
             traceback.print_exc()
-            cmd_ret = 1
+            success = False
 
-        if cmd_ret != 0:
+        if not success:
             failed_samples.append(model_path)
-        sample_idx += 1
 
     print(
-        f"Totally {sample_idx} verified samples, failed {len(failed_samples)} samples.",
+        f"Totally {len(model_paths)} verified samples, failed {len(failed_samples)} samples.",
         file=sys.stderr,
         flush=True,
     )

From a5fa17369258592c16abdd0ef69a47a92c5f677c Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Tue, 20 Jan 2026 13:57:16 +0800
Subject: [PATCH 12/20] modify args.config to separate args.reference_config
 and args.target_config

---
 graph_net_bench/torch/eval_backend_diff.py | 39 +++++++++++++---------
 test/eval_backend_diff_test.sh             | 31 ++++++++---------
 2 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
index ecafb71ae..c254eafaf 100755
--- a/graph_net_bench/torch/eval_backend_diff.py
+++ b/graph_net_bench/torch/eval_backend_diff.py
@@ -138,11 +138,12 @@ def _get_model_paths(args, model_path_prefix, use_model_list):
     return model_paths
 
 
-def _create_model_args(model_path, config):
+def _create_model_args(model_path, reference_config, target_config):
     args = argparse.Namespace()
     args.model_path = model_path
     args.model_path_list = None
-    args.config = config
+    args.reference_config = reference_config
+    args.target_config = target_config
     return args
 
 
@@ -157,12 +158,15 @@ def eval_multi_models(args, model_path_prefix=None, use_model_list=False):
             file=sys.stderr,
             flush=True,
         )
+
+        model_args = argparse.Namespace()
+        model_args.model_path = model_path
+        model_args.model_path_list = None
+        model_args.reference_config = args.reference_config
+        model_args.target_config = args.target_config
+
         try:
-            if path_utils.is_single_model_dir(model_path):
-                eval_single_model(_create_model_args(model_path, args.config))
-            else:
-                for submodel_path in path_utils.get_recursively_model_path(model_path):
-                    eval_single_model(_create_model_args(submodel_path, args.config))
+            eval_single_model(model_args)
             success = True
         except KeyboardInterrupt:
             print("KeyboardInterrupt")
@@ -192,12 +196,12 @@ def eval_single_model(args):
     ref_args = types.SimpleNamespace(
         model_path=args.model_path,
         output_path=ref_dir,
-        **test_compiler_util.convert_to_dict(args.config)["reference_config"],
+        **test_compiler_util.convert_to_dict(args.reference_config),
     )
     target_args = types.SimpleNamespace(
         model_path=args.model_path,
         output_path=target_dir,
-        **test_compiler_util.convert_to_dict(args.config)["target_config"],
+        **test_compiler_util.convert_to_dict(args.target_config),
     )
 
     eval_single_model_with_single_backend(ref_args)
@@ -225,8 +229,8 @@ def eval_single_model(args):
 
 
 def main(args):
-    config_dict = test_compiler_util.convert_to_dict(args.config)
-    model_path_prefix = config_dict.get("reference_config", {}).get("model_path_prefix")
+    ref_config = test_compiler_util.convert_to_dict(args.reference_config)
+    model_path_prefix = ref_config.get("model_path_prefix")
 
     if args.model_path_list and model_path_prefix:
         eval_multi_models(args, model_path_prefix, use_model_list=True)
@@ -258,11 +262,16 @@ def main(args):
         help="Path to samples list, each line contains a sample path",
     )
     parser.add_argument(
-        "--config",
+        "--reference-config",
         type=str,
-        required=False,
-        default=None,
-        help="base64 encode configuration json.",
+        required=True,
+        help="base64 encode reference config json.",
+    )
+    parser.add_argument(
+        "--target-config",
+        type=str,
+        required=True,
+        help="base64 encode target config json.",
     )
     args = parser.parse_args()
     main(args=args)
diff --git a/test/eval_backend_diff_test.sh b/test/eval_backend_diff_test.sh
index 17bba712e..1eaca5ecd 100755
--- a/test/eval_backend_diff_test.sh
+++ b/test/eval_backend_diff_test.sh
@@ -8,22 +8,23 @@ model_list="$AI4C_ROOT/test/workspace_eval_backend_diff/sample_list.txt"
 
 python3 -m graph_net_bench.torch.eval_backend_diff \
     --model-path-list $model_list \
-    --config $(base64 -w 0 <<EOF
+    --reference-config $(base64 -w 0 <<EOF
 {
-    "reference_config":  {
-        "compiler": "nope",
-        "device": "cuda",
-        "warmup": 1,
-        "trials": 1,
-        "model_path_prefix": "$AI4C_ROOT"
-    },
-    "target_config": {
-        "compiler": "nope",
-        "device": "cuda",
-        "warmup": 1,
-        "trials": 1,
-        "model_path_prefix": "$AI4C_ROOT"
-    }
+    "compiler": "nope",
+    "device": "cuda",
+    "warmup": 1,
+    "trials": 1,
+    "model_path_prefix": "$AI4C_ROOT"
+}
+EOF
+) \
+    --target-config $(base64 -w 0 <<EOF
+{
+    "compiler": "nope",
+    "device": "cuda",
+    "warmup": 1,
+    "trials": 1,
+    "model_path_prefix": "$AI4C_ROOT"
 }
 EOF
 ) 2>&1 | tee "$OUTPUT_PATH/validation.log"

From 0c9e07b8d93e9f1aba28569a86995edab583b383 Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Tue, 20 Jan 2026 14:24:51 +0800
Subject: [PATCH 13/20] reuse some code

---
 graph_net/torch/test_reference_device.py | 26 ++++--------------------
 graph_net/torch/test_target_device.py    | 22 +++++++-------------
 2 files changed, 11 insertions(+), 37 deletions(-)

diff --git a/graph_net/torch/test_reference_device.py b/graph_net/torch/test_reference_device.py
index f022d2ba5..33d0ec8e4 100644
--- a/graph_net/torch/test_reference_device.py
+++ b/graph_net/torch/test_reference_device.py
@@ -11,30 +11,12 @@
 from graph_net_bench import test_compiler_util
 from graph_net import model_path_util
 from graph_net_bench.torch import test_compiler
-
-
-def get_reference_log_path(reference_dir, model_path):
-    model_name = model_path.split("torch_samples/")[-1].replace(os.sep, "_")
-    return os.path.join(reference_dir, f"{model_name}.log")
-
-
-def get_reference_output_path(reference_dir, model_path):
-    model_name = model_path.split("torch_samples/")[-1].replace(os.sep, "_")
-    return os.path.join(reference_dir, f"{model_name}.pth")
-
-
-def register_op_lib(op_lib):
-    if op_lib == "flaggems":
-        import flag_gems
-
-        flag_gems.enable()
-    else:
-        pass
+from graph_net_bench.torch import utils, eval_backend_perf
 
 
 def test_single_model(args):
-    ref_log = get_reference_log_path(args.reference_dir, args.model_path)
-    ref_dump = get_reference_output_path(args.reference_dir, args.model_path)
+    ref_log = utils.get_log_path(args.reference_dir, args.model_path)
+    ref_dump = utils.get_output_path(args.reference_dir, args.model_path)
     print(f"Reference log path: {ref_log}", file=sys.stderr, flush=True)
     print(f"Reference outputs path: {ref_dump}", file=sys.stderr, flush=True)
 
@@ -149,7 +131,7 @@ def main(args):
     ref_dump_dir.mkdir(parents=True, exist_ok=True)
 
     if path_utils.is_single_model_dir(args.model_path):
-        register_op_lib(args.op_lib)
+        eval_backend_perf.register_op_lib(args.op_lib)
         test_single_model(args)
     else:
         test_multi_models(args)
diff --git a/graph_net/torch/test_target_device.py b/graph_net/torch/test_target_device.py
index ec2085a32..cf56dee69 100644
--- a/graph_net/torch/test_target_device.py
+++ b/graph_net/torch/test_target_device.py
@@ -8,7 +8,7 @@
 from graph_net_bench import path_utils
 from graph_net_bench import test_compiler_util
 from graph_net import model_path_util
-from graph_net_bench.torch import test_compiler, test_reference_device
+from graph_net_bench.torch import test_compiler, utils, eval_backend_perf
 
 
 def parse_config_from_reference_log(log_path):
@@ -46,9 +46,7 @@ def parse_time_stats_from_reference_log(log_path):
 
 
 def update_args_and_set_seed(args, model_path):
-    ref_log = test_reference_device.get_reference_log_path(
-        args.reference_dir, model_path
-    )
+    ref_log = utils.get_log_path(args.reference_dir, model_path)
     config = parse_config_from_reference_log(ref_log)
     vars(args)["model_path"] = model_path
     vars(args)["compiler"] = config.get("compiler")
@@ -100,14 +98,10 @@ def model_call():
     if test_compiler_util.get_subgraph_tag(args.model_path):
         model_name += "_" + test_compiler_util.get_subgraph_tag(args.model_path)
 
-    ref_dump = test_reference_device.get_reference_output_path(
-        args.reference_dir, args.model_path
-    )
+    ref_dump = utils.get_output_path(args.reference_dir, args.model_path)
     ref_out = torch.load(str(ref_dump))
 
-    ref_log = test_reference_device.get_reference_log_path(
-        args.reference_dir, args.model_path
-    )
+    ref_log = utils.get_log_path(args.reference_dir, args.model_path)
     ref_time_stats = parse_time_stats_from_reference_log(ref_log)
 
     if success:
@@ -117,7 +111,7 @@ def model_call():
 
 
 def is_reference_log_exist(reference_dir, model_path):
-    log_path = test_reference_device.get_reference_log_path(reference_dir, model_path)
+    log_path = utils.get_log_path(reference_dir, model_path)
     return os.path.isfile(log_path)
 
 
@@ -171,16 +165,14 @@ def main(args):
 
     if path_utils.is_single_model_dir(args.model_path):
         if args.op_lib == "origin":
-            ref_log = test_reference_device.get_reference_log_path(
-                args.reference_dir, args.model_path
-            )
+            ref_log = utils.get_log_path(args.reference_dir, args.model_path)
             config = parse_config_from_reference_log(ref_log)
             vars(args)["op_lib"] = config.get("op_lib")
             test_compiler_util.print_with_log_prompt(
                 "[Config] op_lib:", args.op_lib, args.log_prompt
             )
         else:
-            test_reference_device.register_op_lib(args.op_lib)
+            eval_backend_perf.register_op_lib(args.op_lib)
 
         args = update_args_and_set_seed(args, args.model_path)
         test_single_model(args)

From ebd46af74be6fb0ee7828cb4eca27754afcf1a11 Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Tue, 20 Jan 2026 14:37:02 +0800
Subject: [PATCH 14/20] Add unittest on test device; minor fix

---
 graph_net/torch/test_reference_device.py |  2 +-
 test/eval_device_diff_test.sh            | 37 ++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100755 test/eval_device_diff_test.sh

diff --git a/graph_net/torch/test_reference_device.py b/graph_net/torch/test_reference_device.py
index 33d0ec8e4..6a28095e4 100644
--- a/graph_net/torch/test_reference_device.py
+++ b/graph_net/torch/test_reference_device.py
@@ -119,7 +119,7 @@ def test_multi_models(args):
 def main(args):
     assert os.path.isdir(args.model_path)
     # Support all torch compilers
-    valid_compilers = list(test_compiler.registry_backend.keys())
+    valid_compilers = list(test_compiler.compiler_backend_name2class.keys())
     assert (
         args.compiler in valid_compilers
     ), f"Compiler must be one of {valid_compilers}"
diff --git a/test/eval_device_diff_test.sh b/test/eval_device_diff_test.sh
new file mode 100755
index 000000000..10e0ab766
--- /dev/null
+++ b/test/eval_device_diff_test.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+AI4C_ROOT=$(python3 -c "import graph_net_bench; import os; print(os.path.dirname(os.path.dirname(graph_net_bench.__file__)))")
+OUTPUT_PATH=/tmp/workspace_eval_device_diff_test
+REFERENCE_DIR="$OUTPUT_PATH/reference"
+
+mkdir -p "$OUTPUT_PATH"
+mkdir -p "$REFERENCE_DIR"
+
+MODEL_PATH="$AI4C_ROOT/samples/ultralytics/yolov3-tinyu"
+
+echo "=========================================="
+echo "Step 1: Generate reference on device A (simulated)"
+echo "=========================================="
+python3 -m graph_net.torch.test_reference_device \
+    --model-path "$MODEL_PATH" \
+    --compiler nope \
+    --device cuda \
+    --warmup 1 \
+    --trials 1 \
+    --reference-dir "$REFERENCE_DIR" \
+    2>&1 | tee "$OUTPUT_PATH/reference.log"
+
+echo ""
+echo "=========================================="
+echo "Step 2: Compare on device B (simulated)"
+echo "=========================================="
+python3 -m graph_net.torch.test_target_device \
+    --model-path "$MODEL_PATH" \
+    --device cuda \
+    --reference-dir "$REFERENCE_DIR" \
+    2>&1 | tee "$OUTPUT_PATH/target.log"
+
+echo ""
+echo "=========================================="
+echo "Test completed. Logs saved to: $OUTPUT_PATH"
+echo "=========================================="
\ No newline at end of file

From 74b5238ef64ed5a1c08eceed44bbe0e77cc3f72c Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Tue, 20 Jan 2026 15:15:45 +0800
Subject: [PATCH 15/20] reuse eval_backend_perf, eval_backend_diff instead of
 test_compiler in test_device

---
 graph_net/torch/test_reference_device.py |  96 +++++--------------
 graph_net/torch/test_target_device.py    | 114 +++++++++--------------
 2 files changed, 66 insertions(+), 144 deletions(-)

diff --git a/graph_net/torch/test_reference_device.py b/graph_net/torch/test_reference_device.py
index 6a28095e4..bb80c1e8c 100644
--- a/graph_net/torch/test_reference_device.py
+++ b/graph_net/torch/test_reference_device.py
@@ -1,76 +1,33 @@
 import argparse
-import torch
 import os
-from pathlib import Path
-from contextlib import redirect_stdout, redirect_stderr
-import json
 import sys
-import traceback
+import types
+from pathlib import Path
 
 from graph_net_bench import path_utils
-from graph_net_bench import test_compiler_util
 from graph_net import model_path_util
-from graph_net_bench.torch import test_compiler
-from graph_net_bench.torch import utils, eval_backend_perf
+from graph_net_bench.torch import eval_backend_perf
+
+
+def convert_args_for_eval_backend(args):
+    """Convert test_reference_device args to eval_backend_perf args format."""
+    return types.SimpleNamespace(
+        model_path=args.model_path,
+        output_path=args.reference_dir,
+        seed=args.seed,
+        compiler=args.compiler,
+        device=args.device,
+        op_lib=args.op_lib,
+        warmup=args.warmup,
+        trials=args.trials,
+        log_prompt=args.log_prompt,
+        backend_config=getattr(args, "config", None),
+    )
 
 
 def test_single_model(args):
-    ref_log = utils.get_log_path(args.reference_dir, args.model_path)
-    ref_dump = utils.get_output_path(args.reference_dir, args.model_path)
-    print(f"Reference log path: {ref_log}", file=sys.stderr, flush=True)
-    print(f"Reference outputs path: {ref_dump}", file=sys.stderr, flush=True)
-
-    with open(ref_log, "w", encoding="utf-8") as log_f:
-        with redirect_stdout(log_f), redirect_stderr(log_f):
-            compiler = test_compiler.get_compiler_backend(args)
-
-            input_dict = test_compiler.get_input_dict(args)
-            model = test_compiler.get_model(args)
-            model.eval()
-
-            test_compiler_util.print_with_log_prompt(
-                "[Config] seed:", args.seed, args.log_prompt
-            )
-
-            test_compiler_util.print_basic_config(
-                args,
-                test_compiler.get_hardward_name(args),
-                test_compiler.get_compile_framework_version(args),
-            )
-
-            test_compiler_util.print_with_log_prompt(
-                "[Config] op_lib:", args.op_lib, args.log_prompt
-            )
-
-            success = False
-            time_stats = {}
-            try:
-                compiled_model = compiler(model)
-
-                def model_call():
-                    return compiled_model(**input_dict)
-
-                outputs, time_stats = test_compiler.measure_performance(
-                    model_call, args, compiler
-                )
-                success = True
-            except Exception as e:
-                print(
-                    f"Run model failed: {str(e)}\n{traceback.format_exc()}",
-                    file=sys.stderr,
-                    flush=True,
-                )
-
-            test_compiler_util.print_running_status(args, success)
-            if success:
-                torch.save(outputs, str(ref_dump))
-            test_compiler_util.print_with_log_prompt(
-                "[Performance][eager]:", json.dumps(time_stats), args.log_prompt
-            )
-
-    with open(ref_log, "r", encoding="utf-8") as f:
-        content = f.read()
-        print(content, file=sys.stderr, flush=True)
+    eval_args = convert_args_for_eval_backend(args)
+    eval_backend_perf.eval_single_model_with_single_backend(eval_args)
 
 
 def test_multi_models(args):
@@ -118,14 +75,9 @@ def test_multi_models(args):
 
 def main(args):
     assert os.path.isdir(args.model_path)
-    # Support all torch compilers
-    valid_compilers = list(test_compiler.compiler_backend_name2class.keys())
-    assert (
-        args.compiler in valid_compilers
-    ), f"Compiler must be one of {valid_compilers}"
-    assert args.device in ["cuda"]
-
-    test_compiler.set_seed(random_seed=args.seed)
+    assert args.device in ["cuda", "cpu"]
+
+    eval_backend_perf.set_seed(args.seed)
 
     ref_dump_dir = Path(args.reference_dir)
     ref_dump_dir.mkdir(parents=True, exist_ok=True)
diff --git a/graph_net/torch/test_target_device.py b/graph_net/torch/test_target_device.py
index cf56dee69..ee46ceee6 100644
--- a/graph_net/torch/test_target_device.py
+++ b/graph_net/torch/test_target_device.py
@@ -1,14 +1,13 @@
 import argparse
 import os
-import json
 import sys
-import traceback
+import types
 
 import torch
 from graph_net_bench import path_utils
 from graph_net_bench import test_compiler_util
 from graph_net import model_path_util
-from graph_net_bench.torch import test_compiler, utils, eval_backend_perf
+from graph_net_bench.torch import utils, eval_backend_perf, eval_backend_diff
 
 
 def parse_config_from_reference_log(log_path):
@@ -30,84 +29,55 @@ def parse_config_from_reference_log(log_path):
     return config
 
 
-def parse_time_stats_from_reference_log(log_path):
-    assert os.path.isfile(
-        log_path
-    ), f"{log_path} does not exist or is not a regular file."
-
-    with open(log_path, "r", encoding="utf-8") as f:
-        lines = f.readlines()
-        for line in reversed(lines):
-            if "[Performance][eager]" in line:
-                start = line.find("{")
-                end = line.rfind("}")
-                time_stats = json.loads(line[start : end + 1])
-    return time_stats
-
-
-def update_args_and_set_seed(args, model_path):
+def get_ref_config_from_log(args, model_path):
+    """Extract config from reference log file."""
     ref_log = utils.get_log_path(args.reference_dir, model_path)
     config = parse_config_from_reference_log(ref_log)
-    vars(args)["model_path"] = model_path
-    vars(args)["compiler"] = config.get("compiler")
-    vars(args)["trials"] = int(config.get("trials"))
-    vars(args)["warmup"] = int(config.get("warmup"))
-    test_compiler.set_seed(random_seed=int(config.get("seed")))
-    return args
-
-
-def test_single_model(args):
-    compiler = test_compiler.get_compiler_backend(args)
+    return config
 
-    input_dict = test_compiler.get_input_dict(args)
-    model = test_compiler.get_model(args)
-    model.eval()
 
-    model_path = os.path.normpath(args.model_path)
-    test_compiler_util.print_with_log_prompt(
-        "[Processing]", model_path, args.log_prompt
-    )
-    test_compiler_util.print_basic_config(
-        args,
-        test_compiler.get_hardward_name(args),
-        test_compiler.get_compile_framework_version(args),
+def convert_args_for_eval_backend(args, output_path):
+    """Convert test_target_device args to eval_backend_perf args format."""
+    return types.SimpleNamespace(
+        model_path=args.model_path,
+        output_path=output_path,
+        seed=args.seed,
+        compiler=args.compiler,
+        device=args.device,
+        op_lib=args.op_lib,
+        warmup=args.warmup,
+        trials=args.trials,
+        log_prompt=args.log_prompt,
+        backend_config=getattr(args, "config", None),
     )
 
-    success = False
-    time_stats = {}
-    try:
-        compiled_model = compiler(model)
 
-        def model_call():
-            return compiled_model(**input_dict)
-
-        outputs, time_stats = test_compiler.measure_performance(
-            model_call, args, compiler
-        )
-        success = True
-    except Exception as e:
-        print(
-            f"Run model failed: {str(e)}\n{traceback.format_exc()}",
-            file=sys.stderr,
-            flush=True,
-        )
+def test_single_model(args):
+    target_dir = "/tmp/eval_device_diff/target"
 
-    test_compiler_util.print_running_status(args, success)
+    ref_config = get_ref_config_from_log(args, args.model_path)
+    vars(args)["compiler"] = ref_config.get("compiler")
+    vars(args)["trials"] = int(ref_config.get("trials"))
+    vars(args)["warmup"] = int(ref_config.get("warmup"))
+    vars(args)["seed"] = int(ref_config.get("seed"))
 
-    model_name = test_compiler_util.get_model_name(args.model_path)
-    if test_compiler_util.get_subgraph_tag(args.model_path):
-        model_name += "_" + test_compiler_util.get_subgraph_tag(args.model_path)
+    eval_args = convert_args_for_eval_backend(args, target_dir)
+    eval_backend_perf.eval_single_model_with_single_backend(eval_args)
 
     ref_dump = utils.get_output_path(args.reference_dir, args.model_path)
     ref_out = torch.load(str(ref_dump))
-
     ref_log = utils.get_log_path(args.reference_dir, args.model_path)
-    ref_time_stats = parse_time_stats_from_reference_log(ref_log)
+    ref_time_stats = eval_backend_diff.parse_time_stats_from_reference_log(ref_log)
 
-    if success:
-        test_compiler.compare_correctness(ref_out, outputs, args)
+    target_dump = utils.get_output_path(target_dir, args.model_path)
+    target_out = torch.load(str(target_dump))
+    target_log = utils.get_log_path(target_dir, args.model_path)
+    target_time_stats = eval_backend_diff.parse_time_stats_from_reference_log(
+        target_log
+    )
 
-    test_compiler_util.print_times_and_speedup(args, ref_time_stats, time_stats)
+    eval_backend_diff.compare_correctness(ref_out, target_out, eval_args)
+    test_compiler_util.print_times_and_speedup(args, ref_time_stats, target_time_stats)
 
 
 def is_reference_log_exist(reference_dir, model_path):
@@ -165,16 +135,16 @@ def main(args):
 
     if path_utils.is_single_model_dir(args.model_path):
         if args.op_lib == "origin":
-            ref_log = utils.get_log_path(args.reference_dir, args.model_path)
-            config = parse_config_from_reference_log(ref_log)
-            vars(args)["op_lib"] = config.get("op_lib")
-            test_compiler_util.print_with_log_prompt(
-                "[Config] op_lib:", args.op_lib, args.log_prompt
+            ref_config = get_ref_config_from_log(args, args.model_path)
+            vars(args)["op_lib"] = ref_config.get("op_lib")
+            print(
+                f"{args.log_prompt} [Config] op_lib: {args.op_lib}",
+                file=sys.stderr,
+                flush=True,
             )
         else:
             eval_backend_perf.register_op_lib(args.op_lib)
 
-        args = update_args_and_set_seed(args, args.model_path)
         test_single_model(args)
     else:
         test_multi_models(args)

From d8514e4a13f67aac1bcd293cbba62cd860008b21 Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Tue, 20 Jan 2026 16:48:12 +0800
Subject: [PATCH 16/20] move utest

---
 .../test/test_device_test.sh                                      | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename test/eval_device_diff_test.sh => graph_net/test/test_device_test.sh (100%)

diff --git a/test/eval_device_diff_test.sh b/graph_net/test/test_device_test.sh
similarity index 100%
rename from test/eval_device_diff_test.sh
rename to graph_net/test/test_device_test.sh

From b83b6a967770a644881a6751800ef7e7dc144a28 Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Tue, 20 Jan 2026 16:53:13 +0800
Subject: [PATCH 17/20] minor change

---
 graph_net_bench/torch/eval_backend_diff.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
index c254eafaf..cfa171dc6 100755
--- a/graph_net_bench/torch/eval_backend_diff.py
+++ b/graph_net_bench/torch/eval_backend_diff.py
@@ -190,8 +190,8 @@ def eval_multi_models(args, model_path_prefix=None, use_model_list=False):
 
 
 def eval_single_model(args):
-    ref_dir = "/tmp/eval_perf_diff/A"
-    target_dir = "/tmp/eval_perf_diff/B"
+    ref_dir = "/tmp/eval_perf_diff/reference"
+    target_dir = "/tmp/eval_perf_diff/target"
 
     ref_args = types.SimpleNamespace(
         model_path=args.model_path,

From 3a7d9baa928c547d258a327ef8cd7237f1da683f Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Tue, 20 Jan 2026 21:45:02 +0800
Subject: [PATCH 18/20] =?UTF-8?q?Add=20local=5Frunner=EF=BC=8Cprocess=5Fru?=
 =?UTF-8?q?nner=20and=20remote=5Frunner?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 graph_net/torch/test_target_device.py         |   6 +-
 graph_net_bench/test_compiler_util.py         |  62 ++++++++
 graph_net_bench/torch/eval_backend_diff.py    |  85 +++++-----
 graph_net_bench/torch/runner/__init__.py      |  14 ++
 graph_net_bench/torch/runner/base_runner.py   | 149 ++++++++++++++++++
 graph_net_bench/torch/runner/local_runner.py  |  99 ++++++++++++
 .../torch/runner/process_runner.py            | 102 ++++++++++++
 graph_net_bench/torch/runner/remote_runner.py | 135 ++++++++++++++++
 test/eval_device_diff_test.sh                 |  38 +++++
 9 files changed, 645 insertions(+), 45 deletions(-)
 create mode 100644 graph_net_bench/torch/runner/__init__.py
 create mode 100644 graph_net_bench/torch/runner/base_runner.py
 create mode 100644 graph_net_bench/torch/runner/local_runner.py
 create mode 100644 graph_net_bench/torch/runner/process_runner.py
 create mode 100644 graph_net_bench/torch/runner/remote_runner.py
 create mode 100755 test/eval_device_diff_test.sh

diff --git a/graph_net/torch/test_target_device.py b/graph_net/torch/test_target_device.py
index ee46ceee6..88cc9a650 100644
--- a/graph_net/torch/test_target_device.py
+++ b/graph_net/torch/test_target_device.py
@@ -67,14 +67,12 @@ def test_single_model(args):
     ref_dump = utils.get_output_path(args.reference_dir, args.model_path)
     ref_out = torch.load(str(ref_dump))
     ref_log = utils.get_log_path(args.reference_dir, args.model_path)
-    ref_time_stats = eval_backend_diff.parse_time_stats_from_reference_log(ref_log)
+    ref_time_stats = test_compiler_util.parse_performance_stats(str(ref_log))
 
     target_dump = utils.get_output_path(target_dir, args.model_path)
     target_out = torch.load(str(target_dump))
     target_log = utils.get_log_path(target_dir, args.model_path)
-    target_time_stats = eval_backend_diff.parse_time_stats_from_reference_log(
-        target_log
-    )
+    target_time_stats = test_compiler_util.parse_performance_stats(str(target_log))
 
     eval_backend_diff.compare_correctness(ref_out, target_out, eval_args)
     test_compiler_util.print_times_and_speedup(args, ref_time_stats, target_time_stats)
diff --git a/graph_net_bench/test_compiler_util.py b/graph_net_bench/test_compiler_util.py
index 44ccc703e..a83f55994 100644
--- a/graph_net_bench/test_compiler_util.py
+++ b/graph_net_bench/test_compiler_util.py
@@ -7,6 +7,7 @@
 import shutil
 import base64
 import numpy as np
+from typing import Dict, Any
 from dataclasses import dataclass
 from contextlib import contextmanager
 
@@ -381,3 +382,64 @@ def convert_to_dict(config_str):
     config = json.loads(config_str)
     assert isinstance(config, dict), f"config should be a dict. {config_str=}"
     return config
+
+
+def convert_to_base64(config_dict):
+    """Convert a dict to base64 encoded JSON string."""
+    if config_dict is None:
+        return ""
+    config_str = json.dumps(config_dict)
+    return base64.b64encode(config_str.encode("utf-8")).decode("utf-8")
+
+
+def parse_performance_stats(log_path: str) -> Dict[str, Any]:
+    """Parse performance statistics from log file.
+
+    Args:
+        log_path: Path to the log file
+
+    Returns:
+        Dictionary containing time statistics
+
+    Raises:
+        FileNotFoundError: If log_path does not exist
+        ValueError: If performance data cannot be parsed
+    """
+    if not os.path.isfile(log_path):
+        raise FileNotFoundError(f"Log file not found: {log_path}")
+
+    with open(log_path, "r", encoding="utf-8") as f:
+        lines = f.readlines()
+
+    # Search backwards for performance data
+    for line in reversed(lines):
+        if "[Performance][eager]" in line:
+            start = line.find("{")
+            end = line.rfind("}")
+            if start != -1 and end != -1:
+                try:
+                    time_stats = json.loads(line[start : end + 1])
+                    return time_stats
+                except json.JSONDecodeError as e:
+                    raise ValueError(f"Failed to parse performance stats: {e}")
+
+    raise ValueError("No performance statistics found in log file")
+
+
+def extract_log_content(log_path: str) -> str:
+    """Extract and return the entire content of a log file.
+
+    Args:
+        log_path: Path to the log file
+
+    Returns:
+        String containing the log content
+
+    Raises:
+        FileNotFoundError: If log_path does not exist
+    """
+    if not os.path.isfile(log_path):
+        raise FileNotFoundError(f"Log file not found: {log_path}")
+
+    with open(log_path, "r", encoding="utf-8") as f:
+        return f.read()
diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
index cfa171dc6..68e1f2f02 100755
--- a/graph_net_bench/torch/eval_backend_diff.py
+++ b/graph_net_bench/torch/eval_backend_diff.py
@@ -1,15 +1,13 @@
-from . import utils
 import argparse
 import torch
 import sys
 import os
 import os.path
 import traceback
-import json
 import types
 from graph_net_bench import test_compiler_util
 from graph_net_bench import path_utils
-from .eval_backend_perf import eval_single_model_with_single_backend
+from .runner import RunnerConfig, RunResult, create_runner
 
 
 def compare_correctness(expected_out, compiled_out, args):
@@ -94,21 +92,6 @@ def get_cmp_diff_count(expected_out, compiled_out, atol, rtol):
     return " ".join(results)
 
 
-def parse_time_stats_from_reference_log(log_path):
-    assert os.path.isfile(
-        log_path
-    ), f"{log_path} does not exist or is not a regular file."
-
-    with open(log_path, "r", encoding="utf-8") as f:
-        lines = f.readlines()
-        for line in reversed(lines):
-            if "[Performance][eager]" in line:
-                start = line.find("{")
-                end = line.rfind("}")
-                time_stats = json.loads(line[start : end + 1])
-    return time_stats
-
-
 def _get_model_paths(args, model_path_prefix, use_model_list):
     if use_model_list:
         assert os.path.isdir(model_path_prefix) and os.path.isfile(args.model_path_list)
@@ -190,41 +173,61 @@ def eval_multi_models(args, model_path_prefix=None, use_model_list=False):
 
 
 def eval_single_model(args):
+    """
+    Unified evaluation using Runner abstraction.
+    Supports local, process, and remote execution via runner_type in config.
+    """
     ref_dir = "/tmp/eval_perf_diff/reference"
     target_dir = "/tmp/eval_perf_diff/target"
 
-    ref_args = types.SimpleNamespace(
-        model_path=args.model_path,
-        output_path=ref_dir,
-        **test_compiler_util.convert_to_dict(args.reference_config),
+    ref_config_dict = test_compiler_util.convert_to_dict(args.reference_config)
+    target_config_dict = test_compiler_util.convert_to_dict(args.target_config)
+
+    ref_runner_config = RunnerConfig.from_dict(ref_config_dict)
+    target_runner_config = RunnerConfig.from_dict(target_config_dict)
+
+    ref_runner = create_runner(ref_runner_config)
+    target_runner = create_runner(target_runner_config)
+
+    print(
+        f"[eval_backend_diff] Reference runner: {ref_runner_config.strategy.runner_type.value}",
+        file=sys.stderr,
+        flush=True,
     )
-    target_args = types.SimpleNamespace(
-        model_path=args.model_path,
-        output_path=target_dir,
-        **test_compiler_util.convert_to_dict(args.target_config),
+    print(
+        f"[eval_backend_diff] Target runner: {target_runner_config.strategy.runner_type.value}",
+        file=sys.stderr,
+        flush=True,
     )
 
-    eval_single_model_with_single_backend(ref_args)
-    eval_single_model_with_single_backend(target_args)
+    ref_result = ref_runner.run(args.model_path, ref_dir)
+    if not ref_result.success:
+        raise RuntimeError(f"Reference run failed: {ref_result.error_message}")
+
+    target_result = target_runner.run(args.model_path, target_dir)
+    if not target_result.success:
+        raise RuntimeError(f"Target run failed: {target_result.error_message}")
 
-    # compare_perf_diff
-    # A
-    ref_dump_path = utils.get_output_path(ref_dir, args.model_path)
-    ref_out = torch.load(str(ref_dump_path))
+    compare_results(ref_result, target_result, ref_runner_config)
 
-    ref_log_path = utils.get_log_path(ref_dir, args.model_path)
-    ref_time_stats = parse_time_stats_from_reference_log(ref_log_path)
 
-    # B
-    target_dump_path = utils.get_output_path(target_dir, args.model_path)
-    target_out = torch.load(str(target_dump_path))
+def compare_results(
+    ref_result: RunResult, target_result: RunResult, config: RunnerConfig
+):
+    """Compare outputs and performance between reference and target results."""
+    if ref_result.outputs is None or target_result.outputs is None:
+        print("[Warning] Cannot compare: missing outputs", file=sys.stderr)
+        return
 
-    target_log_path = utils.get_log_path(target_dir, args.model_path)
-    target_time_stats = parse_time_stats_from_reference_log(target_log_path)
+    dummy_args = types.SimpleNamespace(
+        log_prompt=config.execution.log_prompt,
+        compiler=config.execution.compiler,
+        device=config.execution.device,
+    )
 
-    compare_correctness(ref_out, target_out, ref_args)
+    compare_correctness(ref_result.outputs, target_result.outputs, dummy_args)
     test_compiler_util.print_times_and_speedup(
-        ref_args, ref_time_stats, target_time_stats
+        dummy_args, ref_result.time_stats, target_result.time_stats
     )
 
 
diff --git a/graph_net_bench/torch/runner/__init__.py b/graph_net_bench/torch/runner/__init__.py
new file mode 100644
index 000000000..643f28f91
--- /dev/null
+++ b/graph_net_bench/torch/runner/__init__.py
@@ -0,0 +1,14 @@
+from .base_runner import BaseRunner, RunResult, RunnerConfig, create_runner
+from .local_runner import LocalRunner
+from .process_runner import ProcessRunner
+from .remote_runner import RemoteRunner
+
+__all__ = [
+    "BaseRunner",
+    "RunResult",
+    "RunnerConfig",
+    "LocalRunner",
+    "ProcessRunner",
+    "RemoteRunner",
+    "create_runner",
+]
diff --git a/graph_net_bench/torch/runner/base_runner.py b/graph_net_bench/torch/runner/base_runner.py
new file mode 100644
index 000000000..25d0882c8
--- /dev/null
+++ b/graph_net_bench/torch/runner/base_runner.py
@@ -0,0 +1,149 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Any, Dict, Optional, Tuple
+from pathlib import Path
+from enum import Enum
+
+
+class RunnerType(Enum):
+    LOCAL = "local"
+    PROCESS = "process"
+    REMOTE = "remote"
+
+
+@dataclass
+class ExecutionConfig:
+    """Configuration specific to model execution."""
+
+    compiler: str = "inductor"
+    device: str = "cuda"
+    op_lib: str = "default"
+    warmup: int = 5
+    trials: int = 10
+    seed: int = 123
+    log_prompt: str = "graph-net-runner-log"
+    backend_config: Optional[str] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            k: v
+            for k, v in self.__dict__.items()
+            if v is not None and not k.startswith("_")
+        }
+
+    @classmethod
+    def from_dict(cls, d: Dict[str, Any]) -> "ExecutionConfig":
+        return cls(**{k: v for k, v in d.items() if hasattr(cls, k)})
+
+
+@dataclass
+class RunnerStrategyConfig:
+    """Configuration for runner strategy selection."""
+
+    runner_type: RunnerType = RunnerType.LOCAL
+    remote_machine: str = "localhost"
+    remote_port: int = 50052
+    subprocess_timeout: int = 600
+
+    @classmethod
+    def from_dict(cls, d: Dict[str, Any]) -> "RunnerStrategyConfig":
+        runner_type_str = d.get("runner_type", "local")
+        try:
+            runner_type = RunnerType(runner_type_str.lower())
+        except ValueError:
+            runner_type = RunnerType.LOCAL
+
+        return cls(
+            runner_type=runner_type,
+            remote_machine=d.get("machine", "localhost"),
+            remote_port=d.get("port", 50052),
+            subprocess_timeout=d.get("subprocess_timeout", 600),
+        )
+
+
+@dataclass
+class RunnerConfig:
+    """Unified configuration combining execution and strategy configs."""
+
+    execution: ExecutionConfig
+    strategy: RunnerStrategyConfig
+
+    @classmethod
+    def from_dict(cls, d: Dict[str, Any]) -> "RunnerConfig":
+        execution_config = ExecutionConfig.from_dict(d)
+        strategy_config = RunnerStrategyConfig.from_dict(d)
+        return cls(execution=execution_config, strategy=strategy_config)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            **self.execution.to_dict(),
+            "runner_type": self.strategy.runner_type.value,
+            "machine": self.strategy.remote_machine,
+            "port": self.strategy.remote_port,
+            "subprocess_timeout": self.strategy.subprocess_timeout,
+        }
+
+
+@dataclass
+class RunResult:
+    """Result of a single backend run."""
+
+    success: bool = False
+    outputs: Optional[Tuple[Any, ...]] = None
+    time_stats: Dict[str, Any] = field(default_factory=dict)
+    log_content: str = ""
+    error_message: str = ""
+
+    output_path: Optional[Path] = None
+    log_path: Optional[Path] = None
+
+
+class BaseRunner(ABC):
+    """Abstract base class for model execution runners."""
+
+    def __init__(self, config: RunnerConfig):
+        self.config = config
+
+    @abstractmethod
+    def run(self, model_path: str, output_dir: str) -> RunResult:
+        """
+        Execute model evaluation and return results.
+
+        Args:
+            model_path: Path to model directory (containing model.py, graph_net.json, etc.)
+            output_dir: Directory to store outputs and logs
+
+        Returns:
+            RunResult containing outputs, timing stats, and logs
+        """
+        pass
+
+    def _get_output_path(self, output_dir: str, model_path: str) -> Path:
+        from graph_net_bench.torch import utils
+
+        return Path(utils.get_output_path(output_dir, model_path))
+
+    def _get_log_path(self, output_dir: str, model_path: str) -> Path:
+        from graph_net_bench.torch import utils
+
+        return Path(utils.get_log_path(output_dir, model_path))
+
+
+def create_runner(config: RunnerConfig) -> BaseRunner:
+    """Factory function to create appropriate runner based on config."""
+    runner_type = config.strategy.runner_type
+
+    if runner_type == RunnerType.LOCAL:
+        from .local_runner import LocalRunner
+
+        return LocalRunner(config)
+    elif runner_type == RunnerType.PROCESS:
+        from .process_runner import ProcessRunner
+
+        return ProcessRunner(config)
+    elif runner_type == RunnerType.REMOTE:
+        from .remote_runner import RemoteRunner
+
+        return RemoteRunner(config)
+    else:
+        raise ValueError(f"Unknown runner_type: {runner_type}")
diff --git a/graph_net_bench/torch/runner/local_runner.py b/graph_net_bench/torch/runner/local_runner.py
new file mode 100644
index 000000000..3d07a6470
--- /dev/null
+++ b/graph_net_bench/torch/runner/local_runner.py
@@ -0,0 +1,99 @@
+import os
+import sys
+import json
+import types
+import traceback
+from io import StringIO
+from contextlib import redirect_stdout, redirect_stderr
+
+import torch
+
+from .base_runner import BaseRunner, RunResult
+
+
+class LocalRunner(BaseRunner):
+    """Execute model evaluation in the current process."""
+
+    def run(self, model_path: str, output_dir: str) -> RunResult:
+        from graph_net_bench.torch import eval_backend_perf
+
+        os.makedirs(output_dir, exist_ok=True)
+
+        log_path = self._get_log_path(output_dir, model_path)
+        output_path = self._get_output_path(output_dir, model_path)
+
+        eval_args = types.SimpleNamespace(
+            model_path=model_path,
+            output_path=output_dir,
+            seed=self.config.execution.seed,
+            compiler=self.config.execution.compiler,
+            device=self.config.execution.device,
+            op_lib=self.config.execution.op_lib,
+            warmup=self.config.execution.warmup,
+            trials=self.config.execution.trials,
+            log_prompt=self.config.execution.log_prompt,
+            backend_config=self.config.execution.backend_config,
+        )
+
+        log_buffer = StringIO()
+        result = RunResult(
+            output_path=output_path,
+            log_path=log_path,
+        )
+
+        try:
+            eval_backend_perf.register_op_lib(self.config.execution.op_lib)
+            eval_backend_perf.set_seed(self.config.execution.seed)
+
+            with redirect_stdout(log_buffer), redirect_stderr(log_buffer):
+                self._run_evaluation(eval_args, result)
+
+        except Exception as e:
+            result.success = False
+            result.error_message = f"{str(e)}\n{traceback.format_exc()}"
+            log_buffer.write(f"\n[ERROR] {result.error_message}\n")
+
+        result.log_content = log_buffer.getvalue()
+
+        with open(log_path, "w", encoding="utf-8") as f:
+            f.write(result.log_content)
+
+        print(result.log_content, file=sys.stderr, flush=True)
+
+        return result
+
+    def _run_evaluation(self, args: types.SimpleNamespace, result: RunResult):
+        from graph_net_bench.torch import eval_backend_perf
+        from graph_net_bench import test_compiler_util
+
+        compiler = eval_backend_perf.get_compiler_backend(args)
+        input_dict = eval_backend_perf.get_input_dict(args)
+        model = eval_backend_perf.get_model(args)
+        model.eval()
+
+        test_compiler_util.print_config(
+            args,
+            eval_backend_perf.get_hardward_name(args.device),
+            eval_backend_perf.get_compiler_version(args.compiler),
+        )
+
+        compiled_model = compiler(model)
+
+        def model_call():
+            return compiled_model(**input_dict)
+
+        outputs, time_stats = eval_backend_perf.measure_performance(
+            model_call, args, compiler
+        )
+
+        result.success = True
+        result.outputs = outputs
+        result.time_stats = time_stats
+
+        if result.output_path:
+            torch.save(outputs, str(result.output_path))
+
+        test_compiler_util.print_running_status(args, True)
+        test_compiler_util.print_with_log_prompt(
+            "[Performance][eager]:", json.dumps(time_stats), args.log_prompt
+        )
diff --git a/graph_net_bench/torch/runner/process_runner.py b/graph_net_bench/torch/runner/process_runner.py
new file mode 100644
index 000000000..9ac68607e
--- /dev/null
+++ b/graph_net_bench/torch/runner/process_runner.py
@@ -0,0 +1,102 @@
+import os
+import sys
+import subprocess
+from pathlib import Path
+
+import torch
+
+from .base_runner import BaseRunner, RunResult
+
+
+class ProcessRunner(BaseRunner):
+    """Execute model evaluation in a separate subprocess on the local machine."""
+
+    def run(self, model_path: str, output_dir: str) -> RunResult:
+        os.makedirs(output_dir, exist_ok=True)
+
+        log_path = self._get_log_path(output_dir, model_path)
+        output_path = self._get_output_path(output_dir, model_path)
+
+        result = RunResult(
+            output_path=output_path,
+            log_path=log_path,
+        )
+
+        cmd = self._build_command(model_path, output_dir)
+        print(f"[ProcessRunner] Executing: {cmd}", file=sys.stderr, flush=True)
+
+        try:
+            env = os.environ.copy()
+            repo_root = Path(__file__).resolve().parents[3]
+            env["PYTHONPATH"] = f"{repo_root}:{env.get('PYTHONPATH', '')}"
+
+            proc = subprocess.run(
+                cmd,
+                shell=True,
+                env=env,
+                capture_output=True,
+                text=True,
+                timeout=self.config.strategy.subprocess_timeout,
+            )
+
+            result.log_content = proc.stderr or ""
+
+            if proc.returncode != 0:
+                result.success = False
+                result.error_message = (
+                    f"Process exited with code {proc.returncode}\n"
+                    f"stdout: {proc.stdout}\n"
+                    f"stderr: {proc.stderr}"
+                )
+            else:
+                result.success = True
+                self._parse_result(result, output_dir, model_path)
+
+        except subprocess.TimeoutExpired as e:
+            result.success = False
+            result.error_message = f"Process timed out: {e}"
+        except Exception as e:
+            result.success = False
+            result.error_message = f"Process execution failed: {e}"
+
+        print(result.log_content, file=sys.stderr, flush=True)
+        return result
+
+    def _build_command(self, model_path: str, output_dir: str) -> str:
+        cmd_parts = [
+            sys.executable,
+            "-m",
+            "graph_net_bench.torch.eval_backend_perf",
+            "--model-path",
+            model_path,
+            "--output-path",
+            output_dir,
+        ]
+
+        config_dict = self.config.to_dict()
+        from graph_net_bench import test_compiler_util
+
+        config_str = test_compiler_util.convert_to_base64(config_dict)
+        cmd_parts.extend(["--config", config_str])
+
+        return " ".join(cmd_parts)
+
+    def _parse_result(self, result: RunResult, output_dir: str, model_path: str):
+        from graph_net_bench import test_compiler_util
+
+        if result.output_path and result.output_path.exists():
+            try:
+                result.outputs = torch.load(str(result.output_path))
+            except Exception as e:
+                result.error_message += f"\nFailed to load outputs: {e}"
+
+        if result.log_path and result.log_path.exists():
+            try:
+                result.log_content = test_compiler_util.extract_log_content(
+                    str(result.log_path)
+                )
+                result.time_stats = test_compiler_util.parse_performance_stats(
+                    str(result.log_path)
+                )
+            except Exception as e:
+                result.error_message += f"\nFailed to parse log: {e}"
diff --git a/graph_net_bench/torch/runner/remote_runner.py b/graph_net_bench/torch/runner/remote_runner.py
new file mode 100644
index 000000000..82f580db5
--- /dev/null
+++ b/graph_net_bench/torch/runner/remote_runner.py
@@ -0,0 +1,135 @@
+import os
+import sys
+from typing import Dict
+
+import torch
+
+from .base_runner import BaseRunner, RunResult
+
+
+class RemoteRunner(BaseRunner):
+    """Execute model evaluation on a remote machine via gRPC."""
+
+    def run(self, model_path: str, output_dir: str) -> RunResult:
+        from graph_net_rpc.sample_remote_executor import SampleRemoteExecutor
+
+        os.makedirs(output_dir, exist_ok=True)
+
+        log_path = self._get_log_path(output_dir, model_path)
+        output_path = self._get_output_path(output_dir, model_path)
+
+        result = RunResult(
+            output_path=output_path,
+            log_path=log_path,
+        )
+
+        rpc_cmd = self._build_rpc_command()
+        executor = SampleRemoteExecutor(
+            machine=self.config.strategy.remote_machine,
+            port=self.config.strategy.remote_port,
+        )
+
+        try:
+            print(
+                f"[RemoteRunner] Sending to {self.config.machine}:{self.config.port}",
+                file=sys.stderr,
+                flush=True,
+            )
+            print(f"[RemoteRunner] rpc_cmd: {rpc_cmd}", file=sys.stderr, flush=True)
+
+            files_dict = executor.execute(model_path, rpc_cmd)
+            self._process_remote_output(result, files_dict, output_dir, model_path)
+            result.success = True
+
+        except Exception as e:
+            import traceback
+
+            result.success = False
+            result.error_message = (
+                f"Remote execution failed: {e}\n{traceback.format_exc()}"
+            )
+            print(result.error_message, file=sys.stderr, flush=True)
+
+        finally:
+            executor.close()
+
+        return result
+
+    def _build_rpc_command(self) -> str:
+        cmd = "python3 -m graph_net.torch.test_reference_device"
+        cmd += ' --model-path "$INPUT_WORKSPACE"'
+        cmd += ' --reference-dir "$OUTPUT_WORKSPACE"'
+        cmd += f" --compiler {self.config.execution.compiler}"
+        cmd += f" --device {self.config.execution.device}"
+        cmd += f" --op-lib {self.config.execution.op_lib}"
+        cmd += f" --warmup {self.config.execution.warmup}"
+        cmd += f" --trials {self.config.execution.trials}"
+        cmd += f" --seed {self.config.execution.seed}"
+
+        if self.config.execution.log_prompt:
+            cmd += f" --log-prompt {self.config.execution.log_prompt}"
+        if self.config.execution.backend_config:
+            cmd += f" --config {self.config.execution.backend_config}"
+
+        return cmd
+
+    def _process_remote_output(
+        self,
+        result: RunResult,
+        files_dict: Dict[str, bytes],
+        output_dir: str,
+        model_path: str,
+    ):
+        from graph_net_bench import test_compiler_util
+
+        log_filename = result.log_path.name if result.log_path else None
+        pth_filename = result.output_path.name if result.output_path else None
+
+        available_logs = sorted([k for k in files_dict.keys() if k.endswith(".log")])
+        available_pths = sorted([k for k in files_dict.keys() if k.endswith(".pth")])
+
+        if log_filename not in files_dict and len(available_logs) == 1:
+            log_filename = available_logs[0]
+        if pth_filename not in files_dict and len(available_pths) == 1:
+            pth_filename = available_pths[0]
+
+        if log_filename and log_filename in files_dict:
+            log_bytes = files_dict[log_filename]
+            if result.log_path:
+                with open(result.log_path, "wb") as f:
+                    f.write(log_bytes)
+            try:
+                result.log_content = log_bytes.decode("utf-8")
+                print(result.log_content, file=sys.stderr, flush=True)
+            except Exception:
+                result.log_content = f"[Binary log, {len(log_bytes)} bytes]"
+                # Write binary content as text for parsing
+                with open(result.log_path, "wb") as f:
+                    f.write(log_bytes)
+
+            try:
+                result.time_stats = test_compiler_util.parse_performance_stats(
+                    str(result.log_path)
+                )
+            except Exception as e:
+                print(f"Warning: Failed to parse time stats: {e}", file=sys.stderr)
+        else:
+            print(
+                f"Warning: log not found. expected={log_filename}, available={available_logs}",
+                file=sys.stderr,
+            )
+
+        if pth_filename and pth_filename in files_dict:
+            pth_bytes = files_dict[pth_filename]
+            if result.output_path:
+                with open(result.output_path, "wb") as f:
+                    f.write(pth_bytes)
+            try:
+                result.outputs = torch.load(str(result.output_path))
+            except Exception as e:
+                print(f"Warning: Failed to load outputs: {e}", file=sys.stderr)
+        else:
+            print(
+                f"Warning: output not found. expected={pth_filename}, available={available_pths}",
+                file=sys.stderr,
+            )
diff --git a/test/eval_device_diff_test.sh b/test/eval_device_diff_test.sh
new file mode 100755
index 000000000..6840b53a7
--- /dev/null
+++ b/test/eval_device_diff_test.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+AI4C_ROOT=$(python3 -c "import graph_net_bench; import os; print(os.path.dirname(os.path.dirname(graph_net_bench.__file__)))")
+OUTPUT_PATH=/tmp/workspace_eval_device_diff_test
+
+mkdir -p "$OUTPUT_PATH"
+model_list="$AI4C_ROOT/test/workspace_eval_backend_diff/sample_list.txt"
+
+# Default remote server settings (can be overridden by environment variables)
+REMOTE_MACHINE="${REMOTE_MACHINE:-localhost}"
+REMOTE_PORT="${REMOTE_PORT:-50052}"
+
+python3 -m graph_net_bench.torch.eval_backend_diff \
+    --model-path-list $model_list \
+    --reference-config $(base64 -w 0 <<EOF
+{
+    "runner_type": "remote",
+    "machine": "$REMOTE_MACHINE",
+    "port": $REMOTE_PORT,
+    "compiler": "nope",
+    "device": "cuda",
+    "warmup": 1,
+    "trials": 1,
+    "model_path_prefix": "$AI4C_ROOT"
+}
+EOF
+) \
+    --target-config $(base64 -w 0 <<EOF
+{
+    "runner_type": "local",
+    "compiler": "nope",
+    "device": "cuda",
+    "warmup": 1,
+    "trials": 1,
+    "model_path_prefix": "$AI4C_ROOT"
+}
+EOF
+) 2>&1 | tee "$OUTPUT_PATH/validation.log"
\ No newline at end of file

From 1250435c79cf4ec7322145ca63df9dbe50381c00 Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Mon, 26 Jan 2026 13:30:09 +0800
Subject: [PATCH 19/20] minor fix

---
 graph_net_bench/torch/runner/remote_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/graph_net_bench/torch/runner/remote_runner.py b/graph_net_bench/torch/runner/remote_runner.py
index 82f580db5..74c5e651f 100644
--- a/graph_net_bench/torch/runner/remote_runner.py
+++ b/graph_net_bench/torch/runner/remote_runner.py
@@ -31,7 +31,7 @@ def run(self, model_path: str, output_dir: str) -> RunResult:
 
         try:
             print(
-                f"[RemoteRunner] Sending to {self.config.machine}:{self.config.port}",
+                f"[RemoteRunner] Sending to {self.config.strategy.remote_machine}:{self.config.strategy.remote_port}",
                 file=sys.stderr,
                 flush=True,
             )

From c92f93133977ad5e8651971bcc8e780c7774e60b Mon Sep 17 00:00:00 2001
From: JewelRoam <2752594773@qq.com>
Date: Mon, 26 Jan 2026 14:43:54 +0800
Subject: [PATCH 20/20] Optimize code style

---
 graph_net/paddle/test_compiler.py             |   6 +-
 graph_net/paddle/test_reference_device.py     |   2 +-
 graph_net/paddle/test_target_device.py        |   2 +-
 graph_net_bench/torch/eval_backend_diff.py    | 413 ++++++++------
 graph_net_bench/torch/eval_backend_perf.py    | 508 +++++++++++-------
 graph_net_bench/torch/runner/base_runner.py   |  25 +-
 graph_net_bench/torch/runner/local_runner.py  | 127 +++--
 .../torch/runner/process_runner.py            | 143 +++--
 graph_net_bench/torch/runner/remote_runner.py | 222 +++++---
 graph_net_bench/torch/test_compiler.py        |   6 +-
 10 files changed, 906 insertions(+), 548 deletions(-)

diff --git a/graph_net/paddle/test_compiler.py b/graph_net/paddle/test_compiler.py
index 76c3d5610..8beea67fb 100644
--- a/graph_net/paddle/test_compiler.py
+++ b/graph_net/paddle/test_compiler.py
@@ -43,7 +43,7 @@ def init_env(args):
         paddle.set_flags({"FLAGS_cudnn_exhaustive_search": 1})
 
 
-def get_hardward_name(args):
+def get_hardware_name(args):
     hardware = "unknown"
     if test_compiler_util.is_gpu_device(args.device):
         hardware = paddle.device.cuda.get_device_name(0)
@@ -149,7 +149,7 @@ def measure_performance(model_call, args, compiler, profile=False):
     min_trials = int(100 / np.mean(warmup_e2e_times[1:]))
     trials = max(args.trials, min_trials)
 
-    hardware_name = get_hardward_name(args)
+    hardware_name = get_hardware_name(args)
     print(
         f"[Profiling] Using device: {args.device} {hardware_name}, warm up {args.warmup}, trials {trials}",
         file=sys.stderr,
@@ -327,7 +327,7 @@ def test_single_model(args):
     model.eval()
 
     test_compiler_util.print_basic_config(
-        args, get_hardward_name(args), get_compile_framework_version(args)
+        args, get_hardware_name(args), get_compile_framework_version(args)
     )
 
     # Run on eager mode
diff --git a/graph_net/paddle/test_reference_device.py b/graph_net/paddle/test_reference_device.py
index f1db9bc0f..4c7c60b5b 100644
--- a/graph_net/paddle/test_reference_device.py
+++ b/graph_net/paddle/test_reference_device.py
@@ -49,7 +49,7 @@ def test_single_model(args):
 
             test_compiler_util.print_basic_config(
                 args,
-                test_compiler.get_hardward_name(args),
+                test_compiler.get_hardware_name(args),
                 test_compiler.get_compile_framework_version(args),
             )
 
diff --git a/graph_net/paddle/test_target_device.py b/graph_net/paddle/test_target_device.py
index 9697aea5d..08176680d 100644
--- a/graph_net/paddle/test_target_device.py
+++ b/graph_net/paddle/test_target_device.py
@@ -89,7 +89,7 @@ def test_single_model(args):
 
     test_compiler_util.print_basic_config(
         args,
-        test_compiler.get_hardward_name(args),
+        test_compiler.get_hardware_name(args),
         test_compiler.get_compile_framework_version(args),
     )
 
diff --git a/graph_net_bench/torch/eval_backend_diff.py b/graph_net_bench/torch/eval_backend_diff.py
index 68e1f2f02..ce9f27b2f 100755
--- a/graph_net_bench/torch/eval_backend_diff.py
+++ b/graph_net_bench/torch/eval_backend_diff.py
@@ -1,220 +1,320 @@
+"""Backend Performance Difference Evaluation Script.
+
+Compares outputs and performance between reference and target compiler backends.
+"""
+
 import argparse
-import torch
-import sys
 import os
-import os.path
+import sys
 import traceback
 import types
-from graph_net_bench import test_compiler_util
+from typing import Any, List, Optional, Tuple
+
+import torch
+
 from graph_net_bench import path_utils
+from graph_net_bench import test_compiler_util
 from .runner import RunnerConfig, RunResult, create_runner
 
+_DEFAULT_REF_DIR = "/tmp/eval_perf_diff/reference"
+_DEFAULT_TARGET_DIR = "/tmp/eval_perf_diff/target"
+
+
+def _get_dtype_name(value: Any) -> str:
+    """Extract dtype name from tensor or type name from other objects."""
+    if isinstance(value, torch.Tensor):
+        return str(value.dtype).replace("torch.", "")
+    return type(value).__name__
+
+
+def _extract_dtypes(outputs: List[Any]) -> List[str]:
+    """Extract dtype/type names from a list of outputs."""
+    return [_get_dtype_name(x) for x in outputs]
 
-def compare_correctness(expected_out, compiled_out, args):
-    eager_dtypes = [
-        (
-            str(x.dtype).replace("torch.", "")
-            if isinstance(x, torch.Tensor)
-            else type(x).__name__
-        )
-        for x in expected_out
-    ]
-    compiled_dtypes = [
-        (
-            str(x.dtype).replace("torch.", "")
-            if isinstance(x, torch.Tensor)
-            else type(x).__name__
-        )
-        for x in compiled_out
-    ]
 
-    # datatype check
+def compare_correctness(
+    expected_out: List[torch.Tensor],
+    compiled_out: List[torch.Tensor],
+    args,
+) -> None:
+    """Compare correctness between expected and compiled outputs.
+
+    Args:
+        expected_out: List of expected output tensors.
+        compiled_out: List of compiled output tensors.
+        args: Arguments containing log_prompt and other settings.
+    """
+    eager_dtypes = _extract_dtypes(expected_out)
+    compiled_dtypes = _extract_dtypes(compiled_out)
+
     type_match = test_compiler_util.check_output_datatype(
         args, eager_dtypes, compiled_dtypes
     )
+    if not type_match:
+        return
 
-    if type_match:
-        test_compiler_util.check_equal(
-            args,
-            expected_out,
-            compiled_out,
-            cmp_equal_func=get_cmp_equal,
-        )
-
-        test_compiler_util.check_allclose(
-            args,
-            expected_out,
-            compiled_out,
-            cmp_all_close_func=get_cmp_all_close,
-            cmp_max_diff_func=get_cmp_max_diff,
-            cmp_mean_diff_func=get_cmp_mean_diff,
-        )
+    test_compiler_util.check_equal(
+        args,
+        expected_out,
+        compiled_out,
+        cmp_equal_func=get_cmp_equal,
+    )
+    test_compiler_util.check_allclose(
+        args,
+        expected_out,
+        compiled_out,
+        cmp_all_close_func=get_cmp_all_close,
+        cmp_max_diff_func=get_cmp_max_diff,
+        cmp_mean_diff_func=get_cmp_mean_diff,
+    )
 
 
-def get_cmp_equal(expected_out, compiled_out):
+def get_cmp_equal(
+    expected_out: List[torch.Tensor], compiled_out: List[torch.Tensor]
+) -> str:
+    """Get space-separated string of equality check results (1=equal, 0=not)."""
     return " ".join(
         str(int(torch.equal(a, b))) for a, b in zip(expected_out, compiled_out)
     )
 
 
-def get_cmp_all_close(expected_out, compiled_out, atol, rtol):
+def get_cmp_all_close(
+    expected_out: List[torch.Tensor],
+    compiled_out: List[torch.Tensor],
+    atol: float,
+    rtol: float,
+) -> str:
+    """Get space-separated string of allclose check results."""
     return " ".join(
         str(int(torch.allclose(a, b, atol=atol, rtol=rtol)))
         for a, b in zip(compiled_out, expected_out)
     )
 
 
-def get_cmp_max_diff(expected_out, compiled_out):
+def _compute_abs_diff(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+    """Compute absolute difference, converting to float for LongTensor compatibility."""
+    return torch.abs(a.float() - b.float())
+
+
+def get_cmp_max_diff(
+    expected_out: List[torch.Tensor], compiled_out: List[torch.Tensor]
+) -> str:
+    """Get space-separated string of max absolute differences."""
+    return " ".join(
+        str(torch.max(_compute_abs_diff(a, b)).item())
+        for a, b in zip(expected_out, compiled_out)
+    )
+
+
+def get_cmp_mean_diff(
+    expected_out: List[torch.Tensor], compiled_out: List[torch.Tensor]
+) -> str:
+    """Get space-separated string of mean absolute differences."""
     return " ".join(
-        # Transform to float to handle LongTensor output of some models, which cannnot be processed with torch.max().
-        str(torch.max(torch.abs(a.float() - b.float())).item())
+        str(torch.mean(_compute_abs_diff(a, b)).item())
         for a, b in zip(expected_out, compiled_out)
     )
 
 
-def get_cmp_mean_diff(expected_out, compiled_out):
+def _count_diff_elements(
+    a: torch.Tensor, b: torch.Tensor, atol: float, rtol: float
+) -> int:
+    """Count number of differing elements between two tensors."""
+    if a.is_floating_point() and b.is_floating_point():
+        return torch.sum(~torch.isclose(a, b, atol=atol, rtol=rtol)).item()
+    return torch.sum(a != b).item()
+
+
+def get_cmp_diff_count(
+    expected_out: List[torch.Tensor],
+    compiled_out: List[torch.Tensor],
+    atol: float,
+    rtol: float,
+) -> str:
+    """Get space-separated string of element difference counts."""
     return " ".join(
-        # To handle LongTensor
-        str(torch.mean(torch.abs(a.float() - b.float())).item())
+        str(_count_diff_elements(a, b, atol, rtol))
         for a, b in zip(expected_out, compiled_out)
     )
 
 
-def get_cmp_diff_count(expected_out, compiled_out, atol, rtol):
-    results = []
-    for a, b in zip(expected_out, compiled_out):
-        # To handle LongTensor
-        if a.is_floating_point() and b.is_floating_point():
-            diff_count = torch.sum(~torch.isclose(a, b, atol=atol, rtol=rtol)).item()
-        else:
-            diff_count = torch.sum(a != b).item()
-        results.append(str(diff_count))
-    return " ".join(results)
+def _has_model_file(path: str) -> bool:
+    """Check if directory contains model.py."""
+    return os.path.exists(os.path.join(path, "model.py"))
 
 
-def _get_model_paths(args, model_path_prefix, use_model_list):
+def _get_model_paths_from_list(
+    model_path_list: str, model_path_prefix: str
+) -> List[str]:
+    """Get model paths from a list file with prefix."""
+    assert os.path.isdir(model_path_prefix), f"Not a directory: {model_path_prefix}"
+    assert os.path.isfile(model_path_list), f"Not a file: {model_path_list}"
+
+    test_samples = test_compiler_util.get_allow_samples(
+        model_path_list, model_path_prefix
+    )
+    return [
+        os.path.join(model_path_prefix, rel_path)
+        for rel_path in test_samples
+        if _has_model_file(os.path.join(model_path_prefix, rel_path))
+    ]
+
+
+def _get_model_paths_from_dir(
+    model_path: str, model_path_list: Optional[str], model_path_prefix: Optional[str]
+) -> List[str]:
+    """Get model paths by recursively scanning a directory."""
+    assert os.path.isdir(model_path), f"Not a directory: {model_path}"
+
+    test_samples = test_compiler_util.get_allow_samples(
+        model_path_list, model_path_prefix
+    )
+    all_paths = path_utils.get_recursively_model_path(model_path)
+
+    if test_samples is None:
+        return list(all_paths)
+    return [p for p in all_paths if os.path.abspath(p) in test_samples]
+
+
+def _get_model_paths(
+    args, model_path_prefix: Optional[str], use_model_list: bool
+) -> List[str]:
+    """Get list of model paths based on configuration."""
     if use_model_list:
-        assert os.path.isdir(model_path_prefix) and os.path.isfile(args.model_path_list)
+        return _get_model_paths_from_list(args.model_path_list, model_path_prefix)
+    return _get_model_paths_from_dir(
+        args.model_path, args.model_path_list, model_path_prefix
+    )
 
-        test_samples = test_compiler_util.get_allow_samples(
-            args.model_path_list, model_path_prefix
-        )
-        model_paths = [
-            os.path.join(model_path_prefix, rel_model_path)
-            for rel_model_path in test_samples
-            if os.path.exists(
-                os.path.join(model_path_prefix, rel_model_path, "model.py")
-            )
-        ]
-    else:
-        assert os.path.isdir(args.model_path)
-
-        test_samples = test_compiler_util.get_allow_samples(
-            args.model_path_list, model_path_prefix
-        )
-        model_paths = [
-            model_path
-            for model_path in path_utils.get_recursively_model_path(args.model_path)
-            if test_samples is None or os.path.abspath(model_path) in test_samples
-        ]
 
-    return model_paths
+def _create_model_args(
+    model_path: str, reference_config: str, target_config: str
+) -> argparse.Namespace:
+    """Create namespace for single model evaluation."""
+    return argparse.Namespace(
+        model_path=model_path,
+        model_path_list=None,
+        reference_config=reference_config,
+        target_config=target_config,
+    )
+
 
+def _eval_single_model_safe(model_args: argparse.Namespace) -> bool:
+    """Evaluate single model with exception handling.
 
-def _create_model_args(model_path, reference_config, target_config):
-    args = argparse.Namespace()
-    args.model_path = model_path
-    args.model_path_list = None
-    args.reference_config = reference_config
-    args.target_config = target_config
-    return args
+    Returns:
+        True if evaluation succeeded, False otherwise.
+    """
+    try:
+        eval_single_model(model_args)
+        return True
+    except KeyboardInterrupt:
+        print("KeyboardInterrupt")
+        sys.exit(1)
+    except Exception:
+        print("\n--- Full Traceback ---")
+        traceback.print_exc()
+        return False
+
+
+def _print_evaluation_summary(total_count: int, failed_samples: List[str]) -> None:
+    """Print summary of multi-model evaluation."""
+    print(
+        f"Totally {total_count} verified samples, failed {len(failed_samples)} samples.",
+        file=sys.stderr,
+        flush=True,
+    )
+    for model_path in failed_samples:
+        print(f"- {model_path}", file=sys.stderr, flush=True)
 
 
-def eval_multi_models(args, model_path_prefix=None, use_model_list=False):
+def eval_multi_models(
+    args,
+    model_path_prefix: Optional[str] = None,
+    use_model_list: bool = False,
+) -> None:
+    """Evaluate multiple models and collect results."""
     module_name = os.path.splitext(os.path.basename(__file__))[0]
-
     model_paths = _get_model_paths(args, model_path_prefix, use_model_list)
-    failed_samples = []
+    failed_samples: List[str] = []
+
     for sample_idx, model_path in enumerate(model_paths):
         print(
             f"[{sample_idx}] {module_name}, model_path: {model_path}",
             file=sys.stderr,
             flush=True,
         )
-
-        model_args = argparse.Namespace()
-        model_args.model_path = model_path
-        model_args.model_path_list = None
-        model_args.reference_config = args.reference_config
-        model_args.target_config = args.target_config
-
-        try:
-            eval_single_model(model_args)
-            success = True
-        except KeyboardInterrupt:
-            print("KeyboardInterrupt")
-            sys.exit(1)
-        except Exception:
-            print("\n--- Full Traceback ---")
-            traceback.print_exc()
-            success = False
-
+        model_args = _create_model_args(
+            model_path, args.reference_config, args.target_config
+        )
+        success = _eval_single_model_safe(model_args)
         if not success:
             failed_samples.append(model_path)
 
-    print(
-        f"Totally {len(model_paths)} verified samples, failed {len(failed_samples)} samples.",
-        file=sys.stderr,
-        flush=True,
+    _print_evaluation_summary(len(model_paths), failed_samples)
+
+
+def _parse_runner_configs(args) -> Tuple[RunnerConfig, RunnerConfig]:
+    """Parse reference and target runner configurations."""
+    return (
+        RunnerConfig.from_dict(
+            test_compiler_util.convert_to_dict(args.reference_config)
+        ),
+        RunnerConfig.from_dict(test_compiler_util.convert_to_dict(args.target_config)),
     )
-    if failed_samples:
-        for model_path in failed_samples:
-            print(f"- {model_path}", file=sys.stderr, flush=True)
 
 
-def eval_single_model(args):
-    """
-    Unified evaluation using Runner abstraction.
-    Supports local, process, and remote execution via runner_type in config.
-    """
-    ref_dir = "/tmp/eval_perf_diff/reference"
-    target_dir = "/tmp/eval_perf_diff/target"
+def _log_runner_info(ref_config: RunnerConfig, target_config: RunnerConfig) -> None:
+    """Log runner type information."""
+    for label, cfg in [("Reference", ref_config), ("Target", target_config)]:
+        print(
+            f"[eval_backend_diff] {label} runner: {cfg.strategy.runner_type.value}",
+            file=sys.stderr,
+            flush=True,
+        )
+
+
+def _run_and_validate(
+    runner, model_path: str, output_dir: str, label: str
+) -> RunResult:
+    """Run model and validate result."""
+    result = runner.run(model_path, output_dir)
+    if not result.success:
+        raise RuntimeError(f"{label} run failed: {result.error_message}")
+    return result
 
-    ref_config_dict = test_compiler_util.convert_to_dict(args.reference_config)
-    target_config_dict = test_compiler_util.convert_to_dict(args.target_config)
 
-    ref_runner_config = RunnerConfig.from_dict(ref_config_dict)
-    target_runner_config = RunnerConfig.from_dict(target_config_dict)
+def eval_single_model(args) -> None:
+    """Evaluate single model using Runner abstraction.
+
+    Supports local, process, and remote execution via runner_type in config.
+    """
+    ref_runner_config, target_runner_config = _parse_runner_configs(args)
+    _log_runner_info(ref_runner_config, target_runner_config)
 
     ref_runner = create_runner(ref_runner_config)
     target_runner = create_runner(target_runner_config)
 
-    print(
-        f"[eval_backend_diff] Reference runner: {ref_runner_config.strategy.runner_type.value}",
-        file=sys.stderr,
-        flush=True,
+    ref_result = _run_and_validate(
+        ref_runner, args.model_path, _DEFAULT_REF_DIR, "Reference"
     )
-    print(
-        f"[eval_backend_diff] Target runner: {target_runner_config.strategy.runner_type.value}",
-        file=sys.stderr,
-        flush=True,
+    target_result = _run_and_validate(
+        target_runner, args.model_path, _DEFAULT_TARGET_DIR, "Target"
     )
 
-    ref_result = ref_runner.run(args.model_path, ref_dir)
-    if not ref_result.success:
-        raise RuntimeError(f"Reference run failed: {ref_result.error_message}")
-
-    target_result = target_runner.run(args.model_path, target_dir)
-    if not target_result.success:
-        raise RuntimeError(f"Target run failed: {target_result.error_message}")
-
     compare_results(ref_result, target_result, ref_runner_config)
 
 
 def compare_results(
     ref_result: RunResult, target_result: RunResult, config: RunnerConfig
-):
-    """Compare outputs and performance between reference and target results."""
+) -> None:
+    """Compare outputs and performance between reference and target results.
+
+    Args:
+        ref_result: Result from reference runner.
+        target_result: Result from target runner.
+        config: Runner configuration for logging settings.
+    """
     if ref_result.outputs is None or target_result.outputs is None:
         print("[Warning] Cannot compare: missing outputs", file=sys.stderr)
         return
@@ -231,20 +331,31 @@ def compare_results(
     )
 
 
-def main(args):
+def main(args: argparse.Namespace) -> None:
+    """Main entry point for backend difference evaluation.
+
+    Args:
+        args: Parsed command-line arguments.
+
+    Raises:
+        ValueError: If model_path is invalid.
+    """
     ref_config = test_compiler_util.convert_to_dict(args.reference_config)
     model_path_prefix = ref_config.get("model_path_prefix")
 
     if args.model_path_list and model_path_prefix:
         eval_multi_models(args, model_path_prefix, use_model_list=True)
-    elif os.path.isdir(args.model_path):
-        if path_utils.is_single_model_dir(args.model_path):
-            eval_single_model(args)
-        else:
-            eval_multi_models(args, model_path_prefix, use_model_list=False)
-    else:
+        return
+
+    if not os.path.isdir(args.model_path):
         raise ValueError(f"Invalid model path: {args.model_path}")
 
+    if path_utils.is_single_model_dir(args.model_path):
+        eval_single_model(args)
+        return
+
+    eval_multi_models(args, model_path_prefix, use_model_list=False)
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
diff --git a/graph_net_bench/torch/eval_backend_perf.py b/graph_net_bench/torch/eval_backend_perf.py
index 5c8586f30..30bf2dacb 100644
--- a/graph_net_bench/torch/eval_backend_perf.py
+++ b/graph_net_bench/torch/eval_backend_perf.py
@@ -1,142 +1,288 @@
-from . import utils
+"""Single Backend Performance Evaluation Script."""
+
 import argparse
 import importlib.util
-import torch
-from pathlib import Path
-from typing import Type
-import sys
-import os
-import traceback
 import json
-import random
-import numpy as np
+import os
 import platform
+import random
+import sys
+import traceback
 import types
 from contextlib import redirect_stdout, redirect_stderr
-from graph_net_bench.torch.backend.graph_compiler_backend import GraphCompilerBackend
+from pathlib import Path
+from typing import Callable, Dict, Any, List, Tuple, Type, Optional
+
+import numpy as np
+import torch
+
+from . import utils
 from graph_net_bench import test_compiler_util
+from graph_net_bench.torch.backend.graph_compiler_backend import GraphCompilerBackend
 
+_ARG_DEFAULTS: Dict[str, Any] = {
+    "model_path": None,
+    "output_path": None,
+    "seed": 123,
+    "compiler": "inductor",
+    "device": "cuda",
+    "op_lib": None,
+    "warmup": 3,
+    "trials": 5,
+    "log_prompt": "graph-net-bench-log",
+    "model_path_prefix": None,
+    "backend_config": None,
+}
 
-def register_op_lib(op_lib):
-    if op_lib == "flaggems":
-        import flag_gems
 
-        flag_gems.enable()
-    else:
-        pass
+def register_op_lib(op_lib: Optional[str]) -> None:
+    """Register operator library if specified."""
+    if op_lib != "flaggems":
+        return
+    import flag_gems
 
+    flag_gems.enable()
 
-def set_seed(random_seed):
+
+def set_seed(random_seed: int) -> None:
+    """Set random seed for reproducibility across all frameworks."""
     random.seed(random_seed)
     np.random.seed(random_seed)
     torch.manual_seed(random_seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(random_seed)
-        torch.cuda.manual_seed_all(random_seed)
+    if not torch.cuda.is_available():
+        return
+    torch.cuda.manual_seed(random_seed)
+    torch.cuda.manual_seed_all(random_seed)
 
 
-def get_hardward_name(device):
-    hardware_name = "unknown"
+def get_hardware_name(device: str) -> str:
+    """Get hardware name based on device type."""
     if "cuda" in device:
-        hardware_name = torch.cuda.get_device_name(device)
-    elif device == "cpu":
-        hardware_name = platform.processor()
-    return hardware_name
+        return torch.cuda.get_device_name(device)
+    if device == "cpu":
+        return platform.processor()
+    return "unknown"
+
+
+def get_compiler_version(compiler_name: str) -> str:
+    """Get version string for the given compiler.
 
+    Args:
+        compiler_name: Name of the compiler (e.g., 'inductor', 'tvm').
 
-def get_compiler_version(compiler):
-    if compiler in ["inductor", "nope", "unstable_to_stable"]:
+    Returns:
+        Version string or 'unknown' if not determinable.
+    """
+    torch_based_compilers = {"inductor", "nope", "unstable_to_stable"}
+    if compiler_name in torch_based_compilers:
         return torch.__version__
-    elif compiler in ["tvm", "xla", "tensorrt", "bladedisc"]:
-        # Assuming compiler object has a version attribute
-        return f"{compiler.capitalize()} {compiler.version}"
+    # TODO: For external compilers, version detection would require runtime introspection
+    # which is not reliably available here. Return a placeholder.
     return "unknown"
 
 
-def load_class_from_file(
-    model_path: str, class_name: str, device: str
-) -> Type[torch.nn.Module]:
-    file_path = f"{model_path}/model.py"
-    file = Path(file_path).resolve()
-    module_name = file.stem
-
+def _read_and_modify_model_code(file_path: str, device: str) -> str:
+    """Read model file and modify code for target device."""
     with open(file_path, "r", encoding="utf-8") as f:
         model_code = f.read()
-    model_code = utils.modify_code_by_device(model_code, device)
+    return utils.modify_code_by_device(model_code, device)
+
+
+def _create_module_from_code(
+    module_name: str, code: str, file_path: Path
+) -> types.ModuleType:
+    """Create a module by executing code."""
     spec = importlib.util.spec_from_loader(module_name, loader=None)
     module = importlib.util.module_from_spec(spec)
     sys.modules[module_name] = module
-    compiled_code = compile(model_code, filename=file, mode="exec")
+    compiled_code = compile(code, filename=file_path, mode="exec")
     exec(compiled_code, module.__dict__)
+    return module
 
-    model_class = getattr(module, class_name, None)
-    setattr(model_class, "__graph_net_file_path__", file_path)
-    setattr(model_class, "__graph_net_device__", device)
-    return model_class
 
+def load_class_from_file(
+    model_path: str, class_name: str, device: str
+) -> Type[torch.nn.Module]:
+    """Dynamically load a model class from file.
 
-def get_compiler_backend(args) -> GraphCompilerBackend:
-    """
-    Dynamically load backend class based on args.compiler
+    Args:
+        model_path: Directory containing model.py.
+        class_name: Name of the class to load.
+        device: Target device for code modification.
+
+    Returns:
+        The loaded model class with metadata attributes set.
+
+    Raises:
+        AttributeError: If class_name not found in module.
     """
-    compiler_name = args.compiler.lower()
-    module_name = f"graph_net_bench.torch.backend.{compiler_name}_backend"
+    file_path = f"{model_path}/model.py"
+    resolved_path = Path(file_path).resolve()
+    module_name = resolved_path.stem
 
-    try:
-        module = __import__(module_name, fromlist=[f"{compiler_name.title()}Backend"])
+    model_code = _read_and_modify_model_code(file_path, device)
+    module = _create_module_from_code(module_name, model_code, resolved_path)
 
-        class_name = (
-            f"{''.join(part.title() for part in compiler_name.split('_'))}Backend"
-        )
+    model_class = getattr(module, class_name)
+    model_class.__graph_net_file_path__ = file_path
+    model_class.__graph_net_device__ = device
+    return model_class
 
-        backend_class = None
-        if hasattr(module, class_name):
-            backend_class = getattr(module, class_name)
-        else:
-            raise ImportError(f"No valid backend class found in {module_name}")
 
-    except ImportError as e:
-        raise ImportError(f"Failed to import backend module for '{compiler_name}': {e}")
+def _build_backend_class_name(compiler_name: str) -> str:
+    """Convert compiler name to PascalCase backend class name."""
+    return "".join(part.title() for part in compiler_name.split("_")) + "Backend"
 
-    backend_config = (
-        test_compiler_util.convert_to_dict(args.backend_config)
-        if args.backend_config is not None
-        else {}
-    )
-    return backend_class(backend_config)
 
+def _load_backend_class(compiler_name: str) -> Type[GraphCompilerBackend]:
+    """Load backend class by compiler name."""
+    module_name = f"graph_net_bench.torch.backend.{compiler_name}_backend"
+    class_name = _build_backend_class_name(compiler_name)
+
+    module = __import__(module_name, fromlist=[class_name])
+    if not hasattr(module, class_name):
+        raise ImportError(
+            f"No valid backend class '{class_name}' found in {module_name}"
+        )
+    return getattr(module, class_name)
 
-def get_model(args):
-    device = "xla" if args.compiler == "xla" else args.device
 
-    # device: Torch device object specifying the target device for model loading (e.g., 'cuda', 'cpu', 'xla')
-    model_class = load_class_from_file(
-        args.model_path, class_name="GraphModule", device=device
-    )
-    model = model_class().to(torch.device(args.device))
-    return model
+def get_compiler_backend(args) -> GraphCompilerBackend:
+    """Dynamically load and instantiate backend class based on args.compiler."""
+    backend_class = _load_backend_class(args.compiler.lower())
+    backend_config = test_compiler_util.convert_to_dict(args.backend_config) or {}
+    return backend_class(backend_config)
 
 
-def get_input_dict(args):
-    inputs_params = utils.load_converted_from_text(f"{args.model_path}")
-    params = inputs_params["weight_info"]
+def get_model(args) -> torch.nn.Module:
+    """Load and prepare model for evaluation."""
+    load_device = "xla" if args.compiler == "xla" else args.device
+    model_class = load_class_from_file(args.model_path, "GraphModule", load_device)
+    return model_class().to(torch.device(args.device))
+
+
+def _update_tensor_device(params: Dict[str, Any], device: str) -> None:
+    """Update device info in tensor metadata in-place."""
     for tensor_meta in params.values():
         if "device" in tensor_meta["info"]:
-            tensor_meta["info"]["device"] = args.device
+            tensor_meta["info"]["device"] = device
+
+
+def get_input_dict(args) -> Dict[str, torch.Tensor]:
+    """Load and prepare input tensors for model evaluation.
+
+    Args:
+        args: Arguments containing model_path and device settings.
+
+    Returns:
+        Dictionary mapping parameter names to tensors on target device.
+    """
+    inputs_params = utils.load_converted_from_text(args.model_path)
+    params = inputs_params["weight_info"]
+    _update_tensor_device(params, args.device)
+
+    target_device = torch.device(args.device)
+    return {k: utils.replay_tensor(v).to(target_device) for k, v in params.items()}
+
+
+def _run_warmup(model_call: Callable, warmup_count: int, sync_fn: Callable) -> None:
+    """Execute warmup runs."""
+    for _ in range(warmup_count):
+        model_call()
+    sync_fn()
+
+
+def _measure_single_trial_cuda(
+    model_call: Callable, sync_fn: Callable
+) -> Tuple[float, float]:
+    """Measure a single trial on CUDA device.
+
+    Returns:
+        Tuple of (e2e_time_ms, gpu_time_ms).
+    """
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    duration_box = test_compiler_util.DurationBox(-1)
+
+    with test_compiler_util.naive_timer(duration_box, sync_fn):
+        start_event.record()
+        model_call()
+        end_event.record()
+        sync_fn()
+
+    gpu_time_ms = start_event.elapsed_time(end_event)
+    return duration_box.value, gpu_time_ms
+
+
+def _measure_single_trial_cpu(model_call: Callable, sync_fn: Callable) -> float:
+    """Measure a single trial on CPU or other devices.
+
+    Returns:
+        End-to-end time in milliseconds.
+    """
+    duration_box = test_compiler_util.DurationBox(-1)
+    with test_compiler_util.naive_timer(duration_box, sync_fn):
+        model_call()
+    return duration_box.value
+
+
+def _run_cuda_trials(
+    model_call: Callable, trials: int, sync_fn: Callable
+) -> Dict[str, Any]:
+    """Run multiple timing trials on CUDA device."""
+    torch.cuda.empty_cache()
+    e2e_times: List[float] = []
+    gpu_times: List[float] = []
+
+    for i in range(trials):
+        e2e_time, gpu_time = _measure_single_trial_cuda(model_call, sync_fn)
+        e2e_times.append(e2e_time)
+        gpu_times.append(gpu_time)
+        print(
+            f"Trial {i + 1}: e2e={e2e_time:.5f} ms, gpu={gpu_time:.5f} ms",
+            file=sys.stderr,
+            flush=True,
+        )
+
     return {
-        k: utils.replay_tensor(v).to(torch.device(args.device))
-        for k, v in params.items()
+        "e2e": test_compiler_util.get_timing_stats(e2e_times),
+        "gpu": test_compiler_util.get_timing_stats(gpu_times),
     }
 
 
-def measure_performance(model_call, args, compiler):
-    stats = {}
-    outs = model_call()
+def _run_cpu_trials(
+    model_call: Callable, trials: int, sync_fn: Callable
+) -> Dict[str, Any]:
+    """Run multiple timing trials on CPU or other devices."""
+    e2e_times: List[float] = []
+
+    for i in range(trials):
+        e2e_time = _measure_single_trial_cpu(model_call, sync_fn)
+        e2e_times.append(e2e_time)
+        print(
+            f"Trial {i + 1}: e2e={e2e_time:.5f} ms",
+            file=sys.stderr,
+            flush=True,
+        )
+
+    return {"e2e": test_compiler_util.get_timing_stats(e2e_times)}
 
-    # Warmup runs
-    for _ in range(args.warmup):
-        model_call()
-    compiler.synchronize()
+
+def measure_performance(
+    model_call: Callable, args, compiler
+) -> Tuple[Any, Dict[str, Any]]:
+    """Measure model inference performance.
+
+    Args:
+        model_call: Callable that executes the model.
+        args: Arguments containing device, warmup, and trials settings.
+        compiler: Compiler backend with synchronize method.
+
+    Returns:
+        Tuple of (model_outputs, timing_stats).
+    """
+    outs = model_call()
+    _run_warmup(model_call, args.warmup, compiler.synchronize)
 
     print(
         f"[Profiling] Warm up {args.warmup}, Trials {args.trials}",
@@ -144,58 +290,83 @@ def measure_performance(model_call, args, compiler):
         flush=True,
     )
 
-    if "cuda" in args.device:
-        torch.cuda.empty_cache()
-        e2e_times = []
-        gpu_times = []
-
-        for i in range(args.trials):
-            # End-to-end timing (naive_timer)
-            duration_box = test_compiler_util.DurationBox(-1)
-            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
-                # GPU-only timing (CUDA Events)
-                start_event = torch.cuda.Event(enable_timing=True)
-                end_event = torch.cuda.Event(enable_timing=True)
-                start_event.record()
-
-                model_call()
-
-                end_event.record()
-                compiler.synchronize()
-
-            gpu_time_ms = start_event.elapsed_time(end_event)
-            e2e_times.append(duration_box.value)
-            gpu_times.append(gpu_time_ms)
-            print(
-                f"Trial {i + 1}: e2e={duration_box.value:.5f} ms, gpu={gpu_time_ms:.5f} ms",
-                file=sys.stderr,
-                flush=True,
-            )
-
-        stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
-        stats["gpu"] = test_compiler_util.get_timing_stats(gpu_times)
-
-    else:  # CPU or other devices
-        e2e_times = []
-        for i in range(args.trials):
-            duration_box = test_compiler_util.DurationBox(-1)
-            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
-                model_call()
-            print(
-                f"Trial {i + 1}: e2e={duration_box.value:.5f} ms",
-                file=sys.stderr,
-                flush=True,
-            )
-            e2e_times.append(duration_box.value)
-        stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
+    is_cuda = "cuda" in args.device
+    if is_cuda:
+        stats = _run_cuda_trials(model_call, args.trials, compiler.synchronize)
+    else:
+        stats = _run_cpu_trials(model_call, args.trials, compiler.synchronize)
 
     return outs, stats
 
 
-def eval_single_model_with_single_backend(args):
+def _compile_and_benchmark(
+    args, compiler: GraphCompilerBackend, model: torch.nn.Module, input_dict: Dict
+) -> Tuple[bool, Any, Dict[str, Any]]:
+    """Compile model and run performance benchmark.
+
+    Returns:
+        Tuple of (success, outputs, time_stats).
+    """
+    try:
+        compiled_model = compiler(model)
+
+        def model_call():
+            return compiled_model(**input_dict)
+
+        outputs, time_stats = measure_performance(model_call, args, compiler)
+        return True, outputs, time_stats
+    except Exception as e:
+        print(
+            f"Run model failed: {str(e)}\n{traceback.format_exc()}",
+            file=sys.stderr,
+            flush=True,
+        )
+        return False, None, {}
+
+
+def _run_evaluation_core(args) -> Tuple[bool, Any, Dict[str, Any]]:
+    """Core evaluation logic: load model, compile, and benchmark."""
+    compiler = get_compiler_backend(args)
+    input_dict = get_input_dict(args)
+    model = get_model(args)
+    model.eval()
+
+    test_compiler_util.print_config(
+        args,
+        get_hardware_name(args.device),
+        get_compiler_version(args.compiler),
+    )
+
+    return _compile_and_benchmark(args, compiler, model, input_dict)
+
+
+def _finalize_evaluation(
+    args,
+    success: bool,
+    outputs: Any,
+    time_stats: Dict[str, Any],
+    output_dump_path: Path,
+) -> None:
+    """Finalize evaluation: save outputs and print status."""
+    test_compiler_util.print_running_status(args, success)
+    if success:
+        torch.save(outputs, str(output_dump_path))
+    test_compiler_util.print_with_log_prompt(
+        "[Performance][eager]:", json.dumps(time_stats), args.log_prompt
+    )
+
+
+def _print_log_file(log_path: Path) -> None:
+    """Read and print log file content to stderr."""
+    print(Path(log_path).read_text(encoding="utf-8"), file=sys.stderr, flush=True)
+
+
+def eval_single_model_with_single_backend(args) -> None:
+    """Evaluate a single model with a single compiler backend."""
     check_and_complete_args(args)
     set_seed(args.seed)
     os.makedirs(args.output_path, exist_ok=True)
+
     log_path = utils.get_log_path(args.output_path, args.model_path)
     output_dump_path = utils.get_output_path(args.output_path, args.model_path)
     print(f"Log path: {log_path}", file=sys.stderr, flush=True)
@@ -203,66 +374,19 @@ def eval_single_model_with_single_backend(args):
 
     with open(log_path, "w", encoding="utf-8") as log_f:
         with redirect_stdout(log_f), redirect_stderr(log_f):
-            compiler = get_compiler_backend(args)
-
-            input_dict = get_input_dict(args)
-            model = get_model(args)
-            model.eval()
-
-            test_compiler_util.print_config(
-                args,
-                get_hardward_name(args.device),
-                get_compiler_version(args.compiler),
-            )
-
-            success = False
-            time_stats = {}
-            try:
-                compiled_model = compiler(model)
-
-                def model_call():
-                    return compiled_model(**input_dict)
-
-                outputs, time_stats = measure_performance(model_call, args, compiler)
-                success = True
-            except Exception as e:
-                print(
-                    f"Run model failed: {str(e)}\n{traceback.format_exc()}",
-                    file=sys.stderr,
-                    flush=True,
-                )
-
-            test_compiler_util.print_running_status(args, success)
-            if success:
-                torch.save(outputs, str(output_dump_path))
-            test_compiler_util.print_with_log_prompt(
-                "[Performance][eager]:", json.dumps(time_stats), args.log_prompt
-            )
-
-    with open(log_path, "r", encoding="utf-8") as f:
-        content = f.read()
-        print(content, file=sys.stderr, flush=True)
-
-
-def check_and_complete_args(args):
-    """
-    Ensure all required arguments are present with default values if missing
-    """
-    defaults = {
-        "model_path": None,  # Model path
-        "output_path": None,  # Log and output directory
-        "seed": 123,  # Random seed
-        "compiler": "inductor",  # Compiler name
-        "device": "cuda",  # Device for testing the compiler (e.g., 'cpu' or 'cuda')
-        "op_lib": None,  # Operator library
-        "warmup": 3,  # Number of warmup steps
-        "trials": 5,  # Number of timing trials
-        "log_prompt": "graph-net-bench-log",  # Log prompt for performance log filtering
-        "model_path_prefix": None,  # Prefix path to model path in args.model-path
-        "backend_config": None,  # backend configuration json
-    }
+            success, outputs, time_stats = _run_evaluation_core(args)
+            _finalize_evaluation(args, success, outputs, time_stats, output_dump_path)
+
+    _print_log_file(log_path)
 
-    for key, default in defaults.items():
+
+def check_and_complete_args(args) -> None:
+    """Ensure all required arguments are present with default values if missing.
+
+    Args:
+        args: Namespace object to be validated and completed in-place.
+    """
+    for key, default in _ARG_DEFAULTS.items():
         if not hasattr(args, key):
             setattr(args, key, default)
 
diff --git a/graph_net_bench/torch/runner/base_runner.py b/graph_net_bench/torch/runner/base_runner.py
index 25d0882c8..809f5f281 100644
--- a/graph_net_bench/torch/runner/base_runner.py
+++ b/graph_net_bench/torch/runner/base_runner.py
@@ -129,21 +129,24 @@ def _get_log_path(self, output_dir: str, model_path: str) -> Path:
         return Path(utils.get_log_path(output_dir, model_path))
 
 
-def create_runner(config: RunnerConfig) -> BaseRunner:
-    """Factory function to create appropriate runner based on config."""
-    runner_type = config.strategy.runner_type
-
+def _get_runner_class(runner_type: RunnerType) -> type:
+    """Get runner class by type with lazy imports."""
     if runner_type == RunnerType.LOCAL:
         from .local_runner import LocalRunner
 
-        return LocalRunner(config)
-    elif runner_type == RunnerType.PROCESS:
+        return LocalRunner
+    if runner_type == RunnerType.PROCESS:
         from .process_runner import ProcessRunner
 
-        return ProcessRunner(config)
-    elif runner_type == RunnerType.REMOTE:
+        return ProcessRunner
+    if runner_type == RunnerType.REMOTE:
         from .remote_runner import RemoteRunner
 
-        return RemoteRunner(config)
-    else:
-        raise ValueError(f"Unknown runner_type: {runner_type}")
+        return RemoteRunner
+    raise ValueError(f"Unknown runner_type: {runner_type}")
+
+
+def create_runner(config: RunnerConfig) -> BaseRunner:
+    """Factory function to create appropriate runner based on config."""
+    runner_class = _get_runner_class(config.strategy.runner_type)
+    return runner_class(config)
diff --git a/graph_net_bench/torch/runner/local_runner.py b/graph_net_bench/torch/runner/local_runner.py
index 3d07a6470..1f21dddc0 100644
--- a/graph_net_bench/torch/runner/local_runner.py
+++ b/graph_net_bench/torch/runner/local_runner.py
@@ -1,98 +1,143 @@
+"""Local runner for in-process model evaluation."""
+
+import json
 import os
 import sys
-import json
-import types
 import traceback
+import types
 from io import StringIO
 from contextlib import redirect_stdout, redirect_stderr
+from pathlib import Path
+from typing import Any
 
 import torch
 
-from .base_runner import BaseRunner, RunResult
+from .base_runner import BaseRunner, RunResult, RunnerConfig
+
+
+def _write_log_file(log_path: Path, content: str) -> None:
+    """Write log content to file."""
+    with open(log_path, "w", encoding="utf-8") as f:
+        f.write(content)
+
+
+def _create_eval_args(
+    model_path: str, output_dir: str, config: RunnerConfig
+) -> types.SimpleNamespace:
+    """Create evaluation arguments from config."""
+    return types.SimpleNamespace(
+        model_path=model_path,
+        output_path=output_dir,
+        seed=config.execution.seed,
+        compiler=config.execution.compiler,
+        device=config.execution.device,
+        op_lib=config.execution.op_lib,
+        warmup=config.execution.warmup,
+        trials=config.execution.trials,
+        log_prompt=config.execution.log_prompt,
+        backend_config=config.execution.backend_config,
+    )
 
 
 class LocalRunner(BaseRunner):
     """Execute model evaluation in the current process."""
 
     def run(self, model_path: str, output_dir: str) -> RunResult:
-        from graph_net_bench.torch import eval_backend_perf
-
         os.makedirs(output_dir, exist_ok=True)
 
         log_path = self._get_log_path(output_dir, model_path)
         output_path = self._get_output_path(output_dir, model_path)
-
-        eval_args = types.SimpleNamespace(
-            model_path=model_path,
-            output_path=output_dir,
-            seed=self.config.execution.seed,
-            compiler=self.config.execution.compiler,
-            device=self.config.execution.device,
-            op_lib=self.config.execution.op_lib,
-            warmup=self.config.execution.warmup,
-            trials=self.config.execution.trials,
-            log_prompt=self.config.execution.log_prompt,
-            backend_config=self.config.execution.backend_config,
-        )
+        eval_args = _create_eval_args(model_path, output_dir, self.config)
 
         log_buffer = StringIO()
-        result = RunResult(
-            output_path=output_path,
-            log_path=log_path,
-        )
+        result = RunResult(output_path=output_path, log_path=log_path)
+
+        self._execute_with_logging(eval_args, result, log_buffer)
+        self._finalize_result(result, log_buffer, log_path)
+
+        return result
+
+    def _execute_with_logging(
+        self,
+        eval_args: types.SimpleNamespace,
+        result: RunResult,
+        log_buffer: StringIO,
+    ) -> None:
+        """Execute evaluation with output redirection."""
+        from graph_net_bench.torch import eval_backend_perf
 
         try:
             eval_backend_perf.register_op_lib(self.config.execution.op_lib)
             eval_backend_perf.set_seed(self.config.execution.seed)
-
             with redirect_stdout(log_buffer), redirect_stderr(log_buffer):
                 self._run_evaluation(eval_args, result)
-
         except Exception as e:
             result.success = False
             result.error_message = f"{str(e)}\n{traceback.format_exc()}"
             log_buffer.write(f"\n[ERROR] {result.error_message}\n")
 
+    def _finalize_result(
+        self, result: RunResult, log_buffer: StringIO, log_path: Path
+    ) -> None:
+        """Finalize result: save log and print to stderr."""
         result.log_content = log_buffer.getvalue()
+        _write_log_file(log_path, result.log_content)
+        print(result.log_content, file=sys.stderr, flush=True)
 
-        with open(log_path, "w", encoding="utf-8") as f:
-            f.write(result.log_content)
+    def _run_evaluation(self, args: types.SimpleNamespace, result: RunResult) -> None:
+        """Run model evaluation and populate result."""
+        from graph_net_bench.torch import eval_backend_perf
 
-        print(result.log_content, file=sys.stderr, flush=True)
+        compiler, model, input_dict = self._prepare_model(args)
+        self._log_config(args)
 
-        return result
+        compiled_model = compiler(model)
+
+        def model_call():
+            return compiled_model(**input_dict)
 
-    def _run_evaluation(self, args: types.SimpleNamespace, result: RunResult):
+        outputs, time_stats = eval_backend_perf.measure_performance(
+            model_call, args, compiler
+        )
+
+        self._populate_result(result, outputs, time_stats)
+        self._log_completion(args, time_stats)
+
+    def _prepare_model(self, args: types.SimpleNamespace) -> tuple:
+        """Prepare compiler, model, and inputs."""
         from graph_net_bench.torch import eval_backend_perf
-        from graph_net_bench import test_compiler_util
 
         compiler = eval_backend_perf.get_compiler_backend(args)
         input_dict = eval_backend_perf.get_input_dict(args)
         model = eval_backend_perf.get_model(args)
         model.eval()
+        return compiler, model, input_dict
+
+    def _log_config(self, args: types.SimpleNamespace) -> None:
+        """Log configuration information."""
+        from graph_net_bench.torch import eval_backend_perf
+        from graph_net_bench import test_compiler_util
 
         test_compiler_util.print_config(
             args,
-            eval_backend_perf.get_hardward_name(args.device),
+            eval_backend_perf.get_hardware_name(args.device),
             eval_backend_perf.get_compiler_version(args.compiler),
         )
 
-        compiled_model = compiler(model)
-
-        def model_call():
-            return compiled_model(**input_dict)
-
-        outputs, time_stats = eval_backend_perf.measure_performance(
-            model_call, args, compiler
-        )
-
+    def _populate_result(
+        self, result: RunResult, outputs: Any, time_stats: dict
+    ) -> None:
+        """Populate result with outputs and stats."""
         result.success = True
         result.outputs = outputs
         result.time_stats = time_stats
-
         if result.output_path:
             torch.save(outputs, str(result.output_path))
 
+    def _log_completion(self, args: types.SimpleNamespace, time_stats: dict) -> None:
+        """Log completion status and performance stats."""
+        from graph_net_bench import test_compiler_util
+
         test_compiler_util.print_running_status(args, True)
         test_compiler_util.print_with_log_prompt(
             "[Performance][eager]:", json.dumps(time_stats), args.log_prompt
diff --git a/graph_net_bench/torch/runner/process_runner.py b/graph_net_bench/torch/runner/process_runner.py
index 9ac68607e..1a48fec8e 100644
--- a/graph_net_bench/torch/runner/process_runner.py
+++ b/graph_net_bench/torch/runner/process_runner.py
@@ -1,57 +1,51 @@
+"""Process runner for subprocess-based model evaluation."""
+
 import os
-import sys
 import subprocess
+import sys
 from pathlib import Path
+from typing import Dict
 
 import torch
 
 from .base_runner import BaseRunner, RunResult
 
 
+def _get_env_with_pythonpath() -> Dict[str, str]:
+    """Get environment with PYTHONPATH set to repo root."""
+    env = os.environ.copy()
+    repo_root = Path(__file__).resolve().parents[3]
+    env["PYTHONPATH"] = f"{repo_root}:{env.get('PYTHONPATH', '')}"
+    return env
+
+
 class ProcessRunner(BaseRunner):
     """Execute model evaluation in a separate subprocess on the local machine."""
 
     def run(self, model_path: str, output_dir: str) -> RunResult:
         os.makedirs(output_dir, exist_ok=True)
 
-        log_path = self._get_log_path(output_dir, model_path)
-        output_path = self._get_output_path(output_dir, model_path)
-
         result = RunResult(
-            output_path=output_path,
-            log_path=log_path,
+            output_path=self._get_output_path(output_dir, model_path),
+            log_path=self._get_log_path(output_dir, model_path),
         )
 
         cmd = self._build_command(model_path, output_dir)
         print(f"[ProcessRunner] Executing: {cmd}", file=sys.stderr, flush=True)
 
-        try:
-            env = os.environ.copy()
-            repo_root = Path(__file__).resolve().parents[3]
-            env["PYTHONPATH"] = f"{repo_root}:{env.get('PYTHONPATH', '')}"
-
-            proc = subprocess.run(
-                cmd,
-                shell=True,
-                env=env,
-                capture_output=True,
-                text=True,
-                timeout=self.config.strategy.subprocess_timeout,
-            )
-
-            result.log_content = proc.stderr or ""
+        self._execute_subprocess(cmd, result, output_dir, model_path)
+        print(result.log_content, file=sys.stderr, flush=True)
 
-            if proc.returncode != 0:
-                result.success = False
-                result.error_message = (
-                    f"Process exited with code {proc.returncode}\n"
-                    f"stdout: {proc.stdout}\n"
-                    f"stderr: {proc.stderr}"
-                )
-            else:
-                result.success = True
-                self._parse_result(result, output_dir, model_path)
+        return result
 
+    def _execute_subprocess(
+        self, cmd: str, result: RunResult, output_dir: str, model_path: str
+    ) -> None:
+        """Execute subprocess and handle results."""
+        try:
+            proc = self._run_process(cmd)
+            result.log_content = proc.stderr or ""
+            self._handle_process_result(proc, result, output_dir, model_path)
         except subprocess.TimeoutExpired as e:
             result.success = False
             result.error_message = f"Process timed out: {e}"
@@ -59,10 +53,41 @@ def run(self, model_path: str, output_dir: str) -> RunResult:
             result.success = False
             result.error_message = f"Process execution failed: {e}"
 
-        print(result.log_content, file=sys.stderr, flush=True)
-        return result
+    def _run_process(self, cmd: str) -> subprocess.CompletedProcess:
+        """Run subprocess with configured timeout."""
+        return subprocess.run(
+            cmd,
+            shell=True,
+            env=_get_env_with_pythonpath(),
+            capture_output=True,
+            text=True,
+            timeout=self.config.strategy.subprocess_timeout,
+        )
+
+    def _handle_process_result(
+        self,
+        proc: subprocess.CompletedProcess,
+        result: RunResult,
+        output_dir: str,
+        model_path: str,
+    ) -> None:
+        """Handle subprocess completion result."""
+        if proc.returncode != 0:
+            result.success = False
+            result.error_message = (
+                f"Process exited with code {proc.returncode}\n"
+                f"stdout: {proc.stdout}\n"
+                f"stderr: {proc.stderr}"
+            )
+            return
+        result.success = True
+        self._parse_result(result, output_dir, model_path)
 
     def _build_command(self, model_path: str, output_dir: str) -> str:
+        """Build subprocess command string."""
+        from graph_net_bench import test_compiler_util
+
+        config_str = test_compiler_util.convert_to_base64(self.config.to_dict())
         cmd_parts = [
             sys.executable,
             "-m",
@@ -71,32 +96,36 @@ def _build_command(self, model_path: str, output_dir: str) -> str:
             model_path,
             "--output-path",
             output_dir,
+            "--config",
+            config_str,
         ]
-
-        config_dict = self.config.to_dict()
-        from graph_net_bench import test_compiler_util
-
-        config_str = test_compiler_util.convert_to_base64(config_dict)
-        cmd_parts.extend(["--config", config_str])
-
         return " ".join(cmd_parts)
 
-    def _parse_result(self, result: RunResult, output_dir: str, model_path: str):
+    def _parse_result(
+        self, result: RunResult, output_dir: str, model_path: str
+    ) -> None:
+        """Parse outputs and logs from subprocess result."""
+        self._load_outputs(result)
+        self._parse_log(result)
+
+    def _load_outputs(self, result: RunResult) -> None:
+        """Load model outputs from file."""
+        if not result.output_path or not result.output_path.exists():
+            return
+        try:
+            result.outputs = torch.load(str(result.output_path))
+        except Exception as e:
+            result.error_message += f"\nFailed to load outputs: {e}"
+
+    def _parse_log(self, result: RunResult) -> None:
+        """Parse log file for content and timing stats."""
+        if not result.log_path or not result.log_path.exists():
+            return
         from graph_net_bench import test_compiler_util
 
-        if result.output_path and result.output_path.exists():
-            try:
-                result.outputs = torch.load(str(result.output_path))
-            except Exception as e:
-                result.error_message += f"\nFailed to load outputs: {e}"
-
-        if result.log_path and result.log_path.exists():
-            try:
-                result.log_content = test_compiler_util.extract_log_content(
-                    str(result.log_path)
-                )
-                result.time_stats = test_compiler_util.parse_performance_stats(
-                    str(result.log_path)
-                )
-            except Exception as e:
-                result.error_message += f"\nFailed to parse log: {e}"
+        try:
+            log_path_str = str(result.log_path)
+            result.log_content = test_compiler_util.extract_log_content(log_path_str)
+            result.time_stats = test_compiler_util.parse_performance_stats(log_path_str)
+        except Exception as e:
+            result.error_message += f"\nFailed to parse log: {e}"
diff --git a/graph_net_bench/torch/runner/remote_runner.py b/graph_net_bench/torch/runner/remote_runner.py
index 74c5e651f..c7c371278 100644
--- a/graph_net_bench/torch/runner/remote_runner.py
+++ b/graph_net_bench/torch/runner/remote_runner.py
@@ -1,135 +1,181 @@
+"""Remote runner for gRPC-based model evaluation."""
+
 import os
 import sys
-from typing import Dict
+import traceback
+from pathlib import Path
+from typing import Dict, Optional
 
 import torch
 
 from .base_runner import BaseRunner, RunResult
 
 
+def _find_file_by_extension(
+    files_dict: Dict[str, bytes], expected_name: Optional[str], extension: str
+) -> Optional[str]:
+    """Find file in dict by expected name or by extension if only one exists."""
+    if expected_name and expected_name in files_dict:
+        return expected_name
+    available = sorted(k for k in files_dict.keys() if k.endswith(extension))
+    if len(available) == 1:
+        return available[0]
+    return None
+
+
+def _save_bytes_to_file(path: Path, content: bytes) -> None:
+    """Save bytes content to file."""
+    with open(path, "wb") as f:
+        f.write(content)
+
+
 class RemoteRunner(BaseRunner):
     """Execute model evaluation on a remote machine via gRPC."""
 
     def run(self, model_path: str, output_dir: str) -> RunResult:
-        from graph_net_rpc.sample_remote_executor import SampleRemoteExecutor
-
         os.makedirs(output_dir, exist_ok=True)
 
-        log_path = self._get_log_path(output_dir, model_path)
-        output_path = self._get_output_path(output_dir, model_path)
-
         result = RunResult(
-            output_path=output_path,
-            log_path=log_path,
+            output_path=self._get_output_path(output_dir, model_path),
+            log_path=self._get_log_path(output_dir, model_path),
         )
 
-        rpc_cmd = self._build_rpc_command()
+        self._execute_remote(model_path, result)
+        return result
+
+    def _execute_remote(self, model_path: str, result: RunResult) -> None:
+        """Execute model on remote machine."""
+        from graph_net_rpc.sample_remote_executor import SampleRemoteExecutor
+
         executor = SampleRemoteExecutor(
             machine=self.config.strategy.remote_machine,
             port=self.config.strategy.remote_port,
         )
 
         try:
-            print(
-                f"[RemoteRunner] Sending to {self.config.strategy.remote_machine}:{self.config.strategy.remote_port}",
-                file=sys.stderr,
-                flush=True,
-            )
+            self._log_execution_start()
+            rpc_cmd = self._build_rpc_command()
             print(f"[RemoteRunner] rpc_cmd: {rpc_cmd}", file=sys.stderr, flush=True)
 
             files_dict = executor.execute(model_path, rpc_cmd)
-            self._process_remote_output(result, files_dict, output_dir, model_path)
+            self._process_remote_output(result, files_dict)
             result.success = True
-
         except Exception as e:
-            import traceback
-
             result.success = False
             result.error_message = (
                 f"Remote execution failed: {e}\n{traceback.format_exc()}"
             )
             print(result.error_message, file=sys.stderr, flush=True)
-
         finally:
             executor.close()
 
-        return result
+    def _log_execution_start(self) -> None:
+        """Log remote execution start."""
+        machine = self.config.strategy.remote_machine
+        port = self.config.strategy.remote_port
+        print(
+            f"[RemoteRunner] Sending to {machine}:{port}", file=sys.stderr, flush=True
+        )
 
     def _build_rpc_command(self) -> str:
-        cmd = "python3 -m graph_net.torch.test_reference_device"
-        cmd += ' --model-path "$INPUT_WORKSPACE"'
-        cmd += ' --reference-dir "$OUTPUT_WORKSPACE"'
-        cmd += f" --compiler {self.config.execution.compiler}"
-        cmd += f" --device {self.config.execution.device}"
-        cmd += f" --op-lib {self.config.execution.op_lib}"
-        cmd += f" --warmup {self.config.execution.warmup}"
-        cmd += f" --trials {self.config.execution.trials}"
-        cmd += f" --seed {self.config.execution.seed}"
-
-        if self.config.execution.log_prompt:
-            cmd += f" --log-prompt {self.config.execution.log_prompt}"
-        if self.config.execution.backend_config:
-            cmd += f" --config {self.config.execution.backend_config}"
-
-        return cmd
+        """Build remote execution command string."""
+        exec_cfg = self.config.execution
+        cmd_parts = [
+            "python3 -m graph_net.torch.test_reference_device",
+            '--model-path "$INPUT_WORKSPACE"',
+            '--reference-dir "$OUTPUT_WORKSPACE"',
+            f"--compiler {exec_cfg.compiler}",
+            f"--device {exec_cfg.device}",
+            f"--op-lib {exec_cfg.op_lib}",
+            f"--warmup {exec_cfg.warmup}",
+            f"--trials {exec_cfg.trials}",
+            f"--seed {exec_cfg.seed}",
+        ]
+        if exec_cfg.log_prompt:
+            cmd_parts.append(f"--log-prompt {exec_cfg.log_prompt}")
+        if exec_cfg.backend_config:
+            cmd_parts.append(f"--config {exec_cfg.backend_config}")
+        return " ".join(cmd_parts)
 
     def _process_remote_output(
-        self,
-        result: RunResult,
-        files_dict: Dict[str, bytes],
-        output_dir: str,
-        model_path: str,
-    ):
-        from graph_net_bench import test_compiler_util
-
-        log_filename = result.log_path.name if result.log_path else None
-        pth_filename = result.output_path.name if result.output_path else None
-
-        available_logs = sorted([k for k in files_dict.keys() if k.endswith(".log")])
-        available_pths = sorted([k for k in files_dict.keys() if k.endswith(".pth")])
-
-        if log_filename not in files_dict and len(available_logs) == 1:
-            log_filename = available_logs[0]
-        if pth_filename not in files_dict and len(available_pths) == 1:
-            pth_filename = available_pths[0]
-
-        if log_filename and log_filename in files_dict:
-            log_bytes = files_dict[log_filename]
-            if result.log_path:
-                with open(result.log_path, "wb") as f:
-                    f.write(log_bytes)
-            try:
-                result.log_content = log_bytes.decode("utf-8")
-                print(result.log_content, file=sys.stderr, flush=True)
-            except Exception:
-                result.log_content = f"[Binary log, {len(log_bytes)} bytes]"
-                # Write binary content as text for parsing
-                with open(result.log_path, "wb") as f:
-                    f.write(log_bytes)
-
-            try:
-                result.time_stats = test_compiler_util.parse_performance_stats(
-                    str(result.log_path)
-                )
-            except Exception as e:
-                print(f"Warning: Failed to parse time stats: {e}", file=sys.stderr)
-        else:
+        self, result: RunResult, files_dict: Dict[str, bytes]
+    ) -> None:
+        """Process files received from remote execution."""
+        self._process_log_file(result, files_dict)
+        self._process_output_file(result, files_dict)
+
+    def _process_log_file(
+        self, result: RunResult, files_dict: Dict[str, bytes]
+    ) -> None:
+        """Process log file from remote output."""
+        expected_name = result.log_path.name if result.log_path else None
+        log_filename = _find_file_by_extension(files_dict, expected_name, ".log")
+
+        if not log_filename:
+            available = [k for k in files_dict.keys() if k.endswith(".log")]
             print(
-                f"Warning: log not found. expected={log_filename}, available={available_logs}",
+                f"Warning: log not found. expected={expected_name}, available={available}",
                 file=sys.stderr,
             )
+            return
+
+        log_bytes = files_dict[log_filename]
+        self._save_and_parse_log(result, log_bytes)
+
+    def _save_and_parse_log(self, result: RunResult, log_bytes: bytes) -> None:
+        """Save log file and parse timing stats."""
+
+        if result.log_path:
+            _save_bytes_to_file(result.log_path, log_bytes)
+
+        result.log_content = self._decode_log_content(log_bytes)
+        print(result.log_content, file=sys.stderr, flush=True)
+
+        self._parse_time_stats(result)
+
+    def _decode_log_content(self, log_bytes: bytes) -> str:
+        """Decode log bytes to string."""
+        try:
+            return log_bytes.decode("utf-8")
+        except Exception:
+            return f"[Binary log, {len(log_bytes)} bytes]"
+
+    def _parse_time_stats(self, result: RunResult) -> None:
+        """Parse performance stats from log file."""
+        if not result.log_path:
+            return
+        from graph_net_bench import test_compiler_util
 
-        if pth_filename and pth_filename in files_dict:
-            pth_bytes = files_dict[pth_filename]
-            if result.output_path:
-                with open(result.output_path, "wb") as f:
-                    f.write(pth_bytes)
-            try:
-                result.outputs = torch.load(str(result.output_path))
-            except Exception as e:
-                print(f"Warning: Failed to load outputs: {e}", file=sys.stderr)
-        else:
+        try:
+            result.time_stats = test_compiler_util.parse_performance_stats(
+                str(result.log_path)
+            )
+        except Exception as e:
+            print(f"Warning: Failed to parse time stats: {e}", file=sys.stderr)
+
+    def _process_output_file(
+        self, result: RunResult, files_dict: Dict[str, bytes]
+    ) -> None:
+        """Process output .pth file from remote output."""
+        expected_name = result.output_path.name if result.output_path else None
+        pth_filename = _find_file_by_extension(files_dict, expected_name, ".pth")
+
+        if not pth_filename:
+            available = [k for k in files_dict.keys() if k.endswith(".pth")]
             print(
-                f"Warning: output not found. expected={pth_filename}, available={available_pths}",
+                f"Warning: output not found. expected={expected_name}, available={available}",
                 file=sys.stderr,
             )
+            return
+
+        pth_bytes = files_dict[pth_filename]
+        self._save_and_load_outputs(result, pth_bytes)
+
+    def _save_and_load_outputs(self, result: RunResult, pth_bytes: bytes) -> None:
+        """Save output file and load tensors."""
+        if result.output_path:
+            _save_bytes_to_file(result.output_path, pth_bytes)
+        try:
+            result.outputs = torch.load(str(result.output_path))
+        except Exception as e:
+            print(f"Warning: Failed to load outputs: {e}", file=sys.stderr)
diff --git a/graph_net_bench/torch/test_compiler.py b/graph_net_bench/torch/test_compiler.py
index 8ee670fd2..52e027c65 100755
--- a/graph_net_bench/torch/test_compiler.py
+++ b/graph_net_bench/torch/test_compiler.py
@@ -58,7 +58,7 @@ def set_seed(random_seed):
         torch.cuda.manual_seed_all(random_seed)
 
 
-def get_hardward_name(args):
+def get_hardware_name(args):
     hardware_name = "unknown"
     if "cuda" in args.device:
         hardware_name = torch.cuda.get_device_name(args.device)
@@ -146,7 +146,7 @@ def measure_performance(model_call, args, compiler):
         model_call()
     compiler.synchronize()
 
-    hardware_name = get_hardward_name(args)
+    hardware_name = get_hardware_name(args)
     print(
         f"[Profiling] Using device: {args.device} {hardware_name}, warm up {args.warmup}, trials {args.trials}",
         file=sys.stderr,
@@ -214,7 +214,7 @@ def test_single_model(args):
         "[Processing]", model_path, args.log_prompt
     )
     test_compiler_util.print_basic_config(
-        args, get_hardward_name(args), get_compile_framework_version(args)
+        args, get_hardware_name(args), get_compile_framework_version(args)
     )
 
     runtime_seed = 1024