diff --git a/graph_net_bench/torch/eval_backend_perf.py b/graph_net_bench/torch/eval_backend_perf.py
index 3fd6db3ff..b22e322da 100644
--- a/graph_net_bench/torch/eval_backend_perf.py
+++ b/graph_net_bench/torch/eval_backend_perf.py
@@ -15,6 +15,7 @@
 from contextlib import redirect_stdout, redirect_stderr
 from graph_net_bench.torch.backend.graph_compiler_backend import GraphCompilerBackend
 from graph_net_bench import test_compiler_util
+from .util.timing import measure_performance
 
 
 def register_op_lib(op_lib):
@@ -129,69 +130,6 @@ def get_input_dict(args):
     }
 
 
-def measure_performance(model_call, args, compiler):
-    stats = {}
-    outs = model_call()
-
-    # Warmup runs
-    for _ in range(args.warmup):
-        model_call()
-    compiler.synchronize()
-
-    print(
-        f"[Profiling] Warm up {args.warmup}, Trials {args.trials}",
-        file=sys.stderr,
-        flush=True,
-    )
-
-    if "cuda" in args.device:
-        torch.cuda.empty_cache()
-        e2e_times = []
-        gpu_times = []
-
-        for i in range(args.trials):
-            # End-to-end timing (naive_timer)
-            duration_box = test_compiler_util.DurationBox(-1)
-            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
-                # GPU-only timing (CUDA Events)
-                start_event = torch.cuda.Event(enable_timing=True)
-                end_event = torch.cuda.Event(enable_timing=True)
-                start_event.record()
-
-                model_call()
-
-                end_event.record()
-                compiler.synchronize()
-
-            gpu_time_ms = start_event.elapsed_time(end_event)
-            e2e_times.append(duration_box.value)
-            gpu_times.append(gpu_time_ms)
-            print(
-                f"Trial {i + 1}: e2e={duration_box.value:.5f} ms, gpu={gpu_time_ms:.5f} ms",
-                file=sys.stderr,
-                flush=True,
-            )
-
-        stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
-        stats["gpu"] = test_compiler_util.get_timing_stats(gpu_times)
-
-    else:  # CPU or other devices
-        e2e_times = []
-        for i in range(args.trials):
-            duration_box = test_compiler_util.DurationBox(-1)
-            with test_compiler_util.naive_timer(duration_box, compiler.synchronize):
-                model_call()
-            print(
-                f"Trial {i + 1}: e2e={duration_box.value:.5f} ms",
-                file=sys.stderr,
-                flush=True,
-            )
-            e2e_times.append(duration_box.value)
-        stats["e2e"] = test_compiler_util.get_timing_stats(e2e_times)
-
-    return outs, stats
-
-
 def eval_single_model_with_single_backend(args):
     check_and_complete_args(args)
     set_seed(args.seed)
diff --git a/graph_net_bench/torch/util/timing.py b/graph_net_bench/torch/util/timing.py
new file mode 100644
index 000000000..e9c42cfa7
--- /dev/null
+++ b/graph_net_bench/torch/util/timing.py
@@ -0,0 +1,90 @@
+import torch
+import sys
+from graph_net_bench import test_compiler_util
+
+
+def measure_performance(model_call, args, compiler):
+    stats = {}
+    outs = model_call()
+    # Warmup runs
+    for _ in range(args.warmup):
+        model_call()
+    compiler.synchronize()
+
+    print(
+        f"[Profiling] Warm up {args.warmup}, Trials {args.trials}",
+        file=sys.stderr,
+        flush=True,
+    )
+
+    if "cuda" in args.device:
+        torch.cuda.empty_cache()
+        executor = CUDATrialExecutor(model_call, compiler)
+    else:
+        executor = NoneCUDATrialExecutor(model_call, compiler)
+
+    timings = run_benchmark(args.trials, executor)
+
+    stats = {
+        name: test_compiler_util.get_timing_stats(values)
+        for name, values in timings.items()
+    }
+
+    return outs, stats
+
+
+def run_benchmark(trials, executor):
+    results = {}
+
+    for i in range(trials):
+        timings = executor.run_one_trial()
+
+        for k, v in timings.items():
+            results.setdefault(k, []).append(v)
+
+        log_trial(i + 1, timings)
+
+    return results
+
+
+def log_trial(idx, timings):
+    msg = ", ".join(f"{k}={v:.5f} ms" for k, v in timings.items())
+    print(f"Trial {idx}: {msg}", file=sys.stderr, flush=True)
+
+
+class BaseTrialExecutor:
+    def __init__(self, model_call, compiler):
+        self.model_call = model_call
+        self.compiler = compiler
+
+    def run_one_trial(self):
+        raise NotImplementedError
+
+
+class NoneCUDATrialExecutor(BaseTrialExecutor):
+    def run_one_trial(self):
+        duration_box = test_compiler_util.DurationBox(-1)
+        with test_compiler_util.naive_timer(duration_box, self.compiler.synchronize):
+            self.model_call()
+        return {"e2e": duration_box.value}
+
+
+class CUDATrialExecutor(BaseTrialExecutor):
+    def run_one_trial(self):
+        duration_box = test_compiler_util.DurationBox(-1)
+
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+
+        with test_compiler_util.naive_timer(duration_box, self.compiler.synchronize):
+            start_event.record()
+            self.model_call()
+            end_event.record()
+            self.compiler.synchronize()
+
+        gpu_time = start_event.elapsed_time(end_event)
+
+        return {
+            "e2e": duration_box.value,
+            "gpu": gpu_time,
+        }