tact/benchmark.py at master · jonchang/tact · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
#!/usr/bin/env python3
"""Benchmarking script for TACT performance across different Python implementations.

This script measures the execution time of TACT commands using uv to manage Python versions:
- uv (pypy) - PyPy 3.11
- uv (python3.14) - Python 3.14 without JIT
- uv (python3.14-jit) - Python 3.14 with experimental JIT (PYTHON_JIT=1)

uv automatically downloads and manages Python versions, ensuring Python 3.14 has JIT enabled.
The JIT version sets PYTHON_JIT=1 environment variable and verifies JIT is enabled.

Usage:
    python benchmark.py [--dataset carangaria|percomorphaceae] [--runs N] [--warmup N]
"""

import argparse
import json
import os
import shutil
import subprocess
import tempfile
import time
from pathlib import Path
from statistics import mean, stdev
from typing import Any


class BenchmarkRunner:
    """Runs benchmarks across different Python implementations."""

    def __init__(
        self,
        dataset: str = "carangaria",
        runs: int = 5,
        warmup: int = 1,
        examples_dir: Path | None = None,
    ):
        """Initialize the benchmark runner.

        Args:
            dataset: Dataset to use ('carangaria' or 'percomorphaceae')
            runs: Number of benchmark runs to perform
            warmup: Number of warmup runs before benchmarking
            examples_dir: Path to examples directory (defaults to ./examples)
        """
        self.dataset = dataset.lower()
        self.runs = runs
        self.warmup = warmup
        self.examples_dir = examples_dir or Path(__file__).parent / "examples"
        self.temp_dir = None

        # Python implementations to test (using uv to manage Python versions)
        # uv will download and manage these Python versions automatically
        self.implementations = {
            "uv (pypy)": {
                "python_version": "pypy3.11",
                "install_cmd": ["uv", "python", "install", "pypy3.11"],
                "run_cmd": ["uv", "run", "--python", "pypy3.11", "--with", "tact"],
                "env": None,  # No special environment variables
            },
            "uv (python3.14)": {
                "python_version": "3.14",
                "install_cmd": ["uv", "python", "install", "3.14"],
                "run_cmd": ["uv", "run", "--python", "3.14", "--with", "tact"],
                "env": None,  # No JIT enabled
            },
            "uv (python3.14-jit)": {
                "python_version": "3.14",
                "install_cmd": ["uv", "python", "install", "3.14"],
                "run_cmd": ["uv", "run", "--python", "3.14", "--with", "tact"],
                "env": {"PYTHON_JIT": "1"},  # Enable JIT
            },
        }

    def setup_dataset(self) -> tuple[Path, Path, Path]:
        """Set up the dataset files and return paths to backbone, taxonomy, and output.

        Returns:
            Tuple of (backbone_path, taxonomy_path, output_base)
        """
        # Create temporary directory for outputs
        self.temp_dir = Path(tempfile.mkdtemp(prefix="tact_benchmark_"))
        output_base = self.temp_dir / "result"

        if self.dataset == "carangaria":
            csv_file = self.examples_dir / "Carangaria.csv"
            tre_file = self.examples_dir / "Carangaria.tre"
            taxonomy_file = self.temp_dir / "Carangaria.taxonomy.tre"
        elif self.dataset == "percomorphaceae":
            # For Percomorphaceae, look for files in examples directory
            # or allow user to provide via --examples-dir
            csv_file = self.examples_dir / "Percomorphaceae.csv"
            tre_file = self.examples_dir / "Percomorphaceae.tre"
            taxonomy_file = self.temp_dir / "Percomorphaceae.taxonomy.tre"
            if not csv_file.exists() or not tre_file.exists():
                raise FileNotFoundError(
                    f"Percomorphaceae dataset files not found in {self.examples_dir}. "
                    "Please provide Percomorphaceae.csv and Percomorphaceae.tre files."
                )
        else:
            raise ValueError(f"Unknown dataset: {self.dataset}")

        if not csv_file.exists():
            raise FileNotFoundError(f"CSV file not found: {csv_file}")
        if not tre_file.exists():
            raise FileNotFoundError(f"Tree file not found: {tre_file}")

        return tre_file, csv_file, taxonomy_file, output_base

    def build_taxonomy(
        self, csv_file: Path, taxonomy_file: Path, run_cmd: list[str], env: dict[str, str] | None = None
    ) -> float:
        """Build the taxonomy tree from CSV.

        Args:
            csv_file: Path to CSV file
            taxonomy_file: Path to output taxonomy file
            run_cmd: Command to run (e.g., ['uv', 'run', '--python', '3.11', '--with', 'tact'])
            env: Optional environment variables to set

        Returns:
            Execution time in seconds
        """
        cmd = [*run_cmd, "tact_build_taxonomic_tree", str(csv_file), "--output", str(taxonomy_file)]

        # Merge with current environment
        process_env = os.environ.copy()
        if env:
            process_env.update(env)

        start = time.perf_counter()
        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            check=True,
            env=process_env,
        )
        elapsed = time.perf_counter() - start

        if not taxonomy_file.exists():
            raise RuntimeError(f"Taxonomy file not created: {taxonomy_file}\n{result.stderr}")

        return elapsed

    def run_tact_add_taxa(
        self,
        backbone: Path,
        taxonomy: Path,
        output_base: Path,
        run_cmd: list[str],
        env: dict[str, str] | None = None,
    ) -> float:
        """Run tact_add_taxa command.

        Args:
            backbone: Path to backbone tree
            taxonomy: Path to taxonomy tree
            output_base: Base name for output files
            run_cmd: Command to run (e.g., ['uv', 'run', '--python', '3.11', '--with', 'tact'])
            env: Optional environment variables to set

        Returns:
            Execution time in seconds
        """
        cmd = [
            *run_cmd,
            "tact_add_taxa",
            "--backbone",
            str(backbone),
            "--taxonomy",
            str(taxonomy),
            "--output",
            str(output_base),
        ]

        # Merge with current environment
        process_env = os.environ.copy()
        if env:
            process_env.update(env)

        start = time.perf_counter()
        subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            check=True,
            env=process_env,
        )
        elapsed = time.perf_counter() - start

        return elapsed

    def ensure_python_version(self, name: str, impl_config: dict[str, Any]) -> bool:
        """Ensure a Python version is installed via uv and verify it works.

        Args:
            name: Name of the implementation
            impl_config: Implementation configuration dict with install_cmd and run_cmd

        Returns:
            True if available or successfully installed and verified, False otherwise
        """
        # Check if uv is available
        try:
            subprocess.run(
                ["uv", "--version"],
                capture_output=True,
                text=True,
                check=True,
            )
        except (subprocess.CalledProcessError, FileNotFoundError):
            print("Error: uv is not installed. Please install uv first:")
            print("  curl -LsSf https://astral.sh/uv/install.sh | sh")
            return False

        # Install Python version if needed
        print(f"Ensuring Python version is available: {impl_config['python_version']}")
        try:
            result = subprocess.run(
                impl_config["install_cmd"],
                capture_output=True,
                text=True,
                check=True,
            )
        except subprocess.CalledProcessError as e:
            print(f"Failed to install Python version {impl_config['python_version']}: {e.stderr}")
            return False

        # Verify the interpreter can run by checking if tact is available
        print(f"Verifying {name} can run tact commands...", end=" ", flush=True)
        try:
            # Test with --help to verify the command works
            verify_cmd = impl_config["run_cmd"] + ["tact_add_taxa", "--help"]
            process_env = os.environ.copy()
            if impl_config.get("env"):
                process_env.update(impl_config["env"])
            result = subprocess.run(
                verify_cmd,
                capture_output=True,
                text=True,
                check=True,
                env=process_env,
            )
            print("✓")
        except subprocess.CalledProcessError as e:
            print(f"✗ Failed: {e.stderr}")
            return False

        # For Python 3.14 with JIT, verify JIT is enabled
        if name == "uv (python3.14-jit)":
            print("Verifying Python 3.14 JIT is enabled...", end=" ", flush=True)
            try:
                code = "import sys;j=getattr(sys,'_jit',None);print('True' if j and j.is_enabled() else 'False')"
                process_env = os.environ.copy()
                process_env["PYTHON_JIT"] = "1"
                result = subprocess.run(
                    ["uv", "run", "--python", "3.14", "python", "-c", code],
                    capture_output=True,
                    text=True,
                    check=True,
                    env=process_env,
                )
                if result.stdout.strip() != "True":
                    print(f"✗ JIT not enabled. Output: {result.stdout.strip()}")
                    return False
                print("✓")
            except subprocess.CalledProcessError as e:
                print(f"✗ Failed to verify JIT: {e.stderr}")
                return False

        return True

    def benchmark_implementation(
        self,
        name: str,
        run_cmd: list[str],
        backbone: Path,
        taxonomy: Path,
        output_base: Path,
        env: dict[str, str] | None = None,
    ) -> dict[str, float]:
        """Benchmark a single Python implementation.

        Args:
            name: Name of the implementation
            run_cmd: Command to run (e.g., ['uv', 'run', '--python', '3.11', '--with', 'tact'])
            backbone: Path to backbone tree
            taxonomy: Path to taxonomy tree
            output_base: Base name for output files
            env: Optional environment variables to set

        Returns:
            Dictionary with timing statistics
        """
        print(f"\n{'=' * 60}")
        print(f"Benchmarking: {name}")
        print(f"{'=' * 60}")

        times = []

        # Warmup runs
        if self.warmup > 0:
            print(f"Running {self.warmup} warmup run(s)...")
            for i in range(self.warmup):
                print(f"  Warmup {i + 1}/{self.warmup}...", end=" ", flush=True)
                try:
                    self.run_tact_add_taxa(backbone, taxonomy, output_base, run_cmd, env)
                    print("✓")
                except subprocess.CalledProcessError as e:
                    print(f"✗ Failed: {e}")
                    return {"error": str(e)}

        # Actual benchmark runs
        print(f"\nRunning {self.runs} benchmark run(s)...")
        for i in range(self.runs):
            print(f"  Run {i + 1}/{self.runs}...", end=" ", flush=True)
            try:
                elapsed = self.run_tact_add_taxa(backbone, taxonomy, output_base, run_cmd, env)
                times.append(elapsed)
                print(f"✓ ({elapsed:.2f}s)")
            except subprocess.CalledProcessError as e:
                print(f"✗ Failed: {e.stderr}")
                return {"error": str(e)}

        if not times:
            return {"error": "No successful runs"}

        # Calculate statistics
        mean_time = mean(times)
        min_time = min(times)
        max_time = max(times)
        std_time = stdev(times) if len(times) > 1 else 0.0

        return {
            "mean": mean_time,
            "min": min_time,
            "max": max_time,
            "std": std_time,
            "runs": times,
        }

    def run(self) -> dict[str, dict[str, float]]:
        """Run all benchmarks.

        Returns:
            Dictionary mapping implementation names to their timing statistics
        """
        print(f"Benchmarking TACT with dataset: {self.dataset}")
        print(f"Runs: {self.runs}, Warmup: {self.warmup}")

        # Set up dataset
        backbone, csv_file, taxonomy_file, output_base = self.setup_dataset()

        results = {}

        # Ensure all Python versions are installed
        print("\nEnsuring Python versions are available...")
        for name, impl_config in self.implementations.items():
            if not self.ensure_python_version(name, impl_config):
                print(f"\nSkipping {name} (failed to install/verify)")
                results[name] = {"error": "Implementation not available"}
                continue

        # Build taxonomy once (we'll reuse it for all implementations)
        print("\nBuilding taxonomy tree...")
        # Use python3.14 (without JIT) for building taxonomy (should be fast and available)
        try:
            build_cmd = self.implementations["uv (python3.14)"]["run_cmd"]
            build_env = self.implementations["uv (python3.14)"]["env"]
            build_time = self.build_taxonomy(csv_file, taxonomy_file, build_cmd, build_env)
            print(f"Taxonomy built in {build_time:.2f}s")
        except subprocess.CalledProcessError as e:
            print(f"Failed to build taxonomy: {e.stderr}")
            return {"error": "Failed to build taxonomy"}

        # Benchmark each implementation
        for name, impl_config in self.implementations.items():
            if name in results and "error" in results[name]:
                # Already marked as unavailable
                continue

            # Create a unique output base for each implementation
            safe_name = name.replace(" ", "_").replace("(", "").replace(")", "").replace(".", "_")
            impl_output_base = output_base.parent / f"{output_base.name}_{safe_name}"

            result = self.benchmark_implementation(
                name,
                impl_config["run_cmd"],
                backbone,
                taxonomy_file,
                impl_output_base,
                impl_config["env"],
            )
            results[name] = result

        return results

    def cleanup(self):
        """Clean up temporary files."""
        if self.temp_dir and self.temp_dir.exists():
            shutil.rmtree(self.temp_dir)
            print(f"\nCleaned up temporary directory: {self.temp_dir}")

    def format_results(self, results: dict[str, dict[str, float]]) -> str:
        """Format results as a markdown table.

        Args:
            results: Dictionary of benchmark results

        Returns:
            Formatted markdown table string
        """
        # Find baseline (pypy) for relative calculations
        baseline = None
        if "uv (pypy)" in results and "mean" in results["uv (pypy)"]:
            baseline = results["uv (pypy)"]["mean"]
        elif "uv (python3.14)" in results and "mean" in results["uv (python3.14)"]:
            baseline = results["uv (python3.14)"]["mean"]

        lines = []
        lines.append("| python | Mean [s] | Min [s] | Max [s] | Relative |")
        lines.append("|:---|---:|---:|---:|---:|")

        # Order implementations for consistent output
        order = ["uv (pypy)", "uv (python3.14)", "uv (python3.14-jit)"]
        for name in order:
            if name not in results:
                continue

            result = results[name]
            if "error" in result:
                error_msg = result["error"][:50]  # Truncate long error messages
                lines.append(f"| `{name}` | Error: {error_msg} | | | |")
                continue

            mean_time = result["mean"]
            min_time = result["min"]
            max_time = result["max"]
            std_time = result["std"]

            # Calculate relative performance
            if baseline:
                relative = mean_time / baseline
                relative_std = std_time / baseline if std_time > 0 else 0
                relative_str = f"{relative:.2f} ± {relative_std:.2f}" if relative_std > 0 else f"{relative:.2f}"
            else:
                relative_str = "—"

            # Format mean with std dev (matching hyperfine format)
            mean_str = f"{mean_time:.3f} ± {std_time:.3f}" if std_time > 0 else f"{mean_time:.3f}"

            lines.append(f"| `{name}` | {mean_str} | {min_time:.3f} | {max_time:.3f} | {relative_str} |")

        return "\n".join(lines)

    def __enter__(self):
        """Enter the context."""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Exit the context."""
        self.cleanup()


def main():
    """Main entry point."""
    parser = argparse.ArgumentParser(description="Benchmark TACT performance across different Python implementations")
    parser.add_argument(
        "--dataset",
        choices=["carangaria", "percomorphaceae"],
        default="carangaria",
        help="Dataset to use for benchmarking (default: carangaria)",
    )
    parser.add_argument(
        "--runs",
        type=int,
        default=5,
        help="Number of benchmark runs (default: 5)",
    )
    parser.add_argument(
        "--warmup",
        type=int,
        default=1,
        help="Number of warmup runs (default: 1)",
    )
    parser.add_argument(
        "--examples-dir",
        type=Path,
        help="Path to examples directory (default: ./examples)",
    )
    parser.add_argument(
        "--json",
        action="store_true",
        help="Output results as JSON instead of markdown table",
    )

    args = parser.parse_args()

    with BenchmarkRunner(
        dataset=args.dataset,
        runs=args.runs,
        warmup=args.warmup,
        examples_dir=args.examples_dir,
    ) as runner:
        results = runner.run()

        if args.json:
            print("\n" + json.dumps(results, indent=2))
        else:
            print("\n" + "=" * 60)
            print("RESULTS")
            print("=" * 60)
            print(runner.format_results(results))


if __name__ == "__main__":
    main()