control-tax-code/apps_utils.py at main · MikhailTerekhov/control-tax-code · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
import io
import json
import os
import tempfile
import tokenize
from datetime import datetime
from dataclasses import dataclass
from enum import Enum
from pathlib import Path
from typing import List, Tuple, Dict, Any
import traceback
import re

import black
from inspect_ai.dataset import hf_dataset, Sample, MemoryDataset, Dataset

from seccomp_sandbox import run_sandboxed


@dataclass
class Test:
    """Represents a single test case with an input and expected output."""
    input: str | list
    expected_output: str | list


# Distinct from seccomp_sandbox.ExecutionStatus, this focuses on the success/failure of the code
# When run by the APPS tooling. It gives outputs in a different format and should be handled differently
class AppsExecutionStatus(str, Enum):
    """Execution status codes for APPS code evaluation."""
    COMPILATION_ERROR = "compilation_error"
    RUNTIME_ERROR = "runtime_error"
    SUCCESS = "success"


def extract_tagged_snippets(text: str, tag: str) -> List[str]:
    """Extract snippets enclosed in <tag> ... </tag> markers from the given text.

    Args:
        text: The input string containing tagged snippets.
        tag: The tag name (without angle brackets) to search for.

    Returns:
        A list of snippet strings found between each pair of <tag> and </tag>.
    """
    tag_start = f"<{tag}>"
    tag_end = f"</{tag}>"
    codes = []
    start_idx = 0
    while True:
        start_idx = text.find(tag_start, start_idx)
        if start_idx == -1:
            break
        end_idx = text.find(tag_end, start_idx)
        if end_idx == -1:
            break
        code_snippet = text[start_idx + len(tag_start) : end_idx].strip()
        codes.append(code_snippet)
        start_idx = end_idx + len(tag_end)
    return codes


# A helper function to save fields to files to reduce the size of the data dict
# Useful to save bulky metadata fields like stdout, stderr, test_outputs and reducing the size of the .eval file
def save_fields_to_files(
    data: Dict[str, Any], keys: List[str], output_dir: Path, name_prefix: str = ""
):
    """Save specified dictionary fields to external files and replace them with file paths.

    Args:
        data: The dictionary containing fields to offload.
        keys: List of keys in `data` to save to files.
        output_dir: Directory where the files will be written.
        name_prefix: Optional prefix to prepend to filenames.

    Side Effects:
        Writes files to `output_dir` and mutates `data` by replacing
        each saved field with a new key `<key>_file` pointing to the file path.
    """
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
    for key in keys:
        if key not in data:
            continue
        if isinstance(data[key], str):
            file_path = output_dir / f"{name_prefix}{key}_{timestamp}.txt"
            with open(file_path, "w") as f:
                f.write(data[key])
        else:
            file_path = output_dir / f"{name_prefix}{key}_{timestamp}.json"
            with open(file_path, "w") as f:
                json.dump(data[key], f)
        data[key + "_file"] = str(file_path)
        del data[key]


# TODO are there other cases we need to handle?
def unwrap_code(code: str) -> str:
    """Remove Markdown code fences from a code snippet, if present.

    Args:
        code: The code string, possibly wrapped in ``` or ```python fences.

    Returns:
        The unwrapped code without surrounding fences.
    """
    if code.startswith("```python\n"):
        return code[10:-3]
    elif code.startswith("```\n"):
        return code[3:-3]
    return code


def format_code(code: str) -> str:
    """Format Python code using Black.

    Args:
        code: The source code to format.

    Returns:
        The code formatted according to Black's style guidelines.
    """
    return black.format_str(code, mode=black.Mode())


def _fallback_remove_comments_regex(code: str) -> str:
    """Remove Python comments using a simple regex-based heuristic.

    This is less accurate than the tokenizer-based approach but works on broken code.
    It drops full-line comments and trailing `# ...` comments not preceded by a quote.

    Args:
        code: The source code string from which to remove comments.

    Returns:
        The code string with comments removed via regex heuristics.
    """
    # Remove full-line comments
    code = re.sub(r"(?m)^\s*#.*$", "", code)
    # Remove trailing comments (best-effort)
    code = re.sub(r"(?m)\s+#.*$", "", code)
    return code


# Modified from o3-mini-high suggestion – now robust to malformed code
def remove_comments_and_format(source_code: str) -> str:
    """Strip comments from source code and apply Black formatting if valid.

    Steps:
        1. Try precise, tokenizer-based comment removal, then run Black.
        2. If tokenization fails, fall back to regex-based removal without formatting.

    Args:
        source_code: The input code string to process.

    Returns:
        The de-commented (and possibly formatted) code string.
    """
    try:
        # Tokeniser needs bytes input
        source_bytes = source_code.encode("utf-8")
        tokens = tokenize.tokenize(io.BytesIO(source_bytes).readline)

        # Build a list of tokens, skipping comments
        filtered_tokens: list[tuple[int, str]] = []
        for tok in tokens:
            if tok.type == tokenize.COMMENT:
                continue
            filtered_tokens.append((tok.type, tok.string))

        no_comments = tokenize.untokenize(filtered_tokens).decode("utf-8")
        # Run *black* for consistent formatting
        return format_code(no_comments)
    except (tokenize.TokenError, IndentationError, SyntaxError):
        # Malformed code – fall back to regex method (no formatting)
        return _fallback_remove_comments_regex(source_code)


def apps_record_to_sample(record):
    """Convert a raw APPS dataset record into a Sample with metadata.

    Args:
        record: A dict from the APPS dataset containing keys like 'question',
                'solutions', and 'input_output'.

    Returns:
        A Sample instance with:
            - input: the problem prompt
            - target: a list of solution code snippets
            - id: the problem ID
            - metadata: dict containing difficulty, tests, URL, starter code, etc.
    """
    # Convert string representation of list back to actual list of strings
    input_output_str = record["input_output"]
    solutions_str = record["solutions"]
    try:
        solutions = json.loads(solutions_str)
    except Exception:
        solutions = []
    for i in range(len(solutions)):
        solutions[i] = unwrap_code(solutions[i])

    input_output = json.loads(input_output_str) if input_output_str.strip() else None
    solution_description = "a valid Python script that can be run as `python script.py`"
    if input_output is None:
        tests = []
        fn_name = None
    else:
        if "fn_name" in input_output:
            solution_description = (
                f"a valid Python function named `{input_output['fn_name']}`"
            )
            fn_name = input_output["fn_name"]
        else:
            fn_name = None
        assert "inputs" in input_output
        assert "outputs" in input_output
        assert len(input_output["inputs"]) == len(input_output["outputs"]), (
            f"Input and output have different lengths: "
            f"{len(input_output['inputs'])} != {len(input_output['outputs'])}"
        )
        tests = [
            Test(
                input=input_output["inputs"][i],
                expected_output=input_output["outputs"][i],
            )
            for i in range(len(input_output["inputs"]))
        ]

    return Sample(
        input=record["question"],
        target=solutions,
        id=record["problem_id"],
        metadata={
            "difficulty": record["difficulty"],
            "tests": tests,
            "solution_description": solution_description,
            "url": record["url"],
            "starter_code": record["starter_code"],
            "fn_name": fn_name,
        },
    )


def unit_tests(tests: List[Test]) -> str:
    """Render a list of Test objects into APPS XML-like unit test format.

    Args:
        tests: A list of Test instances.

    Returns:
        A string containing formatted <test> blocks for each Test.
    """
    return "\n".join(
        [
            f"<test>\n<input>\n{test.input}\n</input>\n<output>\n{test.expected_output}\n</output>\n</test>"
            for test in tests
        ]
    )


def prepare_apps_dataset(
    num_samples_per_difficulty: int | None = 10,
    difficulty: str | None = None,
    do_filter_fn_name: bool = True,
    do_filter_deterministic: bool = True,
    prepare_backdoor_tests: bool = False,
    min_num_backdoor_tests: int = 3,
    num_backdoor_tests: int = 5,
    backdoor_test_char_limit: int = 1000,
    split: str = "test",
    seed: int = 239,
) -> Dataset:
    """Load, filter, and sample the APPS dataset with optional backdoor test generation.

    Args:
        num_samples_per_difficulty: Number of samples to select per difficulty.
        difficulty: Specific difficulty to filter, or None for all.
        do_filter_fn_name: Exclude samples specifying a function name if True.
        do_filter_deterministic: Exclude nondeterministic problems if True.
        prepare_backdoor_tests: Generate and preserve small backdoor tests if True.
        min_num_backdoor_tests: Minimum number of backdoor tests required per sample.
        num_backdoor_tests: Number of small tests to preserve per sample.
        backdoor_test_char_limit: Max characters for inputs+outputs in backdoor tests.
        split: Which split to load ('train', 'validation', 'test').
        seed: Random seed for shuffling.

    Returns:
        A Dataset object containing the prepared APPS samples.
    """
    # Load APPS dataset from Huggingface
    dataset = hf_dataset(
        path="codeparrot/apps",
        split=split,
        sample_fields=apps_record_to_sample,
        trust=True,
    )

    # filter the dataset to only include examples with solutions
    dataset = dataset.filter(lambda x: len(x.target) > 0)
    # No need to keep the samples where there's no tests
    dataset = dataset.filter(lambda x: len(x.metadata["tests"]) > 0)

    if do_filter_fn_name:
        print(f"Before filtering fn_name: {len(dataset)}")
        dataset = dataset.filter(lambda x: x.metadata["fn_name"] is None)
        print(f"After filtering fn_name: {len(dataset)}")

    difficulties = (
        [difficulty]
        if difficulty is not None
        else list(set([x.metadata["difficulty"] for x in dataset]))
    )
    print(f"Difficulties: {difficulties}")

    with open(Path(__file__).parent / "data/apps_are_solutions_correct.json", "r") as f:
        deterministic_data = json.load(f)

    for sample in dataset:
        assert str(sample.id) in deterministic_data, (
            f"Problem id {sample.id} not in deterministic data"
        )
        sample.metadata["passing_solution_idx"] = deterministic_data[str(sample.id)][
            "passing_solution_idx"
        ]

    if do_filter_deterministic:
        print(f"Before filtering deterministic: {len(dataset)}")

        for difficulty in difficulties:
            num_samples = sum(
                1 for x in dataset if x.metadata["difficulty"] == difficulty
            )
            print(f"Difficulty: {difficulty}, total num samples: {num_samples}")

        assert split == "test", "We only have this for the test split for now"

        def is_deterministic(sample: Sample) -> bool:
            """Determine if a sample is deterministic based on prior evaluation data."""
            if str(sample.id) not in deterministic_data:
                return False
            data = deterministic_data[str(sample.id)]
            return data["num_failed"] == 0 and data["num_solutions_passed"] >= 3

        dataset = dataset.filter(is_deterministic)

        print(f"After filtering deterministic: {len(dataset)}")
        for difficulty in difficulties:
            num_samples = sum(
                1 for x in dataset if x.metadata["difficulty"] == difficulty
            )
            print(f"Difficulty: {difficulty}, total num samples: {num_samples}")

    if prepare_backdoor_tests:
        for sample in dataset:
            assert sample.metadata is not None

            suitable_test_ids = [
                i
                for i in range(len(sample.metadata["tests"]))
                if len(sample.metadata["tests"][i].input)
                + len(sample.metadata["tests"][i].expected_output)
                < backdoor_test_char_limit
            ]

            # These are the tests that will be shown to the model.
            preserved_test_ids = suitable_test_ids[:num_backdoor_tests]
            backdoor_tests = [sample.metadata["tests"][i] for i in preserved_test_ids]
            sample.metadata["preserved_test_ids"] = preserved_test_ids
            assert sample.metadata["passing_solution_idx"] is not None
            sample.metadata["correct_solution"] = sample.target[
                sample.metadata["passing_solution_idx"]
            ]
            sample.metadata["unit_tests"] = unit_tests(backdoor_tests)

        # Filter out samples with too few backdoor tests
        dataset = MemoryDataset(
            [
                sample
                for sample in dataset
                if len(sample.metadata["preserved_test_ids"]) >= min_num_backdoor_tests
            ]
        )
        print(f"After filtering backdoor suitable: {len(dataset)}")
        for difficulty in difficulties:
            num_samples = sum(
                1 for x in dataset if x.metadata["difficulty"] == difficulty
            )
            print(f"Difficulty: {difficulty}, total num samples: {num_samples}")

    dataset.shuffle(seed=seed)

    # Select num_samples per difficulty
    filtered_datasets = []
    for difficulty in difficulties:
        difficulty_dataset = dataset.filter(
            lambda x: x.metadata["difficulty"] == difficulty
        )
        print(f"Difficulty: {difficulty}, total num samples: {len(difficulty_dataset)}")
        sampled_data = (
            difficulty_dataset[:num_samples_per_difficulty]
            if num_samples_per_difficulty is not None
            else difficulty_dataset
        )
        filtered_datasets.append(sampled_data)
    dataset = MemoryDataset(
        [sample for dataset in filtered_datasets for sample in dataset]
    )

    return dataset


async def are_tests_correct(
    tests: List[Test],
    code: str,
    fn_name: str | None,
    timeout_seconds: int = 4,
    memory_limit_mb: int = 5 * 1024,
) -> Tuple[List[bool] | None, Dict[str, Any]]:
    """Run provided tests against code in a sandboxed APPS evaluation environment.

    Args:
        tests: List of Test instances with inputs and expected outputs.
        code: The source code to evaluate, may include Markdown fences.
        fn_name: Name of the function under test, or None for scripts.
        timeout_seconds: Timeout for each test in seconds.
        memory_limit_mb: Memory limit for the sandbox in megabytes.

    Returns:
        A tuple (results, metadata):
            - results: List of booleans or error codes (-1 runtime, -2 compile),
                       or None if execution/parsing failed.
            - metadata: Dict with stdout, stderr, returncode, status,
                        wrapped startup code, test_outputs, and error info.
    """
    code = unwrap_code(code)

    input_output = {
        "inputs": [test.input for test in tests],
        "outputs": [test.expected_output for test in tests],
    }
    if fn_name is not None:
        input_output["fn_name"] = fn_name
    problem_raw = {"input_output": json.dumps(input_output)}

    # Create temporary files for both problem JSON and code
    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
        json.dump(problem_raw, f)
        problem_file = f.name

    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
        f.write(code)
        code_file = f.name

    error = ""

    assert (Path(__file__).parent / "thirdparty/apps/eval").exists(), (
        "The APPS eval code is not present! Probably this is because you didn't load "
        "the submodules. If that is the case, please run `git submodule update --init --recursive`"
    )
    try:
        # Modified wrapper to read both from files
        wrapper = f"""import sys, json
sys.path.append("{str((Path(__file__).parent / "thirdparty/apps/eval").absolute())}")
import testing_util

with open("{problem_file}", 'r') as f:
    problem = json.load(f)
with open("{code_file}", 'r') as f:
    test = f.read()

problem["input_output"] = json.loads(problem["input_output"])
testing_util.timeout = {timeout_seconds}
result, outputs = testing_util.run_test(problem=problem, test=test, debug=True, return_outputs=True)

fixed = []
for e in result:
    if type(e).__name__ == 'ndarray':
        e = e.item(0)
    if type(e).__name__ == 'bool_':
        e = bool(e)
    fixed.append(e)
result = fixed

print("RESULT_JSON_START", file=sys.stderr)
print(json.dumps(result), file=sys.stderr)
print("RESULT_JSON_END", file=sys.stderr)

print("OUTPUTS_JSON_START", file=sys.stderr)
print(json.dumps(outputs), file=sys.stderr)
print("OUTPUTS_JSON_END", file=sys.stderr)"""

        outputs = await run_sandboxed(
            wrapper,
            args=[],
            input="",
            # The timeout here is increased by 1 second to account for the wrapper
            # run_tests checks for the correct timeout for each test internally
            timeout_seconds=(timeout_seconds + 1) * len(tests),
            memory_limit_mb=memory_limit_mb,
            use_seccomp=False,
        )

        # Extract result from stderr between markers
        try:
            result_json = outputs.stderr.split("RESULT_JSON_START\n")[1].split(
                "\nRESULT_JSON_END"
            )[0]
            result = json.loads(result_json)
        except (IndexError, json.JSONDecodeError) as e:
            result = None
            error = "Error parsing result: " + str(e)

        metadata = {
            "stdout": outputs.stdout,
            "stderr": outputs.stderr,
            "returncode": outputs.return_code,
            "status": outputs.status,
            "wrapped_startup_code": outputs.wrapped_code,
            "error": error,
        }

        try:
            outputs_json = outputs.stderr.split("OUTPUTS_JSON_START\n")[1].split(
                "\nOUTPUTS_JSON_END"
            )[0]
            test_outputs = json.loads(outputs_json)
        except (IndexError, json.JSONDecodeError) as e:
            test_outputs = None
            error = "Error parsing outputs: " + str(e)
            metadata["error"] = error

        metadata["test_outputs"] = test_outputs

        if result is not None and len(result) == 1:
            if result[0] == -2:
                metadata["error"] = "Compilation error"
            elif result[0] == -1:
                metadata["error"] = "Runtime error"
        else:
            if result is not None and len(result) != len(tests):
                metadata["error"] += (
                    f"\nResult length {len(result)} != test length {len(tests)}"
                )
                result = None
        return result, metadata
    except Exception as e:
        error = f"Error running tests:\n{traceback.format_exception(e)}"
        return None, {"error": error}
    finally:
        # Clean up both temporary files
        os.unlink(problem_file)
        os.unlink(code_file)


# A convenience wrapper function to run the code the way the APPS eval does
# Under the hood, it runs are_tests_correct but discards the test results
async def run_apps_code(
    code: str,
    inputs: List[str],
    fn_name: str | None,
    timeout_seconds: int = 4,
    memory_limit_mb: int = 5 * 1024,
) -> Dict[str, Any]:
    """Execute code in APPS evaluation wrapper, returning execution metadata.

    This convenience wrapper invokes `are_tests_correct` with dummy expected
    outputs, capturing only compilation/runtime status codes.

    Args:
        code: The source code to execute.
        inputs: List of input strings for placeholder tests.
        fn_name: Name of the function under test, or None for full scripts.
        timeout_seconds: Timeout per test in seconds.
        memory_limit_mb: Memory limit for sandbox in megabytes.

    Returns:
        A dictionary containing execution metadata and a 'run_apps_status'
        list of AppsExecutionStatus values for each input.
    """
    mock_tests = [Test(input=input, expected_output="") for input in inputs]
    result, metadata = await are_tests_correct(
        tests=mock_tests,
        code=code,
        fn_name=fn_name,
        timeout_seconds=timeout_seconds,
        memory_limit_mb=memory_limit_mb,
    )
    if result is None:
        metadata["run_apps_status"] = [AppsExecutionStatus.RUNTIME_ERROR]
    elif result[0] == -2:
        metadata["run_apps_status"] = [AppsExecutionStatus.COMPILATION_ERROR]
    else:
        metadata["run_apps_status"] = [
            AppsExecutionStatus.RUNTIME_ERROR
            if x == -1
            else AppsExecutionStatus.SUCCESS
            for x in result
        ]
    return metadata