-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathextract_rust_test_data.py
More file actions
151 lines (121 loc) · 5.58 KB
/
extract_rust_test_data.py
File metadata and controls
151 lines (121 loc) · 5.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import re
import sys
import os
from typing import Tuple, List
from compare_mathml_in_csv import setMathCATPreferences, setMathMLForMathCAT
sys.stdout.reconfigure(encoding='utf-8') # in case print statements are used for debugging
def extract_from_file(filename: str) -> Tuple[List[str], List[str], List[str]]:
"""
Extract expr (MathML) strings, braille strings, and canonical MathML from a Rust test file.
Args:
filename: Path to the Rust test file
Returns:
tuple: (expr_list, braille_list, canonical_list) - Three lists containing the extracted strings
"""
with open(filename, 'r', encoding='utf-8') as f:
content = f.read()
# Pattern to strip // comments but not if they are inside strings
comment_pattern = re.compile(
r'(' # Start Group 1 (Stuff to keep)
r'r(#*)"[\s\S]*?"\2' # Raw strings: r#""# or r""
r'|' # OR
r'"(?:\\.|[^"\\])*"' # Standard strings: "..."
r')' # End Group 1
r'|' # OR (Stuff to delete)
r'//.*', # The comment
re.MULTILINE
)
# Strip comments from content
content = comment_pattern.sub(lambda m: m.group(1) or "", content)
# Use combined pattern to ensure expr and test_braille are paired correctly
# Pattern for "expr = ..."
expr_part = r'let\s+expr\s*=\s*(?:r#*"(.*?)"#*|"(.*?)")\s*;'
# Pattern for "test_braille(...)"
call_part = r'\s*test_braille(?:_prefs)?\s*\(\s*.*?\s*expr\s*,\s*"([^"]*)"\s*\)\s*;'
# Combined pattern matches both together
combined_pattern = re.compile(expr_part + call_part, re.MULTILINE | re.DOTALL)
# Extract paired matches
matches = combined_pattern.findall(content)
expr_list = []
braille_list = []
canonical_list = []
for i, match in enumerate(matches):
# match[0] is the content for raw strings (r#"..."#)
# match[1] is the content for standard strings ("...")
# match[2] is the braille string
expr_content = match[0] if match[0] else match[1]
braille_content = match[2]
# Normalize whitespace but preserve structure
expr_content = " ".join(expr_content.split()).strip()
expr_list.append(expr_content)
braille_list.append(braille_content)
# Generate canonical MathML
if expr_content.startswith("<math") and expr_content.endswith("</math>"):
try:
canonical = setMathMLForMathCAT(expr_content)
canonical_list.append(" ".join(canonical.split()).strip())
except Exception as e:
print(f"Warning: Canonicalization error for test in {filename}: {e}")
print(f"MathML={expr_content[:100]}...")
canonical_list.append("") # Add empty string to keep lists aligned
else:
canonical_list.append("") # Add empty string if not valid MathML
return expr_list, braille_list, canonical_list
def extract_from_files(file_list: List[str], expr_output: str, braille_output: str, canonical_output: str) -> None:
"""
Extract expr, braille strings, and canonical MathML from multiple Rust test files and write to output files.
Args:
file_list: List of paths to Rust test files
expr_output: Path to output file for expr strings
braille_output: Path to output file for braille strings
canonical_output: Path to output file for canonical MathML
"""
# Clear existing output files if they exist
output_files = [expr_output, braille_output, canonical_output]
for out_file in output_files:
if os.path.exists(out_file):
os.remove(out_file)
# Initialize MathCAT
try:
setMathCATPreferences({})
except Exception as e:
print(f"Warning: Can't set MathCAT preferences: {e}")
total_expr_count = 0
total_braille_count = 0
total_canonical_count = 0
for filename in file_list:
if not os.path.isfile(filename):
print(f"Skipping: '{filename}' (File not found)")
continue
expr_list, braille_list, canonical_list = extract_from_file(filename)
# Append to output files
with open(expr_output, 'a', encoding='utf-8') as f:
for expr in expr_list:
f.write(f"{expr}\n")
with open(braille_output, 'a', encoding='utf-8') as f:
for braille in braille_list:
f.write(f"{braille}\n")
with open(canonical_output, 'a', encoding='utf-8') as f:
for canonical in canonical_list:
f.write(f"{canonical}\n")
total_expr_count += len(expr_list)
total_braille_count += len(braille_list)
total_canonical_count += len(canonical_list)
print(f"Processed {filename}: Found {len(expr_list)} pairs.")
print(f"\nTotal extracted: {total_expr_count} expr strings, "
f"{total_braille_count} braille strings, "
f"and {total_canonical_count} canonical MathML strings")
print(f"Written to {expr_output}, {braille_output}, and {canonical_output}")
if __name__ == "__main__":
if len(sys.argv) < 5:
print("Usage: python extract_rust_test_data.py "
"<expr_output_file> "
"<braille_output_file> "
"<canonical_output_file> "
"<input_file1> [input_file2] ...")
sys.exit(1)
expr_output = sys.argv[1]
braille_output = sys.argv[2]
canonical_output = sys.argv[3]
input_files = sys.argv[4:]
extract_from_files(input_files, expr_output, braille_output, canonical_output)