-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess.py
More file actions
41 lines (32 loc) · 1.43 KB
/
preprocess.py
File metadata and controls
41 lines (32 loc) · 1.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import json
from data_loader import get_combined_verilog_data
def format_for_prolog_agent(example):
"""
Transforms a raw Verilog dataset entry into a
Training Prompt for Agent 1.
"""
instruction = example['instruction']
# SYSTEM PROMPT: Tells the LLM how to act
system_message = (
"You are a Hardware Verification Agent. "
"Translate the following hardware description into formal Prolog constraints."
)
# FORMAT: This is the structure the LLM will learn
# User: English Text -> Assistant: Prolog Code
formatted_text = f"### System: {system_message}\n"
formatted_text += f"### Instruction: {instruction}\n"
formatted_text += f"### Response: [PROLOG_START]\n"
# Note: In a real training set, you'd put your manual Prolog here as the 'target'
return {"text": formatted_text}
def prepare_training_file(output_file="training_data.jsonl"):
print("Loading datasets for preprocessing...")
dataset = get_combined_verilog_data()
print(f"Processing {len(dataset)} entries...")
processed_data = dataset.map(format_for_prolog_agent)
# Save to JSONL (standard format for Hugging Face fine-tuning)
with open(output_file, "w") as f:
for entry in processed_data:
f.write(json.dumps(entry) + "\n")
print(f"✅ Preprocessing complete. Data saved to {output_file}")
if __name__ == "__main__":
prepare_training_file()