-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_loader.py
More file actions
38 lines (31 loc) · 1.6 KB
/
data_loader.py
File metadata and controls
38 lines (31 loc) · 1.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from datasets import load_dataset, concatenate_datasets
def get_combined_verilog_data():
print("--- Loading MG-Verilog ---")
mg_dataset = load_dataset("GaTech-EIC/MG-Verilog", split="train")
# DEBUG: See what columns actually exist
print(f"Available columns in MG-Verilog: {mg_dataset.column_names}")
# The actual column name in MG-Verilog is likely 'block_summary'
# We use .get() to avoid crashing if the name changes again
mg_cleaned = mg_dataset.map(lambda x: {
"instruction": x.get("block_summary") or x.get("detailed_global_summary") or "No description",
"output": x.get("verilog_code") or x.get("code") or "",
"source": "mg_verilog"
}, remove_columns=mg_dataset.column_names)
print("--- Loading RTLCoder ---")
# Using the official path or a small community version
try:
rtl_dataset = load_dataset("hkust-zhiyao/RTL-Coder", split="train")
except:
rtl_dataset = load_dataset("Nellyw888/RTL-Coder_small", split="train")
rtl_cleaned = rtl_dataset.map(lambda x: {
"instruction": x.get("instruction") or "No instruction",
"output": x.get("output") or x.get("verilog_code") or "",
"source": "rtl_coder"
}, remove_columns=rtl_dataset.column_names)
print("--- Merging Datasets ---")
combined_dataset = concatenate_datasets([mg_cleaned, rtl_cleaned])
return combined_dataset.shuffle(seed=42)
if __name__ == "__main__":
dataset = get_combined_verilog_data()
print(f"Success! Combined Dataset Size: {len(dataset)}")
print(f"First Sample Instruction: {dataset[0]['instruction'][:100]}...")