thesiswork/data_loader.py at main · Basarans/thesiswork · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from datasets import load_dataset, concatenate_datasets

def get_combined_verilog_data():
    print("--- Loading MG-Verilog ---")
    mg_dataset = load_dataset("GaTech-EIC/MG-Verilog", split="train")

    # DEBUG: See what columns actually exist
    print(f"Available columns in MG-Verilog: {mg_dataset.column_names}")

    # The actual column name in MG-Verilog is likely 'block_summary'
    # We use .get() to avoid crashing if the name changes again
    mg_cleaned = mg_dataset.map(lambda x: {
        "instruction": x.get("block_summary") or x.get("detailed_global_summary") or "No description",
        "output": x.get("verilog_code") or x.get("code") or "",
        "source": "mg_verilog"
    }, remove_columns=mg_dataset.column_names)

    print("--- Loading RTLCoder ---")
    # Using the official path or a small community version
    try:
        rtl_dataset = load_dataset("hkust-zhiyao/RTL-Coder", split="train")
    except:
        rtl_dataset = load_dataset("Nellyw888/RTL-Coder_small", split="train")

    rtl_cleaned = rtl_dataset.map(lambda x: {
        "instruction": x.get("instruction") or "No instruction",
        "output": x.get("output") or x.get("verilog_code") or "",
        "source": "rtl_coder"
    }, remove_columns=rtl_dataset.column_names)

    print("--- Merging Datasets ---")
    combined_dataset = concatenate_datasets([mg_cleaned, rtl_cleaned])
    return combined_dataset.shuffle(seed=42)

if __name__ == "__main__":
    dataset = get_combined_verilog_data()
    print(f"Success! Combined Dataset Size: {len(dataset)}")
    print(f"First Sample Instruction: {dataset[0]['instruction'][:100]}...")