-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathassemble_dataset.py
More file actions
73 lines (60 loc) · 2.75 KB
/
assemble_dataset.py
File metadata and controls
73 lines (60 loc) · 2.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import json
import os
from argparse import ArgumentParser
from typing import Dict, Optional
import pandas as pd
EXAMPLE_ID = 'example_id'
QUERY = "query"
LABELS = "labels"
def assemble_file(filename: str, offsets_dir: str, outdir: str, id_to_query_map: Dict[int, str]):
offset_path = os.path.join(offsets_dir, f"{filename}.jsonl")
outpath = os.path.join(outdir, f"{filename}.txt")
with open(outpath, 'w', encoding='utf8') as outfile, open(offset_path, 'r', encoding='utf8') as infile:
for line in infile:
fields = json.loads(line.strip())
idx = int(fields[EXAMPLE_ID])
labels = fields[LABELS]
query = id_to_query_map.get(idx, None)
if query is None:
raise ValueError(f"No query found for example id: {idx}")
tokens = query.strip().split()
assert len(tokens) == len(labels), f"Mismatch in queries and tokens\n{idx}\n{tokens}\n{labels}"
for token, label in zip(tokens, labels):
print(f"{token}\t{label}", file=outfile)
print(file=outfile)
def assemble_dataset():
parser = ArgumentParser()
parser.add_argument("esci_path")
parser.add_argument("offsets_dir")
parser.add_argument("out_dir")
parser.add_argument("--individual-annotators", action="store_true")
args = parser.parse_args()
id_to_query_map = read_parquet_to_id_map(args.esci_path)
if args.individual_annotators:
print("Building annotator1...")
assemble_file('annotator-1', args.offsets_dir, args.out_dir, id_to_query_map)
print("Building annotator2...")
assemble_file('annotator-2', args.offsets_dir, args.out_dir, id_to_query_map)
print("Building annotator3...")
assemble_file('annotator-3', args.offsets_dir, args.out_dir, id_to_query_map)
else:
print("Building training dataset...")
assemble_file('train', args.offsets_dir, args.out_dir, id_to_query_map)
print("Building dev dataset...")
assemble_file('dev', args.offsets_dir, args.out_dir, id_to_query_map)
print("Building test dataset...")
assemble_file('test', args.offsets_dir, args.out_dir, id_to_query_map)
def read_parquet_to_id_map(esci_path: str, product_locale: Optional[str] = None):
print("Reading ESCI dataset and building id to query mapping...")
df_examples = pd.read_parquet(esci_path)
id_to_query_map = {}
for i in df_examples.index:
exampleid = df_examples[EXAMPLE_ID][i]
query = df_examples[QUERY][i]
if product_locale: # us, jp, es
if df_examples["product_locale"][i] != product_locale:
continue
id_to_query_map[exampleid] = query
return id_to_query_map
if __name__ == "__main__":
assemble_dataset()