-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdataset2.py
More file actions
151 lines (125 loc) · 4.4 KB
/
dataset2.py
File metadata and controls
151 lines (125 loc) · 4.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import random
from db import firestore_db, firestore
def retrieve_text(text_id):
"""
Retrieve a specific text record from Firestore.
Args:
- text_id (str): ID of the text record to retrieve.
Returns:
- text_doc_dict (dict): Dictionary representing the retrieved text record.
"""
text_ref = firestore_db.collection("texts2").document(text_id)
text_doc = text_ref.get()
text_doc_dict = text_doc.to_dict()
return text_doc_dict
def count_passages():
text_coll_ref = firestore_db.collection("passages")
count_query = text_coll_ref.count()
query_result = count_query.get()
return query_result[0][0].value
def id_with_zeros(number):
return str(number).zfill(10)
def retrieve_passage(passage_id, with_original_text_split=False):
"""
Retrieve a specific text record from Firestore.
Args:
- text_id (str): ID of the text record to retrieve.
Returns:
- text_doc_dict (dict): Dictionary representing the retrieved text record.
"""
passages_coll_ref = firestore_db.collection("passages")
# Retrieve the passage record
passage_ref = passages_coll_ref.document(passage_id)
passage_doc = passage_ref.get()
passage = passage_doc.to_dict()
# Retrieve the original text split
if with_original_text_split is False:
return passage
else:
# Retrieve the original text split from the text record
original_text = retrieve_text(passage["text_id"])
original_text_split = original_text["text_split"]
original_text_title = original_text["title"]
passage["original_text_split"] = original_text_split
passage["original_text_title"] = original_text_title
return passage
def retrieve_random_passage():
"""Load one random passage."""
passage_count = count_passages() # Call this function only once
max_attempts = 100 # Example limit for the number of iterations
attempts = 0
while attempts < max_attempts:
random_number = random.randint(1, passage_count)
passage_id = id_with_zeros(random_number)
passage = retrieve_passage(passage_id, with_original_text_split=True)
# Check for the desired condition and return the passage if met
if "is_accepted_dataset2_datapoint" not in passage and "https://transcripts.cnn.com" not in passage["url"]:
return passage
attempts += 1
return None
def update_passage(slots):
"""
Update a passage record in Firestore.
Args:
- slots (dict): Dictionary containing the updated passage record attributes.
Returns:
- True (bool): Indicates a successful update.
"""
passage_ref = firestore_db.collection(u'passages').document(slots["id"])
for key, value in slots.items():
if value == "" or value == []:
slots[key] = firestore.DELETE_FIELD
elif key == "dataset2_datapoint":
slots[key] = value # a list
elif key == "is_accepted_dataset2_datapoint":
slots[key] = value # a boolean
elif key == "annotator":
slots[key] = value # a string
passage_ref.update(slots)
return True
def retrieve_dataset_2(_annotator_id=None):
"""
Retrieve the whole dataset1.
Args:
_annotator_id: Number of the annotator.
Returns:
"""
source_passages_ref = firestore_db.collection('passages')
dataset2 = []
if _annotator_id is None:
# retrieve texts ordered by "is_accepted_dataset2_datapoint"
docs = (source_passages_ref
.where("is_accepted_dataset2_datapoint", "==", True).stream())
recs = [doc.to_dict() for doc in docs]
for rec in recs:
text = []
for sent in rec["dataset2_datapoint"]:
slots = {
"role": sent["role"],
"sentence": sent["sentence"]
}
text.append(slots)
slots = {
"id": rec["id"],
"text": text,
"metadata": {
"text_id": rec["text_id"],
"source": rec["url"],
"annotator": rec["annotator"]
},
}
if rec.get("publication-date"):
slots["metadata"]["publication-date"] = rec["publication-date"]
dataset2.append(slots)
else:
# retrieve texts with "annotator" equal to "IE-[_annotator_id]" ordered by "is_accepted_dataset2_datapoint"
docs = (source_passages_ref.where("annotator", "==", f"IE-{_annotator_id}")
.order_by("is_accepted_dataset2_datapoint").stream())
recs = [doc.to_dict() for doc in docs]
for rec in recs:
slots = {
"id": rec["id"],
"is_accepted": rec["is_accepted_dataset2_datapoint"]
}
dataset2.append(slots)
return dataset2