annotationNLP/dataset2.py at main · pacoreyes/annotationNLP · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import random
from db import firestore_db, firestore


def retrieve_text(text_id):
  """
  Retrieve a specific text record from Firestore.

  Args:
  - text_id (str): ID of the text record to retrieve.

  Returns:
  - text_doc_dict (dict): Dictionary representing the retrieved text record.
  """
  text_ref = firestore_db.collection("texts2").document(text_id)
  text_doc = text_ref.get()
  text_doc_dict = text_doc.to_dict()
  return text_doc_dict


def count_passages():
  text_coll_ref = firestore_db.collection("passages")
  count_query = text_coll_ref.count()
  query_result = count_query.get()
  return query_result[0][0].value


def id_with_zeros(number):
  return str(number).zfill(10)


def retrieve_passage(passage_id, with_original_text_split=False):
  """
  Retrieve a specific text record from Firestore.

  Args:
  - text_id (str): ID of the text record to retrieve.

  Returns:
  - text_doc_dict (dict): Dictionary representing the retrieved text record.
  """
  passages_coll_ref = firestore_db.collection("passages")
  # Retrieve the passage record
  passage_ref = passages_coll_ref.document(passage_id)
  passage_doc = passage_ref.get()
  passage = passage_doc.to_dict()

  # Retrieve the original text split
  if with_original_text_split is False:
    return passage
  else:
    # Retrieve the original text split from the text record
    original_text = retrieve_text(passage["text_id"])
    original_text_split = original_text["text_split"]
    original_text_title = original_text["title"]
    passage["original_text_split"] = original_text_split
    passage["original_text_title"] = original_text_title
    return passage


def retrieve_random_passage():
  """Load one random passage."""
  passage_count = count_passages()  # Call this function only once
  max_attempts = 100  # Example limit for the number of iterations
  attempts = 0

  while attempts < max_attempts:
    random_number = random.randint(1, passage_count)
    passage_id = id_with_zeros(random_number)
    passage = retrieve_passage(passage_id, with_original_text_split=True)
    # Check for the desired condition and return the passage if met
    if "is_accepted_dataset2_datapoint" not in passage and "https://transcripts.cnn.com" not in passage["url"]:
      return passage
    attempts += 1
  return None


def update_passage(slots):
  """
  Update a passage record in Firestore.

  Args:
  - slots (dict): Dictionary containing the updated passage record attributes.

  Returns:
  - True (bool): Indicates a successful update.
  """
  passage_ref = firestore_db.collection(u'passages').document(slots["id"])
  for key, value in slots.items():
    if value == "" or value == []:
      slots[key] = firestore.DELETE_FIELD
    elif key == "dataset2_datapoint":
      slots[key] = value  # a list
    elif key == "is_accepted_dataset2_datapoint":
      slots[key] = value  # a boolean
    elif key == "annotator":
      slots[key] = value  # a string
  passage_ref.update(slots)
  return True


def retrieve_dataset_2(_annotator_id=None):
  """
  Retrieve the whole dataset1.
  Args:
    _annotator_id: Number of the annotator.

  Returns:

  """
  source_passages_ref = firestore_db.collection('passages')
  dataset2 = []

  if _annotator_id is None:
    # retrieve texts ordered by "is_accepted_dataset2_datapoint"
    docs = (source_passages_ref
            .where("is_accepted_dataset2_datapoint", "==", True).stream())
    recs = [doc.to_dict() for doc in docs]
    for rec in recs:
      text = []
      for sent in rec["dataset2_datapoint"]:
        slots = {
          "role": sent["role"],
          "sentence": sent["sentence"]
        }
        text.append(slots)

      slots = {
        "id": rec["id"],
        "text": text,
        "metadata": {
          "text_id": rec["text_id"],
          "source": rec["url"],
          "annotator": rec["annotator"]
        },
      }
      if rec.get("publication-date"):
        slots["metadata"]["publication-date"] = rec["publication-date"]
      dataset2.append(slots)
  else:
    # retrieve texts with "annotator" equal to "IE-[_annotator_id]" ordered by "is_accepted_dataset2_datapoint"
    docs = (source_passages_ref.where("annotator", "==", f"IE-{_annotator_id}")
            .order_by("is_accepted_dataset2_datapoint").stream())
    recs = [doc.to_dict() for doc in docs]
    for rec in recs:
      slots = {
        "id": rec["id"],
        "is_accepted": rec["is_accepted_dataset2_datapoint"]
      }
      dataset2.append(slots)
  return dataset2