Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -175,10 +175,10 @@ polymarket-update-questions:
wikipedia: wikipedia-fetch wikipedia-update-questions

wikipedia-fetch:
$(MAKE) -C src/questions/wikipedia/fetch || echo "* $@" >> $(MAKE_FAILURE_LOG)
$(MAKE) -C src/orchestration/func_wikipedia_fetch || echo "* $@" >> $(MAKE_FAILURE_LOG)

wikipedia-update-questions:
$(MAKE) -C src/questions/wikipedia/update_questions || echo "* $@" >> $(MAKE_FAILURE_LOG)
$(MAKE) -C src/orchestration/func_wikipedia_update || echo "* $@" >> $(MAKE_FAILURE_LOG)

fred: fred-fetch fred-update-questions

Expand Down
5 changes: 5 additions & 0 deletions src/_fb_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ class SourceQuestionBank:
QuestionBank = dict[str, SourceQuestionBank]


# Wikipedia's fetch() returns one DataFrame per page, keyed by id_root (columns vary per page).
# Shared between WikipediaSource.fetch()/update() and the orchestration fetch IO.
WikipediaFetchResult = dict[str, pd.DataFrame]


@dataclass
class UpdateResult:
"""Return value of a source's update() method.
Expand Down
296 changes: 14 additions & 282 deletions src/helpers/wikipedia.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,20 @@
# -*- coding: utf-8 -*-
"""Wikipedia constants."""
"""Wikipedia shared helpers.

Light home of Wikipedia's naive-forecast computation (scipy/numpy/pandas) plus the hash-mapping
and identity access used by the still-unrefactored ``base_eval`` naive forecaster and by
``question_curation``. Hash-mapping access routes through a lazily-instantiated ``WikipediaSource``
(see ``_get_source``); ``sources.wikipedia`` lazy-imports its scraping deps (requests/bs4) inside
fetch, so importing this module — and the many modules that import it — stays light.

When ``base_eval`` is refactored to call ``WikipediaSource.get_naive_forecast()`` this computation
can move onto the source class (Phase 1 plan) and this module shrinks to a metadata-only shim.
"""

import logging
import os
import sys
from datetime import datetime, timedelta
from datetime import timedelta

import numpy as np
import pandas as pd
Expand All @@ -14,10 +24,11 @@

from sources._metadata import SOURCE_METADATA # noqa: E402
from sources.wikipedia import _IDS_TO_NULLIFY as IDS_TO_NULLIFY # noqa: F401, E402
from sources.wikipedia import _PAGES as PAGES # noqa: E402
from sources.wikipedia import ( # noqa: F401, E402
_TRANSFORM_ID_MAPPING as transform_id_mapping,
)
from sources.wikipedia import QuestionType # noqa: F401, E402
from sources.wikipedia import QuestionType # noqa: E402

from . import constants # noqa: E402

Expand All @@ -27,8 +38,6 @@
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

HEADERS = {"User-Agent": constants.BENCHMARK_USER_AGENT}

WIKIPEDIA_QUESTION_BANK_DATA_STORAGE_START_DATETIME = (
constants.QUESTION_BANK_DATA_STORAGE_START_DATETIME - timedelta(days=360 * 4)
)
Expand All @@ -38,8 +47,6 @@

source = "wikipedia"

fetch_directory = f"{source}/fetch"

# Lazy import to avoid circular imports at module level
_source = None

Expand Down Expand Up @@ -79,11 +86,6 @@ def ffill_dfr(dfr):
return _get_source()._ffill_dfr(dfr)


def get_fetch_filename(question_id_root: str) -> str:
"""Provide the name of the fetch file for the id_root."""
return f"{question_id_root}.jsonl"


def id_hash(id_root: str, id_field_value: str) -> str:
"""Encode wikipedia Ids."""
return _get_source()._id_hash(id_root=id_root, id_field_value=id_field_value)
Expand All @@ -94,23 +96,6 @@ def id_unhash(hash_key: str) -> tuple:
return _get_source()._id_unhash(hash_key)


def clean_FIDE_rankings(df):
"""Clean fetched data for `FIDE_rankings`.

Fix inconsistent player names.
"""
df = df[~df["Player"].str.contains("Change from the previous month")].copy()
replacements = {
"Gukesh D.": "Gukesh Dommaraju",
"Gukesh D": "Gukesh Dommaraju",
"Leinier Dominguez": "Leinier Domínguez Pérez",
"Leinier Dominguez Pérez": "Leinier Domínguez Pérez",
"Nana Dzagnidze]": "Nana Dzagnidze",
}
df["Player"] = df["Player"].replace(replacements)
return df


def get_probability_forecast(mid, comparison_value, forecast_mean, forecast_std):
"""Get forecast based on question type.

Expand Down Expand Up @@ -174,55 +159,6 @@ def get_min_max_possible_value(mid):
raise ValueError(f"Could not find min/max for {id_root}.")


def clean_List_of_world_records_in_swimming(df):
"""Clean fetched data for `List_of_world_records_in_swimming`.

Drop any rows that contain parens.
"""
df = df[~df["Name"].str.contains(r"[()]")].reset_index(drop=True)
df = df[~df["Name"].str.contains("eventsort")].reset_index(drop=True)
df = df[~df["Name"].str.contains("recordinfo")].reset_index(drop=True)
return df


def clean_List_of_infectious_diseases(df):
"""Clean fetched data for `List_of_infectious_diseases`.

* Remove rows with multiple answers.
* Change all `Under research[x]` to `No`
* Change all `No` to 0
* Change all `Yes` to 1
"""
duplicates = df[df.duplicated(subset=["date", "Common name"], keep=False)]
df = df.drop(duplicates.index).reset_index(drop=True)
# On and before this date the `"Vaccine(s)"` field had other info in it.
df = df[df["date"] > pd.Timestamp("2021-07-07")]
df["Vaccine(s)"] = df["Vaccine(s)"].replace(
{
r"Under research.*": "No",
r"Under Development.*": "No",
r"Yes.*": "Yes",
r"No.*": "No",
},
regex=True,
)
df.loc[df["Vaccine(s)"] == "No", "Vaccine(s)"] = 0
df.loc[df["Vaccine(s)"] == "Yes", "Vaccine(s)"] = 1
df["Vaccine(s)"] = df["Vaccine(s)"].astype(int)
df = df.dropna(ignore_index=True)
return df


def is_resolved_List_of_infectious_diseases(value):
"""Return true if the vaccine has been developed."""
return value == 1 or str(value).lower() == "yes"


def get_value_List_of_infectious_diseases(value):
"""Return Yes/No instead of 1/0."""
return "Yes" if value else "No"


def get_question_type(mid):
"""Retun the question type given mid."""
d = id_unhash(mid)
Expand Down Expand Up @@ -275,207 +211,3 @@ def backfill_for_forecast(mid, dfr):
dfr = pd.concat([fill_df, dfr]).sort_values("date")

return dfr


FIDE_BACKGROUND = (
(
"The International Chess Federation (FIDE) governs international chess "
"competition. Each month, FIDE publishes the lists 'Top 100 Players', 'Top 100 "
"Women', 'Top 100 Juniors' and 'Top 100 Girls' and rankings of countries according "
"to the average rating of their top 10 players and top 10 female players.\n"
"To create the rankings, FIDE uses the Elo rating system, which is a method for "
"calculating the relative skill levels of players in zero-sum games such as chess. "
"The difference in the ratings between two players serves as a predictor of the "
"outcome of a match. Two players with equal ratings who play against each other "
"are expected to score an equal number of wins. A player whose rating is 100 "
"points greater than their opponent's is expected to score 64%; if the difference "
"is 200 points, then the expected score for the stronger player is 76%.\n"
"A player's Elo rating is a number which may change depending on the outcome of "
"rated games played. After every game, the winning player takes points from the "
"losing one. The difference between the ratings of the winner and loser determines "
"the total number of points gained or lost after a game. If the higher-rated "
"player wins, then only a few rating points will be taken from the lower-rated "
"player. However, if the lower-rated player scores an upset win, many rating "
"points will be transferred. The lower-rated player will also gain a few points "
"from the higher rated player in the event of a draw. This means that this rating "
"system is self-correcting. Players whose ratings are too low or too high should, "
"in the long run, do better or worse correspondingly than the rating system "
"predicts and thus gain or lose rating points until the ratings reflect their true "
"playing strength.\n"
"Elo ratings are comparative only, and are valid only within the rating pool in "
"which they were calculated, rather than being an absolute measure of a player's "
"strength."
),
tuple(),
)

PAGES = [
{
"id_root": "FIDE_rankings_elo_rating",
"page_title": "FIDE_rankings",
"table_index": [
{
"start_date": WIKIPEDIA_QUESTION_BANK_DATA_STORAGE_START_DATE,
"table_index": [1, 3],
},
],
"question_type": QuestionType.ONE_PERCENT_MORE,
"key": {
"id",
},
"fields": {
"id": "Player",
"value": "Rating",
},
"resolution_file_value_column_dtype": int,
"question": (
(
"According to Wikipedia, will {id} have an Elo rating on {resolution_date} that's "
"at least 1% higher than on {forecast_due_date}?"
),
("id",),
),
"background": FIDE_BACKGROUND,
"freeze_datetime_value_explanation": (
"{id}'s ELO rating.",
("id",),
),
"clean_func": "clean_FIDE_rankings",
},
{
"id_root": "FIDE_rankings_ranking",
"page_title": "FIDE_rankings",
"table_index": [
{
"start_date": WIKIPEDIA_QUESTION_BANK_DATA_STORAGE_START_DATE,
"table_index": [1, 3],
},
],
"question_type": QuestionType.SAME_OR_LESS,
"key": {
"id",
},
"fields": {
"id": "Player",
"value": "Rank",
},
"resolution_file_value_column_dtype": int,
"question": (
(
"According to Wikipedia, will {id} have a FIDE ranking on {resolution_date} as "
"high or higher than their ranking on {forecast_due_date}?"
),
("id",),
),
"background": FIDE_BACKGROUND,
"freeze_datetime_value_explanation": (
"{id}'s FIDE ranking.",
("id",),
),
"clean_func": "clean_FIDE_rankings",
},
{
"id_root": "List_of_world_records_in_swimming",
"page_title": "List_of_world_records_in_swimming",
"table_index": [
{
"start_date": WIKIPEDIA_QUESTION_BANK_DATA_STORAGE_START_DATE,
"table_index": [0, 2],
},
{
"start_date": datetime(2025, 5, 4).date(),
"table_index": [0, 1],
},
],
"question_type": QuestionType.SAME,
"key": {
"id",
"value",
},
"fields": {
"id": "Name",
"value": "Event",
},
"resolution_file_value_column_dtype": str,
"question": (
(
"According to Wikipedia, will {id} still hold the world record for {value} in "
"long course (50 metres) swimming pools on {resolution_date}?"
),
("id", "value"),
),
"background": (
(
"The world records in swimming are ratified by World Aquatics (formerly known as FINA), "
"the international governing body of swimming. Records can be set in long course (50 "
"metres) or short course (25 metres) swimming pools.\n"
"The ratification process is described in FINA Rule SW12, and involves submission of "
"paperwork certifying the accuracy of the timing system and the length of the pool, "
"satisfaction of FINA rules regarding swimwear and a negative doping test by the "
"swimmer(s) involved. Records can be set at intermediate distances in an individual "
"race and for the first leg of a relay race. Records which have not yet been fully "
"ratified are marked with a '#' symbol in these lists."
),
tuple(),
),
"freeze_datetime_value_explanation": (
"{id} is a record holder in the {value}.",
(
"id",
"value",
),
),
"clean_func": "clean_List_of_world_records_in_swimming",
},
{
"id_root": "List_of_infectious_diseases",
"page_title": "List_of_infectious_diseases",
"table_index": [
{
"start_date": WIKIPEDIA_QUESTION_BANK_DATA_STORAGE_START_DATE,
"table_index": 0,
},
],
"question_type": QuestionType.MORE,
"key": {
"id",
},
"fields": {
"id": "Common name",
"value": "Vaccine(s)",
},
"resolution_file_value_column_dtype": str,
"question": (
(
"According to Wikipedia, will a vaccine have been developed for {id} by "
"{resolution_date}?"
),
("id",),
),
"background": (
(
"According to Wikipedia, {id} is the common name of an infectious disease. A vaccine "
"is a biological preparation that provides active acquired immunity to a particular "
"infectious or malignant disease. The safety and effectiveness of vaccines has "
"been widely studied and verified. A vaccine typically contains an agent that "
"resembles a disease-causing microorganism and is often made from weakened or killed "
"forms of the microbe, its toxins, or one of its surface proteins. The agent "
"stimulates the body's immune system to recognize the agent as a threat, destroy it, "
"and recognize further and destroy any of the microorganisms associated with that "
"agent that it may encounter in the future."
),
("id",),
),
"freeze_datetime_value_explanation": (
"Vaccine status for {id}. 'No' means that a vaccine has not yet been created. "
"'Yes' means that it has.",
("id",),
),
"clean_func": "clean_List_of_infectious_diseases",
"is_resolved_func": "is_resolved_List_of_infectious_diseases",
"value_func": "get_value_List_of_infectious_diseases",
},
]

for page in PAGES:
page["table_index"].sort(key=lambda e: e["start_date"])
Loading
Loading