Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,10 @@ infer-update-questions:
acled: acled-fetch acled-update-questions

acled-fetch:
$(MAKE) -C src/questions/acled/fetch || echo "* $@" >> $(MAKE_FAILURE_LOG)
$(MAKE) -C src/orchestration/func_acled_fetch || echo "* $@" >> $(MAKE_FAILURE_LOG)

acled-update-questions:
$(MAKE) -C src/questions/acled/update_questions || echo "* $@" >> $(MAKE_FAILURE_LOG)
$(MAKE) -C src/orchestration/func_acled_update || echo "* $@" >> $(MAKE_FAILURE_LOG)

yfinance: yfinance-fetch yfinance-update-questions

Expand Down
20 changes: 20 additions & 0 deletions src/_schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,26 @@ class Config:
coerce = True


class AcledFetchFrame(pa.DataFrameModel):
"""Output of AcledSource.fetch(). Raw per-event rows from the ACLED API."""

event_id_cnty: Series[str]
event_date: Series[str]
iso: Series[int]
region: Series[str]
country: Series[str]
admin1: Series[str]
event_type: Series[str]
fatalities: Series[int]
timestamp: Series[str]

class Config:
"""Schema configuration."""

strict = False
coerce = True


class AcledResolutionFrame(pa.DataFrameModel):
"""ACLED-specific: aggregated events by country and date.

Expand Down
1 change: 1 addition & 0 deletions src/base_eval/naive_and_dummy_forecasters/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
google-cloud-storage
google-cloud-secret-manager
gcsfs==2024.3.1
backoff
pandas>=2.2.2,<3.0
pandas-market-calendars
tqdm
Expand Down
216 changes: 15 additions & 201 deletions src/helpers/acled.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,26 @@
"""ACLED-specific variables."""
"""ACLED shared helpers.

Light home of ACLED's pure computation (aggregations + naive-forecast helpers): the module
imports only numpy/pandas and the lightweight metadata layer at load time. Hash-mapping access
routes through a lazily-instantiated ``AcledSource`` (see ``_get_source``), so importing this
module stays light — only *calling* the hash funcs pulls the (now heavy) ``sources.acled``.

The sole caller of those hash funcs is the unrefactored ``base_eval`` naive forecaster, which
therefore declares ``backoff`` in its requirements. ``question_curation`` imports this module only
for the ``SOURCE_INTRO``/``RESOLUTION_CRITERIA`` constants (served from ``_metadata``); it never
triggers the lazy import, so it and its many consumers stay light.

When ``base_eval`` is refactored to call ``AcledSource.get_naive_forecast()`` this computation can
move onto the source class (Phase 1 plan) and this module shrinks to a metadata-only shim.
"""

from datetime import timedelta
from enum import Enum

import numpy as np
import pandas as pd

from sources._metadata import SOURCE_METADATA

from . import data_utils

SOURCE_INTRO = SOURCE_METADATA["acled"]["source_intro"]
RESOLUTION_CRITERIA = SOURCE_METADATA["acled"]["resolution_criteria"]

Expand Down Expand Up @@ -54,134 +65,6 @@ def upload_hash_mapping():
_upload(raw_json, source)


FETCH_COLUMN_DTYPE = {
"event_id_cnty": str,
"event_date": str,
"iso": int,
"region": str,
"country": str,
"admin1": str,
"event_type": str,
"fatalities": int,
"timestamp": str,
}
FETCH_COLUMNS = list(FETCH_COLUMN_DTYPE.keys())

BACKGROUND = """
ACLED classifies events into six distinct categories:

1. Battles: violent interactions between two organized armed groups at a particular time and
location;
2. Protests: in-person public demonstrations of three or more participants in which the participants
do not engage in violence, though violence may be used against them;
3. Riots: violent events where demonstrators or mobs of three or more engage in violent or
destructive acts, including but not limited to physical fights, rock throwing, property
destruction, etc.;
4. Explosions/Remote violence: incidents in which one side uses weapon types that, by their nature,
are at range and widely destructive;
5. Violence against civilians: violent events where an organized armed group inflicts violence upon
unarmed non-combatants; and
6. Strategic developments: contextually important information regarding incidents and activities of
groups that are not recorded as any of the other event types, yet may trigger future events or
contribute to political dynamics within and across states.

Detailed information about the categories can be found at:
https://acleddata.com/knowledge-base/codebook/#acled-events
"""


def read_dff(local_question_bank_dir=None) -> pd.DataFrame:
"""
Read fetch file and create dfr.

Args:
local_question_bank_dir (str): the location where the question bank was unzipped.

Returns:
df (pd.DataFrame): parsed and formatted fetch file.
dfr (pd.DataFrame): the ACLED resolution values.
"""
if local_question_bank_dir is None:
filenames = data_utils.generate_filenames(source=source)
df = data_utils.download_and_read(
filename=filenames["jsonl_fetch"],
local_filename=filenames["local_fetch"],
df_tmp=pd.DataFrame(columns=FETCH_COLUMNS),
dtype=FETCH_COLUMN_DTYPE,
)
else:
filenames = data_utils.generate_filenames(source=source)
source_fetch_file = filenames.get("jsonl_fetch")
local_filename = f"{local_question_bank_dir}/{source_fetch_file}"

df = pd.read_json(
local_filename,
lines=True,
dtype=FETCH_COLUMN_DTYPE,
convert_dates=False,
)

# The values for the `event_date` field in the following entries are incorrect.
# They are "0025-" for "2025" and "0024-" for "2024-"
#
# Bug reported to ACLED on 26 Sept 2025
#
# 2025:
# * https://acleddata.com/api/acled/read?_format=json&event_id_cnty=ABW24
# * https://acleddata.com/api/acled/read?_format=json&event_id_cnty=YEM104718
# * https://acleddata.com/api/acled/read?_format=json&event_id_cnty=YEM99604
#
# 2024:
# * https://acleddata.com/api/acled/read?_format=json&event_id_cnty=NCL346
# * https://acleddata.com/api/acled/read?_format=json&event_id_cnty=NCL351
# * https://acleddata.com/api/acled/read?_format=json&event_id_cnty=PYF127
def fix_year_prefix(date_str):
if isinstance(date_str, str):
if date_str.startswith("0025-"):
return "2025-" + date_str[5:]
if date_str.startswith("0024-"):
return "2024-" + date_str[5:]
return date_str

df["event_date"] = df["event_date"].apply(fix_year_prefix)
# End fix bug with ACLED data

df["event_date"] = pd.to_datetime(df["event_date"])

df = df[["country", "event_date", "event_type", "fatalities"]].copy()

dfr = (
pd.get_dummies(df, columns=["event_type"], prefix="", prefix_sep="")
.groupby(["country", "event_date"])
.sum()
.reset_index()
)

return df, dfr


def download_dff_and_prepare_dfr(local_question_bank_dir: str = None) -> tuple:
"""Prepare ACLED data for resolution."""
df, dfr = read_dff(local_question_bank_dir=local_question_bank_dir)
countries = df["country"].unique()
event_types = list(df["event_type"].unique()) + ["fatalities"]
return (
dfr,
countries,
event_types,
)


class QuestionType(Enum):
"""Types of questions.

These will determine how a given question is resolved.
"""

N_30_DAYS_GT_30_DAY_AVG_OVER_PAST_360_DAYS = 0
N_30_DAYS_X_10_GT_30_DAY_AVG_OVER_PAST_360_DAYS_PLUS_1 = 1


def get_forecast(comparison_value, dfr, country, col, ref_date):
"""Retrun the LHS of the comparison for the question.

Expand Down Expand Up @@ -233,17 +116,6 @@ def get_base_comparison_value(key, dfr, country, col, ref_date):
raise ValueError("Invalid key.")


def get_freeze_value(key, dfr, country, event_type, today):
"""Return the freeze value given the key."""
if key == "last30Days.gt.30DayAvgOverPast360Days":
return thirty_day_avg_over_past_360_days(dfr, country, event_type, today)

if key == "last30DaysTimes10.gt.30DayAvgOverPast360DaysPlus1":
return thirty_day_avg_over_past_360_days_plus_1(dfr, country, event_type, today)

raise Exception("Invalid key.")


def sum_over_past_30_days(dfr, country, col, ref_date):
"""Sum over the 30 days before the ref_date."""
dfc = dfr[dfr["country"] == country].copy()
Expand All @@ -269,61 +141,3 @@ def thirty_day_avg_over_past_360_days(dfr, country, col, ref_date):
def thirty_day_avg_over_past_360_days_plus_1(dfr, country, col, ref_date):
"""Get 1 plus the 30 day average over the 360 days before the ref_date."""
return 1 + thirty_day_avg_over_past_360_days(dfr, country, col, ref_date)


QUESTIONS = {
"last30Days.gt.30DayAvgOverPast360Days": {
"question_type": QuestionType.N_30_DAYS_GT_30_DAY_AVG_OVER_PAST_360_DAYS,
"question": (
(
"Will there be more {event_type} in {country} for the 30 days before "
"{resolution_date} compared to the 30-day average of {event_type} over the 360 "
"days preceding {forecast_due_date}?"
"\n\n"
"e.g. If the forecast due date is 2024-01-01 and we have the following data:\n"
"Date,{event_type}\n"
"2023-11-11,1\n"
"2023-10-10,2\n"
"to calculate the 30-day average of {event_type} over the preceding 360 "
"days, we’d have: (1+2)/12=0.25.\n\n"
"In this example, for the question to resolve positively, 1 or more "
"{event_type} would need to occur in the 30 days leading up to the resolution."
),
("event_type", "country"),
),
"freeze_datetime_value_explanation": (
(
"The 30-day average of {event_type} over the past 360 days in {country}. "
"This reference value will potentially change as ACLED updates its dataset."
),
("event_type", "country"),
),
},
"last30DaysTimes10.gt.30DayAvgOverPast360DaysPlus1": {
"question_type": QuestionType.N_30_DAYS_X_10_GT_30_DAY_AVG_OVER_PAST_360_DAYS_PLUS_1,
"question": (
(
"Will there be more than ten times as many {event_type} in {country} for the 30 "
"days before {resolution_date} compared to one plus the 30-day average of "
"{event_type} over the 360 days preceding {forecast_due_date}?"
"\n\n"
"e.g. If the forecast due date is 2024-01-01 and we have the following data:\n"
"Date,{event_type}\n"
"2023-11-11,1\n"
"2023-10-10,2\n"
"to calculate one plus the 30-day average of {event_type} over the preceding 360 "
"days, we’d have: 1+(1+2)/12=1.25.\n\n"
"In this example, for the question to resolve positively, 13 (10 x 1.25) or more "
"{event_type} would need to occur in the 30 days leading up to the resolution."
),
("event_type", "country"),
),
"freeze_datetime_value_explanation": (
(
"One plus the 30-day average of {event_type} over the past 360 days in {country}. "
"This reference value will potentially change as ACLED updates its dataset."
),
("event_type", "country"),
),
},
}
54 changes: 13 additions & 41 deletions src/orchestration/_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from termcolor import colored

from _fb_types import QuestionBank, SourceQuestionBank
from _schemas import AcledResolutionFrame, QuestionFrame, ResolutionFrame
from _schemas import QuestionFrame, ResolutionFrame
from helpers import constants, data_utils, dates, env, keys
from sources import ALL_SOURCE_NAMES, MARKET_SOURCE_NAMES
from sources._base import BaseSource
Expand All @@ -34,51 +34,23 @@


def _read_acled_dfr(local_question_bank_dir: str) -> pd.DataFrame:
"""Read ACLED fetch file and prepare the resolution DataFrame."""
acled_fetch_column_dtype = {
"event_id_cnty": str,
"event_date": str,
"iso": int,
"region": str,
"country": str,
"admin1": str,
"event_type": str,
"fatalities": int,
"timestamp": str,
}
filenames = data_utils.generate_filenames("acled")
source_fetch_file = filenames.get("jsonl_fetch")
local_filename = f"{local_question_bank_dir}/{source_fetch_file}"
"""Read the ACLED fetch file and build the resolution DataFrame.

The dff -> dfr transform (year-prefix fix, one-hot encode, aggregate) is owned by the
domain layer; this is just the IO wrapper that reads the fetch file off the unzipped
question bank and delegates to it, so the transform lives in exactly one place.
"""
from sources.acled import FETCH_COLUMN_DTYPE, AcledSource

df = pd.read_json(
filenames = data_utils.generate_filenames("acled")
local_filename = f"{local_question_bank_dir}/{filenames['jsonl_fetch']}"
dff = pd.read_json(
local_filename,
lines=True,
dtype=acled_fetch_column_dtype,
dtype=FETCH_COLUMN_DTYPE,
convert_dates=False,
)

# Fix year prefix bug in ACLED data
def fix_year_prefix(date_str):
if isinstance(date_str, str):
if date_str.startswith("0025-"):
return "2025-" + date_str[5:]
if date_str.startswith("0024-"):
return "2024-" + date_str[5:]
return date_str

df["event_date"] = df["event_date"].apply(fix_year_prefix)
df = AcledResolutionFrame.validate(df)
df["event_date"] = pd.to_datetime(df["event_date"])

df = df[["country", "event_date", "event_type", "fatalities"]].copy()

dfr = (
pd.get_dummies(df, columns=["event_type"], prefix="", prefix_sep="")
.groupby(["country", "event_date"])
.sum()
.reset_index()
)

dfr, _, _ = AcledSource._prepare_resolution_data(dff)
return dfr


Expand Down
Loading
Loading