Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,10 @@ acled-update-questions:
yfinance: yfinance-fetch yfinance-update-questions

yfinance-fetch:
$(MAKE) -C src/questions/yfinance/fetch || echo "* $@" >> $(MAKE_FAILURE_LOG)
$(MAKE) -C src/orchestration/func_yfinance_fetch || echo "* $@" >> $(MAKE_FAILURE_LOG)

yfinance-update-questions:
$(MAKE) -C src/questions/yfinance/update_questions || echo "* $@" >> $(MAKE_FAILURE_LOG)
$(MAKE) -C src/orchestration/func_yfinance_update || echo "* $@" >> $(MAKE_FAILURE_LOG)

polymarket: polymarket-fetch polymarket-update-questions

Expand Down
7 changes: 7 additions & 0 deletions src/_schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,13 @@ class PolymarketFetchFrame(QuestionFrame):
historical_prices: Series[object] # list[dict] with {date, value} per question


class YfinanceFetchFrame(QuestionFrame):
"""Output of YfinanceSource.fetch(). QuestionFrame plus transient fields for update()."""

fetch_datetime: Series[str]
probability: Series[object] = pa.Field(nullable=True)


class ManifoldFetchFrame(pa.DataFrameModel):
"""Output of ManifoldSource.fetch(). Just market IDs from search-markets endpoint."""

Expand Down
25 changes: 19 additions & 6 deletions src/orchestration/_source_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,24 @@ def write_fetch_output(source: str, dff: pd.DataFrame) -> None:
)


def list_existing_resolution_ids(source: str) -> set[str]:
"""Return the set of bare question IDs that already have a resolution file.

Lists <source>/*.jsonl and strips to bare IDs. This is a pure existence
listing: it includes files whose contents are empty, mirroring the old
inline list_with_prefix check. Do NOT derive this from
load_existing_resolution_files, which drops empty frames.

Args:
source (str): Source name (e.g. "infer").

Returns:
set of bare question IDs present in storage for this source.
"""
paths = gcp.storage.list_with_prefix(bucket_name=env.QUESTION_BANK_BUCKET, prefix=f"{source}/")
return {os.path.basename(p).removesuffix(".jsonl") for p in paths if p.endswith(".jsonl")}


def load_existing_resolution_files(
source: str,
ids: Iterable[str] | None = None,
Expand All @@ -52,12 +70,7 @@ def load_existing_resolution_files(
dict mapping question_id to its resolution DataFrame.
"""
if ids is None:
paths = gcp.storage.list_with_prefix(
bucket_name=env.QUESTION_BANK_BUCKET, prefix=f"{source}/"
)
question_ids = [
os.path.basename(p).removesuffix(".jsonl") for p in paths if p.endswith(".jsonl")
]
question_ids = list(list_existing_resolution_ids(source))
else:
question_ids = [str(qid) for qid in ids]

Expand Down
9 changes: 3 additions & 6 deletions src/orchestration/func_infer_fetch/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,9 @@
import logging
from typing import Any

from helpers import data_utils, decorator, env, keys
from helpers import data_utils, decorator, keys
from orchestration import _source_io
from sources.infer import InferSource
from utils import gcp

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
Expand All @@ -23,11 +22,9 @@ def driver(_: Any) -> None:
source.api_key = keys.API_KEY_INFER

dfq = data_utils.get_data_from_cloud_storage(SOURCE, return_question_data=True)
files_in_storage = gcp.storage.list_with_prefix(
bucket_name=env.QUESTION_BANK_BUCKET, prefix=SOURCE
)
existing_resolution_ids = _source_io.list_existing_resolution_ids(SOURCE)

dff = source.fetch(dfq=dfq, files_in_storage=files_in_storage)
dff = source.fetch(dfq=dfq, existing_resolution_ids=existing_resolution_ids)

_source_io.write_fetch_output(SOURCE, dff)
logger.info("Done.")
Expand Down
9 changes: 3 additions & 6 deletions src/orchestration/func_manifold_update/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,9 @@
import logging
from typing import Any

from helpers import data_utils, decorator, env
from helpers import data_utils, decorator
from orchestration import _source_io
from sources.manifold import ManifoldSource
from utils import gcp

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
Expand All @@ -32,15 +31,13 @@ def driver(_: Any) -> None:
existing_resolution_files = _source_io.load_existing_resolution_files(SOURCE)
logger.info(f"Loaded {len(existing_resolution_files)} resolution files")

files_in_storage = gcp.storage.list_with_prefix(
bucket_name=env.QUESTION_BANK_BUCKET, prefix=SOURCE
)
existing_resolution_ids = _source_io.list_existing_resolution_ids(SOURCE)

result = source.update(
dfq,
dff,
existing_resolution_files=existing_resolution_files,
files_in_storage=files_in_storage,
existing_resolution_ids=existing_resolution_ids,
)

logger.info("Uploading to GCP...")
Expand Down
9 changes: 3 additions & 6 deletions src/orchestration/func_metaculus_update/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,9 @@
import logging
from typing import Any

from helpers import data_utils, decorator, env, keys
from helpers import data_utils, decorator, keys
from orchestration import _source_io
from sources.metaculus import MetaculusSource
from utils import gcp

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
Expand All @@ -26,14 +25,12 @@ def driver(_: Any) -> None:
SOURCE, return_question_data=True, return_fetch_data=True
)

files_in_storage = gcp.storage.list_with_prefix(
bucket_name=env.QUESTION_BANK_BUCKET, prefix=SOURCE
)
existing_resolution_ids = _source_io.list_existing_resolution_ids(SOURCE)

result = source.update(
dfq,
dff,
files_in_storage=files_in_storage,
existing_resolution_ids=existing_resolution_ids,
)

logger.info("Uploading to GCP...")
Expand Down
4 changes: 3 additions & 1 deletion src/orchestration/func_resolve/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,6 @@ slack_sdk
pandera
pytz
python-dateutil
backoff
backoff
beautifulsoup4
yfinance
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,23 @@ UPLOAD_DIR = upload
.gcloudignore:
cp -r $(ROOT_DIR)src/helpers/.gcloudignore .

Procfile:
cp -r $(ROOT_DIR)src/helpers/Procfile .
Dockerfile: $(ROOT_DIR)src/helpers/Dockerfile.template
sed \
-e 's/REGION/$(CLOUD_DEPLOY_REGION)/g' \
-e 's/STACK/google-22-full/g' \
-e 's/PYTHON_VERSION/python312/g' \
$< > Dockerfile

deploy : main.py .gcloudignore requirements.txt Procfile
deploy : main.py .gcloudignore requirements.txt Dockerfile
mkdir -p $(UPLOAD_DIR)
cp -r $(ROOT_DIR)utils $(UPLOAD_DIR)/
cp -r $(ROOT_DIR)src/helpers $(UPLOAD_DIR)/
cp -r $(ROOT_DIR)src/sources $(UPLOAD_DIR)/
cp $(ROOT_DIR)src/_fb_types.py $(UPLOAD_DIR)/
cp $(ROOT_DIR)src/_schemas.py $(UPLOAD_DIR)/
cp -r $(ROOT_DIR)src/helpers $(UPLOAD_DIR)/helpers
cp -r $(ROOT_DIR)src/sources $(UPLOAD_DIR)/sources
mkdir -p $(UPLOAD_DIR)/orchestration
cp $(ROOT_DIR)src/orchestration/__init__.py $(UPLOAD_DIR)/orchestration/
cp $(ROOT_DIR)src/orchestration/_io.py $(UPLOAD_DIR)/orchestration/
cp $(ROOT_DIR)src/orchestration/_source_io.py $(UPLOAD_DIR)/orchestration/
cp $(ROOT_DIR)src/_fb_types.py $(UPLOAD_DIR)/
cp $(ROOT_DIR)src/_schemas.py $(UPLOAD_DIR)/
cp $^ $(UPLOAD_DIR)/
gcloud run jobs deploy \
func-data-yfinance-fetch \
Expand All @@ -37,4 +41,4 @@ deploy : main.py .gcloudignore requirements.txt Procfile
--source $(UPLOAD_DIR)

clean :
rm -rf $(UPLOAD_DIR) .gcloudignore Procfile
rm -rf $(UPLOAD_DIR) .gcloudignore Dockerfile
32 changes: 32 additions & 0 deletions src/orchestration/func_yfinance_fetch/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""Yfinance fetch entry point."""

from __future__ import annotations

import logging
from typing import Any

from helpers import data_utils, decorator
from orchestration import _source_io
from sources.yfinance import YfinanceSource

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

SOURCE = "yfinance"


@decorator.log_runtime
def driver(_: Any) -> None:
"""Fetch Yahoo Finance stock data and upload to question bank."""
source = YfinanceSource()

dfq = data_utils.get_data_from_cloud_storage(SOURCE, return_question_data=True)

dff = source.fetch(dfq=dfq)

_source_io.write_fetch_output(SOURCE, dff)
logger.info("Done.")


if __name__ == "__main__":
driver(None)
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
google-cloud-storage
google-cloud-secret-manager
pandas>=2.2.2,<3.0
yfinance
pandera
termcolor
requests
numpy
yfinance
beautifulsoup4
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,23 @@ UPLOAD_DIR = upload
.gcloudignore:
cp -r $(ROOT_DIR)src/helpers/.gcloudignore .

Procfile:
cp -r $(ROOT_DIR)src/helpers/Procfile .
Dockerfile: $(ROOT_DIR)src/helpers/Dockerfile.template
sed \
-e 's/REGION/$(CLOUD_DEPLOY_REGION)/g' \
-e 's/STACK/google-22-full/g' \
-e 's/PYTHON_VERSION/python312/g' \
$< > Dockerfile

deploy : main.py .gcloudignore requirements.txt Procfile
deploy : main.py .gcloudignore requirements.txt Dockerfile
mkdir -p $(UPLOAD_DIR)
cp -r $(ROOT_DIR)utils $(UPLOAD_DIR)/
cp -r $(ROOT_DIR)src/helpers $(UPLOAD_DIR)/
cp -r $(ROOT_DIR)src/sources $(UPLOAD_DIR)/
cp $(ROOT_DIR)src/_fb_types.py $(UPLOAD_DIR)/
cp $(ROOT_DIR)src/_schemas.py $(UPLOAD_DIR)/
cp -r $(ROOT_DIR)src/helpers $(UPLOAD_DIR)/helpers
cp -r $(ROOT_DIR)src/sources $(UPLOAD_DIR)/sources
mkdir -p $(UPLOAD_DIR)/orchestration
cp $(ROOT_DIR)src/orchestration/__init__.py $(UPLOAD_DIR)/orchestration/
cp $(ROOT_DIR)src/orchestration/_io.py $(UPLOAD_DIR)/orchestration/
cp $(ROOT_DIR)src/orchestration/_source_io.py $(UPLOAD_DIR)/orchestration/
cp $(ROOT_DIR)src/_fb_types.py $(UPLOAD_DIR)/
cp $(ROOT_DIR)src/_schemas.py $(UPLOAD_DIR)/
cp $^ $(UPLOAD_DIR)/
gcloud run jobs deploy \
func-data-yfinance-update-questions \
Expand All @@ -37,4 +41,4 @@ deploy : main.py .gcloudignore requirements.txt Procfile
--source $(UPLOAD_DIR)

clean :
rm -rf $(UPLOAD_DIR) .gcloudignore Procfile
rm -rf $(UPLOAD_DIR) .gcloudignore Dockerfile
53 changes: 53 additions & 0 deletions src/orchestration/func_yfinance_update/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""Yfinance update entry point."""

from __future__ import annotations

import logging
import os
from typing import Any

from helpers import data_utils, decorator
from orchestration import _source_io
from sources.yfinance import YfinanceSource

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

SOURCE = "yfinance"


@decorator.log_runtime
def driver(_: Any) -> None:
"""Update Yahoo Finance questions and resolution files."""
overwrite_price_history = os.environ.get("OVERWRITE_PRICE_HISTORY", "").lower() in ("1", "true")
if overwrite_price_history:
logger.info("OVERWRITE_PRICE_HISTORY is set. Re-fetching all resolution data.")

source = YfinanceSource()

dfq, dff = data_utils.get_data_from_cloud_storage(
SOURCE, return_question_data=True, return_fetch_data=True
)

# Load existing resolution files for fetched tickers plus the renamed-ticker originals, whose
# files are rebuilt from their replacement symbols inside update().
rename_originals = [entry["original_ticker"] for entry in source.ticker_renames]
ids_to_load = sorted(set(dff["id"].astype(str)) | set(rename_originals))
existing_resolution_files = _source_io.load_existing_resolution_files(SOURCE, ids=ids_to_load)

result = source.update(
dfq,
dff,
existing_resolution_files=existing_resolution_files,
overwrite_price_history=overwrite_price_history,
)

logger.info("Uploading to GCP...")
data_utils.upload_questions(result.dfq, SOURCE)
if result.resolution_files:
_source_io.upload_resolution_files(SOURCE, result.resolution_files)
logger.info("Done.")


if __name__ == "__main__":
driver(None)
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
google-cloud-storage
google-cloud-secret-manager
yfinance
pandas>=2.2.2,<3.0
tqdm
requests
bs4
pandera
termcolor
requests
numpy
yfinance
beautifulsoup4
Loading
Loading