diff --git a/Makefile b/Makefile index b65e2f67..35c9b03e 100644 --- a/Makefile +++ b/Makefile @@ -159,10 +159,10 @@ acled-update-questions: yfinance: yfinance-fetch yfinance-update-questions yfinance-fetch: - $(MAKE) -C src/questions/yfinance/fetch || echo "* $@" >> $(MAKE_FAILURE_LOG) + $(MAKE) -C src/orchestration/func_yfinance_fetch || echo "* $@" >> $(MAKE_FAILURE_LOG) yfinance-update-questions: - $(MAKE) -C src/questions/yfinance/update_questions || echo "* $@" >> $(MAKE_FAILURE_LOG) + $(MAKE) -C src/orchestration/func_yfinance_update || echo "* $@" >> $(MAKE_FAILURE_LOG) polymarket: polymarket-fetch polymarket-update-questions diff --git a/src/_schemas.py b/src/_schemas.py index 76495e69..e8ac7064 100644 --- a/src/_schemas.py +++ b/src/_schemas.py @@ -100,6 +100,13 @@ class PolymarketFetchFrame(QuestionFrame): historical_prices: Series[object] # list[dict] with {date, value} per question +class YfinanceFetchFrame(QuestionFrame): + """Output of YfinanceSource.fetch(). QuestionFrame plus transient fields for update().""" + + fetch_datetime: Series[str] + probability: Series[object] = pa.Field(nullable=True) + + class ManifoldFetchFrame(pa.DataFrameModel): """Output of ManifoldSource.fetch(). Just market IDs from search-markets endpoint.""" diff --git a/src/orchestration/_source_io.py b/src/orchestration/_source_io.py index 10a7d436..a28a6a7f 100644 --- a/src/orchestration/_source_io.py +++ b/src/orchestration/_source_io.py @@ -34,6 +34,24 @@ def write_fetch_output(source: str, dff: pd.DataFrame) -> None: ) +def list_existing_resolution_ids(source: str) -> set[str]: + """Return the set of bare question IDs that already have a resolution file. + + Lists /*.jsonl and strips to bare IDs. This is a pure existence + listing: it includes files whose contents are empty, mirroring the old + inline list_with_prefix check. Do NOT derive this from + load_existing_resolution_files, which drops empty frames. + + Args: + source (str): Source name (e.g. "infer"). + + Returns: + set of bare question IDs present in storage for this source. + """ + paths = gcp.storage.list_with_prefix(bucket_name=env.QUESTION_BANK_BUCKET, prefix=f"{source}/") + return {os.path.basename(p).removesuffix(".jsonl") for p in paths if p.endswith(".jsonl")} + + def load_existing_resolution_files( source: str, ids: Iterable[str] | None = None, @@ -52,12 +70,7 @@ def load_existing_resolution_files( dict mapping question_id to its resolution DataFrame. """ if ids is None: - paths = gcp.storage.list_with_prefix( - bucket_name=env.QUESTION_BANK_BUCKET, prefix=f"{source}/" - ) - question_ids = [ - os.path.basename(p).removesuffix(".jsonl") for p in paths if p.endswith(".jsonl") - ] + question_ids = list(list_existing_resolution_ids(source)) else: question_ids = [str(qid) for qid in ids] diff --git a/src/orchestration/func_infer_fetch/main.py b/src/orchestration/func_infer_fetch/main.py index f39baf1e..968feb87 100644 --- a/src/orchestration/func_infer_fetch/main.py +++ b/src/orchestration/func_infer_fetch/main.py @@ -5,10 +5,9 @@ import logging from typing import Any -from helpers import data_utils, decorator, env, keys +from helpers import data_utils, decorator, keys from orchestration import _source_io from sources.infer import InferSource -from utils import gcp logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -23,11 +22,9 @@ def driver(_: Any) -> None: source.api_key = keys.API_KEY_INFER dfq = data_utils.get_data_from_cloud_storage(SOURCE, return_question_data=True) - files_in_storage = gcp.storage.list_with_prefix( - bucket_name=env.QUESTION_BANK_BUCKET, prefix=SOURCE - ) + existing_resolution_ids = _source_io.list_existing_resolution_ids(SOURCE) - dff = source.fetch(dfq=dfq, files_in_storage=files_in_storage) + dff = source.fetch(dfq=dfq, existing_resolution_ids=existing_resolution_ids) _source_io.write_fetch_output(SOURCE, dff) logger.info("Done.") diff --git a/src/orchestration/func_manifold_update/main.py b/src/orchestration/func_manifold_update/main.py index a152a0dc..8279dc19 100644 --- a/src/orchestration/func_manifold_update/main.py +++ b/src/orchestration/func_manifold_update/main.py @@ -5,10 +5,9 @@ import logging from typing import Any -from helpers import data_utils, decorator, env +from helpers import data_utils, decorator from orchestration import _source_io from sources.manifold import ManifoldSource -from utils import gcp logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -32,15 +31,13 @@ def driver(_: Any) -> None: existing_resolution_files = _source_io.load_existing_resolution_files(SOURCE) logger.info(f"Loaded {len(existing_resolution_files)} resolution files") - files_in_storage = gcp.storage.list_with_prefix( - bucket_name=env.QUESTION_BANK_BUCKET, prefix=SOURCE - ) + existing_resolution_ids = _source_io.list_existing_resolution_ids(SOURCE) result = source.update( dfq, dff, existing_resolution_files=existing_resolution_files, - files_in_storage=files_in_storage, + existing_resolution_ids=existing_resolution_ids, ) logger.info("Uploading to GCP...") diff --git a/src/orchestration/func_metaculus_update/main.py b/src/orchestration/func_metaculus_update/main.py index b013829a..63baa0e3 100644 --- a/src/orchestration/func_metaculus_update/main.py +++ b/src/orchestration/func_metaculus_update/main.py @@ -5,10 +5,9 @@ import logging from typing import Any -from helpers import data_utils, decorator, env, keys +from helpers import data_utils, decorator, keys from orchestration import _source_io from sources.metaculus import MetaculusSource -from utils import gcp logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -26,14 +25,12 @@ def driver(_: Any) -> None: SOURCE, return_question_data=True, return_fetch_data=True ) - files_in_storage = gcp.storage.list_with_prefix( - bucket_name=env.QUESTION_BANK_BUCKET, prefix=SOURCE - ) + existing_resolution_ids = _source_io.list_existing_resolution_ids(SOURCE) result = source.update( dfq, dff, - files_in_storage=files_in_storage, + existing_resolution_ids=existing_resolution_ids, ) logger.info("Uploading to GCP...") diff --git a/src/orchestration/func_resolve/requirements.txt b/src/orchestration/func_resolve/requirements.txt index 4c503990..24f56996 100644 --- a/src/orchestration/func_resolve/requirements.txt +++ b/src/orchestration/func_resolve/requirements.txt @@ -10,4 +10,6 @@ slack_sdk pandera pytz python-dateutil -backoff \ No newline at end of file +backoff +beautifulsoup4 +yfinance \ No newline at end of file diff --git a/src/questions/yfinance/fetch/Makefile b/src/orchestration/func_yfinance_fetch/Makefile similarity index 62% rename from src/questions/yfinance/fetch/Makefile rename to src/orchestration/func_yfinance_fetch/Makefile index d4a68038..002c84bb 100644 --- a/src/questions/yfinance/fetch/Makefile +++ b/src/orchestration/func_yfinance_fetch/Makefile @@ -9,19 +9,23 @@ UPLOAD_DIR = upload .gcloudignore: cp -r $(ROOT_DIR)src/helpers/.gcloudignore . -Procfile: - cp -r $(ROOT_DIR)src/helpers/Procfile . +Dockerfile: $(ROOT_DIR)src/helpers/Dockerfile.template + sed \ + -e 's/REGION/$(CLOUD_DEPLOY_REGION)/g' \ + -e 's/STACK/google-22-full/g' \ + -e 's/PYTHON_VERSION/python312/g' \ + $< > Dockerfile -deploy : main.py .gcloudignore requirements.txt Procfile +deploy : main.py .gcloudignore requirements.txt Dockerfile mkdir -p $(UPLOAD_DIR) cp -r $(ROOT_DIR)utils $(UPLOAD_DIR)/ - cp -r $(ROOT_DIR)src/helpers $(UPLOAD_DIR)/ - cp -r $(ROOT_DIR)src/sources $(UPLOAD_DIR)/ - cp $(ROOT_DIR)src/_fb_types.py $(UPLOAD_DIR)/ - cp $(ROOT_DIR)src/_schemas.py $(UPLOAD_DIR)/ + cp -r $(ROOT_DIR)src/helpers $(UPLOAD_DIR)/helpers + cp -r $(ROOT_DIR)src/sources $(UPLOAD_DIR)/sources mkdir -p $(UPLOAD_DIR)/orchestration cp $(ROOT_DIR)src/orchestration/__init__.py $(UPLOAD_DIR)/orchestration/ - cp $(ROOT_DIR)src/orchestration/_io.py $(UPLOAD_DIR)/orchestration/ + cp $(ROOT_DIR)src/orchestration/_source_io.py $(UPLOAD_DIR)/orchestration/ + cp $(ROOT_DIR)src/_fb_types.py $(UPLOAD_DIR)/ + cp $(ROOT_DIR)src/_schemas.py $(UPLOAD_DIR)/ cp $^ $(UPLOAD_DIR)/ gcloud run jobs deploy \ func-data-yfinance-fetch \ @@ -37,4 +41,4 @@ deploy : main.py .gcloudignore requirements.txt Procfile --source $(UPLOAD_DIR) clean : - rm -rf $(UPLOAD_DIR) .gcloudignore Procfile + rm -rf $(UPLOAD_DIR) .gcloudignore Dockerfile diff --git a/src/orchestration/func_yfinance_fetch/main.py b/src/orchestration/func_yfinance_fetch/main.py new file mode 100644 index 00000000..9e473a6c --- /dev/null +++ b/src/orchestration/func_yfinance_fetch/main.py @@ -0,0 +1,32 @@ +"""Yfinance fetch entry point.""" + +from __future__ import annotations + +import logging +from typing import Any + +from helpers import data_utils, decorator +from orchestration import _source_io +from sources.yfinance import YfinanceSource + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +SOURCE = "yfinance" + + +@decorator.log_runtime +def driver(_: Any) -> None: + """Fetch Yahoo Finance stock data and upload to question bank.""" + source = YfinanceSource() + + dfq = data_utils.get_data_from_cloud_storage(SOURCE, return_question_data=True) + + dff = source.fetch(dfq=dfq) + + _source_io.write_fetch_output(SOURCE, dff) + logger.info("Done.") + + +if __name__ == "__main__": + driver(None) diff --git a/src/questions/yfinance/update_questions/requirements.txt b/src/orchestration/func_yfinance_fetch/requirements.txt similarity index 60% rename from src/questions/yfinance/update_questions/requirements.txt rename to src/orchestration/func_yfinance_fetch/requirements.txt index ab0d59a5..a52665cf 100644 --- a/src/questions/yfinance/update_questions/requirements.txt +++ b/src/orchestration/func_yfinance_fetch/requirements.txt @@ -1,6 +1,7 @@ google-cloud-storage -google-cloud-secret-manager pandas>=2.2.2,<3.0 -yfinance pandera -termcolor +requests +numpy +yfinance +beautifulsoup4 diff --git a/src/questions/yfinance/update_questions/Makefile b/src/orchestration/func_yfinance_update/Makefile similarity index 62% rename from src/questions/yfinance/update_questions/Makefile rename to src/orchestration/func_yfinance_update/Makefile index e5a9ceab..49712c83 100644 --- a/src/questions/yfinance/update_questions/Makefile +++ b/src/orchestration/func_yfinance_update/Makefile @@ -9,19 +9,23 @@ UPLOAD_DIR = upload .gcloudignore: cp -r $(ROOT_DIR)src/helpers/.gcloudignore . -Procfile: - cp -r $(ROOT_DIR)src/helpers/Procfile . +Dockerfile: $(ROOT_DIR)src/helpers/Dockerfile.template + sed \ + -e 's/REGION/$(CLOUD_DEPLOY_REGION)/g' \ + -e 's/STACK/google-22-full/g' \ + -e 's/PYTHON_VERSION/python312/g' \ + $< > Dockerfile -deploy : main.py .gcloudignore requirements.txt Procfile +deploy : main.py .gcloudignore requirements.txt Dockerfile mkdir -p $(UPLOAD_DIR) cp -r $(ROOT_DIR)utils $(UPLOAD_DIR)/ - cp -r $(ROOT_DIR)src/helpers $(UPLOAD_DIR)/ - cp -r $(ROOT_DIR)src/sources $(UPLOAD_DIR)/ - cp $(ROOT_DIR)src/_fb_types.py $(UPLOAD_DIR)/ - cp $(ROOT_DIR)src/_schemas.py $(UPLOAD_DIR)/ + cp -r $(ROOT_DIR)src/helpers $(UPLOAD_DIR)/helpers + cp -r $(ROOT_DIR)src/sources $(UPLOAD_DIR)/sources mkdir -p $(UPLOAD_DIR)/orchestration cp $(ROOT_DIR)src/orchestration/__init__.py $(UPLOAD_DIR)/orchestration/ - cp $(ROOT_DIR)src/orchestration/_io.py $(UPLOAD_DIR)/orchestration/ + cp $(ROOT_DIR)src/orchestration/_source_io.py $(UPLOAD_DIR)/orchestration/ + cp $(ROOT_DIR)src/_fb_types.py $(UPLOAD_DIR)/ + cp $(ROOT_DIR)src/_schemas.py $(UPLOAD_DIR)/ cp $^ $(UPLOAD_DIR)/ gcloud run jobs deploy \ func-data-yfinance-update-questions \ @@ -37,4 +41,4 @@ deploy : main.py .gcloudignore requirements.txt Procfile --source $(UPLOAD_DIR) clean : - rm -rf $(UPLOAD_DIR) .gcloudignore Procfile + rm -rf $(UPLOAD_DIR) .gcloudignore Dockerfile diff --git a/src/orchestration/func_yfinance_update/main.py b/src/orchestration/func_yfinance_update/main.py new file mode 100644 index 00000000..f49bee99 --- /dev/null +++ b/src/orchestration/func_yfinance_update/main.py @@ -0,0 +1,53 @@ +"""Yfinance update entry point.""" + +from __future__ import annotations + +import logging +import os +from typing import Any + +from helpers import data_utils, decorator +from orchestration import _source_io +from sources.yfinance import YfinanceSource + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +SOURCE = "yfinance" + + +@decorator.log_runtime +def driver(_: Any) -> None: + """Update Yahoo Finance questions and resolution files.""" + overwrite_price_history = os.environ.get("OVERWRITE_PRICE_HISTORY", "").lower() in ("1", "true") + if overwrite_price_history: + logger.info("OVERWRITE_PRICE_HISTORY is set. Re-fetching all resolution data.") + + source = YfinanceSource() + + dfq, dff = data_utils.get_data_from_cloud_storage( + SOURCE, return_question_data=True, return_fetch_data=True + ) + + # Load existing resolution files for fetched tickers plus the renamed-ticker originals, whose + # files are rebuilt from their replacement symbols inside update(). + rename_originals = [entry["original_ticker"] for entry in source.ticker_renames] + ids_to_load = sorted(set(dff["id"].astype(str)) | set(rename_originals)) + existing_resolution_files = _source_io.load_existing_resolution_files(SOURCE, ids=ids_to_load) + + result = source.update( + dfq, + dff, + existing_resolution_files=existing_resolution_files, + overwrite_price_history=overwrite_price_history, + ) + + logger.info("Uploading to GCP...") + data_utils.upload_questions(result.dfq, SOURCE) + if result.resolution_files: + _source_io.upload_resolution_files(SOURCE, result.resolution_files) + logger.info("Done.") + + +if __name__ == "__main__": + driver(None) diff --git a/src/questions/yfinance/fetch/requirements.txt b/src/orchestration/func_yfinance_update/requirements.txt similarity index 58% rename from src/questions/yfinance/fetch/requirements.txt rename to src/orchestration/func_yfinance_update/requirements.txt index 85bd11bb..a52665cf 100644 --- a/src/questions/yfinance/fetch/requirements.txt +++ b/src/orchestration/func_yfinance_update/requirements.txt @@ -1,9 +1,7 @@ google-cloud-storage -google-cloud-secret-manager -yfinance pandas>=2.2.2,<3.0 -tqdm -requests -bs4 pandera -termcolor +requests +numpy +yfinance +beautifulsoup4 diff --git a/src/questions/yfinance/fetch/main.py b/src/questions/yfinance/fetch/main.py deleted file mode 100644 index dfe5aa77..00000000 --- a/src/questions/yfinance/fetch/main.py +++ /dev/null @@ -1,188 +0,0 @@ -"""Yfinance fetch new questions script.""" - -import json -import logging -import os -import sys -import time -from datetime import timedelta - -import pandas as pd -import requests -import yfinance as yf -from bs4 import BeautifulSoup -from tqdm import tqdm - -sys.path.append(os.path.join(os.path.dirname(__file__), "../../..")) -from helpers import constants, data_utils, dates, decorator, env # noqa: E402 - -sys.path.append(os.path.join(os.path.dirname(__file__), "../../../..")) -from utils import gcp # noqa: E402 - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -SOURCE = "yfinance" - - -def get_sp500_tickers(): - """ - Retrieve the list of S&P 500 index constituent tickers. - - Access the S&P 500 Wikipedia page and parse the HTML content to extract - the tickers of the constituents. Return a list of the tickers found - in the designated table on the page. - - Returns: - list of str: A list containing the tickers of the S&P 500 index constituents. - """ - try: - url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies" - headers = {"User-Agent": constants.BENCHMARK_USER_AGENT} - response = requests.get(url, headers=headers) - response.raise_for_status() - soup = BeautifulSoup(response.content, "html.parser") - table = soup.find("table", {"id": "constituents"}) - tickers = [ - row.find_all("td")[0].text.strip() for row in table.find_all("tr")[1:] - ] # Skip header row - logger.info(f"Retrieved S&P 500 stock tickers: {len(tickers)} tickers") - return tickers - except Exception as e: - logger.error(f"Failed to retrieve stock tickers due to: {e}") - return [] # Return an empty list if there's any error - - -def fetch_one_stock(ticker): - """ - Fetch the company name and the latest historical stock data for a given stock ticker. - - This function attempts to retrieve the company name and its most recent stock data for - the specified ticker using the yfinance library. It returns the company name and a - pandas DataFrame containing the historical stock data. If the ticker is invalid or - data retrieval fails, it returns None for both. - - Parameters: - - ticker (str): The stock ticker for which to fetch the data. - - Returns: - - tuple: A tuple containing the company name (str) and the historical stock data - (pandas.DataFrame) for the given ticker. Returns (None, None) if unable to fetch data. - """ - try: - ticker = yf.Ticker(ticker) - company_name = ticker.info["longName"] - hist = ticker.history(period="5d", auto_adjust=False).reset_index() - yesterday = dates.get_date_today() - timedelta(days=1) - hist["Date"] = pd.to_datetime(hist["Date"]) - hist = hist[hist["Date"].dt.date <= yesterday].tail(1) - return company_name, hist - - except Exception: - return None, None - - -def fetch_all_stock(dfq): - """ - Fetch and compile stock information for a list of stock tickers. - - Iterates over a provided list of stock tickers, fetching the company name and the most - recent stock data for each. Constructs a detailed output string for each stock ticker, - along with additional metadata, and appends it to a list. This list is returned at the - end of the function. If data for a specific stock cannot be fetched, it is skipped. - - Parameters: - - dfq (list of dict): A list of dicts for all stocks information in questions.jsonl. - - Returns: - - pd.DataFrame: Stock data including delisted stocks marked with resolved=True. - """ - stock_list = [] - - top_500_stocks = get_sp500_tickers() - current_stocks = dfq["id"].unique() if "id" in dfq.columns else [] - set_top_500 = set(top_500_stocks) - set_current = set(current_stocks) - union_stocks_list = list(set_top_500.union(set_current)) - logger.info( - f"Stock tickers not in top 500 but in current stocks: {set_current.difference(set_top_500)}" - ) - - for ticker in tqdm(union_stocks_list, desc="Fetching stock data"): - # Avoid YFRateLimitError - time.sleep(1) - company_name, hist = fetch_one_stock(ticker) - current_time = dates.get_datetime_now() - - if company_name and not hist.empty: - current_price = round(hist["Close"].iloc[-1], 2) - background = yf.Ticker(ticker).info.get("longBusinessSummary", "N/A") - - stock_list.append( - { - "id": ticker, - "question": ( - f"Will {ticker}'s market close price on " - "{resolution_date} be higher than its market close price on " - "{forecast_due_date}?\n\n" - "Stock splits and reverse splits will be accounted for in resolving this " - "question. Forecasts on questions about companies that have been delisted " - "(through mergers or bankruptcy) will resolve to their final close price." - ), - "background": background, - "market_info_resolution_criteria": "N/A", - "market_info_open_datetime": "N/A", - "market_info_close_datetime": "N/A", - "url": f"https://finance.yahoo.com/quote/{ticker}", - "resolved": False, - "market_info_resolution_datetime": "N/A", - "fetch_datetime": current_time, - "probability": current_price, - "forecast_horizons": constants.FORECAST_HORIZONS_IN_DAYS, - "freeze_datetime_value": current_price, - "freeze_datetime_value_explanation": f"The latest market close price of {ticker}.", - } - ) - - logger.info(company_name) - elif company_name is None and ticker in set_current and ticker not in set_top_500: - existing = dfq[dfq["id"] == ticker].iloc[0].to_dict() - existing.update( - { - "resolved": True, - "fetch_datetime": current_time, - "probability": float("nan"), - "freeze_datetime_value": "N/A", - } - ) - stock_list.append(existing) - logger.warning(f"{ticker} detected as delisted (not in S&P 500 and fetch failed)") - - return pd.DataFrame(stock_list) - - -@decorator.log_runtime -def driver(_): - """Fetch all stock and then upload to gcp.""" - dfq = data_utils.get_data_from_cloud_storage(SOURCE, return_question_data=True) - - all_stock = fetch_all_stock(dfq) - filenames = data_utils.generate_filenames(SOURCE) - - # Save and upload - with open(filenames["local_fetch"], "w", encoding="utf-8") as f: - # can't use `dfq.to_json` because we don't want escape chars - for record in all_stock.to_dict("records"): - json_str = json.dumps(record, ensure_ascii=False) - f.write(json_str + "\n") - - logger.info("Uploading to GCP...") - gcp.storage.upload( - bucket_name=env.QUESTION_BANK_BUCKET, - local_filename=filenames["local_fetch"], - ) - logger.info("Done.") - - -if __name__ == "__main__": - driver(None) diff --git a/src/questions/yfinance/update_questions/main.py b/src/questions/yfinance/update_questions/main.py deleted file mode 100644 index 9d953eb6..00000000 --- a/src/questions/yfinance/update_questions/main.py +++ /dev/null @@ -1,340 +0,0 @@ -"""Yfinance update question script.""" - -import logging -import os -import sys -from datetime import timedelta - -import pandas as pd -import yfinance as yf - -sys.path.append(os.path.join(os.path.dirname(__file__), "../../..")) # noqa: E402 -from helpers import constants, data_utils, dates, decorator, env # noqa: E402 - -sys.path.append(os.path.join(os.path.dirname(__file__), "../../../..")) -from sources._metadata import SOURCE_METADATA # noqa: E402 -from utils import gcp # noqa: E402 - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -SOURCE = "yfinance" -TICKER_RENAMES = SOURCE_METADATA["yfinance"]["ticker_renames"] - - -def select_time_range(days_difference): - """ - Select the appropriate time range based on days_difference. - - Possible time ranges in: - ['1d', '5d', '1mo', '3mo', '6mo', '1y', '2y', '5y', '10y', 'ytd', 'max'] - """ - if days_difference <= 1: - return "1d" - elif days_difference <= 5: - return "5d" - elif days_difference <= 30: - return "1mo" - elif days_difference <= 90: - return "3mo" - elif days_difference <= 180: - return "6mo" - elif days_difference <= 365: - return "1y" - elif days_difference <= 365 * 2: - return "2y" - elif days_difference <= 365 * 5: - return "5y" - elif days_difference <= 365 * 10: - return "10y" - else: - return "max" - - -def fetch_one_stock(ticker, period): - """ - Fetch historical stock price data for a given ticker. - - Retrieve the closing prices of a stock from the Yahoo Finance API. The function handles any - exceptions during the data retrieval process and returns an empty DataFrame if the data fetch - fails. - - Parameters: - - ticker (str): The stock symbol for which to retrieve price data. - - period (str): One of: '1d', '5d', '1mo', '3mo', '6mo', '1y', '2y', '5y', '10y', 'ytd', 'max' - - Returns: - - DataFrame: A pandas DataFrame containing the historical closing prices of the stock with - columns 'date' and 'value', where 'date' is the date of the closing price and - 'value' is the closing price itself. If the data fetch fails, returns an empty DataFrame. - """ - try: - ticker = yf.Ticker(ticker) - hist = ticker.history(period=period, auto_adjust=False) - return hist[["Close"]].reset_index().rename(columns={"Date": "date", "Close": "value"}) - except Exception as e: - logger.error(f"Failed to fetch data for {ticker}: {e}") - return pd.DataFrame() - - -def get_historical_prices(current_df, ticker, period): - """ - Update a DataFrame with the latest historical stock prices for a given ticker. - - Determine the period to fetch based on the most recent date in the provided DataFrame and fill - any missing days, including weekends and holidays, by carrying forward the last available price. - If the input DataFrame is empty, initialize it and fetch data for at least the last day. The - resulting DataFrame includes a complete series of dates and corresponding stock prices, - indexed daily from the earliest date available in the input or from the last day if the DataFrame - was initially empty. - - Parameters: - - current_df (DataFrame): A pandas DataFrame containing columns ['id', 'date', 'value'], - where 'id' is the stock ticker, 'date' is the date, and 'value' is the stock price. - - ticker (str): The stock symbol for which to retrieve and update prices. - - period (str): One of: '1d', '5d', '1mo', '3mo', '6mo', '1y', '2y', '5y', '10y', 'ytd', 'max' - - Returns: - - DataFrame: A pandas DataFrame updated with daily historical stock prices from the most recent - date in 'current_df' to the present day. The DataFrame is sorted by 'date' and includes - the columns ['id', 'date', 'value']. - """ - if current_df.empty: - current_df = pd.DataFrame(columns=constants.RESOLUTION_FILE_COLUMNS) - - df = fetch_one_stock(ticker, period) - if df.empty: - return current_df - - yesterday = dates.get_date_today() - timedelta(days=1) - df["date"] = pd.to_datetime(df["date"]).dt.date - df = df[ - (df["date"] >= constants.QUESTION_BANK_DATA_STORAGE_START_DATE) & (df["date"] <= yesterday) - ] - - # forward fill for weekends - full_date_range = pd.date_range(start=df["date"].min(), end=yesterday) - df = df.set_index("date").reindex(full_date_range).ffill().rename_axis("date").reset_index() - df["id"] = ticker - return df[["id", "date", "value"]].astype(dtype=constants.RESOLUTION_FILE_COLUMN_DTYPE) - - -def create_resolution_file(question, period, force=False): - """ - Create or update a resolution file based on the question ID provided. Download the existing file, if any. - - Check the last entry date, and update with new data if there's no entry for today. Upload the updated file - back to the specified Google Cloud Platform bucket. - Parameters: - - question (dict): A dictionary containing at least the 'id' of the question. - - period (str): One of: '1d', '5d', '1mo', '3mo', '6mo', '1y', '2y', '5y', '10y', 'ytd', 'max' - - force (bool): If True, re-fetch data even if the resolution file is already up-to-date. - - Returns: - - DataFrame: Return the current state of the resolution file as a DataFrame if no update is needed. - If an update occurs, the function returns None after uploading the updated file. - """ - basename = f"{question['id']}.jsonl" - remote_filename = f"{SOURCE}/{basename}" - local_filename = "/tmp/tmp.jsonl" - - if os.path.exists(local_filename): - os.remove(local_filename) - gcp.storage.download_no_error_message_on_404( - bucket_name=env.QUESTION_BANK_BUCKET, - filename=remote_filename, - local_filename=local_filename, - ) - if os.path.exists(local_filename): - df = pd.read_json( - local_filename, - lines=True, - dtype=constants.RESOLUTION_FILE_COLUMN_DTYPE, - convert_dates=False, - ) - else: - df = pd.DataFrame( - { - col: pd.Series( - dtype=( - constants.RESOLUTION_FILE_COLUMN_DTYPE[col] - if col in constants.RESOLUTION_FILE_COLUMN_DTYPE - else "object" - ) - ) - for col in constants.RESOLUTION_FILE_COLUMNS - } - ) - - is_resolved = question.get("resolved", False) - - yesterday = dates.get_date_today() - timedelta(days=1) - if ( - not force - and not is_resolved - and not df.empty - and pd.to_datetime(df["date"].iloc[-1]).date() >= yesterday - ): - logger.info(f"{question['id']} is skipped because it's already up-to-date!") - # Check last date to see if we've already gotten the resolution value for today - # If we have it already, return to avoid unnecessary API calls - return - - df_new = get_historical_prices(df, question["id"], period) - - if is_resolved: - df_new = finalize_resolution_file(df_new) - - if not df.equals(df_new): - # Only upload dataframes that changed. - logger.info(f"Uploading resolution file for {question['id']}") - df_new.to_json(local_filename, orient="records", lines=True, date_format="iso") - gcp.storage.upload( - bucket_name=env.QUESTION_BANK_BUCKET, - local_filename=local_filename, - filename=remote_filename, - ) - - -def update_questions(dfq, dff, overwrite_price_history=False): - """ - Update the dataframes with new or modified question data and new community predictions. - - Parameters: - - dfq (pd.DataFrame): DataFrame containing all existing questions. - - dff (pd.DataFrame): DataFrame containing all newly fetched questions. - - overwrite_price_history (bool): If True, re-fetch all resolution data even if up-to-date. - - The function updates dfq by either replacing existing questions with new data or adding new questions. - It also appends new community predictions to dfr for each question in all_questions_to_add. - """ - dff_list = dff.to_dict("records") - day_diff = (dates.get_date_today() - constants.QUESTION_BANK_DATA_STORAGE_START_DATE).days - period = select_time_range(day_diff) - - renamed_tickers = {e["original_ticker"] for e in TICKER_RENAMES} - - for question in dff_list: - if question["id"] in renamed_tickers: - logger.info(f"Skipping {question['id']} (renamed ticker, handled separately)") - else: - create_resolution_file(question, period, force=overwrite_price_history) - - del question["fetch_datetime"] - del question["probability"] - - # Check if the question exists in dfq - if question["id"] in dfq["id"].values: - # Case 1: Update existing question - dfq_index = dfq.index[dfq["id"] == question["id"]].tolist()[0] - for key, value in question.items(): - dfq.at[dfq_index, key] = value - else: - # Case 2: Append new question - new_q_row = pd.DataFrame([question]) - new_q_row = new_q_row.astype(constants.QUESTION_FILE_COLUMN_DTYPE) - dfq = pd.concat([dfq, new_q_row], ignore_index=True) - - create_renamed_ticker_resolution_files(period) - - return dfq - - -def finalize_resolution_file(df): - """Forward-fill a resolution file to yesterday. - - Args: - df (pd.DataFrame): Resolution data with columns [id, date, value]. - - Returns: - pd.DataFrame: Forward-filled to yesterday. - """ - if df.empty: - return df - - end_date = dates.get_date_today() - timedelta(days=1) - - df = df.copy() - ticker_id = df["id"].iloc[0] - df["date"] = pd.to_datetime(df["date"]) - df = df.set_index("date") - - full_range = pd.date_range(start=df.index.min(), end=end_date) - df = df.reindex(full_range).ffill().rename_axis("date").reset_index() - df["id"] = ticker_id - - return df[["id", "date", "value"]].astype(dtype=constants.RESOLUTION_FILE_COLUMN_DTYPE) - - -def create_renamed_ticker_resolution_files(period): - """Fetch resolution data for renamed tickers using the replacement ticker. - - For each entry in TICKER_RENAMES, fetch price history under the replacement - ticker and write it to a resolution file named after the original ticker. - - Args: - period (str): yfinance period string for history lookback. - """ - local_filename = "/tmp/tmp.jsonl" - - for entry in TICKER_RENAMES: - original = entry["original_ticker"] - replacement = entry["replacement_ticker"] - remote_filename = f"{SOURCE}/{original}.jsonl" - - if os.path.exists(local_filename): - os.remove(local_filename) - gcp.storage.download_no_error_message_on_404( - bucket_name=env.QUESTION_BANK_BUCKET, - filename=remote_filename, - local_filename=local_filename, - ) - if os.path.exists(local_filename): - current_df = pd.read_json( - local_filename, - lines=True, - dtype=constants.RESOLUTION_FILE_COLUMN_DTYPE, - convert_dates=False, - ) - else: - current_df = pd.DataFrame(columns=constants.RESOLUTION_FILE_COLUMNS) - - df_new = get_historical_prices(current_df, replacement, period) - if df_new.empty: - logger.warning(f"No data for replacement ticker {replacement} (original: {original})") - continue - - df_new["id"] = original - - if not current_df.equals(df_new): - logger.info(f"Uploading resolution file for {original} (via {replacement})") - df_new.to_json(local_filename, orient="records", lines=True, date_format="iso") - gcp.storage.upload( - bucket_name=env.QUESTION_BANK_BUCKET, - local_filename=local_filename, - filename=remote_filename, - ) - - -@decorator.log_runtime -def driver(_): - """Execute the main workflow of fetching, processing, and uploading questions.""" - overwrite_price_history = os.environ.get("OVERWRITE_PRICE_HISTORY", "").lower() in ("1", "true") - if overwrite_price_history: - logger.info("OVERWRITE_PRICE_HISTORY is set. Re-fetching all resolution data.") - - # Download existing questions from cloud storage - dfq, dff = data_utils.get_data_from_cloud_storage( - SOURCE, return_question_data=True, return_fetch_data=True - ) - - # Update the existing questions - dfq = update_questions(dfq, dff, overwrite_price_history=overwrite_price_history) - - logger.info("Uploading to GCP...") - data_utils.upload_questions(dfq, SOURCE) - logger.info("Done.") - - -if __name__ == "__main__": - driver(None) diff --git a/src/sources/infer.py b/src/sources/infer.py index 7cf88c10..1a8826e9 100644 --- a/src/sources/infer.py +++ b/src/sources/infer.py @@ -40,16 +40,17 @@ def fetch( self, *, dfq: DataFrame[QuestionFrame] | None = None, - files_in_storage: list[str] | None = None, + existing_resolution_ids: set[str] | None = None, ) -> DataFrame[InferFetchFrame]: """Fetch questions from the INFER API. Args: dfq (DataFrame[QuestionFrame] | None): Existing question bank. - files_in_storage (list[str] | None): Existing resolution file paths. + existing_resolution_ids (set[str] | None): Bare IDs that already have a resolution + file in storage. """ self._require_api_key() - files_in_storage = files_in_storage or [] + existing_resolution_ids = existing_resolution_ids or set() # Determine which existing questions need re-fetching resolved_ids: list[str] = [] @@ -62,7 +63,7 @@ def fetch( logger.info(f"Number unresolved_ids: {len(unresolved_ids)}") resolved_ids_without_files = [ - id for id in resolved_ids if f"{self.name}/{id}.jsonl" not in files_in_storage + id for id in resolved_ids if str(id) not in existing_resolution_ids ] logger.info(f"resolved_ids_without_resolution_files: {resolved_ids_without_files}") @@ -128,7 +129,7 @@ def update( # Build/update resolution file existing_df = existing_resolution_files.get(question_id) - df_res = self._build_resolution_file( + df_res = self._build_resolution_df( question=question, resolved=question["resolved"], existing_df=existing_df, @@ -319,7 +320,7 @@ def _get_historical_forecasts( # Private: resolution file building # ------------------------------------------------------------------ - def _build_resolution_file( + def _build_resolution_df( self, question: dict, resolved: bool, @@ -386,14 +387,14 @@ def _build_resolution_file( @staticmethod def _finalize_resolution_df(df: pd.DataFrame) -> DataFrame[ResolutionFrame]: - """Apply date filtering and return as validated ResolutionFrame. + """Apply date filtering and select resolution columns. Args: df (pd.DataFrame): Raw resolution data with id, date, value columns. """ df["date"] = pd.to_datetime(df["date"]) df = df[df["date"].dt.date >= constants.BENCHMARK_START_DATE_DATETIME_DATE] - return ResolutionFrame.validate(df[["id", "date", "value"]]) + return df[["id", "date", "value"]].astype(dtype=constants.RESOLUTION_FILE_COLUMN_DTYPE) # ------------------------------------------------------------------ # Private: question transformation diff --git a/src/sources/manifold.py b/src/sources/manifold.py index 9adbdf01..68d19891 100644 --- a/src/sources/manifold.py +++ b/src/sources/manifold.py @@ -112,7 +112,7 @@ def update( dff: DataFrame[ManifoldFetchFrame], *, existing_resolution_files: dict[str, DataFrame[ResolutionFrame]] | None = None, - files_in_storage: list[str] | None = None, + existing_resolution_ids: set[str] | None = None, ) -> UpdateResult: """Process fetched IDs into updated questions and resolution files. @@ -124,10 +124,11 @@ def update( dfq (DataFrame[QuestionFrame]): Existing questions. dff (DataFrame[ManifoldFetchFrame]): Freshly fetched market IDs. existing_resolution_files (dict | None): Per-question existing resolution data. - files_in_storage (list[str] | None): Existing resolution file paths in storage. + existing_resolution_ids (set[str] | None): Bare IDs that already have a resolution + file in storage. """ existing_resolution_files = existing_resolution_files or {} - files_in_storage = files_in_storage or [] + existing_resolution_ids = existing_resolution_ids or set() resolution_files: dict[str, pd.DataFrame] = {} # --- Append new IDs from dff to dfq --- @@ -166,7 +167,7 @@ def update( # Build resolution file existing_df = existing_resolution_files.get(row["id"]) - df_res = self._build_resolution_file( + df_res = self._build_resolution_df( market=market, market_info_resolution_datetime=dfq.at[index, "market_info_resolution_datetime"], existing_df=existing_df, @@ -182,10 +183,9 @@ def update( # --- Regenerate missing resolution files for resolved questions --- for _index, row in dfq[dfq["resolved"]].iterrows(): - filename = f"{self.name}/{row['id']}.jsonl" - if filename not in files_in_storage and row["id"] not in resolution_files: + if str(row["id"]) not in existing_resolution_ids and row["id"] not in resolution_files: market = self._get_market(row["id"]) - df_res = self._build_resolution_file( + df_res = self._build_resolution_df( market=market, market_info_resolution_datetime=row["market_info_resolution_datetime"], existing_df=None, @@ -331,7 +331,7 @@ def _get_market_bets(self, market_id: str) -> list[dict]: # Private: resolution file building # ------------------------------------------------------------------ - def _build_resolution_file( + def _build_resolution_df( self, market: dict, market_info_resolution_datetime: str, @@ -403,15 +403,9 @@ def _build_resolution_file( df = df.ffill() df["id"] = market_id - return self._finalize_resolution_df(df) - - @staticmethod - def _finalize_resolution_df(df: pd.DataFrame) -> DataFrame[ResolutionFrame]: - """Filter to benchmark period and validate as ResolutionFrame.""" df["date"] = pd.to_datetime(df["date"]) df = df[df["date"].dt.date >= constants.BENCHMARK_START_DATE_DATETIME_DATE] - df = df[["id", "date", "value"]].astype(dtype=constants.RESOLUTION_FILE_COLUMN_DTYPE) - return ResolutionFrame.validate(df) + return df[["id", "date", "value"]].astype(dtype=constants.RESOLUTION_FILE_COLUMN_DTYPE) @staticmethod def _get_resolved_market_value(market: dict) -> float: diff --git a/src/sources/metaculus.py b/src/sources/metaculus.py index c0eec80d..d7aea463 100644 --- a/src/sources/metaculus.py +++ b/src/sources/metaculus.py @@ -93,7 +93,7 @@ def update( dfq: DataFrame[QuestionFrame], dff: DataFrame[MetaculusFetchFrame], *, - files_in_storage: list[str] | None = None, + existing_resolution_ids: set[str] | None = None, ) -> UpdateResult: """Fetch full question data and build resolution files. @@ -106,8 +106,8 @@ def update( Args: dfq (DataFrame[QuestionFrame]): Existing questions. dff (DataFrame[MetaculusFetchFrame]): Discovered question IDs from fetch(). - files_in_storage (list[str] | None): Existing resolution file paths in storage, - used to decide which resolved questions need regenerating. + existing_resolution_ids (set[str] | None): Bare IDs that already have a resolution + file in storage, used to decide which resolved questions need regenerating. """ self._require_api_key() resolution_files: dict[str, pd.DataFrame] = {} @@ -157,7 +157,7 @@ def update( dfq.at[index, "forecast_horizons"] = "N/A" # Build resolution file - df_res = self._create_resolution_file(dfq, index, market) + df_res = self._build_resolution_df(dfq, index, market) if df_res is not None: resolution_files[str(row["id"])] = df_res dfq.at[index, "freeze_datetime_value"] = ( @@ -167,13 +167,12 @@ def update( dfq.at[index, "freeze_datetime_value"] = "N/A" # Regenerate missing resolution files for resolved questions - files_in_storage = files_in_storage or [] + existing_resolution_ids = existing_resolution_ids or set() for index, row in dfq[dfq["resolved"]].iterrows(): question_id = str(row["id"]) - filename = f"{self.name}/{question_id}.jsonl" - if filename not in files_in_storage and question_id not in resolution_files: + if question_id not in existing_resolution_ids and question_id not in resolution_files: market = self._get_market(row["id"]) - df_res = self._create_resolution_file(dfq, index, market) + df_res = self._build_resolution_df(dfq, index, market) if df_res is not None: resolution_files[question_id] = df_res @@ -325,12 +324,12 @@ def _get_resolved_market_value(market: dict) -> float: return 0 return np.nan - def _create_resolution_file( + def _build_resolution_df( self, dfq: pd.DataFrame, index: int, market: dict, - ) -> pd.DataFrame | None: + ) -> DataFrame[ResolutionFrame] | None: """Build the resolution file for a market from its aggregation history. Overwrites the resolution file entirely on each run (Metaculus returns the @@ -440,20 +439,5 @@ def set_date(end_datetime): } df["id"] = str(market["id"]) - df = df[["id", "date", "value"]] - return self._finalize_resolution_df(df) - - @staticmethod - def _finalize_resolution_df(df: pd.DataFrame) -> DataFrame[ResolutionFrame]: - """Cast types and return as a validated ResolutionFrame. - - Unlike infer/manifold, Metaculus does not filter to the benchmark start date: - the aggregation history is already bounded by the question's open window, and - this preserves the legacy job's output exactly. - - Args: - df (pd.DataFrame): Raw resolution data with id, date, value columns. - """ - df = df[["id", "date", "value"]].astype(dtype=constants.RESOLUTION_FILE_COLUMN_DTYPE) - return ResolutionFrame.validate(df) + return df[["id", "date", "value"]].astype(dtype=constants.RESOLUTION_FILE_COLUMN_DTYPE) diff --git a/src/sources/polymarket.py b/src/sources/polymarket.py index 647a12dd..d98081a0 100644 --- a/src/sources/polymarket.py +++ b/src/sources/polymarket.py @@ -201,7 +201,7 @@ def update( question_id = str(question["id"]) # Build the resolution file from the embedded historical prices. - resolution_files[question_id] = self._build_resolution_file(question) + resolution_files[question_id] = self._build_resolution_df(question) # Strip transient fields (not part of QuestionFrame). del question["fetch_datetime"] @@ -538,7 +538,7 @@ def _transform_question( # Private: resolution file building # ------------------------------------------------------------------ - def _build_resolution_file(self, question: dict) -> DataFrame[ResolutionFrame]: + def _build_resolution_df(self, question: dict) -> DataFrame[ResolutionFrame]: """Build a resolution file from a fetched question's embedded historical prices. Args: @@ -546,5 +546,4 @@ def _build_resolution_file(self, question: dict) -> DataFrame[ResolutionFrame]: """ df = pd.DataFrame(question["historical_prices"]) df["id"] = question["id"] - df = df[["id", "date", "value"]].astype(dtype=constants.RESOLUTION_FILE_COLUMN_DTYPE) - return ResolutionFrame.validate(df) + return df[["id", "date", "value"]].astype(dtype=constants.RESOLUTION_FILE_COLUMN_DTYPE) diff --git a/src/sources/yfinance.py b/src/sources/yfinance.py index 5d0eee13..e545a1d1 100644 --- a/src/sources/yfinance.py +++ b/src/sources/yfinance.py @@ -2,20 +2,462 @@ from __future__ import annotations +import logging +import time +from datetime import date, timedelta from typing import ClassVar +import pandas as pd +import pandera.pandas as pa +import requests +import yfinance as yf +from bs4 import BeautifulSoup +from pandera.typing import DataFrame + +from _fb_types import UpdateResult +from _schemas import QuestionFrame, ResolutionFrame, YfinanceFetchFrame +from helpers import constants, dates + from ._dataset import DatasetSource +logger = logging.getLogger(__name__) + class YfinanceSource(DatasetSource): """Yahoo Finance financial data source.""" name: ClassVar[str] = "yfinance" - def fetch(self, **kwargs): - """Fetch Yahoo Finance data from external API.""" - raise NotImplementedError + # Pinned at the start of fetch()/update() so every downstream helper (via self.get_date_today()) + # observes one consistent date for the whole run, even if it straddles midnight. + _today: date | None = None + + def get_date_today(self) -> date: + """Return the date pinned for this run, or the live date if none is pinned. + + fetch() and update() pin ``self._today`` once at the start; downstream helpers call this + instead of ``dates.get_date_today()`` so they all see the same date. + """ + return self._today if self._today is not None else dates.get_date_today() + + # ------------------------------------------------------------------ + # Public: fetch + # ------------------------------------------------------------------ + + @pa.check_types + def fetch( + self, + *, + dfq: DataFrame[QuestionFrame] | None = None, + ) -> DataFrame[YfinanceFetchFrame]: + """Fetch S&P 500 stock data from Yahoo Finance. + + The ticker universe is the union of the current S&P 500 constituents and any tickers + already in the question bank. Tickers that are still in the question pool but have dropped + out of the S&P 500 and can no longer be fetched are marked resolved (delisted) using their + existing question row. + + Args: + dfq (DataFrame[QuestionFrame] | None): Existing question bank. + """ + top_500 = self._get_sp500_tickers() + set_top_500 = set(top_500) + set_current = set(dfq["id"].unique()) if dfq is not None and "id" in dfq.columns else set() + all_tickers = list(set_top_500 | set_current) + + logger.info( + f"Stock tickers not in top 500 but in current stocks: {set_current - set_top_500}" + ) + + # Pin 'today' once for this run so all downstream date logic is consistent. + self._today = dates.get_date_today() + current_time = dates.get_datetime_now() + + rows = [] + for ticker_symbol in all_tickers: + time.sleep(1) # Avoid YFRateLimitError + company_name, hist = self._fetch_one_stock(ticker_symbol) + + if company_name and not hist.empty: + current_price = round(hist["Close"].iloc[-1], 2) + background = yf.Ticker(ticker_symbol).info.get("longBusinessSummary", "N/A") + rows.append( + { + "id": ticker_symbol, + "question": ( + f"Will {ticker_symbol}'s market close price on " + "{resolution_date} be higher than its market close price on " + "{forecast_due_date}?\n\n" + "Stock splits and reverse splits will be accounted for in resolving " + "this question. Forecasts on questions about companies that have been " + "delisted (through mergers or bankruptcy) will resolve to their final " + "close price." + ), + "background": background, + "market_info_resolution_criteria": "N/A", + "market_info_open_datetime": "N/A", + "market_info_close_datetime": "N/A", + "url": f"https://finance.yahoo.com/quote/{ticker_symbol}", + "resolved": False, + "market_info_resolution_datetime": "N/A", + "fetch_datetime": current_time, + "probability": current_price, + "forecast_horizons": constants.FORECAST_HORIZONS_IN_DAYS, + "freeze_datetime_value": current_price, + "freeze_datetime_value_explanation": ( + f"The latest market close price of {ticker_symbol}." + ), + } + ) + logger.info(company_name) + elif ( + company_name is None + and ticker_symbol in set_current + and ticker_symbol not in set_top_500 + ): + # Delisted: still in the question pool but no longer fetchable and out of the + # S&P 500. Carry forward the existing question row, marked resolved. + existing = dfq[dfq["id"] == ticker_symbol].iloc[0].to_dict() + existing.update( + { + "resolved": True, + "fetch_datetime": current_time, + "probability": float("nan"), + "freeze_datetime_value": "N/A", + } + ) + rows.append(existing) + logger.warning( + f"{ticker_symbol} detected as delisted (not in S&P 500 and fetch failed)" + ) + + return pd.DataFrame(rows) + + # ------------------------------------------------------------------ + # Public: update + # ------------------------------------------------------------------ + + @pa.check_types + def update( + self, + dfq: DataFrame[QuestionFrame], + dff: DataFrame[YfinanceFetchFrame], + *, + existing_resolution_files: dict[str, DataFrame[ResolutionFrame]] | None = None, + overwrite_price_history: bool = False, + ) -> UpdateResult: + """Process fetched stock data into updated questions and resolution files. + + Args: + dfq (DataFrame[QuestionFrame]): Existing questions. + dff (DataFrame[YfinanceFetchFrame]): Freshly fetched data. + existing_resolution_files (dict | None): Per-question existing resolution data. Must + include any renamed-ticker originals so their files can be refreshed. + overwrite_price_history (bool): If True, re-fetch all resolution data even if a file is + already up-to-date. + """ + existing_resolution_files = existing_resolution_files or {} + resolution_files: dict[str, pd.DataFrame] = {} + + # Pin 'today' once for this run so all downstream date logic is consistent. + self._today = dates.get_date_today() + period = self._select_time_range( + (self._today - constants.QUESTION_BANK_DATA_STORAGE_START_DATE).days + ) + + renamed_tickers = {entry["original_ticker"] for entry in self.ticker_renames} + + for question in dff.to_dict("records"): + question_id = str(question["id"]) + + if question_id in renamed_tickers: + # Resolution file is rebuilt from the replacement ticker below. + logger.info(f"Skipping {question_id} (renamed ticker, handled separately)") + else: + df_res = self._build_resolution_df( + question=question, + period=period, + existing_df=existing_resolution_files.get(question_id), + force=overwrite_price_history, + ) + if df_res is not None: + resolution_files[question_id] = df_res + + # Strip transient fetch-only fields (not part of QuestionFrame) + del question["fetch_datetime"] + del question["probability"] + + # Upsert into dfq + if question["id"] in dfq["id"].values: + dfq_index = dfq.index[dfq["id"] == question["id"]].tolist()[0] + for key, value in question.items(): + dfq.at[dfq_index, key] = value + else: + new_q_row = pd.DataFrame([question]) + new_q_row = new_q_row.astype(constants.QUESTION_FILE_COLUMN_DTYPE) + dfq = pd.concat([dfq, new_q_row], ignore_index=True) + + # Renamed tickers: fetch under the replacement, write under the original ticker. + resolution_files.update( + self._build_renamed_ticker_resolution_files(period, existing_resolution_files) + ) + + return UpdateResult( + dfq=dfq, + resolution_files=resolution_files, + ) + + # ------------------------------------------------------------------ + # Private: S&P 500 tickers + # ------------------------------------------------------------------ + + @staticmethod + def _get_sp500_tickers() -> list[str]: + """Scrape S&P 500 constituent tickers from Wikipedia.""" + try: + url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies" + headers = {"User-Agent": constants.BENCHMARK_USER_AGENT} + response = requests.get(url, headers=headers) + response.raise_for_status() + soup = BeautifulSoup(response.content, "html.parser") + table = soup.find("table", {"id": "constituents"}) + tickers = [row.find_all("td")[0].text.strip() for row in table.find_all("tr")[1:]] + logger.info(f"Retrieved S&P 500 stock tickers: {len(tickers)} tickers") + return tickers + except Exception as e: + logger.error(f"Failed to retrieve stock tickers due to: {e}") + return [] + + # ------------------------------------------------------------------ + # Private: single stock fetch + # ------------------------------------------------------------------ + + def _fetch_one_stock(self, ticker_symbol: str) -> tuple[str | None, pd.DataFrame | None]: + """Fetch company name and the latest historical row for one ticker. + + Args: + ticker_symbol (str): Stock ticker symbol. + + Returns: + Tuple of (company_name, hist_df) or (None, None) on failure. + """ + try: + ticker = yf.Ticker(ticker_symbol) + company_name = ticker.info["longName"] + hist = ticker.history(period="5d", auto_adjust=False).reset_index() + yesterday = self.get_date_today() - timedelta(days=1) + hist["Date"] = pd.to_datetime(hist["Date"]) + hist = hist[hist["Date"].dt.date <= yesterday].tail(1) + return company_name, hist + except Exception: + return None, None + + # ------------------------------------------------------------------ + # Private: resolution file building + # ------------------------------------------------------------------ + + @staticmethod + def _select_time_range(days_difference: int) -> str: + """Map days since data storage start to a yfinance period parameter. + + Possible time ranges in: + ['1d', '5d', '1mo', '3mo', '6mo', '1y', '2y', '5y', '10y', 'ytd', 'max'] + + Args: + days_difference (int): Days since QUESTION_BANK_DATA_STORAGE_START_DATE. + """ + if days_difference <= 1: + return "1d" + elif days_difference <= 5: + return "5d" + elif days_difference <= 30: + return "1mo" + elif days_difference <= 90: + return "3mo" + elif days_difference <= 180: + return "6mo" + elif days_difference <= 365: + return "1y" + elif days_difference <= 365 * 2: + return "2y" + elif days_difference <= 365 * 5: + return "5y" + elif days_difference <= 365 * 10: + return "10y" + else: + return "max" + + @staticmethod + def _fetch_historical_prices(ticker_symbol: str, period: str) -> pd.DataFrame: + """Fetch historical closing prices for a ticker. + + Args: + ticker_symbol (str): Stock ticker symbol. + period (str): yfinance period string. + + Returns: + DataFrame with columns [date, value], or an empty DataFrame on failure. + """ + try: + ticker = yf.Ticker(ticker_symbol) + hist = ticker.history(period=period, auto_adjust=False) + return hist[["Close"]].reset_index().rename(columns={"Date": "date", "Close": "value"}) + except Exception as e: + logger.error(f"Failed to fetch data for {ticker_symbol}: {e}") + return pd.DataFrame() + + def _get_historical_prices( + self, + existing_df: pd.DataFrame | None, + ticker_symbol: str, + period: str, + ) -> pd.DataFrame | None: + """Build a resolution DataFrame of daily prices for a ticker. + + Args: + existing_df (pd.DataFrame | None): Existing resolution data, used as the fallback when + the fetch returns nothing. + ticker_symbol (str): Stock ticker symbol. + period (str): yfinance period string. + + Returns: + DataFrame with columns [id, date, value]; the existing data unchanged when the fetch + returns nothing; or None when the fetch returns nothing and there is no existing data. + """ + df = self._fetch_historical_prices(ticker_symbol, period) + if df.empty: + # Fetch returned nothing: keep the existing file unchanged; if there is none, there is + # nothing to write. + if existing_df is None or existing_df.empty: + return None + return existing_df + + yesterday = self.get_date_today() - timedelta(days=1) + df["date"] = pd.to_datetime(df["date"]).dt.date + df = df[ + (df["date"] >= constants.QUESTION_BANK_DATA_STORAGE_START_DATE) + & (df["date"] <= yesterday) + ] + + # Forward fill for weekends/holidays + full_date_range = pd.date_range(start=df["date"].min(), end=yesterday) + df = df.set_index("date").reindex(full_date_range).ffill().rename_axis("date").reset_index() + df["id"] = ticker_symbol + return df[["id", "date", "value"]].astype(dtype=constants.RESOLUTION_FILE_COLUMN_DTYPE) + + def _finalize_resolution_file(self, df: pd.DataFrame) -> pd.DataFrame: + """Forward-fill a resolved ticker's resolution file to yesterday. + + Args: + df (pd.DataFrame): Resolution data with columns [id, date, value]. + + Returns: + DataFrame forward-filled through yesterday. + """ + if df.empty: + return df + + end_date = self.get_date_today() - timedelta(days=1) + + df = df.copy() + ticker_id = df["id"].iloc[0] + df["date"] = pd.to_datetime(df["date"]) + df = df.set_index("date") + + full_range = pd.date_range(start=df.index.min(), end=end_date) + df = df.reindex(full_range).ffill().rename_axis("date").reset_index() + df["id"] = ticker_id + + return df[["id", "date", "value"]].astype(dtype=constants.RESOLUTION_FILE_COLUMN_DTYPE) + + def _build_resolution_df( + self, + question: dict, + period: str, + existing_df: DataFrame[ResolutionFrame] | None = None, + force: bool = False, + ) -> DataFrame[ResolutionFrame] | None: + """Build or update a resolution file for a single stock ticker. + + Args: + question (dict): Must have 'id'; 'resolved' marks a delisted ticker. + period (str): yfinance period string. + existing_df (DataFrame[ResolutionFrame] | None): Existing resolution data. + force (bool): If True, re-fetch even when the file is already up-to-date. + + Returns: + The updated DataFrame, or None when no upload is needed (already up-to-date or + unchanged). + """ + is_resolved = question.get("resolved", False) + yesterday = self.get_date_today() - timedelta(days=1) + + # Already up-to-date check — skip the API call entirely. Resolved (delisted) tickers are + # always rebuilt so the final close price is forward-filled. + if ( + not force + and not is_resolved + and existing_df is not None + and not existing_df.empty + and pd.to_datetime(existing_df["date"].iloc[-1]).date() >= yesterday + ): + logger.info(f"{question['id']} is skipped because it's already up-to-date!") + return None + + df_new = self._get_historical_prices(existing_df, question["id"], period) + if df_new is None: + return None + + if is_resolved: + df_new = self._finalize_resolution_file(df_new) + + # Only upload dataframes that changed. + if existing_df is not None and not existing_df.empty and existing_df.equals(df_new): + return None + + return df_new + + def _build_renamed_ticker_resolution_files( + self, + period: str, + existing_resolution_files: dict[str, pd.DataFrame], + ) -> dict[str, pd.DataFrame]: + """Build resolution files for renamed tickers using their replacement symbols. + + For each entry in ``self.ticker_renames``, fetch price history under the replacement + ticker and write it to a resolution file keyed by the original ticker. + + Args: + period (str): yfinance period string. + existing_resolution_files (dict): Existing resolution data, keyed by question id; must + include the original tickers. + + Returns: + Mapping of original ticker -> resolution DataFrame, only for files that changed. + """ + resolution_files: dict[str, pd.DataFrame] = {} + for entry in self.ticker_renames: + original = entry["original_ticker"] + replacement = entry["replacement_ticker"] + + existing_df = existing_resolution_files.get(original) + + df_new = self._get_historical_prices(existing_df, replacement, period) + if df_new is None: + logger.warning( + f"No data for replacement ticker {replacement} (original: {original})" + ) + continue + + # df_new may be `existing_df` (fetch returned nothing); copy before relabelling so the + # caller's resolution file is never mutated in place. + df_new = df_new.copy() + df_new["id"] = original + + if existing_df is not None and not existing_df.empty and existing_df.equals(df_new): + continue + + logger.info(f"Built resolution file for {original} (via {replacement})") + resolution_files[original] = df_new - def update(self, dfq, dff, **kwargs): - """Process fetched Yahoo Finance data into questions and resolution files.""" - raise NotImplementedError + return resolution_files diff --git a/src/tests/conftest.py b/src/tests/conftest.py index 48da008d..0af0d240 100644 --- a/src/tests/conftest.py +++ b/src/tests/conftest.py @@ -13,6 +13,7 @@ from sources.manifold import ManifoldSource from sources.metaculus import MetaculusSource from sources.polymarket import PolymarketSource +from sources.yfinance import YfinanceSource # --------------------------------------------------------------------------- # Time-freezing fixture @@ -99,6 +100,12 @@ def polymarket_source(): return PolymarketSource() +@pytest.fixture() +def yfinance_source(): + """Return a YfinanceSource instance.""" + return YfinanceSource() + + # --------------------------------------------------------------------------- # DataFrame factories # --------------------------------------------------------------------------- @@ -276,6 +283,38 @@ def make_infer_fetch_df(rows): return df +# --------------------------------------------------------------------------- +# Yfinance-specific factories +# --------------------------------------------------------------------------- + + +def make_yfinance_fetch_df(rows): + """Build a DataFrame matching YfinanceFetchFrame schema. + + Each row should have at least 'id'. Missing columns get defaults. + """ + defaults = { + "question": "Will {id} go up?", + "background": "N/A", + "url": "N/A", + "resolved": False, + "forecast_horizons": "N/A", + "freeze_datetime_value": "100.0", + "freeze_datetime_value_explanation": "N/A", + "market_info_resolution_criteria": "N/A", + "market_info_open_datetime": "N/A", + "market_info_close_datetime": "N/A", + "market_info_resolution_datetime": "N/A", + "fetch_datetime": "2026-03-18T00:00:00+00:00", + "probability": 100.0, + } + df = pd.DataFrame(rows) + for col, default in defaults.items(): + if col not in df.columns: + df[col] = default + return df + + # --------------------------------------------------------------------------- # Manifold-specific factories # --------------------------------------------------------------------------- diff --git a/src/tests/test_infer.py b/src/tests/test_infer.py index e382c146..c562a526 100644 --- a/src/tests/test_infer.py +++ b/src/tests/test_infer.py @@ -190,12 +190,12 @@ def test_only_keeps_id_date_value(self): # --------------------------------------------------------------------------- -# _build_resolution_file (mock _get_historical_forecasts) +# _build_resolution_df (mock _get_historical_forecasts) # --------------------------------------------------------------------------- -class TestBuildResolutionFile: - """Tests for InferSource._build_resolution_file.""" +class TestBuildResolutionDf: + """Tests for InferSource._build_resolution_df.""" def _question(self, **overrides): base = { @@ -212,7 +212,7 @@ def test_nullified_no_existing(self, mock_hist, infer_source, freeze_today): """Nullified question with no existing data returns single NaN row.""" freeze_today(date(2026, 1, 15)) q = self._question(nullify_question=True) - df = infer_source._build_resolution_file(q, resolved=False, existing_df=None) + df = infer_source._build_resolution_df(q, resolved=False, existing_df=None) assert len(df) == 1 assert np.isnan(df["value"].iloc[0]) @@ -229,7 +229,7 @@ def test_nullified_with_existing(self, mock_hist, infer_source, freeze_today): ] ) q = self._question(nullify_question=True) - df = infer_source._build_resolution_file(q, resolved=False, existing_df=existing) + df = infer_source._build_resolution_df(q, resolved=False, existing_df=existing) assert df["value"].isna().all() mock_hist.assert_not_called() @@ -245,7 +245,7 @@ def test_already_up_to_date(self, mock_hist, infer_source, freeze_today): ] ) q = self._question() - df = infer_source._build_resolution_file(q, resolved=False, existing_df=existing) + df = infer_source._build_resolution_df(q, resolved=False, existing_df=existing) assert df.equals(existing) mock_hist.assert_not_called() @@ -262,7 +262,7 @@ def test_fetches_when_stale(self, mock_hist, infer_source, freeze_today): ) existing = make_resolution_df([{"id": "200", "date": "2024-06-01", "value": 0.5}]) q = self._question() - df = infer_source._build_resolution_file(q, resolved=False, existing_df=existing) + df = infer_source._build_resolution_df(q, resolved=False, existing_df=existing) assert not df.empty mock_hist.assert_called_once() @@ -282,7 +282,7 @@ def test_resolved_truncates_and_appends(self, mock_hist, infer_source, freeze_to market_info_resolution_datetime="2026-01-11T00:00:00+00:00", probability=1.0, ) - df = infer_source._build_resolution_file(q, resolved=True, existing_df=None) + df = infer_source._build_resolution_df(q, resolved=True, existing_df=None) # Should have rows up to resolution date assert not df.empty @@ -337,7 +337,7 @@ def test_deduplication_active_wins(self, mock_api, infer_source): [make_infer_api_question(id=100, state="active")], # active fetch ] dfq = make_question_df([{"id": "100", "resolved": False}]) - dff = infer_source.fetch(dfq=dfq, files_in_storage=[]) + dff = infer_source.fetch(dfq=dfq, existing_resolution_ids=set()) assert len(dff) == 1 @@ -350,7 +350,7 @@ def test_resolved_without_files_refetched(self, mock_api, infer_source): ] dfq = make_question_df([{"id": "100", "resolved": True}]) # No resolution file in storage → should re-fetch - dff = infer_source.fetch(dfq=dfq, files_in_storage=[]) + dff = infer_source.fetch(dfq=dfq, existing_resolution_ids=set()) assert len(dff) == 1 mock_api.assert_any_call(status="all", question_ids=["100"]) @@ -361,7 +361,7 @@ def test_empty_dfq(self, mock_api, infer_source): mock_api.side_effect = [ [make_infer_api_question(id=300)], ] - dff = infer_source.fetch(dfq=None, files_in_storage=[]) + dff = infer_source.fetch(dfq=None, existing_resolution_ids=set()) assert len(dff) == 1 def test_api_key_required(self): @@ -372,14 +372,14 @@ def test_api_key_required(self): # --------------------------------------------------------------------------- -# update() (mock _build_resolution_file) +# update() (mock _build_resolution_df) # --------------------------------------------------------------------------- class TestUpdate: """Tests for InferSource.update.""" - @patch.object(InferSource, "_build_resolution_file") + @patch.object(InferSource, "_build_resolution_df") def test_basic_update(self, mock_build, infer_source): """Returns UpdateResult with valid dfq and resolution files.""" mock_build.return_value = make_resolution_df( @@ -394,7 +394,7 @@ def test_basic_update(self, mock_build, infer_source): assert "200" in result.resolution_files QuestionFrame.validate(result.dfq) - @patch.object(InferSource, "_build_resolution_file") + @patch.object(InferSource, "_build_resolution_df") def test_new_question_inserted(self, mock_build, infer_source): """Question not in dfq gets appended.""" mock_build.return_value = make_resolution_df( @@ -407,7 +407,7 @@ def test_new_question_inserted(self, mock_build, infer_source): assert len(result.dfq) == 2 assert set(result.dfq["id"].tolist()) == {"100", "300"} - @patch.object(InferSource, "_build_resolution_file") + @patch.object(InferSource, "_build_resolution_df") def test_existing_question_updated(self, mock_build, infer_source): """Existing question fields are updated in place.""" mock_build.return_value = make_resolution_df( @@ -420,7 +420,7 @@ def test_existing_question_updated(self, mock_build, infer_source): assert len(result.dfq) == 1 assert result.dfq.iloc[0]["question"] == "New text" - @patch.object(InferSource, "_build_resolution_file") + @patch.object(InferSource, "_build_resolution_df") def test_nullified_marked_resolved(self, mock_build, infer_source): """Nullified questions are marked as resolved in dfq.""" mock_build.return_value = make_resolution_df( @@ -433,7 +433,7 @@ def test_nullified_marked_resolved(self, mock_build, infer_source): row = result.dfq[result.dfq["id"] == "200"].iloc[0] assert bool(row["resolved"]) is True - @patch.object(InferSource, "_build_resolution_file") + @patch.object(InferSource, "_build_resolution_df") def test_transient_fields_stripped(self, mock_build, infer_source): """fetch_datetime, probability, nullify_question not in output dfq.""" mock_build.return_value = make_resolution_df( diff --git a/src/tests/test_manifold.py b/src/tests/test_manifold.py index 58332a2e..4f5d0005 100644 --- a/src/tests/test_manifold.py +++ b/src/tests/test_manifold.py @@ -53,59 +53,12 @@ def test_unknown_resolution(self): # --------------------------------------------------------------------------- -# _finalize_resolution_df (pure, no mocking) +# _build_resolution_df (mock _get_market_bets) # --------------------------------------------------------------------------- -class TestFinalizeResolutionDf: - """Tests for ManifoldSource._finalize_resolution_df static method.""" - - def test_filters_before_benchmark_start(self): - """Rows before BENCHMARK_START_DATE are dropped.""" - df = pd.DataFrame( - { - "id": ["A", "A", "A"], - "date": pd.to_datetime(["2020-01-01", "2024-06-01", "2024-07-01"]), - "value": [0.1, 0.2, 0.3], - } - ) - result = ManifoldSource._finalize_resolution_df(df) - assert len(result) == 2 - assert result["value"].tolist() == [0.2, 0.3] - - def test_validates_schema(self): - """Output is a valid ResolutionFrame.""" - df = pd.DataFrame( - { - "id": ["A"], - "date": pd.to_datetime(["2024-06-01"]), - "value": [0.5], - } - ) - result = ManifoldSource._finalize_resolution_df(df) - ResolutionFrame.validate(result) - - def test_only_keeps_id_date_value(self): - """Extra columns are stripped.""" - df = pd.DataFrame( - { - "id": ["A"], - "date": pd.to_datetime(["2024-06-01"]), - "value": [0.5], - "extra": ["junk"], - } - ) - result = ManifoldSource._finalize_resolution_df(df) - assert list(result.columns) == ["id", "date", "value"] - - -# --------------------------------------------------------------------------- -# _build_resolution_file (mock _get_market_bets) -# --------------------------------------------------------------------------- - - -class TestBuildResolutionFile: - """Tests for ManifoldSource._build_resolution_file.""" +class TestBuildResolutionDf: + """Tests for ManifoldSource._build_resolution_df.""" @patch.object(ManifoldSource, "_get_market_bets") def test_already_up_to_date(self, mock_bets, manifold_source, freeze_today): @@ -118,7 +71,7 @@ def test_already_up_to_date(self, mock_bets, manifold_source, freeze_today): ] ) market = make_manifold_api_market() - result = manifold_source._build_resolution_file( + result = manifold_source._build_resolution_df( market=market, market_info_resolution_datetime="N/A", existing_df=existing ) @@ -134,7 +87,7 @@ def test_basic_unresolved_market(self, mock_bets, manifold_source, freeze_today) make_manifold_bet(id="b2", createdTime=1768226400000, probAfter=0.6), # 2026-01-12 ] market = make_manifold_api_market() - result = manifold_source._build_resolution_file( + result = manifold_source._build_resolution_df( market=market, market_info_resolution_datetime="N/A", existing_df=None ) @@ -151,7 +104,7 @@ def test_empty_bets_returns_none(self, mock_bets, manifold_source, freeze_today) freeze_today(date(2026, 1, 15)) mock_bets.return_value = [] market = make_manifold_api_market() - result = manifold_source._build_resolution_file( + result = manifold_source._build_resolution_df( market=market, market_info_resolution_datetime="N/A", existing_df=None ) assert result is None @@ -165,7 +118,7 @@ def test_no_filled_bets_returns_none(self, mock_bets, manifold_source, freeze_to make_manifold_bet(isFilled=False, createdTime=1768226400000), ] market = make_manifold_api_market() - result = manifold_source._build_resolution_file( + result = manifold_source._build_resolution_df( market=market, market_info_resolution_datetime="N/A", existing_df=None ) assert result is None @@ -179,7 +132,7 @@ def test_forward_fills_gaps(self, mock_bets, manifold_source, freeze_today): make_manifold_bet(id="b2", createdTime=1768384800000, probAfter=0.8), # 2026-01-14 ] market = make_manifold_api_market() - result = manifold_source._build_resolution_file( + result = manifold_source._build_resolution_df( market=market, market_info_resolution_datetime="N/A", existing_df=None ) @@ -203,7 +156,7 @@ def test_resolved_truncates_at_resolution(self, mock_bets, manifold_source, free resolution="YES", resolutionTime=1768310400000, # 2026-01-13T12:00:00Z ) - result = manifold_source._build_resolution_file( + result = manifold_source._build_resolution_df( market=market, market_info_resolution_datetime="2026-01-13T12:00:00+00:00", existing_df=None, @@ -231,7 +184,7 @@ def test_resolved_cancel_nan_last_row(self, mock_bets, manifold_source, freeze_t resolution="CANCEL", resolutionTime=1768310400000, # 2026-01-13T12:00:00Z ) - result = manifold_source._build_resolution_file( + result = manifold_source._build_resolution_df( market=market, market_info_resolution_datetime="2026-01-13T12:00:00+00:00", existing_df=None, @@ -250,7 +203,7 @@ def test_filters_future_bets(self, mock_bets, manifold_source, freeze_today): make_manifold_bet(id="b2", createdTime=1768464000000, probAfter=0.9), # 2026-01-15 ] market = make_manifold_api_market() - result = manifold_source._build_resolution_file( + result = manifold_source._build_resolution_df( market=market, market_info_resolution_datetime="N/A", existing_df=None ) @@ -377,14 +330,14 @@ def test_empty_results(self, mock_search, manifold_source): # --------------------------------------------------------------------------- -# update() (mock _get_market + _build_resolution_file) +# update() (mock _get_market + _build_resolution_df) # --------------------------------------------------------------------------- class TestUpdate: """Tests for ManifoldSource.update.""" - @patch.object(ManifoldSource, "_build_resolution_file") + @patch.object(ManifoldSource, "_build_resolution_df") @patch.object(ManifoldSource, "_get_market") def test_new_id_appended(self, mock_market, mock_build, manifold_source): """IDs in dff not in dfq get appended with defaults.""" @@ -402,7 +355,7 @@ def test_new_id_appended(self, mock_market, mock_build, manifold_source): new_row = result.dfq[result.dfq["id"] == "new_001"].iloc[0] assert new_row["freeze_datetime_value_explanation"] == "The market value." - @patch.object(ManifoldSource, "_build_resolution_file") + @patch.object(ManifoldSource, "_build_resolution_df") @patch.object(ManifoldSource, "_get_market") def test_existing_unresolved_updated(self, mock_market, mock_build, manifold_source): """Unresolved question fields are updated from market details.""" @@ -425,7 +378,7 @@ def test_existing_unresolved_updated(self, mock_market, mock_build, manifold_sou assert row["background"] == "New background" assert row["url"] == "https://manifold.markets/updated" - @patch.object(ManifoldSource, "_build_resolution_file") + @patch.object(ManifoldSource, "_build_resolution_df") @patch.object(ManifoldSource, "_get_market") def test_market_becomes_resolved(self, mock_market, mock_build, manifold_source): """Market with isResolved=True marks dfq row as resolved.""" @@ -447,10 +400,10 @@ def test_market_becomes_resolved(self, mock_market, mock_build, manifold_source) assert bool(row["resolved"]) is True assert row["market_info_resolution_datetime"] != "N/A" - @patch.object(ManifoldSource, "_build_resolution_file") + @patch.object(ManifoldSource, "_build_resolution_df") @patch.object(ManifoldSource, "_get_market") def test_resolution_file_stored(self, mock_market, mock_build, manifold_source): - """Resolution file from _build_resolution_file is in result.""" + """Resolution file from _build_resolution_df is in result.""" res_df = make_resolution_df([{"id": "mkt_001", "date": "2024-06-01", "value": 0.5}]) mock_market.return_value = make_manifold_api_market(id="mkt_001") mock_build.return_value = res_df @@ -461,7 +414,7 @@ def test_resolution_file_stored(self, mock_market, mock_build, manifold_source): assert "mkt_001" in result.resolution_files - @patch.object(ManifoldSource, "_build_resolution_file") + @patch.object(ManifoldSource, "_build_resolution_df") @patch.object(ManifoldSource, "_get_market") def test_freeze_datetime_value_set(self, mock_market, mock_build, manifold_source): """freeze_datetime_value is set to last value of resolution df.""" @@ -480,10 +433,10 @@ def test_freeze_datetime_value_set(self, mock_market, mock_build, manifold_sourc row = result.dfq[result.dfq["id"] == "mkt_001"].iloc[0] assert str(row["freeze_datetime_value"]) == "0.75" - @patch.object(ManifoldSource, "_build_resolution_file") + @patch.object(ManifoldSource, "_build_resolution_df") @patch.object(ManifoldSource, "_get_market") def test_build_resolution_returns_none(self, mock_market, mock_build, manifold_source): - """_build_resolution_file returning None: no resolution file stored.""" + """_build_resolution_df returning None: no resolution file stored.""" mock_market.return_value = make_manifold_api_market(id="mkt_001") mock_build.return_value = None dfq = make_question_df([{"id": "mkt_001", "resolved": False}]) @@ -493,7 +446,7 @@ def test_build_resolution_returns_none(self, mock_market, mock_build, manifold_s assert "mkt_001" not in (result.resolution_files or {}) - @patch.object(ManifoldSource, "_build_resolution_file") + @patch.object(ManifoldSource, "_build_resolution_df") @patch.object(ManifoldSource, "_get_market") def test_regenerates_missing_resolved_files(self, mock_market, mock_build, manifold_source): """Resolved questions missing from storage get resolution files regenerated.""" @@ -514,11 +467,11 @@ def test_regenerates_missing_resolved_files(self, mock_market, mock_build, manif ) dff = make_manifold_fetch_df([{"id": "mkt_001"}]) - result = manifold_source.update(dfq, dff, files_in_storage=[]) + result = manifold_source.update(dfq, dff, existing_resolution_ids=set()) assert "mkt_001" in result.resolution_files - @patch.object(ManifoldSource, "_build_resolution_file") + @patch.object(ManifoldSource, "_build_resolution_df") @patch.object(ManifoldSource, "_get_market") def test_skips_resolved_already_in_storage(self, mock_market, mock_build, manifold_source): """Resolved questions with files in storage are not re-fetched.""" @@ -533,13 +486,13 @@ def test_skips_resolved_already_in_storage(self, mock_market, mock_build, manifo ) dff = make_manifold_fetch_df([{"id": "mkt_001"}]) - result = manifold_source.update(dfq, dff, files_in_storage=["manifold/mkt_001.jsonl"]) + result = manifold_source.update(dfq, dff, existing_resolution_ids={"mkt_001"}) # _get_market should not be called for the resolved question mock_market.assert_not_called() assert "mkt_001" not in (result.resolution_files or {}) - @patch.object(ManifoldSource, "_build_resolution_file") + @patch.object(ManifoldSource, "_build_resolution_df") @patch.object(ManifoldSource, "_get_market") def test_output_schema_valid(self, mock_market, mock_build, manifold_source): """Output dfq passes QuestionFrame validation.""" diff --git a/src/tests/test_metaculus.py b/src/tests/test_metaculus.py index 26f4448b..8c34602a 100644 --- a/src/tests/test_metaculus.py +++ b/src/tests/test_metaculus.py @@ -85,82 +85,12 @@ def test_non_integer(self): # --------------------------------------------------------------------------- -# _finalize_resolution_df (pure, no mocking) +# _build_resolution_df # --------------------------------------------------------------------------- -class TestFinalizeResolutionDf: - """Tests for MetaculusSource._finalize_resolution_df static method.""" - - def test_validates_schema(self): - """Output is a valid ResolutionFrame.""" - df = pd.DataFrame( - { - "id": ["42472"], - "date": [date(2025, 6, 1)], - "value": [0.5], - } - ) - result = MetaculusSource._finalize_resolution_df(df) - ResolutionFrame.validate(result) - - def test_keeps_all_historical_data(self): - """Unlike Infer, rows before BENCHMARK_START_DATE are NOT filtered.""" - df = pd.DataFrame( - { - "id": ["A", "A", "A"], - "date": [date(2018, 1, 1), date(2020, 6, 1), date(2025, 7, 1)], - "value": [0.1, 0.2, 0.3], - } - ) - result = MetaculusSource._finalize_resolution_df(df) - assert len(result) == 3 - - def test_date_coerced_to_string(self): - """datetime.date objects become 'YYYY-MM-DD' strings.""" - df = pd.DataFrame( - { - "id": ["A"], - "date": [date(2025, 6, 1)], - "value": [0.5], - } - ) - result = MetaculusSource._finalize_resolution_df(df) - assert result["date"].iloc[0] == "2025-06-01" - - def test_id_coerced_to_string(self): - """Integer IDs are coerced to string.""" - df = pd.DataFrame( - { - "id": [42472], - "date": [date(2025, 6, 1)], - "value": [0.5], - } - ) - result = MetaculusSource._finalize_resolution_df(df) - assert result["id"].iloc[0] == "42472" - - def test_only_keeps_id_date_value(self): - """Extra columns are stripped.""" - df = pd.DataFrame( - { - "id": ["A"], - "date": [date(2025, 6, 1)], - "value": [0.5], - "extra": ["junk"], - } - ) - result = MetaculusSource._finalize_resolution_df(df) - assert list(result.columns) == ["id", "date", "value"] - - -# --------------------------------------------------------------------------- -# _create_resolution_file -# --------------------------------------------------------------------------- - - -class TestCreateResolutionFile: - """Tests for MetaculusSource._create_resolution_file.""" +class TestBuildResolutionDf: + """Tests for MetaculusSource._build_resolution_df.""" def _dfq_row(self, resolved=False, resolution_datetime="N/A"): return make_question_df( @@ -179,7 +109,7 @@ def test_empty_history_returns_none(self, metaculus_source): question={"aggregations": {"recency_weighted": {"history": []}}} ) dfq = self._dfq_row() - result = metaculus_source._create_resolution_file(dfq, 0, market) + result = metaculus_source._build_resolution_df(dfq, 0, market) assert result is None def test_single_day_entries_filtered(self, metaculus_source): @@ -201,7 +131,7 @@ def test_single_day_entries_filtered(self, metaculus_source): } ) dfq = self._dfq_row() - result = metaculus_source._create_resolution_file(dfq, 0, market) + result = metaculus_source._build_resolution_df(dfq, 0, market) assert result is None def test_single_day_last_millisecond_kept(self, metaculus_source): @@ -224,7 +154,7 @@ def test_single_day_last_millisecond_kept(self, metaculus_source): } ) dfq = self._dfq_row() - result = metaculus_source._create_resolution_file(dfq, 0, market) + result = metaculus_source._build_resolution_df(dfq, 0, market) assert result is not None assert len(result) >= 1 @@ -252,7 +182,7 @@ def test_backfill_gaps(self, metaculus_source): } ) dfq = self._dfq_row() - result = metaculus_source._create_resolution_file(dfq, 0, market) + result = metaculus_source._build_resolution_df(dfq, 0, market) assert result is not None # Should have continuous dates from start through last date dates_in_result = result["date"].tolist() @@ -282,7 +212,7 @@ def test_deduplication_keeps_last(self, metaculus_source): } ) dfq = self._dfq_row() - result = metaculus_source._create_resolution_file(dfq, 0, market) + result = metaculus_source._build_resolution_df(dfq, 0, market) assert result is not None # Both entries map to date 2025-01-01 (end_date - 1 day) # The second (0.9) should be kept @@ -304,7 +234,7 @@ def test_resolved_market_truncates_and_appends(self, metaculus_source): resolved=True, resolution_datetime="2025-01-03T00:00:00+00:00", ) - result = metaculus_source._create_resolution_file(dfq, 0, market) + result = metaculus_source._build_resolution_df(dfq, 0, market) assert result is not None # Last row should be the resolution value (yes -> 1) assert float(result.iloc[-1]["value"]) == 1.0 @@ -335,7 +265,7 @@ def test_null_end_time_uses_today(self, metaculus_source, freeze_today): } ) dfq = self._dfq_row() - result = metaculus_source._create_resolution_file(dfq, 0, market) + result = metaculus_source._build_resolution_df(dfq, 0, market) assert result is not None # Should have dates from Dec 31 through Jan 4 (today - 1 day) assert len(result) >= 2 @@ -364,7 +294,7 @@ def test_future_dates_dropped(self, metaculus_source, freeze_today): } ) dfq = self._dfq_row() - result = metaculus_source._create_resolution_file(dfq, 0, market) + result = metaculus_source._build_resolution_df(dfq, 0, market) assert result is not None # No date may exceed yesterday (2025-01-04); the future 2025-01-09 entry is dropped. assert max(result["date"].tolist()) <= "2025-01-04" @@ -389,7 +319,7 @@ def test_future_only_unresolved_market_returns_none(self, metaculus_source, free } ) dfq = self._dfq_row() - result = metaculus_source._create_resolution_file(dfq, 0, market) + result = metaculus_source._build_resolution_df(dfq, 0, market) assert result is None def test_date_assignment_subtracts_day(self, metaculus_source): @@ -412,7 +342,7 @@ def test_date_assignment_subtracts_day(self, metaculus_source): } ) dfq = self._dfq_row() - result = metaculus_source._create_resolution_file(dfq, 0, market) + result = metaculus_source._build_resolution_df(dfq, 0, market) assert result is not None # end_date 2025-01-02 minus 1 day = 2025-01-01 assert "2025-01-01" in result["date"].tolist() @@ -421,7 +351,7 @@ def test_id_is_string(self, metaculus_source): """Output id column contains string values.""" market = make_metaculus_market() dfq = self._dfq_row() - result = metaculus_source._create_resolution_file(dfq, 0, market) + result = metaculus_source._build_resolution_df(dfq, 0, market) assert result is not None assert all(isinstance(v, str) for v in result["id"].tolist()) @@ -429,7 +359,7 @@ def test_output_validated_as_resolution_frame(self, metaculus_source): """Output passes ResolutionFrame.validate().""" market = make_metaculus_market() dfq = self._dfq_row() - result = metaculus_source._create_resolution_file(dfq, 0, market) + result = metaculus_source._build_resolution_df(dfq, 0, market) assert result is not None ResolutionFrame.validate(result) @@ -682,7 +612,7 @@ def test_api_key_required(self): # --------------------------------------------------------------------------- -# update() (mock _get_market and _create_resolution_file) +# update() (mock _get_market and _build_resolution_df) # --------------------------------------------------------------------------- @@ -699,7 +629,7 @@ def _resolution_df(self, question_id="42472", value=0.6): } ) - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_new_question_appended(self, mock_market, mock_res, metaculus_source): """ID in dff not in dfq gets appended with defaults.""" @@ -715,7 +645,7 @@ def test_new_question_appended(self, mock_market, mock_res, metaculus_source): row = result.dfq[result.dfq["id"] == "200"].iloc[0] assert row["freeze_datetime_value_explanation"] == "The community prediction." - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_existing_question_not_duplicated(self, mock_market, mock_res, metaculus_source): """ID already in dfq does not add a new row.""" @@ -728,7 +658,7 @@ def test_existing_question_not_duplicated(self, mock_market, mock_res, metaculus result = metaculus_source.update(dfq, dff) assert len(result.dfq) == 1 - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_question_fields_updated(self, mock_market, mock_res, metaculus_source): """Unresolved question fields are updated from market data.""" @@ -753,7 +683,7 @@ def test_question_fields_updated(self, mock_market, mock_res, metaculus_source): assert row["market_info_resolution_criteria"] == "New criteria" assert "metaculus.com/questions/42472" in row["url"] - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_background_empty_string_kept(self, mock_market, mock_res, metaculus_source): """Empty description string is stored as-is, not converted to 'N/A'.""" @@ -767,7 +697,7 @@ def test_background_empty_string_kept(self, mock_market, mock_res, metaculus_sou result = metaculus_source.update(dfq, dff) assert result.dfq.iloc[0]["background"] == "" - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_background_missing_key_becomes_na(self, mock_market, mock_res, metaculus_source): """Missing description key becomes 'N/A'.""" @@ -782,7 +712,7 @@ def test_background_missing_key_becomes_na(self, mock_market, mock_res, metaculu result = metaculus_source.update(dfq, dff) assert result.dfq.iloc[0]["background"] == "N/A" - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_resolved_market_sets_resolution_datetime( self, mock_market, mock_res, metaculus_source @@ -808,10 +738,10 @@ def test_resolved_market_sets_resolution_datetime( # min(close=March 1, resolve=Feb 15) = Feb 15 assert "2026-02-15" in str(row["market_info_resolution_datetime"]) - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_resolution_file_stored(self, mock_market, mock_res, metaculus_source): - """Resolution file from _create_resolution_file is stored in result.""" + """Resolution file from _build_resolution_df is stored in result.""" mock_market.return_value = make_metaculus_market() res_df = self._resolution_df() mock_res.return_value = res_df @@ -822,7 +752,7 @@ def test_resolution_file_stored(self, mock_market, mock_res, metaculus_source): result = metaculus_source.update(dfq, dff) assert "42472" in result.resolution_files - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_freeze_value_from_last_resolution(self, mock_market, mock_res, metaculus_source): """freeze_datetime_value is the last value in the resolution file.""" @@ -836,10 +766,10 @@ def test_freeze_value_from_last_resolution(self, mock_market, mock_res, metaculu # QuestionFrame coerces freeze_datetime_value to str assert str(result.dfq.iloc[0]["freeze_datetime_value"]) == "0.75" - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_freeze_value_na_when_no_resolution(self, mock_market, mock_res, metaculus_source): - """freeze_datetime_value is 'N/A' when _create_resolution_file returns None.""" + """freeze_datetime_value is 'N/A' when _build_resolution_df returns None.""" mock_market.return_value = make_metaculus_market() mock_res.return_value = None @@ -849,7 +779,7 @@ def test_freeze_value_na_when_no_resolution(self, mock_market, mock_res, metacul result = metaculus_source.update(dfq, dff) assert result.dfq.iloc[0]["freeze_datetime_value"] == "N/A" - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_market_api_failure_propagates(self, mock_market, mock_res, metaculus_source): """A persistent _get_market failure propagates (fail loudly), not silently skipped.""" @@ -862,7 +792,7 @@ def test_market_api_failure_propagates(self, mock_market, mock_res, metaculus_so metaculus_source.update(dfq, dff) mock_res.assert_not_called() - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_resolved_missing_resolution_regenerated(self, mock_market, mock_res, metaculus_source): """Resolved question without existing resolution file triggers regeneration.""" @@ -895,7 +825,7 @@ def test_resolved_missing_resolution_regenerated(self, mock_market, mock_res, me mock_market.assert_called_once_with("42472") assert "42472" in result.resolution_files - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_resolved_with_existing_file_not_regenerated( self, mock_market, mock_res, metaculus_source @@ -912,12 +842,12 @@ def test_resolved_with_existing_file_not_regenerated( ) dff = make_metaculus_fetch_df([]) - metaculus_source.update(dfq, dff, files_in_storage=["metaculus/42472.jsonl"]) + metaculus_source.update(dfq, dff, existing_resolution_ids={"42472"}) mock_market.assert_not_called() mock_res.assert_not_called() - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_cap_new_questions(self, mock_market, mock_res, metaculus_source): """New IDs exceeding _QUESTION_LIMIT - unresolved are capped.""" @@ -942,7 +872,7 @@ def test_api_key_required(self): with pytest.raises(RuntimeError, match="api_key must be set"): src.update(dfq, dff) - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_forecast_horizons_always_na(self, mock_market, mock_res, metaculus_source): """Every updated row has forecast_horizons = 'N/A'.""" @@ -955,7 +885,7 @@ def test_forecast_horizons_always_na(self, mock_market, mock_res, metaculus_sour result = metaculus_source.update(dfq, dff) assert result.dfq.iloc[0]["forecast_horizons"] == "N/A" - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_valid_question_frame_output(self, mock_market, mock_res, metaculus_source): """result.dfq passes QuestionFrame.validate().""" diff --git a/src/tests/test_polymarket.py b/src/tests/test_polymarket.py index 02e444bf..9f21635e 100644 --- a/src/tests/test_polymarket.py +++ b/src/tests/test_polymarket.py @@ -151,12 +151,12 @@ def test_preserves_time_component(self): # --------------------------------------------------------------------------- -# _build_resolution_file (pure, no mocking) +# _build_resolution_df (pure, no mocking) # --------------------------------------------------------------------------- -class TestBuildResolutionFile: - """Tests for PolymarketSource._build_resolution_file.""" +class TestBuildResolutionDf: + """Tests for PolymarketSource._build_resolution_df.""" def test_basic(self, polymarket_source): """Extracts resolution df from historical_prices and validates schema.""" @@ -167,7 +167,7 @@ def test_basic(self, polymarket_source): {"date": "2024-06-02", "value": 0.6}, ], } - result = polymarket_source._build_resolution_file(question) + result = polymarket_source._build_resolution_df(question) assert len(result) == 2 assert (result["id"] == "0xabc123").all() assert list(result.columns) == ["id", "date", "value"] @@ -182,7 +182,7 @@ def test_no_benchmark_filter(self, polymarket_source): {"date": "2024-06-01", "value": 0.5}, ], } - result = polymarket_source._build_resolution_file(question) + result = polymarket_source._build_resolution_df(question) assert len(result) == 2 @@ -860,14 +860,14 @@ def _row(condition_id): # --------------------------------------------------------------------------- -# update() (mock _build_resolution_file) +# update() (mock _build_resolution_df) # --------------------------------------------------------------------------- class TestUpdate: """Tests for PolymarketSource.update.""" - @patch.object(PolymarketSource, "_build_resolution_file") + @patch.object(PolymarketSource, "_build_resolution_df") def test_new_id_appended(self, mock_build, polymarket_source): """An ID in dff not in dfq gets appended.""" mock_build.return_value = make_resolution_df( @@ -881,7 +881,7 @@ def test_new_id_appended(self, mock_build, polymarket_source): assert "0xnew" in result.dfq["id"].values assert len(result.dfq) == 2 - @patch.object(PolymarketSource, "_build_resolution_file") + @patch.object(PolymarketSource, "_build_resolution_df") def test_existing_updated(self, mock_build, polymarket_source): """An existing ID's fields are updated from dff.""" mock_build.return_value = make_resolution_df( @@ -896,9 +896,9 @@ def test_existing_updated(self, mock_build, polymarket_source): assert row["question"] == "Updated question" assert len(result.dfq) == 1 - @patch.object(PolymarketSource, "_build_resolution_file") + @patch.object(PolymarketSource, "_build_resolution_df") def test_resolution_file_stored(self, mock_build, polymarket_source): - """The resolution file from _build_resolution_file is in the result.""" + """The resolution file from _build_resolution_df is in the result.""" mock_build.return_value = make_resolution_df( [{"id": "0xabc123", "date": "2024-06-01", "value": 0.5}] ) @@ -909,7 +909,7 @@ def test_resolution_file_stored(self, mock_build, polymarket_source): assert "0xabc123" in result.resolution_files - @patch.object(PolymarketSource, "_build_resolution_file") + @patch.object(PolymarketSource, "_build_resolution_df") def test_strips_transient_fields(self, mock_build, polymarket_source): """Transient fields are not present in the output dfq.""" mock_build.return_value = make_resolution_df( @@ -923,7 +923,7 @@ def test_strips_transient_fields(self, mock_build, polymarket_source): for col in ["fetch_datetime", "probability", "historical_prices"]: assert col not in result.dfq.columns - @patch.object(PolymarketSource, "_build_resolution_file") + @patch.object(PolymarketSource, "_build_resolution_df") def test_output_schema_valid(self, mock_build, polymarket_source): """The output dfq passes QuestionFrame validation.""" mock_build.return_value = make_resolution_df( diff --git a/src/tests/test_yfinance.py b/src/tests/test_yfinance.py index 29c047fb..de4324ec 100644 --- a/src/tests/test_yfinance.py +++ b/src/tests/test_yfinance.py @@ -1,20 +1,21 @@ """Tests for yfinance source, fetch, and update logic.""" from datetime import date -from unittest.mock import patch +from unittest.mock import MagicMock, Mock, patch import pandas as pd import pytest +from _schemas import YfinanceFetchFrame from helpers import constants -from questions.yfinance.fetch.main import fetch_all_stock -from questions.yfinance.update_questions.main import ( - finalize_resolution_file, - update_questions, -) from sources._metadata import SOURCE_METADATA from sources.yfinance import YfinanceSource -from tests.conftest import make_forecast_df, make_question_df +from tests.conftest import ( + make_forecast_df, + make_question_df, + make_resolution_df, + make_yfinance_fetch_df, +) DELISTED_STOCKS = SOURCE_METADATA["yfinance"]["nullified_questions"] TICKER_RENAMES = SOURCE_METADATA["yfinance"]["ticker_renames"] @@ -153,380 +154,411 @@ def test_resolve_pre_delisting_question_resolves_normally(self, source): assert bool(jnpr_row["resolved"]) is True -class TestDetectDelistedStocks: - """Test detection of delisted stocks during fetch.""" - - @patch("questions.yfinance.fetch.main.time.sleep") - @patch("questions.yfinance.fetch.main.get_sp500_tickers") - @patch("questions.yfinance.fetch.main.fetch_one_stock") - def test_stock_not_in_sp500_and_fetch_fails_is_resolved( - self, mock_fetch, mock_sp500, mock_sleep +# =========================================================================== +# Refactored YfinanceSource (sources/yfinance.py) — fetch/update behaviour. +# The classes above test the metadata and base-class nullification; the classes +# below test the refactored source's own fetch/update logic. +# =========================================================================== + + +class TestSourceGetDateToday: + """Tests for YfinanceSource.get_date_today (pinned-date accessor).""" + + def test_falls_back_to_live_date_when_unpinned(self, yfinance_source, freeze_today): + """With no run pinned, returns the live date.""" + freeze_today(date(2026, 3, 18)) + assert yfinance_source.get_date_today() == date(2026, 3, 18) + + def test_returns_pinned_date(self, yfinance_source, freeze_today): + """Once pinned, returns the pinned date regardless of the live clock.""" + freeze_today(date(2026, 3, 18)) + yfinance_source._today = date(2025, 1, 1) + assert yfinance_source.get_date_today() == date(2025, 1, 1) + + +class TestSourceGetSp500Tickers: + """Tests for YfinanceSource._get_sp500_tickers.""" + + _HTML = """ + + + + + + +
SymbolSecurity
AAPLApple Inc.
MSFTMicrosoft Corp.
GOOGLAlphabet Inc.
+ + """ + + @patch("sources.yfinance.requests.get") + def test_parses_wikipedia_table(self, mock_get): + """Returns the ticker list from the Wikipedia constituents table.""" + mock_resp = Mock() + mock_resp.content = self._HTML.encode() + mock_resp.raise_for_status = Mock() + mock_get.return_value = mock_resp + + assert YfinanceSource._get_sp500_tickers() == ["AAPL", "MSFT", "GOOGL"] + + @patch("sources.yfinance.requests.get") + def test_sends_benchmark_user_agent(self, mock_get): + """Sends the benchmark User-Agent header (regression for the dropped/Mozilla UA).""" + mock_resp = Mock() + mock_resp.content = self._HTML.encode() + mock_resp.raise_for_status = Mock() + mock_get.return_value = mock_resp + + YfinanceSource._get_sp500_tickers() + + _, kwargs = mock_get.call_args + assert kwargs["headers"] == {"User-Agent": constants.BENCHMARK_USER_AGENT} + + @patch("sources.yfinance.requests.get") + def test_returns_empty_on_error(self, mock_get): + """Returns an empty list when the request fails (legacy-faithful swallow).""" + mock_get.side_effect = Exception("Network error") + assert YfinanceSource._get_sp500_tickers() == [] + + +class TestSourceSelectTimeRange: + """Tests for YfinanceSource._select_time_range.""" + + @pytest.mark.parametrize( + "days,expected", + [ + (0, "1d"), + (1, "1d"), + (5, "5d"), + (30, "1mo"), + (90, "3mo"), + (180, "6mo"), + (365, "1y"), + (730, "2y"), + (1825, "5y"), + (3650, "10y"), + (4000, "max"), + ], + ) + def test_time_range_mapping(self, days, expected): + """Correct yfinance period for each day range.""" + assert YfinanceSource._select_time_range(days) == expected + + +class TestSourceFetchOneStock: + """Tests for YfinanceSource._fetch_one_stock.""" + + @patch("sources.yfinance.yf.Ticker") + def test_uses_unadjusted_close(self, mock_ticker_cls, yfinance_source, freeze_today): + """History is requested with auto_adjust=False (raw close prices).""" + freeze_today(date(2026, 3, 18)) + mock_ticker = MagicMock() + mock_ticker.info = {"longName": "Apple Inc."} + hist = pd.DataFrame( + {"Date": pd.to_datetime(["2026-03-16", "2026-03-17"]), "Close": [253.0, 254.23]} + ).set_index("Date") + mock_ticker.history.return_value = hist + mock_ticker_cls.return_value = mock_ticker + + name, out = yfinance_source._fetch_one_stock("AAPL") + + mock_ticker.history.assert_called_once_with(period="5d", auto_adjust=False) + assert name == "Apple Inc." + assert out["Close"].iloc[-1] == 254.23 # capped at yesterday, last row + + @patch("sources.yfinance.yf.Ticker") + def test_returns_none_on_error(self, mock_ticker_cls, yfinance_source): + """Returns (None, None) when the ticker lookup fails (legacy-faithful swallow).""" + mock_ticker_cls.return_value.info.__getitem__.side_effect = KeyError("longName") + assert yfinance_source._fetch_one_stock("INVALID") == (None, None) + + +class TestSourceFetch: + """Tests for YfinanceSource.fetch.""" + + @staticmethod + def _ticker_with(close, name="Apple Inc.", summary="A company."): + ticker = MagicMock() + ticker.info = {"longName": name, "longBusinessSummary": summary} + ticker.history.return_value = pd.DataFrame( + {"Date": pd.to_datetime(["2026-03-17"]), "Close": [close]} + ).set_index("Date") + return ticker + + @patch("sources.yfinance.yf.Ticker") + @patch.object(YfinanceSource, "_get_sp500_tickers", return_value=["AAPL"]) + def test_builds_valid_fetch_frame( + self, _mock_tickers, mock_ticker_cls, yfinance_source, freeze_today + ): + """A fetched ticker produces a schema-valid YfinanceFetchFrame row.""" + freeze_today(date(2026, 3, 18)) + mock_ticker_cls.return_value = self._ticker_with(254.23) + + dff = yfinance_source.fetch(dfq=make_question_df([{"id": "AAPL"}])) + + YfinanceFetchFrame.validate(dff) + row = dff[dff["id"] == "AAPL"].iloc[0] + assert bool(row["resolved"]) is False + assert row["url"] == "https://finance.yahoo.com/quote/AAPL" + assert float(row["probability"]) == 254.23 + + @patch("sources.yfinance.yf.Ticker") + @patch.object(YfinanceSource, "_get_sp500_tickers", return_value=[]) + def test_delisted_ticker_marked_resolved( + self, _mock_tickers, mock_ticker_cls, yfinance_source, freeze_today ): - """A stock not in S&P 500 that fails to fetch appears as resolved in the result.""" - mock_sp500.return_value = ["AAPL", "MSFT"] - mock_fetch.return_value = (None, None) + """A pool ticker that left the S&P 500 and fails to fetch is carried forward as resolved.""" + freeze_today(date(2026, 3, 18)) + mock_ticker_cls.return_value.info.__getitem__.side_effect = KeyError("longName") + + dfq = make_question_df([{"id": "OLDCO", "question": "legacy question"}]) + dff = yfinance_source.fetch(dfq=dfq) + + row = dff[dff["id"] == "OLDCO"].iloc[0] + assert bool(row["resolved"]) is True + assert row["freeze_datetime_value"] == "N/A" + assert pd.isna(row["probability"]) + assert row["question"] == "legacy question" # original question text preserved + + @patch("sources.yfinance.yf.Ticker") + @patch.object(YfinanceSource, "_fetch_one_stock") + @patch.object(YfinanceSource, "_get_sp500_tickers", return_value=["AAPL", "FAILS"]) + def test_in_sp500_fetch_failure_is_dropped( + self, _mock_tickers, mock_fetch_one, mock_ticker_cls, yfinance_source, freeze_today + ): + """A ticker still in the S&P 500 that fails to fetch is dropped (not delisted).""" + freeze_today(date(2026, 3, 18)) + hist = pd.DataFrame({"Close": [254.23], "Date": pd.to_datetime(["2026-03-17"])}) + mock_fetch_one.side_effect = lambda sym: ( + ("Apple Inc.", hist) if sym == "AAPL" else (None, None) + ) + mock_ticker_cls.return_value.info.get.return_value = "N/A" - from tests.conftest import make_question_df + dff = yfinance_source.fetch(dfq=make_question_df([{"id": "AAPL"}])) - dfq = make_question_df( - [ - {"id": "AAPL"}, - {"id": "MSFT"}, - {"id": "HES", "question": "Will HES go up?", "background": "Hess Corp."}, - ] - ) - result_df = fetch_all_stock(dfq) - hes_rows = result_df[result_df["id"] == "HES"] - assert len(hes_rows) == 1 - hes = hes_rows.iloc[0] - assert bool(hes["resolved"]) is True - assert hes["question"] == "Will HES go up?" - assert hes["background"] == "Hess Corp." - assert hes["freeze_datetime_value"] == "N/A" + assert "FAILS" not in dff["id"].values + assert "AAPL" in dff["id"].values - @patch("questions.yfinance.fetch.main.time.sleep") - @patch("questions.yfinance.fetch.main.get_sp500_tickers") - @patch("questions.yfinance.fetch.main.fetch_one_stock") - def test_stock_in_sp500_and_fetch_fails_not_in_result(self, mock_fetch, mock_sp500, mock_sleep): - """A stock in S&P 500 that fails to fetch doesn't appear in results.""" - mock_sp500.return_value = ["AAPL", "MSFT"] - mock_fetch.return_value = (None, None) - - dfq = pd.DataFrame({"id": ["AAPL"]}) - result_df = fetch_all_stock(dfq) - assert result_df.empty or "AAPL" not in result_df["id"].values - - @patch("questions.yfinance.fetch.main.time.sleep") - @patch("questions.yfinance.fetch.main.dates") - @patch("questions.yfinance.fetch.main.yf") - @patch("questions.yfinance.fetch.main.get_sp500_tickers") - @patch("questions.yfinance.fetch.main.fetch_one_stock") - def test_stock_not_in_sp500_but_fetch_succeeds_not_resolved( - self, mock_fetch, mock_sp500, mock_yf, mock_dates, mock_sleep + @patch("sources.yfinance.yf.Ticker") + @patch.object(YfinanceSource, "_fetch_one_stock") + @patch.object(YfinanceSource, "_get_sp500_tickers", return_value=["AAPL"]) + def test_not_in_sp500_but_fetch_succeeds_not_resolved( + self, _mock_tickers, mock_fetch_one, mock_ticker_cls, yfinance_source, freeze_today ): - """A stock not in S&P 500 that returns data has resolved=False.""" - mock_sp500.return_value = ["AAPL"] - hist = pd.DataFrame({"Close": [100.0], "Date": [pd.Timestamp("2026-04-12")]}) - mock_fetch.return_value = ("Hess Corporation", hist) - mock_yf.Ticker.return_value.info.get.return_value = "N/A" - mock_dates.get_datetime_now.return_value = "2026-04-13T00:00:00Z" - - dfq = pd.DataFrame({"id": ["AAPL", "HES"]}) - result_df = fetch_all_stock(dfq) - hes_rows = result_df[result_df["id"] == "HES"] - assert len(hes_rows) == 1 - assert bool(hes_rows.iloc[0]["resolved"]) is False - - -class TestUpdateQuestionsDelistedStock: - """Test that delisted stocks don't wipe out existing question data.""" - - @patch("questions.yfinance.update_questions.main.create_renamed_ticker_resolution_files") - @patch("questions.yfinance.update_questions.main.create_resolution_file") - def test_delisted_stock_preserves_existing_fields(self, mock_create_res, mock_create_renamed): - """When a delisted stock is updated, existing fields like question are preserved.""" - dfq = pd.DataFrame( - [ - { - "id": "AAPL", - "question": "Will AAPL's market close price go up?", - "background": "Apple Inc.", - "url": "https://finance.yahoo.com/quote/AAPL", - "resolved": False, - "forecast_horizons": [7, 30, 90], - "freeze_datetime_value": "200.0", - "freeze_datetime_value_explanation": "The latest market close price of AAPL.", - "market_info_resolution_criteria": "N/A", - "market_info_open_datetime": "N/A", - "market_info_close_datetime": "N/A", - "market_info_resolution_datetime": "N/A", - }, - { - "id": "HES", - "question": "Will HES's market close price go up?", - "background": "Hess Corporation is an oil company.", - "url": "https://finance.yahoo.com/quote/HES", - "resolved": False, - "forecast_horizons": [7, 30, 90], - "freeze_datetime_value": "150.0", - "freeze_datetime_value_explanation": "The latest market close price of HES.", - "market_info_resolution_criteria": "N/A", - "market_info_open_datetime": "N/A", - "market_info_close_datetime": "N/A", - "market_info_resolution_datetime": "N/A", - }, - ] - ) + """A pool ticker no longer in the S&P 500 that still returns data is not marked resolved.""" + freeze_today(date(2026, 3, 18)) + hist = pd.DataFrame({"Close": [100.0], "Date": pd.to_datetime(["2026-03-17"])}) + mock_fetch_one.return_value = ("Some Co", hist) + mock_ticker_cls.return_value.info.get.return_value = "N/A" - # Simulate fetch output: both entries are complete records. - dff = pd.DataFrame( - [ - { - "id": "AAPL", - "question": "Will AAPL's market close price go up?", - "background": "Apple Inc.", - "url": "https://finance.yahoo.com/quote/AAPL", - "resolved": False, - "forecast_horizons": [7, 30, 90], - "freeze_datetime_value": "200.0", - "freeze_datetime_value_explanation": "The latest market close price of AAPL.", - "market_info_resolution_criteria": "N/A", - "market_info_open_datetime": "N/A", - "market_info_close_datetime": "N/A", - "market_info_resolution_datetime": "N/A", - "fetch_datetime": "2026-04-16T00:00:00Z", - "probability": 200.0, - }, - { - "id": "HES", - "question": "Will HES's market close price go up?", - "background": "Hess Corporation is an oil company.", - "url": "https://finance.yahoo.com/quote/HES", - "resolved": True, - "forecast_horizons": [7, 30, 90], - "freeze_datetime_value": "N/A", - "freeze_datetime_value_explanation": "The latest market close price of HES.", - "market_info_resolution_criteria": "N/A", - "market_info_open_datetime": "N/A", - "market_info_close_datetime": "N/A", - "market_info_resolution_datetime": "N/A", - "fetch_datetime": "2026-04-16T00:00:00Z", - "probability": float("nan"), - }, - ] - ) + dff = yfinance_source.fetch(dfq=make_question_df([{"id": "AAPL"}, {"id": "OUTCO"}])) - result = update_questions(dfq, dff) - hes = result[result["id"] == "HES"].iloc[0] - assert bool(hes["resolved"]) is True - assert hes["question"] == "Will HES's market close price go up?" - assert hes["background"] == "Hess Corporation is an oil company." - assert hes["freeze_datetime_value"] == "N/A" + outco = dff[dff["id"] == "OUTCO"].iloc[0] + assert bool(outco["resolved"]) is False - @patch("questions.yfinance.update_questions.main.create_renamed_ticker_resolution_files") - @patch("questions.yfinance.update_questions.main.create_resolution_file") - def test_active_stock_updates_all_fields(self, mock_create_res, mock_create_renamed): - """When an active stock is updated, all fields from fetch are applied.""" - dfq = pd.DataFrame( - [ - { - "id": "AAPL", - "question": "Old question", - "background": "Old background", - "url": "https://finance.yahoo.com/quote/AAPL", - "resolved": False, - "forecast_horizons": [7, 30], - "freeze_datetime_value": "100.0", - "freeze_datetime_value_explanation": "The latest market close price of AAPL.", - "market_info_resolution_criteria": "N/A", - "market_info_open_datetime": "N/A", - "market_info_close_datetime": "N/A", - "market_info_resolution_datetime": "N/A", - } - ] - ) - dff = pd.DataFrame( - [ - { - "id": "AAPL", - "question": "New question", - "background": "New background", - "url": "https://finance.yahoo.com/quote/AAPL", - "resolved": False, - "forecast_horizons": constants.FORECAST_HORIZONS_IN_DAYS, - "freeze_datetime_value": "200.0", - "freeze_datetime_value_explanation": "The latest market close price of AAPL.", - "market_info_resolution_criteria": "N/A", - "market_info_open_datetime": "N/A", - "market_info_close_datetime": "N/A", - "market_info_resolution_datetime": "N/A", - "fetch_datetime": "2026-04-16T00:00:00Z", - "probability": 200.0, - } - ] - ) +class TestSourceBuildResolutionDf: + """Tests for YfinanceSource._build_resolution_df.""" - result = update_questions(dfq, dff) - aapl = result[result["id"] == "AAPL"].iloc[0] - assert aapl["question"] == "New question" - assert aapl["background"] == "New background" + @staticmethod + def _prices(dates_, values): + return pd.DataFrame({"date": pd.to_datetime(dates_), "value": values}) + def test_skips_when_up_to_date(self, yfinance_source, freeze_today): + """Returns None (no upload) when the existing file already reaches yesterday.""" + freeze_today(date(2026, 3, 18)) + existing = make_resolution_df([{"id": "AAPL", "date": "2026-03-17", "value": 250.0}]) + existing["date"] = existing["date"].astype(str) + out = yfinance_source._build_resolution_df( + {"id": "AAPL", "resolved": False}, period="1mo", existing_df=existing + ) + assert out is None + + @patch.object(YfinanceSource, "_fetch_historical_prices") + def test_force_rebuilds_when_up_to_date(self, mock_fetch, yfinance_source, freeze_today): + """force=True re-fetches even when the existing file is current.""" + freeze_today(date(2026, 3, 18)) + mock_fetch.return_value = self._prices(["2026-03-16", "2026-03-17"], [248.0, 251.0]) + existing = make_resolution_df([{"id": "AAPL", "date": "2026-03-17", "value": 250.0}]) + existing["date"] = existing["date"].astype(str) + + out = yfinance_source._build_resolution_df( + {"id": "AAPL", "resolved": False}, period="1mo", existing_df=existing, force=True + ) + assert out is not None + mock_fetch.assert_called_once() -class TestTickerRenameResolution: - """Test that renamed tickers are handled correctly in update_questions.""" + @patch.object(YfinanceSource, "_fetch_historical_prices") + def test_resolved_forward_fills_to_yesterday(self, mock_fetch, yfinance_source, freeze_today): + """A resolved (delisted) ticker is forward-filled through yesterday.""" + freeze_today(date(2026, 3, 18)) + mock_fetch.return_value = self._prices(["2026-03-13"], [99.5]) - @patch("questions.yfinance.update_questions.main.create_renamed_ticker_resolution_files") - @patch("questions.yfinance.update_questions.main.create_resolution_file") - def test_renamed_ticker_skipped_in_main_loop(self, mock_create_res, mock_create_renamed): - """create_resolution_file should not be called for original tickers in TICKER_RENAMES.""" - dfq = pd.DataFrame( - [ - { - "id": "AAPL", - "question": "Will AAPL go up?", - "background": "Apple Inc.", - "url": "https://finance.yahoo.com/quote/AAPL", - "resolved": False, - "forecast_horizons": [7, 30, 90], - "freeze_datetime_value": "200.0", - "freeze_datetime_value_explanation": "The latest market close price of AAPL.", - "market_info_resolution_criteria": "N/A", - "market_info_open_datetime": "N/A", - "market_info_close_datetime": "N/A", - "market_info_resolution_datetime": "N/A", - }, - { - "id": "FI", - "question": "Will FI go up?", - "background": "Fiserv.", - "url": "https://finance.yahoo.com/quote/FI", - "resolved": False, - "forecast_horizons": [7, 30, 90], - "freeze_datetime_value": "N/A", - "freeze_datetime_value_explanation": "The latest market close price of FI.", - "market_info_resolution_criteria": "N/A", - "market_info_open_datetime": "N/A", - "market_info_close_datetime": "N/A", - "market_info_resolution_datetime": "N/A", - }, - ] + out = yfinance_source._build_resolution_df( + {"id": "GONE", "resolved": True}, period="1mo", existing_df=None ) - - dff = pd.DataFrame( - [ - { - "id": "AAPL", - "question": "Will AAPL go up?", - "background": "Apple Inc.", - "url": "https://finance.yahoo.com/quote/AAPL", - "resolved": False, - "forecast_horizons": [7, 30, 90], - "freeze_datetime_value": "200.0", - "freeze_datetime_value_explanation": "The latest market close price of AAPL.", - "market_info_resolution_criteria": "N/A", - "market_info_open_datetime": "N/A", - "market_info_close_datetime": "N/A", - "market_info_resolution_datetime": "N/A", - "fetch_datetime": "2026-04-16T00:00:00Z", - "probability": 200.0, - }, - { - "id": "FI", - "question": "Will FI go up?", - "background": "Fiserv.", - "url": "https://finance.yahoo.com/quote/FI", - "resolved": True, - "forecast_horizons": [7, 30, 90], - "freeze_datetime_value": "N/A", - "freeze_datetime_value_explanation": "The latest market close price of FI.", - "market_info_resolution_criteria": "N/A", - "market_info_open_datetime": "N/A", - "market_info_close_datetime": "N/A", - "market_info_resolution_datetime": "N/A", - "fetch_datetime": "2026-04-16T00:00:00Z", - "probability": float("nan"), - }, - ] + assert out is not None + assert pd.to_datetime(out["date"]).max().date() == date(2026, 3, 17) # yesterday + assert float(out["value"].iloc[-1]) == 99.5 # final close carried forward + assert (out["id"] == "GONE").all() + + @patch.object(YfinanceSource, "_fetch_historical_prices") + def test_new_ticker_failed_fetch_returns_none(self, mock_fetch, yfinance_source, freeze_today): + """A brand-new ticker whose fetch returns nothing writes no file (no empty upload).""" + freeze_today(date(2026, 3, 18)) + mock_fetch.return_value = pd.DataFrame() # fetch failure / no data + out = yfinance_source._build_resolution_df( + {"id": "NEWCO", "resolved": False}, period="1mo", existing_df=None ) + assert out is None + + @patch.object(YfinanceSource, "_fetch_historical_prices") + def test_failed_fetch_keeps_existing_file(self, mock_fetch, yfinance_source, freeze_today): + """When fetch fails but a file exists, the existing data is kept (returned unchanged).""" + freeze_today(date(2026, 3, 18)) + mock_fetch.return_value = pd.DataFrame() # fetch failure + existing = make_resolution_df([{"id": "AAPL", "date": "2026-03-10", "value": 250.0}]) + existing["date"] = existing["date"].astype(str) + # Not up-to-date (last date 03-10 < yesterday 03-17), so it doesn't early-skip; fetch fails. + out = yfinance_source._build_resolution_df( + {"id": "AAPL", "resolved": False}, period="1mo", existing_df=existing, force=True + ) + # Existing equals the fallback -> no change -> None (existing file left as-is). + assert out is None + + @patch.object(YfinanceSource, "_fetch_historical_prices") + def test_unchanged_returns_none(self, mock_fetch, yfinance_source, freeze_today): + """If the rebuilt file equals the existing one, returns None (no upload).""" + freeze_today(date(2026, 3, 18)) + mock_fetch.return_value = self._prices(["2026-03-16", "2026-03-17"], [248.0, 251.0]) + built = yfinance_source._build_resolution_df( + {"id": "AAPL", "resolved": False}, period="1mo", existing_df=None + ) + out = yfinance_source._build_resolution_df( + {"id": "AAPL", "resolved": False}, period="1mo", existing_df=built, force=True + ) + assert out is None - result = update_questions(dfq, dff) - - # create_resolution_file should only be called for AAPL, not FI - called_ids = [c.args[0]["id"] for c in mock_create_res.call_args_list] - assert "AAPL" in called_ids - assert "FI" not in called_ids - # But FI should still be updated in dfq (resolved=True flows through) - fi_row = result[result["id"] == "FI"].iloc[0] - assert bool(fi_row["resolved"]) is True +class TestSourceUpdate: + """Tests for YfinanceSource.update.""" - @patch("questions.yfinance.update_questions.main.gcp.storage.upload") - @patch("questions.yfinance.update_questions.main.gcp.storage.download_no_error_message_on_404") - @patch("questions.yfinance.update_questions.main.get_historical_prices") - def test_renamed_ticker_fetches_replacement_data( - self, mock_get_hist, mock_download, mock_upload + @patch.object(YfinanceSource, "_fetch_historical_prices") + def test_appends_new_question_and_strips_transient( + self, mock_fetch, yfinance_source, freeze_today ): - """Data is fetched using replacement_ticker and written with original_ticker as ID.""" - from questions.yfinance.update_questions.main import ( - create_renamed_ticker_resolution_files, + """A new fetched ticker is added to dfq without transient fetch columns.""" + freeze_today(date(2026, 3, 18)) + mock_fetch.return_value = pd.DataFrame( + {"date": pd.to_datetime(["2026-03-16", "2026-03-17"]), "value": [10.0, 11.0]} ) + dfq = make_question_df([{"id": "OLD"}]) + dff = make_yfinance_fetch_df([{"id": "NEW"}]) - mock_download.return_value = None # no existing file + result = yfinance_source.update(dfq, dff) - mock_get_hist.return_value = pd.DataFrame( - { - "id": ["FISV", "FISV"], - "date": ["2026-04-14", "2026-04-15"], - "value": [220.0, 221.0], - } - ) + assert "NEW" in result.dfq["id"].values + assert "fetch_datetime" not in result.dfq.columns + assert "probability" not in result.dfq.columns + assert "NEW" in result.resolution_files - create_renamed_ticker_resolution_files("5y") + @patch.object(YfinanceSource, "_fetch_historical_prices") + def test_renamed_ticker_resolution_built_from_replacement( + self, mock_fetch, yfinance_source, freeze_today + ): + """Renamed tickers resolve under the original id using the replacement's price history. - # get_historical_prices should be called with replacement tickers - call_tickers = [c.args[1] for c in mock_get_hist.call_args_list] - assert "FISV" in call_tickers - assert "MRSH" in call_tickers + Regression for the delisted/renamed-ticker handling (prod fix 5042c68). + """ + freeze_today(date(2026, 3, 18)) + renames = yfinance_source.ticker_renames + assert renames, "yfinance metadata should declare ticker_renames" + original = renames[0]["original_ticker"] + replacement = renames[0]["replacement_ticker"] - # Upload should write to yfinance/{original}.jsonl - upload_filenames = [c[1]["filename"] for c in mock_upload.call_args_list] - assert "yfinance/FI.jsonl" in upload_filenames - assert "yfinance/MMC.jsonl" in upload_filenames + seen = [] - @patch("questions.yfinance.update_questions.main.finalize_resolution_file") - @patch("questions.yfinance.update_questions.main.gcp.storage.upload") - @patch("questions.yfinance.update_questions.main.gcp.storage.download_no_error_message_on_404") - @patch("questions.yfinance.update_questions.main.get_historical_prices") - def test_renamed_ticker_not_finalized( - self, mock_get_hist, mock_download, mock_upload, mock_finalize - ): - """finalize_resolution_file should not be called for renamed tickers.""" - from questions.yfinance.update_questions.main import ( - create_renamed_ticker_resolution_files, - ) + def fake_fetch(symbol, period): + seen.append(symbol) + return pd.DataFrame( + {"date": pd.to_datetime(["2026-03-16", "2026-03-17"]), "value": [5.0, 6.0]} + ) - mock_download.return_value = None + mock_fetch.side_effect = fake_fetch - mock_get_hist.return_value = pd.DataFrame( - { - "id": ["FISV", "FISV"], - "date": ["2026-04-14", "2026-04-15"], - "value": [220.0, 221.0], - } - ) + dfq = make_question_df([{"id": original}]) + # The original ticker is NOT in dff (yfinance serves no data under it). + dff = make_yfinance_fetch_df([{"id": "AAPL"}]) - create_renamed_ticker_resolution_files("5y") + result = yfinance_source.update(dfq, dff) - mock_finalize.assert_not_called() + assert original in result.resolution_files + assert (result.resolution_files[original]["id"] == original).all() + assert replacement in seen + @patch.object(YfinanceSource, "_fetch_historical_prices") + def test_renamed_original_skipped_in_main_loop(self, mock_fetch, yfinance_source, freeze_today): + """A renamed original in dff is built via its replacement, never fetched directly.""" + freeze_today(date(2026, 3, 18)) + original = yfinance_source.ticker_renames[0]["original_ticker"] + replacement = yfinance_source.ticker_renames[0]["replacement_ticker"] -class TestFinalizeResolutionFile: - """Test forward-filling a resolution file to cover resolution dates up to yesterday.""" + seen = [] - def test_forward_fills_to_yesterday(self, freeze_today): - """The finalized resolution file extends to yesterday, not into the future.""" - freeze_today(date(2026, 4, 13)) + def fake_fetch(symbol, period): + seen.append(symbol) + return pd.DataFrame({"date": pd.to_datetime(["2026-03-17"]), "value": [6.0]}) - df = pd.DataFrame( - { - "id": ["HES", "HES", "HES"], - "date": ["2025-05-01", "2025-05-02", "2025-05-03"], - "value": [150.0, 151.0, 149.0], - } + mock_fetch.side_effect = fake_fetch + + dfq = make_question_df([{"id": original}]) + dff = make_yfinance_fetch_df([{"id": original}]) + + result = yfinance_source.update(dfq, dff) + + assert original in result.resolution_files + assert original not in seen # original symbol never fetched directly + assert replacement in seen + + @patch.object(YfinanceSource, "_fetch_historical_prices") + def test_delisted_stock_preserves_existing_question_fields( + self, mock_fetch, yfinance_source, freeze_today + ): + """Updating a delisted (resolved) ticker keeps its existing question text/background.""" + freeze_today(date(2026, 3, 18)) + mock_fetch.return_value = pd.DataFrame( + {"date": pd.to_datetime(["2026-03-13"]), "value": [149.0]} + ) + dfq = make_question_df( + [{"id": "HES", "question": "Will HES go up?", "background": "Hess Corporation."}] + ) + dff = make_yfinance_fetch_df( + [ + { + "id": "HES", + "question": "Will HES go up?", + "background": "Hess Corporation.", + "resolved": True, + "freeze_datetime_value": "N/A", + "probability": float("nan"), + } + ] ) - result = finalize_resolution_file(df) + result = yfinance_source.update(dfq, dff) + + hes = result.dfq[result.dfq["id"] == "HES"].iloc[0] + assert bool(hes["resolved"]) is True + assert hes["question"] == "Will HES go up?" + assert hes["background"] == "Hess Corporation." + assert hes["freeze_datetime_value"] == "N/A" + # Resolved tickers are forward-filled and uploaded. + assert "HES" in result.resolution_files - last_date = pd.to_datetime(result["date"]).max().date() - assert last_date == date(2026, 4, 12) # yesterday - # All forward-filled values should be the last known price - final_rows = result[pd.to_datetime(result["date"]).dt.date > date(2025, 5, 3)] - assert (final_rows["value"].astype(float) == 149.0).all() +class TestSourceFinalizeResolutionFile: + """Tests for YfinanceSource._finalize_resolution_file.""" - def test_empty_df_returns_empty(self): + def test_empty_df_returns_empty(self, yfinance_source): """Empty input returns empty output.""" df = pd.DataFrame(columns=["id", "date", "value"]) - result = finalize_resolution_file(df) - assert result.empty + assert yfinance_source._finalize_resolution_file(df).empty