From 0f091ee85960b6171725aeac65d2ce0679d1f084 Mon Sep 17 00:00:00 2001 From: Nikolay Petrov Date: Fri, 5 Jun 2026 21:19:42 +0300 Subject: [PATCH 1/3] refactor: harmonize resolution-file existence checks across market sources --- src/orchestration/_source_io.py | 25 ++++++++++++++----- src/orchestration/func_infer_fetch/main.py | 9 +++---- .../func_manifold_update/main.py | 9 +++---- .../func_metaculus_update/main.py | 9 +++---- src/sources/infer.py | 9 ++++--- src/sources/manifold.py | 10 ++++---- src/sources/metaculus.py | 11 ++++---- src/tests/test_infer.py | 6 ++--- src/tests/test_manifold.py | 4 +-- src/tests/test_metaculus.py | 2 +- 10 files changed, 49 insertions(+), 45 deletions(-) diff --git a/src/orchestration/_source_io.py b/src/orchestration/_source_io.py index 10a7d436..a28a6a7f 100644 --- a/src/orchestration/_source_io.py +++ b/src/orchestration/_source_io.py @@ -34,6 +34,24 @@ def write_fetch_output(source: str, dff: pd.DataFrame) -> None: ) +def list_existing_resolution_ids(source: str) -> set[str]: + """Return the set of bare question IDs that already have a resolution file. + + Lists /*.jsonl and strips to bare IDs. This is a pure existence + listing: it includes files whose contents are empty, mirroring the old + inline list_with_prefix check. Do NOT derive this from + load_existing_resolution_files, which drops empty frames. + + Args: + source (str): Source name (e.g. "infer"). + + Returns: + set of bare question IDs present in storage for this source. + """ + paths = gcp.storage.list_with_prefix(bucket_name=env.QUESTION_BANK_BUCKET, prefix=f"{source}/") + return {os.path.basename(p).removesuffix(".jsonl") for p in paths if p.endswith(".jsonl")} + + def load_existing_resolution_files( source: str, ids: Iterable[str] | None = None, @@ -52,12 +70,7 @@ def load_existing_resolution_files( dict mapping question_id to its resolution DataFrame. """ if ids is None: - paths = gcp.storage.list_with_prefix( - bucket_name=env.QUESTION_BANK_BUCKET, prefix=f"{source}/" - ) - question_ids = [ - os.path.basename(p).removesuffix(".jsonl") for p in paths if p.endswith(".jsonl") - ] + question_ids = list(list_existing_resolution_ids(source)) else: question_ids = [str(qid) for qid in ids] diff --git a/src/orchestration/func_infer_fetch/main.py b/src/orchestration/func_infer_fetch/main.py index f39baf1e..968feb87 100644 --- a/src/orchestration/func_infer_fetch/main.py +++ b/src/orchestration/func_infer_fetch/main.py @@ -5,10 +5,9 @@ import logging from typing import Any -from helpers import data_utils, decorator, env, keys +from helpers import data_utils, decorator, keys from orchestration import _source_io from sources.infer import InferSource -from utils import gcp logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -23,11 +22,9 @@ def driver(_: Any) -> None: source.api_key = keys.API_KEY_INFER dfq = data_utils.get_data_from_cloud_storage(SOURCE, return_question_data=True) - files_in_storage = gcp.storage.list_with_prefix( - bucket_name=env.QUESTION_BANK_BUCKET, prefix=SOURCE - ) + existing_resolution_ids = _source_io.list_existing_resolution_ids(SOURCE) - dff = source.fetch(dfq=dfq, files_in_storage=files_in_storage) + dff = source.fetch(dfq=dfq, existing_resolution_ids=existing_resolution_ids) _source_io.write_fetch_output(SOURCE, dff) logger.info("Done.") diff --git a/src/orchestration/func_manifold_update/main.py b/src/orchestration/func_manifold_update/main.py index a152a0dc..8279dc19 100644 --- a/src/orchestration/func_manifold_update/main.py +++ b/src/orchestration/func_manifold_update/main.py @@ -5,10 +5,9 @@ import logging from typing import Any -from helpers import data_utils, decorator, env +from helpers import data_utils, decorator from orchestration import _source_io from sources.manifold import ManifoldSource -from utils import gcp logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -32,15 +31,13 @@ def driver(_: Any) -> None: existing_resolution_files = _source_io.load_existing_resolution_files(SOURCE) logger.info(f"Loaded {len(existing_resolution_files)} resolution files") - files_in_storage = gcp.storage.list_with_prefix( - bucket_name=env.QUESTION_BANK_BUCKET, prefix=SOURCE - ) + existing_resolution_ids = _source_io.list_existing_resolution_ids(SOURCE) result = source.update( dfq, dff, existing_resolution_files=existing_resolution_files, - files_in_storage=files_in_storage, + existing_resolution_ids=existing_resolution_ids, ) logger.info("Uploading to GCP...") diff --git a/src/orchestration/func_metaculus_update/main.py b/src/orchestration/func_metaculus_update/main.py index b013829a..63baa0e3 100644 --- a/src/orchestration/func_metaculus_update/main.py +++ b/src/orchestration/func_metaculus_update/main.py @@ -5,10 +5,9 @@ import logging from typing import Any -from helpers import data_utils, decorator, env, keys +from helpers import data_utils, decorator, keys from orchestration import _source_io from sources.metaculus import MetaculusSource -from utils import gcp logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -26,14 +25,12 @@ def driver(_: Any) -> None: SOURCE, return_question_data=True, return_fetch_data=True ) - files_in_storage = gcp.storage.list_with_prefix( - bucket_name=env.QUESTION_BANK_BUCKET, prefix=SOURCE - ) + existing_resolution_ids = _source_io.list_existing_resolution_ids(SOURCE) result = source.update( dfq, dff, - files_in_storage=files_in_storage, + existing_resolution_ids=existing_resolution_ids, ) logger.info("Uploading to GCP...") diff --git a/src/sources/infer.py b/src/sources/infer.py index 7cf88c10..29c4fbd4 100644 --- a/src/sources/infer.py +++ b/src/sources/infer.py @@ -40,16 +40,17 @@ def fetch( self, *, dfq: DataFrame[QuestionFrame] | None = None, - files_in_storage: list[str] | None = None, + existing_resolution_ids: set[str] | None = None, ) -> DataFrame[InferFetchFrame]: """Fetch questions from the INFER API. Args: dfq (DataFrame[QuestionFrame] | None): Existing question bank. - files_in_storage (list[str] | None): Existing resolution file paths. + existing_resolution_ids (set[str] | None): Bare IDs that already have a resolution + file in storage. """ self._require_api_key() - files_in_storage = files_in_storage or [] + existing_resolution_ids = existing_resolution_ids or set() # Determine which existing questions need re-fetching resolved_ids: list[str] = [] @@ -62,7 +63,7 @@ def fetch( logger.info(f"Number unresolved_ids: {len(unresolved_ids)}") resolved_ids_without_files = [ - id for id in resolved_ids if f"{self.name}/{id}.jsonl" not in files_in_storage + id for id in resolved_ids if str(id) not in existing_resolution_ids ] logger.info(f"resolved_ids_without_resolution_files: {resolved_ids_without_files}") diff --git a/src/sources/manifold.py b/src/sources/manifold.py index 9adbdf01..d2aefdf7 100644 --- a/src/sources/manifold.py +++ b/src/sources/manifold.py @@ -112,7 +112,7 @@ def update( dff: DataFrame[ManifoldFetchFrame], *, existing_resolution_files: dict[str, DataFrame[ResolutionFrame]] | None = None, - files_in_storage: list[str] | None = None, + existing_resolution_ids: set[str] | None = None, ) -> UpdateResult: """Process fetched IDs into updated questions and resolution files. @@ -124,10 +124,11 @@ def update( dfq (DataFrame[QuestionFrame]): Existing questions. dff (DataFrame[ManifoldFetchFrame]): Freshly fetched market IDs. existing_resolution_files (dict | None): Per-question existing resolution data. - files_in_storage (list[str] | None): Existing resolution file paths in storage. + existing_resolution_ids (set[str] | None): Bare IDs that already have a resolution + file in storage. """ existing_resolution_files = existing_resolution_files or {} - files_in_storage = files_in_storage or [] + existing_resolution_ids = existing_resolution_ids or set() resolution_files: dict[str, pd.DataFrame] = {} # --- Append new IDs from dff to dfq --- @@ -182,8 +183,7 @@ def update( # --- Regenerate missing resolution files for resolved questions --- for _index, row in dfq[dfq["resolved"]].iterrows(): - filename = f"{self.name}/{row['id']}.jsonl" - if filename not in files_in_storage and row["id"] not in resolution_files: + if str(row["id"]) not in existing_resolution_ids and row["id"] not in resolution_files: market = self._get_market(row["id"]) df_res = self._build_resolution_file( market=market, diff --git a/src/sources/metaculus.py b/src/sources/metaculus.py index c0eec80d..6504a578 100644 --- a/src/sources/metaculus.py +++ b/src/sources/metaculus.py @@ -93,7 +93,7 @@ def update( dfq: DataFrame[QuestionFrame], dff: DataFrame[MetaculusFetchFrame], *, - files_in_storage: list[str] | None = None, + existing_resolution_ids: set[str] | None = None, ) -> UpdateResult: """Fetch full question data and build resolution files. @@ -106,8 +106,8 @@ def update( Args: dfq (DataFrame[QuestionFrame]): Existing questions. dff (DataFrame[MetaculusFetchFrame]): Discovered question IDs from fetch(). - files_in_storage (list[str] | None): Existing resolution file paths in storage, - used to decide which resolved questions need regenerating. + existing_resolution_ids (set[str] | None): Bare IDs that already have a resolution + file in storage, used to decide which resolved questions need regenerating. """ self._require_api_key() resolution_files: dict[str, pd.DataFrame] = {} @@ -167,11 +167,10 @@ def update( dfq.at[index, "freeze_datetime_value"] = "N/A" # Regenerate missing resolution files for resolved questions - files_in_storage = files_in_storage or [] + existing_resolution_ids = existing_resolution_ids or set() for index, row in dfq[dfq["resolved"]].iterrows(): question_id = str(row["id"]) - filename = f"{self.name}/{question_id}.jsonl" - if filename not in files_in_storage and question_id not in resolution_files: + if question_id not in existing_resolution_ids and question_id not in resolution_files: market = self._get_market(row["id"]) df_res = self._create_resolution_file(dfq, index, market) if df_res is not None: diff --git a/src/tests/test_infer.py b/src/tests/test_infer.py index e382c146..3443a467 100644 --- a/src/tests/test_infer.py +++ b/src/tests/test_infer.py @@ -337,7 +337,7 @@ def test_deduplication_active_wins(self, mock_api, infer_source): [make_infer_api_question(id=100, state="active")], # active fetch ] dfq = make_question_df([{"id": "100", "resolved": False}]) - dff = infer_source.fetch(dfq=dfq, files_in_storage=[]) + dff = infer_source.fetch(dfq=dfq, existing_resolution_ids=set()) assert len(dff) == 1 @@ -350,7 +350,7 @@ def test_resolved_without_files_refetched(self, mock_api, infer_source): ] dfq = make_question_df([{"id": "100", "resolved": True}]) # No resolution file in storage → should re-fetch - dff = infer_source.fetch(dfq=dfq, files_in_storage=[]) + dff = infer_source.fetch(dfq=dfq, existing_resolution_ids=set()) assert len(dff) == 1 mock_api.assert_any_call(status="all", question_ids=["100"]) @@ -361,7 +361,7 @@ def test_empty_dfq(self, mock_api, infer_source): mock_api.side_effect = [ [make_infer_api_question(id=300)], ] - dff = infer_source.fetch(dfq=None, files_in_storage=[]) + dff = infer_source.fetch(dfq=None, existing_resolution_ids=set()) assert len(dff) == 1 def test_api_key_required(self): diff --git a/src/tests/test_manifold.py b/src/tests/test_manifold.py index 58332a2e..9c1f85b9 100644 --- a/src/tests/test_manifold.py +++ b/src/tests/test_manifold.py @@ -514,7 +514,7 @@ def test_regenerates_missing_resolved_files(self, mock_market, mock_build, manif ) dff = make_manifold_fetch_df([{"id": "mkt_001"}]) - result = manifold_source.update(dfq, dff, files_in_storage=[]) + result = manifold_source.update(dfq, dff, existing_resolution_ids=set()) assert "mkt_001" in result.resolution_files @@ -533,7 +533,7 @@ def test_skips_resolved_already_in_storage(self, mock_market, mock_build, manifo ) dff = make_manifold_fetch_df([{"id": "mkt_001"}]) - result = manifold_source.update(dfq, dff, files_in_storage=["manifold/mkt_001.jsonl"]) + result = manifold_source.update(dfq, dff, existing_resolution_ids={"mkt_001"}) # _get_market should not be called for the resolved question mock_market.assert_not_called() diff --git a/src/tests/test_metaculus.py b/src/tests/test_metaculus.py index 26f4448b..8e0df704 100644 --- a/src/tests/test_metaculus.py +++ b/src/tests/test_metaculus.py @@ -912,7 +912,7 @@ def test_resolved_with_existing_file_not_regenerated( ) dff = make_metaculus_fetch_df([]) - metaculus_source.update(dfq, dff, files_in_storage=["metaculus/42472.jsonl"]) + metaculus_source.update(dfq, dff, existing_resolution_ids={"42472"}) mock_market.assert_not_called() mock_res.assert_not_called() From 247222b2d126e5356d7a7d961b29b716539baeb7 Mon Sep 17 00:00:00 2001 From: Nikolay Petrov Date: Wed, 10 Jun 2026 13:51:31 +0300 Subject: [PATCH 2/3] fix: harmonize sources resolution df generation semantics Supersedes PR #209 and #208. Fixes issue #206 --- src/sources/infer.py | 4 +- src/sources/manifold.py | 6 +- src/sources/metaculus.py | 8 +-- src/sources/polymarket.py | 4 +- src/tests/test_infer.py | 28 ++++---- src/tests/test_manifold.py | 93 ++++++------------------ src/tests/test_metaculus.py | 136 +++++++++-------------------------- src/tests/test_polymarket.py | 24 +++---- 8 files changed, 93 insertions(+), 210 deletions(-) diff --git a/src/sources/infer.py b/src/sources/infer.py index 29c4fbd4..40c94106 100644 --- a/src/sources/infer.py +++ b/src/sources/infer.py @@ -129,7 +129,7 @@ def update( # Build/update resolution file existing_df = existing_resolution_files.get(question_id) - df_res = self._build_resolution_file( + df_res = self._build_resolution_df( question=question, resolved=question["resolved"], existing_df=existing_df, @@ -320,7 +320,7 @@ def _get_historical_forecasts( # Private: resolution file building # ------------------------------------------------------------------ - def _build_resolution_file( + def _build_resolution_df( self, question: dict, resolved: bool, diff --git a/src/sources/manifold.py b/src/sources/manifold.py index d2aefdf7..ec6ea64e 100644 --- a/src/sources/manifold.py +++ b/src/sources/manifold.py @@ -167,7 +167,7 @@ def update( # Build resolution file existing_df = existing_resolution_files.get(row["id"]) - df_res = self._build_resolution_file( + df_res = self._build_resolution_df( market=market, market_info_resolution_datetime=dfq.at[index, "market_info_resolution_datetime"], existing_df=existing_df, @@ -185,7 +185,7 @@ def update( for _index, row in dfq[dfq["resolved"]].iterrows(): if str(row["id"]) not in existing_resolution_ids and row["id"] not in resolution_files: market = self._get_market(row["id"]) - df_res = self._build_resolution_file( + df_res = self._build_resolution_df( market=market, market_info_resolution_datetime=row["market_info_resolution_datetime"], existing_df=None, @@ -331,7 +331,7 @@ def _get_market_bets(self, market_id: str) -> list[dict]: # Private: resolution file building # ------------------------------------------------------------------ - def _build_resolution_file( + def _build_resolution_df( self, market: dict, market_info_resolution_datetime: str, diff --git a/src/sources/metaculus.py b/src/sources/metaculus.py index 6504a578..cf0bce26 100644 --- a/src/sources/metaculus.py +++ b/src/sources/metaculus.py @@ -157,7 +157,7 @@ def update( dfq.at[index, "forecast_horizons"] = "N/A" # Build resolution file - df_res = self._create_resolution_file(dfq, index, market) + df_res = self._build_resolution_df(dfq, index, market) if df_res is not None: resolution_files[str(row["id"])] = df_res dfq.at[index, "freeze_datetime_value"] = ( @@ -172,7 +172,7 @@ def update( question_id = str(row["id"]) if question_id not in existing_resolution_ids and question_id not in resolution_files: market = self._get_market(row["id"]) - df_res = self._create_resolution_file(dfq, index, market) + df_res = self._build_resolution_df(dfq, index, market) if df_res is not None: resolution_files[question_id] = df_res @@ -324,12 +324,12 @@ def _get_resolved_market_value(market: dict) -> float: return 0 return np.nan - def _create_resolution_file( + def _build_resolution_df( self, dfq: pd.DataFrame, index: int, market: dict, - ) -> pd.DataFrame | None: + ) -> DataFrame[ResolutionFrame] | None: """Build the resolution file for a market from its aggregation history. Overwrites the resolution file entirely on each run (Metaculus returns the diff --git a/src/sources/polymarket.py b/src/sources/polymarket.py index 647a12dd..ed00e611 100644 --- a/src/sources/polymarket.py +++ b/src/sources/polymarket.py @@ -201,7 +201,7 @@ def update( question_id = str(question["id"]) # Build the resolution file from the embedded historical prices. - resolution_files[question_id] = self._build_resolution_file(question) + resolution_files[question_id] = self._build_resolution_df(question) # Strip transient fields (not part of QuestionFrame). del question["fetch_datetime"] @@ -538,7 +538,7 @@ def _transform_question( # Private: resolution file building # ------------------------------------------------------------------ - def _build_resolution_file(self, question: dict) -> DataFrame[ResolutionFrame]: + def _build_resolution_df(self, question: dict) -> DataFrame[ResolutionFrame]: """Build a resolution file from a fetched question's embedded historical prices. Args: diff --git a/src/tests/test_infer.py b/src/tests/test_infer.py index 3443a467..c562a526 100644 --- a/src/tests/test_infer.py +++ b/src/tests/test_infer.py @@ -190,12 +190,12 @@ def test_only_keeps_id_date_value(self): # --------------------------------------------------------------------------- -# _build_resolution_file (mock _get_historical_forecasts) +# _build_resolution_df (mock _get_historical_forecasts) # --------------------------------------------------------------------------- -class TestBuildResolutionFile: - """Tests for InferSource._build_resolution_file.""" +class TestBuildResolutionDf: + """Tests for InferSource._build_resolution_df.""" def _question(self, **overrides): base = { @@ -212,7 +212,7 @@ def test_nullified_no_existing(self, mock_hist, infer_source, freeze_today): """Nullified question with no existing data returns single NaN row.""" freeze_today(date(2026, 1, 15)) q = self._question(nullify_question=True) - df = infer_source._build_resolution_file(q, resolved=False, existing_df=None) + df = infer_source._build_resolution_df(q, resolved=False, existing_df=None) assert len(df) == 1 assert np.isnan(df["value"].iloc[0]) @@ -229,7 +229,7 @@ def test_nullified_with_existing(self, mock_hist, infer_source, freeze_today): ] ) q = self._question(nullify_question=True) - df = infer_source._build_resolution_file(q, resolved=False, existing_df=existing) + df = infer_source._build_resolution_df(q, resolved=False, existing_df=existing) assert df["value"].isna().all() mock_hist.assert_not_called() @@ -245,7 +245,7 @@ def test_already_up_to_date(self, mock_hist, infer_source, freeze_today): ] ) q = self._question() - df = infer_source._build_resolution_file(q, resolved=False, existing_df=existing) + df = infer_source._build_resolution_df(q, resolved=False, existing_df=existing) assert df.equals(existing) mock_hist.assert_not_called() @@ -262,7 +262,7 @@ def test_fetches_when_stale(self, mock_hist, infer_source, freeze_today): ) existing = make_resolution_df([{"id": "200", "date": "2024-06-01", "value": 0.5}]) q = self._question() - df = infer_source._build_resolution_file(q, resolved=False, existing_df=existing) + df = infer_source._build_resolution_df(q, resolved=False, existing_df=existing) assert not df.empty mock_hist.assert_called_once() @@ -282,7 +282,7 @@ def test_resolved_truncates_and_appends(self, mock_hist, infer_source, freeze_to market_info_resolution_datetime="2026-01-11T00:00:00+00:00", probability=1.0, ) - df = infer_source._build_resolution_file(q, resolved=True, existing_df=None) + df = infer_source._build_resolution_df(q, resolved=True, existing_df=None) # Should have rows up to resolution date assert not df.empty @@ -372,14 +372,14 @@ def test_api_key_required(self): # --------------------------------------------------------------------------- -# update() (mock _build_resolution_file) +# update() (mock _build_resolution_df) # --------------------------------------------------------------------------- class TestUpdate: """Tests for InferSource.update.""" - @patch.object(InferSource, "_build_resolution_file") + @patch.object(InferSource, "_build_resolution_df") def test_basic_update(self, mock_build, infer_source): """Returns UpdateResult with valid dfq and resolution files.""" mock_build.return_value = make_resolution_df( @@ -394,7 +394,7 @@ def test_basic_update(self, mock_build, infer_source): assert "200" in result.resolution_files QuestionFrame.validate(result.dfq) - @patch.object(InferSource, "_build_resolution_file") + @patch.object(InferSource, "_build_resolution_df") def test_new_question_inserted(self, mock_build, infer_source): """Question not in dfq gets appended.""" mock_build.return_value = make_resolution_df( @@ -407,7 +407,7 @@ def test_new_question_inserted(self, mock_build, infer_source): assert len(result.dfq) == 2 assert set(result.dfq["id"].tolist()) == {"100", "300"} - @patch.object(InferSource, "_build_resolution_file") + @patch.object(InferSource, "_build_resolution_df") def test_existing_question_updated(self, mock_build, infer_source): """Existing question fields are updated in place.""" mock_build.return_value = make_resolution_df( @@ -420,7 +420,7 @@ def test_existing_question_updated(self, mock_build, infer_source): assert len(result.dfq) == 1 assert result.dfq.iloc[0]["question"] == "New text" - @patch.object(InferSource, "_build_resolution_file") + @patch.object(InferSource, "_build_resolution_df") def test_nullified_marked_resolved(self, mock_build, infer_source): """Nullified questions are marked as resolved in dfq.""" mock_build.return_value = make_resolution_df( @@ -433,7 +433,7 @@ def test_nullified_marked_resolved(self, mock_build, infer_source): row = result.dfq[result.dfq["id"] == "200"].iloc[0] assert bool(row["resolved"]) is True - @patch.object(InferSource, "_build_resolution_file") + @patch.object(InferSource, "_build_resolution_df") def test_transient_fields_stripped(self, mock_build, infer_source): """fetch_datetime, probability, nullify_question not in output dfq.""" mock_build.return_value = make_resolution_df( diff --git a/src/tests/test_manifold.py b/src/tests/test_manifold.py index 9c1f85b9..4f5d0005 100644 --- a/src/tests/test_manifold.py +++ b/src/tests/test_manifold.py @@ -53,59 +53,12 @@ def test_unknown_resolution(self): # --------------------------------------------------------------------------- -# _finalize_resolution_df (pure, no mocking) +# _build_resolution_df (mock _get_market_bets) # --------------------------------------------------------------------------- -class TestFinalizeResolutionDf: - """Tests for ManifoldSource._finalize_resolution_df static method.""" - - def test_filters_before_benchmark_start(self): - """Rows before BENCHMARK_START_DATE are dropped.""" - df = pd.DataFrame( - { - "id": ["A", "A", "A"], - "date": pd.to_datetime(["2020-01-01", "2024-06-01", "2024-07-01"]), - "value": [0.1, 0.2, 0.3], - } - ) - result = ManifoldSource._finalize_resolution_df(df) - assert len(result) == 2 - assert result["value"].tolist() == [0.2, 0.3] - - def test_validates_schema(self): - """Output is a valid ResolutionFrame.""" - df = pd.DataFrame( - { - "id": ["A"], - "date": pd.to_datetime(["2024-06-01"]), - "value": [0.5], - } - ) - result = ManifoldSource._finalize_resolution_df(df) - ResolutionFrame.validate(result) - - def test_only_keeps_id_date_value(self): - """Extra columns are stripped.""" - df = pd.DataFrame( - { - "id": ["A"], - "date": pd.to_datetime(["2024-06-01"]), - "value": [0.5], - "extra": ["junk"], - } - ) - result = ManifoldSource._finalize_resolution_df(df) - assert list(result.columns) == ["id", "date", "value"] - - -# --------------------------------------------------------------------------- -# _build_resolution_file (mock _get_market_bets) -# --------------------------------------------------------------------------- - - -class TestBuildResolutionFile: - """Tests for ManifoldSource._build_resolution_file.""" +class TestBuildResolutionDf: + """Tests for ManifoldSource._build_resolution_df.""" @patch.object(ManifoldSource, "_get_market_bets") def test_already_up_to_date(self, mock_bets, manifold_source, freeze_today): @@ -118,7 +71,7 @@ def test_already_up_to_date(self, mock_bets, manifold_source, freeze_today): ] ) market = make_manifold_api_market() - result = manifold_source._build_resolution_file( + result = manifold_source._build_resolution_df( market=market, market_info_resolution_datetime="N/A", existing_df=existing ) @@ -134,7 +87,7 @@ def test_basic_unresolved_market(self, mock_bets, manifold_source, freeze_today) make_manifold_bet(id="b2", createdTime=1768226400000, probAfter=0.6), # 2026-01-12 ] market = make_manifold_api_market() - result = manifold_source._build_resolution_file( + result = manifold_source._build_resolution_df( market=market, market_info_resolution_datetime="N/A", existing_df=None ) @@ -151,7 +104,7 @@ def test_empty_bets_returns_none(self, mock_bets, manifold_source, freeze_today) freeze_today(date(2026, 1, 15)) mock_bets.return_value = [] market = make_manifold_api_market() - result = manifold_source._build_resolution_file( + result = manifold_source._build_resolution_df( market=market, market_info_resolution_datetime="N/A", existing_df=None ) assert result is None @@ -165,7 +118,7 @@ def test_no_filled_bets_returns_none(self, mock_bets, manifold_source, freeze_to make_manifold_bet(isFilled=False, createdTime=1768226400000), ] market = make_manifold_api_market() - result = manifold_source._build_resolution_file( + result = manifold_source._build_resolution_df( market=market, market_info_resolution_datetime="N/A", existing_df=None ) assert result is None @@ -179,7 +132,7 @@ def test_forward_fills_gaps(self, mock_bets, manifold_source, freeze_today): make_manifold_bet(id="b2", createdTime=1768384800000, probAfter=0.8), # 2026-01-14 ] market = make_manifold_api_market() - result = manifold_source._build_resolution_file( + result = manifold_source._build_resolution_df( market=market, market_info_resolution_datetime="N/A", existing_df=None ) @@ -203,7 +156,7 @@ def test_resolved_truncates_at_resolution(self, mock_bets, manifold_source, free resolution="YES", resolutionTime=1768310400000, # 2026-01-13T12:00:00Z ) - result = manifold_source._build_resolution_file( + result = manifold_source._build_resolution_df( market=market, market_info_resolution_datetime="2026-01-13T12:00:00+00:00", existing_df=None, @@ -231,7 +184,7 @@ def test_resolved_cancel_nan_last_row(self, mock_bets, manifold_source, freeze_t resolution="CANCEL", resolutionTime=1768310400000, # 2026-01-13T12:00:00Z ) - result = manifold_source._build_resolution_file( + result = manifold_source._build_resolution_df( market=market, market_info_resolution_datetime="2026-01-13T12:00:00+00:00", existing_df=None, @@ -250,7 +203,7 @@ def test_filters_future_bets(self, mock_bets, manifold_source, freeze_today): make_manifold_bet(id="b2", createdTime=1768464000000, probAfter=0.9), # 2026-01-15 ] market = make_manifold_api_market() - result = manifold_source._build_resolution_file( + result = manifold_source._build_resolution_df( market=market, market_info_resolution_datetime="N/A", existing_df=None ) @@ -377,14 +330,14 @@ def test_empty_results(self, mock_search, manifold_source): # --------------------------------------------------------------------------- -# update() (mock _get_market + _build_resolution_file) +# update() (mock _get_market + _build_resolution_df) # --------------------------------------------------------------------------- class TestUpdate: """Tests for ManifoldSource.update.""" - @patch.object(ManifoldSource, "_build_resolution_file") + @patch.object(ManifoldSource, "_build_resolution_df") @patch.object(ManifoldSource, "_get_market") def test_new_id_appended(self, mock_market, mock_build, manifold_source): """IDs in dff not in dfq get appended with defaults.""" @@ -402,7 +355,7 @@ def test_new_id_appended(self, mock_market, mock_build, manifold_source): new_row = result.dfq[result.dfq["id"] == "new_001"].iloc[0] assert new_row["freeze_datetime_value_explanation"] == "The market value." - @patch.object(ManifoldSource, "_build_resolution_file") + @patch.object(ManifoldSource, "_build_resolution_df") @patch.object(ManifoldSource, "_get_market") def test_existing_unresolved_updated(self, mock_market, mock_build, manifold_source): """Unresolved question fields are updated from market details.""" @@ -425,7 +378,7 @@ def test_existing_unresolved_updated(self, mock_market, mock_build, manifold_sou assert row["background"] == "New background" assert row["url"] == "https://manifold.markets/updated" - @patch.object(ManifoldSource, "_build_resolution_file") + @patch.object(ManifoldSource, "_build_resolution_df") @patch.object(ManifoldSource, "_get_market") def test_market_becomes_resolved(self, mock_market, mock_build, manifold_source): """Market with isResolved=True marks dfq row as resolved.""" @@ -447,10 +400,10 @@ def test_market_becomes_resolved(self, mock_market, mock_build, manifold_source) assert bool(row["resolved"]) is True assert row["market_info_resolution_datetime"] != "N/A" - @patch.object(ManifoldSource, "_build_resolution_file") + @patch.object(ManifoldSource, "_build_resolution_df") @patch.object(ManifoldSource, "_get_market") def test_resolution_file_stored(self, mock_market, mock_build, manifold_source): - """Resolution file from _build_resolution_file is in result.""" + """Resolution file from _build_resolution_df is in result.""" res_df = make_resolution_df([{"id": "mkt_001", "date": "2024-06-01", "value": 0.5}]) mock_market.return_value = make_manifold_api_market(id="mkt_001") mock_build.return_value = res_df @@ -461,7 +414,7 @@ def test_resolution_file_stored(self, mock_market, mock_build, manifold_source): assert "mkt_001" in result.resolution_files - @patch.object(ManifoldSource, "_build_resolution_file") + @patch.object(ManifoldSource, "_build_resolution_df") @patch.object(ManifoldSource, "_get_market") def test_freeze_datetime_value_set(self, mock_market, mock_build, manifold_source): """freeze_datetime_value is set to last value of resolution df.""" @@ -480,10 +433,10 @@ def test_freeze_datetime_value_set(self, mock_market, mock_build, manifold_sourc row = result.dfq[result.dfq["id"] == "mkt_001"].iloc[0] assert str(row["freeze_datetime_value"]) == "0.75" - @patch.object(ManifoldSource, "_build_resolution_file") + @patch.object(ManifoldSource, "_build_resolution_df") @patch.object(ManifoldSource, "_get_market") def test_build_resolution_returns_none(self, mock_market, mock_build, manifold_source): - """_build_resolution_file returning None: no resolution file stored.""" + """_build_resolution_df returning None: no resolution file stored.""" mock_market.return_value = make_manifold_api_market(id="mkt_001") mock_build.return_value = None dfq = make_question_df([{"id": "mkt_001", "resolved": False}]) @@ -493,7 +446,7 @@ def test_build_resolution_returns_none(self, mock_market, mock_build, manifold_s assert "mkt_001" not in (result.resolution_files or {}) - @patch.object(ManifoldSource, "_build_resolution_file") + @patch.object(ManifoldSource, "_build_resolution_df") @patch.object(ManifoldSource, "_get_market") def test_regenerates_missing_resolved_files(self, mock_market, mock_build, manifold_source): """Resolved questions missing from storage get resolution files regenerated.""" @@ -518,7 +471,7 @@ def test_regenerates_missing_resolved_files(self, mock_market, mock_build, manif assert "mkt_001" in result.resolution_files - @patch.object(ManifoldSource, "_build_resolution_file") + @patch.object(ManifoldSource, "_build_resolution_df") @patch.object(ManifoldSource, "_get_market") def test_skips_resolved_already_in_storage(self, mock_market, mock_build, manifold_source): """Resolved questions with files in storage are not re-fetched.""" @@ -539,7 +492,7 @@ def test_skips_resolved_already_in_storage(self, mock_market, mock_build, manifo mock_market.assert_not_called() assert "mkt_001" not in (result.resolution_files or {}) - @patch.object(ManifoldSource, "_build_resolution_file") + @patch.object(ManifoldSource, "_build_resolution_df") @patch.object(ManifoldSource, "_get_market") def test_output_schema_valid(self, mock_market, mock_build, manifold_source): """Output dfq passes QuestionFrame validation.""" diff --git a/src/tests/test_metaculus.py b/src/tests/test_metaculus.py index 8e0df704..8c34602a 100644 --- a/src/tests/test_metaculus.py +++ b/src/tests/test_metaculus.py @@ -85,82 +85,12 @@ def test_non_integer(self): # --------------------------------------------------------------------------- -# _finalize_resolution_df (pure, no mocking) +# _build_resolution_df # --------------------------------------------------------------------------- -class TestFinalizeResolutionDf: - """Tests for MetaculusSource._finalize_resolution_df static method.""" - - def test_validates_schema(self): - """Output is a valid ResolutionFrame.""" - df = pd.DataFrame( - { - "id": ["42472"], - "date": [date(2025, 6, 1)], - "value": [0.5], - } - ) - result = MetaculusSource._finalize_resolution_df(df) - ResolutionFrame.validate(result) - - def test_keeps_all_historical_data(self): - """Unlike Infer, rows before BENCHMARK_START_DATE are NOT filtered.""" - df = pd.DataFrame( - { - "id": ["A", "A", "A"], - "date": [date(2018, 1, 1), date(2020, 6, 1), date(2025, 7, 1)], - "value": [0.1, 0.2, 0.3], - } - ) - result = MetaculusSource._finalize_resolution_df(df) - assert len(result) == 3 - - def test_date_coerced_to_string(self): - """datetime.date objects become 'YYYY-MM-DD' strings.""" - df = pd.DataFrame( - { - "id": ["A"], - "date": [date(2025, 6, 1)], - "value": [0.5], - } - ) - result = MetaculusSource._finalize_resolution_df(df) - assert result["date"].iloc[0] == "2025-06-01" - - def test_id_coerced_to_string(self): - """Integer IDs are coerced to string.""" - df = pd.DataFrame( - { - "id": [42472], - "date": [date(2025, 6, 1)], - "value": [0.5], - } - ) - result = MetaculusSource._finalize_resolution_df(df) - assert result["id"].iloc[0] == "42472" - - def test_only_keeps_id_date_value(self): - """Extra columns are stripped.""" - df = pd.DataFrame( - { - "id": ["A"], - "date": [date(2025, 6, 1)], - "value": [0.5], - "extra": ["junk"], - } - ) - result = MetaculusSource._finalize_resolution_df(df) - assert list(result.columns) == ["id", "date", "value"] - - -# --------------------------------------------------------------------------- -# _create_resolution_file -# --------------------------------------------------------------------------- - - -class TestCreateResolutionFile: - """Tests for MetaculusSource._create_resolution_file.""" +class TestBuildResolutionDf: + """Tests for MetaculusSource._build_resolution_df.""" def _dfq_row(self, resolved=False, resolution_datetime="N/A"): return make_question_df( @@ -179,7 +109,7 @@ def test_empty_history_returns_none(self, metaculus_source): question={"aggregations": {"recency_weighted": {"history": []}}} ) dfq = self._dfq_row() - result = metaculus_source._create_resolution_file(dfq, 0, market) + result = metaculus_source._build_resolution_df(dfq, 0, market) assert result is None def test_single_day_entries_filtered(self, metaculus_source): @@ -201,7 +131,7 @@ def test_single_day_entries_filtered(self, metaculus_source): } ) dfq = self._dfq_row() - result = metaculus_source._create_resolution_file(dfq, 0, market) + result = metaculus_source._build_resolution_df(dfq, 0, market) assert result is None def test_single_day_last_millisecond_kept(self, metaculus_source): @@ -224,7 +154,7 @@ def test_single_day_last_millisecond_kept(self, metaculus_source): } ) dfq = self._dfq_row() - result = metaculus_source._create_resolution_file(dfq, 0, market) + result = metaculus_source._build_resolution_df(dfq, 0, market) assert result is not None assert len(result) >= 1 @@ -252,7 +182,7 @@ def test_backfill_gaps(self, metaculus_source): } ) dfq = self._dfq_row() - result = metaculus_source._create_resolution_file(dfq, 0, market) + result = metaculus_source._build_resolution_df(dfq, 0, market) assert result is not None # Should have continuous dates from start through last date dates_in_result = result["date"].tolist() @@ -282,7 +212,7 @@ def test_deduplication_keeps_last(self, metaculus_source): } ) dfq = self._dfq_row() - result = metaculus_source._create_resolution_file(dfq, 0, market) + result = metaculus_source._build_resolution_df(dfq, 0, market) assert result is not None # Both entries map to date 2025-01-01 (end_date - 1 day) # The second (0.9) should be kept @@ -304,7 +234,7 @@ def test_resolved_market_truncates_and_appends(self, metaculus_source): resolved=True, resolution_datetime="2025-01-03T00:00:00+00:00", ) - result = metaculus_source._create_resolution_file(dfq, 0, market) + result = metaculus_source._build_resolution_df(dfq, 0, market) assert result is not None # Last row should be the resolution value (yes -> 1) assert float(result.iloc[-1]["value"]) == 1.0 @@ -335,7 +265,7 @@ def test_null_end_time_uses_today(self, metaculus_source, freeze_today): } ) dfq = self._dfq_row() - result = metaculus_source._create_resolution_file(dfq, 0, market) + result = metaculus_source._build_resolution_df(dfq, 0, market) assert result is not None # Should have dates from Dec 31 through Jan 4 (today - 1 day) assert len(result) >= 2 @@ -364,7 +294,7 @@ def test_future_dates_dropped(self, metaculus_source, freeze_today): } ) dfq = self._dfq_row() - result = metaculus_source._create_resolution_file(dfq, 0, market) + result = metaculus_source._build_resolution_df(dfq, 0, market) assert result is not None # No date may exceed yesterday (2025-01-04); the future 2025-01-09 entry is dropped. assert max(result["date"].tolist()) <= "2025-01-04" @@ -389,7 +319,7 @@ def test_future_only_unresolved_market_returns_none(self, metaculus_source, free } ) dfq = self._dfq_row() - result = metaculus_source._create_resolution_file(dfq, 0, market) + result = metaculus_source._build_resolution_df(dfq, 0, market) assert result is None def test_date_assignment_subtracts_day(self, metaculus_source): @@ -412,7 +342,7 @@ def test_date_assignment_subtracts_day(self, metaculus_source): } ) dfq = self._dfq_row() - result = metaculus_source._create_resolution_file(dfq, 0, market) + result = metaculus_source._build_resolution_df(dfq, 0, market) assert result is not None # end_date 2025-01-02 minus 1 day = 2025-01-01 assert "2025-01-01" in result["date"].tolist() @@ -421,7 +351,7 @@ def test_id_is_string(self, metaculus_source): """Output id column contains string values.""" market = make_metaculus_market() dfq = self._dfq_row() - result = metaculus_source._create_resolution_file(dfq, 0, market) + result = metaculus_source._build_resolution_df(dfq, 0, market) assert result is not None assert all(isinstance(v, str) for v in result["id"].tolist()) @@ -429,7 +359,7 @@ def test_output_validated_as_resolution_frame(self, metaculus_source): """Output passes ResolutionFrame.validate().""" market = make_metaculus_market() dfq = self._dfq_row() - result = metaculus_source._create_resolution_file(dfq, 0, market) + result = metaculus_source._build_resolution_df(dfq, 0, market) assert result is not None ResolutionFrame.validate(result) @@ -682,7 +612,7 @@ def test_api_key_required(self): # --------------------------------------------------------------------------- -# update() (mock _get_market and _create_resolution_file) +# update() (mock _get_market and _build_resolution_df) # --------------------------------------------------------------------------- @@ -699,7 +629,7 @@ def _resolution_df(self, question_id="42472", value=0.6): } ) - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_new_question_appended(self, mock_market, mock_res, metaculus_source): """ID in dff not in dfq gets appended with defaults.""" @@ -715,7 +645,7 @@ def test_new_question_appended(self, mock_market, mock_res, metaculus_source): row = result.dfq[result.dfq["id"] == "200"].iloc[0] assert row["freeze_datetime_value_explanation"] == "The community prediction." - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_existing_question_not_duplicated(self, mock_market, mock_res, metaculus_source): """ID already in dfq does not add a new row.""" @@ -728,7 +658,7 @@ def test_existing_question_not_duplicated(self, mock_market, mock_res, metaculus result = metaculus_source.update(dfq, dff) assert len(result.dfq) == 1 - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_question_fields_updated(self, mock_market, mock_res, metaculus_source): """Unresolved question fields are updated from market data.""" @@ -753,7 +683,7 @@ def test_question_fields_updated(self, mock_market, mock_res, metaculus_source): assert row["market_info_resolution_criteria"] == "New criteria" assert "metaculus.com/questions/42472" in row["url"] - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_background_empty_string_kept(self, mock_market, mock_res, metaculus_source): """Empty description string is stored as-is, not converted to 'N/A'.""" @@ -767,7 +697,7 @@ def test_background_empty_string_kept(self, mock_market, mock_res, metaculus_sou result = metaculus_source.update(dfq, dff) assert result.dfq.iloc[0]["background"] == "" - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_background_missing_key_becomes_na(self, mock_market, mock_res, metaculus_source): """Missing description key becomes 'N/A'.""" @@ -782,7 +712,7 @@ def test_background_missing_key_becomes_na(self, mock_market, mock_res, metaculu result = metaculus_source.update(dfq, dff) assert result.dfq.iloc[0]["background"] == "N/A" - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_resolved_market_sets_resolution_datetime( self, mock_market, mock_res, metaculus_source @@ -808,10 +738,10 @@ def test_resolved_market_sets_resolution_datetime( # min(close=March 1, resolve=Feb 15) = Feb 15 assert "2026-02-15" in str(row["market_info_resolution_datetime"]) - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_resolution_file_stored(self, mock_market, mock_res, metaculus_source): - """Resolution file from _create_resolution_file is stored in result.""" + """Resolution file from _build_resolution_df is stored in result.""" mock_market.return_value = make_metaculus_market() res_df = self._resolution_df() mock_res.return_value = res_df @@ -822,7 +752,7 @@ def test_resolution_file_stored(self, mock_market, mock_res, metaculus_source): result = metaculus_source.update(dfq, dff) assert "42472" in result.resolution_files - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_freeze_value_from_last_resolution(self, mock_market, mock_res, metaculus_source): """freeze_datetime_value is the last value in the resolution file.""" @@ -836,10 +766,10 @@ def test_freeze_value_from_last_resolution(self, mock_market, mock_res, metaculu # QuestionFrame coerces freeze_datetime_value to str assert str(result.dfq.iloc[0]["freeze_datetime_value"]) == "0.75" - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_freeze_value_na_when_no_resolution(self, mock_market, mock_res, metaculus_source): - """freeze_datetime_value is 'N/A' when _create_resolution_file returns None.""" + """freeze_datetime_value is 'N/A' when _build_resolution_df returns None.""" mock_market.return_value = make_metaculus_market() mock_res.return_value = None @@ -849,7 +779,7 @@ def test_freeze_value_na_when_no_resolution(self, mock_market, mock_res, metacul result = metaculus_source.update(dfq, dff) assert result.dfq.iloc[0]["freeze_datetime_value"] == "N/A" - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_market_api_failure_propagates(self, mock_market, mock_res, metaculus_source): """A persistent _get_market failure propagates (fail loudly), not silently skipped.""" @@ -862,7 +792,7 @@ def test_market_api_failure_propagates(self, mock_market, mock_res, metaculus_so metaculus_source.update(dfq, dff) mock_res.assert_not_called() - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_resolved_missing_resolution_regenerated(self, mock_market, mock_res, metaculus_source): """Resolved question without existing resolution file triggers regeneration.""" @@ -895,7 +825,7 @@ def test_resolved_missing_resolution_regenerated(self, mock_market, mock_res, me mock_market.assert_called_once_with("42472") assert "42472" in result.resolution_files - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_resolved_with_existing_file_not_regenerated( self, mock_market, mock_res, metaculus_source @@ -917,7 +847,7 @@ def test_resolved_with_existing_file_not_regenerated( mock_market.assert_not_called() mock_res.assert_not_called() - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_cap_new_questions(self, mock_market, mock_res, metaculus_source): """New IDs exceeding _QUESTION_LIMIT - unresolved are capped.""" @@ -942,7 +872,7 @@ def test_api_key_required(self): with pytest.raises(RuntimeError, match="api_key must be set"): src.update(dfq, dff) - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_forecast_horizons_always_na(self, mock_market, mock_res, metaculus_source): """Every updated row has forecast_horizons = 'N/A'.""" @@ -955,7 +885,7 @@ def test_forecast_horizons_always_na(self, mock_market, mock_res, metaculus_sour result = metaculus_source.update(dfq, dff) assert result.dfq.iloc[0]["forecast_horizons"] == "N/A" - @patch.object(MetaculusSource, "_create_resolution_file") + @patch.object(MetaculusSource, "_build_resolution_df") @patch.object(MetaculusSource, "_get_market") def test_valid_question_frame_output(self, mock_market, mock_res, metaculus_source): """result.dfq passes QuestionFrame.validate().""" diff --git a/src/tests/test_polymarket.py b/src/tests/test_polymarket.py index 02e444bf..9f21635e 100644 --- a/src/tests/test_polymarket.py +++ b/src/tests/test_polymarket.py @@ -151,12 +151,12 @@ def test_preserves_time_component(self): # --------------------------------------------------------------------------- -# _build_resolution_file (pure, no mocking) +# _build_resolution_df (pure, no mocking) # --------------------------------------------------------------------------- -class TestBuildResolutionFile: - """Tests for PolymarketSource._build_resolution_file.""" +class TestBuildResolutionDf: + """Tests for PolymarketSource._build_resolution_df.""" def test_basic(self, polymarket_source): """Extracts resolution df from historical_prices and validates schema.""" @@ -167,7 +167,7 @@ def test_basic(self, polymarket_source): {"date": "2024-06-02", "value": 0.6}, ], } - result = polymarket_source._build_resolution_file(question) + result = polymarket_source._build_resolution_df(question) assert len(result) == 2 assert (result["id"] == "0xabc123").all() assert list(result.columns) == ["id", "date", "value"] @@ -182,7 +182,7 @@ def test_no_benchmark_filter(self, polymarket_source): {"date": "2024-06-01", "value": 0.5}, ], } - result = polymarket_source._build_resolution_file(question) + result = polymarket_source._build_resolution_df(question) assert len(result) == 2 @@ -860,14 +860,14 @@ def _row(condition_id): # --------------------------------------------------------------------------- -# update() (mock _build_resolution_file) +# update() (mock _build_resolution_df) # --------------------------------------------------------------------------- class TestUpdate: """Tests for PolymarketSource.update.""" - @patch.object(PolymarketSource, "_build_resolution_file") + @patch.object(PolymarketSource, "_build_resolution_df") def test_new_id_appended(self, mock_build, polymarket_source): """An ID in dff not in dfq gets appended.""" mock_build.return_value = make_resolution_df( @@ -881,7 +881,7 @@ def test_new_id_appended(self, mock_build, polymarket_source): assert "0xnew" in result.dfq["id"].values assert len(result.dfq) == 2 - @patch.object(PolymarketSource, "_build_resolution_file") + @patch.object(PolymarketSource, "_build_resolution_df") def test_existing_updated(self, mock_build, polymarket_source): """An existing ID's fields are updated from dff.""" mock_build.return_value = make_resolution_df( @@ -896,9 +896,9 @@ def test_existing_updated(self, mock_build, polymarket_source): assert row["question"] == "Updated question" assert len(result.dfq) == 1 - @patch.object(PolymarketSource, "_build_resolution_file") + @patch.object(PolymarketSource, "_build_resolution_df") def test_resolution_file_stored(self, mock_build, polymarket_source): - """The resolution file from _build_resolution_file is in the result.""" + """The resolution file from _build_resolution_df is in the result.""" mock_build.return_value = make_resolution_df( [{"id": "0xabc123", "date": "2024-06-01", "value": 0.5}] ) @@ -909,7 +909,7 @@ def test_resolution_file_stored(self, mock_build, polymarket_source): assert "0xabc123" in result.resolution_files - @patch.object(PolymarketSource, "_build_resolution_file") + @patch.object(PolymarketSource, "_build_resolution_df") def test_strips_transient_fields(self, mock_build, polymarket_source): """Transient fields are not present in the output dfq.""" mock_build.return_value = make_resolution_df( @@ -923,7 +923,7 @@ def test_strips_transient_fields(self, mock_build, polymarket_source): for col in ["fetch_datetime", "probability", "historical_prices"]: assert col not in result.dfq.columns - @patch.object(PolymarketSource, "_build_resolution_file") + @patch.object(PolymarketSource, "_build_resolution_df") def test_output_schema_valid(self, mock_build, polymarket_source): """The output dfq passes QuestionFrame validation.""" mock_build.return_value = make_resolution_df( From 668fa4a51c422d537f55c222722cc9f466428bd6 Mon Sep 17 00:00:00 2001 From: Nikolay Petrov Date: Wed, 10 Jun 2026 13:52:06 +0300 Subject: [PATCH 3/3] fix: remove duplicate resolution df validation Supersedes PR #209 and #208. Fixes issue #206 --- src/sources/infer.py | 4 ++-- src/sources/manifold.py | 8 +------- src/sources/metaculus.py | 17 +---------------- src/sources/polymarket.py | 3 +-- 4 files changed, 5 insertions(+), 27 deletions(-) diff --git a/src/sources/infer.py b/src/sources/infer.py index 40c94106..1a8826e9 100644 --- a/src/sources/infer.py +++ b/src/sources/infer.py @@ -387,14 +387,14 @@ def _build_resolution_df( @staticmethod def _finalize_resolution_df(df: pd.DataFrame) -> DataFrame[ResolutionFrame]: - """Apply date filtering and return as validated ResolutionFrame. + """Apply date filtering and select resolution columns. Args: df (pd.DataFrame): Raw resolution data with id, date, value columns. """ df["date"] = pd.to_datetime(df["date"]) df = df[df["date"].dt.date >= constants.BENCHMARK_START_DATE_DATETIME_DATE] - return ResolutionFrame.validate(df[["id", "date", "value"]]) + return df[["id", "date", "value"]].astype(dtype=constants.RESOLUTION_FILE_COLUMN_DTYPE) # ------------------------------------------------------------------ # Private: question transformation diff --git a/src/sources/manifold.py b/src/sources/manifold.py index ec6ea64e..68d19891 100644 --- a/src/sources/manifold.py +++ b/src/sources/manifold.py @@ -403,15 +403,9 @@ def _build_resolution_df( df = df.ffill() df["id"] = market_id - return self._finalize_resolution_df(df) - - @staticmethod - def _finalize_resolution_df(df: pd.DataFrame) -> DataFrame[ResolutionFrame]: - """Filter to benchmark period and validate as ResolutionFrame.""" df["date"] = pd.to_datetime(df["date"]) df = df[df["date"].dt.date >= constants.BENCHMARK_START_DATE_DATETIME_DATE] - df = df[["id", "date", "value"]].astype(dtype=constants.RESOLUTION_FILE_COLUMN_DTYPE) - return ResolutionFrame.validate(df) + return df[["id", "date", "value"]].astype(dtype=constants.RESOLUTION_FILE_COLUMN_DTYPE) @staticmethod def _get_resolved_market_value(market: dict) -> float: diff --git a/src/sources/metaculus.py b/src/sources/metaculus.py index cf0bce26..d7aea463 100644 --- a/src/sources/metaculus.py +++ b/src/sources/metaculus.py @@ -439,20 +439,5 @@ def set_date(end_datetime): } df["id"] = str(market["id"]) - df = df[["id", "date", "value"]] - return self._finalize_resolution_df(df) - - @staticmethod - def _finalize_resolution_df(df: pd.DataFrame) -> DataFrame[ResolutionFrame]: - """Cast types and return as a validated ResolutionFrame. - - Unlike infer/manifold, Metaculus does not filter to the benchmark start date: - the aggregation history is already bounded by the question's open window, and - this preserves the legacy job's output exactly. - - Args: - df (pd.DataFrame): Raw resolution data with id, date, value columns. - """ - df = df[["id", "date", "value"]].astype(dtype=constants.RESOLUTION_FILE_COLUMN_DTYPE) - return ResolutionFrame.validate(df) + return df[["id", "date", "value"]].astype(dtype=constants.RESOLUTION_FILE_COLUMN_DTYPE) diff --git a/src/sources/polymarket.py b/src/sources/polymarket.py index ed00e611..d98081a0 100644 --- a/src/sources/polymarket.py +++ b/src/sources/polymarket.py @@ -546,5 +546,4 @@ def _build_resolution_df(self, question: dict) -> DataFrame[ResolutionFrame]: """ df = pd.DataFrame(question["historical_prices"]) df["id"] = question["id"] - df = df[["id", "date", "value"]].astype(dtype=constants.RESOLUTION_FILE_COLUMN_DTYPE) - return ResolutionFrame.validate(df) + return df[["id", "date", "value"]].astype(dtype=constants.RESOLUTION_FILE_COLUMN_DTYPE)