diff --git a/run_simulation.py b/run_simulation.py index dcf8d6c..381f687 100644 --- a/run_simulation.py +++ b/run_simulation.py @@ -276,6 +276,36 @@ True, {"market_weight": 1.00}, ), + "Diff-Adj. Brier (w_mkt=0.00, fe_frac=0.50)": ( + rank_by_diff_adj_brier, + "avg_diff_adj_brier", + True, + {"market_weight": 0.00, "fe_models_frac": 0.5}, + ), + "Diff-Adj. Brier (w_mkt=0.25, fe_frac=0.50)": ( + rank_by_diff_adj_brier, + "avg_diff_adj_brier", + True, + {"market_weight": 0.25, "fe_models_frac": 0.5}, + ), + "Diff-Adj. Brier (w_mkt=0.50, fe_frac=0.50)": ( + rank_by_diff_adj_brier, + "avg_diff_adj_brier", + True, + {"market_weight": 0.50, "fe_models_frac": 0.5}, + ), + "Diff-Adj. Brier (w_mkt=0.75, fe_frac=0.50)": ( + rank_by_diff_adj_brier, + "avg_diff_adj_brier", + True, + {"market_weight": 0.75, "fe_models_frac": 0.5}, + ), + "Diff-Adj. Brier (w_mkt=1.00, fe_frac=0.50)": ( + rank_by_diff_adj_brier, + "avg_diff_adj_brier", + True, + {"market_weight": 1.00, "fe_models_frac": 0.5}, + ), "BSS (Pct.)": ( rank_by_bss, "avg_bss", diff --git a/src/ranking_sim.py b/src/ranking_sim.py index b2dba27..6810bc8 100644 --- a/src/ranking_sim.py +++ b/src/ranking_sim.py @@ -92,7 +92,7 @@ def rank_by_brier(df): return model_scores -def rank_by_diff_adj_brier(df, market_weight=0.0): +def rank_by_diff_adj_brier(df, market_weight=0.0, fe_models_frac=1.0): df = df.copy() # Return empty results for empty input @@ -130,14 +130,51 @@ def rank_by_diff_adj_brier(df, market_weight=0.0): f"Market weight should be in [0, 1] but instead equals {market_weight}" ) + if fe_models_frac <= 0.0 or fe_models_frac > 1.0: + raise ValueError( + f"Fraction of models used in 2FE estimation should be in (0, 1] " + f"but instead equals {fe_models_frac}" + ) + df["brier_score"] = brier_score(df) - mod = pf.feols("brier_score ~ 1 | question_id + model", data=df) + if fe_models_frac < 1.0: + # Differentiate between round-based and + # random sampling schemes + if "round_id" in df.columns: + # Round-based sampling: sample fraction per round + df_fe = pd.DataFrame() + for round_id in df["round_id"].unique(): + df_round_data = df[df["round_id"] == round_id].copy() + round_models = df_round_data["model"].unique() + + # Sample fraction of models for this round + n_fe_models = max(1, int(len(round_models) * fe_models_frac)) + fe_models = np.random.choice( + round_models, size=n_fe_models, replace=False + ) + mask = df_round_data["model"].isin(fe_models) + df_round_fe_data = df_round_data[mask] + + # Concatenate + df_fe = pd.concat([df_fe, df_round_fe_data], ignore_index=True) + else: + # Random sampling: sample fraction globally + models = df["model"].unique() + n_fe_models = max(1, int(len(models) * fe_models_frac)) + fe_models = np.random.choice(models, size=n_fe_models, replace=False) + mask = df["model"].isin(fe_models) + df_fe = df[mask] + else: + df_fe = df.copy() + + # Estimate the 2FE model + mod = pf.feols("brier_score ~ 1 | question_id + model", data=df_fe) dict_question_fe = mod.fixef()["C(question_id)"] if len(dict_question_fe) != len(df["question_id"].unique()): raise ValueError( - f"Estimated num. of question fixed effects ({len(dict_question_fe)}) \ - not equal to num. of questions ({len(df['question_id'].unique())})" + f"Estimated num. of question fixed effects ({len(dict_question_fe)}) " + f"not equal to num. of questions ({len(df['question_id'].unique())})" ) # Merge question FE back to the original df diff --git a/tests/test_ranking_sim.py b/tests/test_ranking_sim.py index 24b38c0..48ee132 100644 --- a/tests/test_ranking_sim.py +++ b/tests/test_ranking_sim.py @@ -224,6 +224,142 @@ def test_rank_by_diff_adj_mkt_weighting(): assert (df_test["model"] == diff_adj_ranking["model"]).all() +def test_rank_by_diff_adj_fe_sampling(): + """Test that diff-adj Brier returns different rankings + when not all models are sampled""" + + # Create data where model subset matters for FE estimation + # Model_A: Good at Q1, Q2, bad at Q3 + # Model_B: Bad at Q1, Q2, good at Q3 + # Model_C: Consistently mediocre + # Model_D: Opposite of Model_C + + df = pd.DataFrame( + [ + # Question 1 (outcome = 1) + { + "model": "Model_A", + "question_id": "q1", + "forecast": 0.9, + "resolved_to": 1, + "question_type": "market", + "market_forecast": 0.6, + }, + { + "model": "Model_B", + "question_id": "q1", + "forecast": 0.2, + "resolved_to": 1, + "question_type": "market", + "market_forecast": 0.6, + }, + { + "model": "Model_C", + "question_id": "q1", + "forecast": 0.5, + "resolved_to": 1, + "question_type": "market", + "market_forecast": 0.6, + }, + { + "model": "Model_D", + "question_id": "q1", + "forecast": 0.5, + "resolved_to": 1, + "question_type": "market", + "market_forecast": 0.6, + }, + # Question 2 (outcome = 0) + { + "model": "Model_A", + "question_id": "q2", + "forecast": 0.1, + "resolved_to": 0, + "question_type": "market", + "market_forecast": 0.4, + }, + { + "model": "Model_B", + "question_id": "q2", + "forecast": 0.8, + "resolved_to": 0, + "question_type": "market", + "market_forecast": 0.4, + }, + { + "model": "Model_C", + "question_id": "q2", + "forecast": 0.5, + "resolved_to": 0, + "question_type": "market", + "market_forecast": 0.4, + }, + { + "model": "Model_D", + "question_id": "q2", + "forecast": 0.5, + "resolved_to": 0, + "question_type": "market", + "market_forecast": 0.4, + }, + # Question 3 (outcome = 1) + { + "model": "Model_A", + "question_id": "q3", + "forecast": 0.3, + "resolved_to": 1, + "question_type": "market", + "market_forecast": 0.5, + }, + { + "model": "Model_B", + "question_id": "q3", + "forecast": 0.9, + "resolved_to": 1, + "question_type": "market", + "market_forecast": 0.5, + }, + { + "model": "Model_C", + "question_id": "q3", + "forecast": 0.6, + "resolved_to": 1, + "question_type": "market", + "market_forecast": 0.5, + }, + { + "model": "Model_D", + "question_id": "q3", + "forecast": 0.4, + "resolved_to": 1, + "question_type": "market", + "market_forecast": 0.5, + }, + ] + ) + + np.random.seed(42) + diff_adj_ranking_full = rank_by_diff_adj_brier(df, fe_models_frac=1.0) + + np.random.seed(123) # Different seed for different sampling + diff_adj_ranking_sampled = rank_by_diff_adj_brier(df, fe_models_frac=0.25) + + diff_adj_ranking_sampled.rename( + columns={"avg_diff_adj_brier": "avg_diff_adj_brier_sampled"}, inplace=True + ) + + diff_adj_ranking_full = pd.merge( + diff_adj_ranking_full, diff_adj_ranking_sampled, on="model", how="left" + ) + + diff_adj_ranking_full["brier_delta"] = ( + diff_adj_ranking_full["avg_diff_adj_brier"] + - diff_adj_ranking_full["avg_diff_adj_brier_sampled"] + ) + + assert (diff_adj_ranking_full["brier_delta"].abs() > 1e-3).all() + + def test_rank_by_diff_adj_brier_empty_dataframe(): """Test that rank_by_diff_adj_brier handles empty dataframe gracefully.""" df_empty = pd.DataFrame(columns=["model", "question_id", "forecast", "resolved_to"])