Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions run_simulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,36 @@
True,
{"market_weight": 1.00},
),
"Diff-Adj. Brier (w_mkt=0.00, fe_frac=0.50)": (
rank_by_diff_adj_brier,
"avg_diff_adj_brier",
True,
{"market_weight": 0.00, "fe_models_frac": 0.5},
),
"Diff-Adj. Brier (w_mkt=0.25, fe_frac=0.50)": (
rank_by_diff_adj_brier,
"avg_diff_adj_brier",
True,
{"market_weight": 0.25, "fe_models_frac": 0.5},
),
"Diff-Adj. Brier (w_mkt=0.50, fe_frac=0.50)": (
rank_by_diff_adj_brier,
"avg_diff_adj_brier",
True,
{"market_weight": 0.50, "fe_models_frac": 0.5},
),
"Diff-Adj. Brier (w_mkt=0.75, fe_frac=0.50)": (
rank_by_diff_adj_brier,
"avg_diff_adj_brier",
True,
{"market_weight": 0.75, "fe_models_frac": 0.5},
),
"Diff-Adj. Brier (w_mkt=1.00, fe_frac=0.50)": (
rank_by_diff_adj_brier,
"avg_diff_adj_brier",
True,
{"market_weight": 1.00, "fe_models_frac": 0.5},
),
"BSS (Pct.)": (
rank_by_bss,
"avg_bss",
Expand Down
45 changes: 41 additions & 4 deletions src/ranking_sim.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def rank_by_brier(df):
return model_scores


def rank_by_diff_adj_brier(df, market_weight=0.0):
def rank_by_diff_adj_brier(df, market_weight=0.0, fe_models_frac=1.0):
df = df.copy()

# Return empty results for empty input
Expand Down Expand Up @@ -130,14 +130,51 @@ def rank_by_diff_adj_brier(df, market_weight=0.0):
f"Market weight should be in [0, 1] but instead equals {market_weight}"
)

if fe_models_frac <= 0.0 or fe_models_frac > 1.0:
raise ValueError(
f"Fraction of models used in 2FE estimation should be in (0, 1] "
f"but instead equals {fe_models_frac}"
)

df["brier_score"] = brier_score(df)

mod = pf.feols("brier_score ~ 1 | question_id + model", data=df)
if fe_models_frac < 1.0:
# Differentiate between round-based and
# random sampling schemes
if "round_id" in df.columns:
# Round-based sampling: sample fraction per round
df_fe = pd.DataFrame()
for round_id in df["round_id"].unique():
df_round_data = df[df["round_id"] == round_id].copy()
round_models = df_round_data["model"].unique()

# Sample fraction of models for this round
n_fe_models = max(1, int(len(round_models) * fe_models_frac))
fe_models = np.random.choice(
round_models, size=n_fe_models, replace=False
)
mask = df_round_data["model"].isin(fe_models)
df_round_fe_data = df_round_data[mask]

# Concatenate
df_fe = pd.concat([df_fe, df_round_fe_data], ignore_index=True)
else:
# Random sampling: sample fraction globally
models = df["model"].unique()
n_fe_models = max(1, int(len(models) * fe_models_frac))
fe_models = np.random.choice(models, size=n_fe_models, replace=False)
mask = df["model"].isin(fe_models)
df_fe = df[mask]
else:
df_fe = df.copy()

# Estimate the 2FE model
mod = pf.feols("brier_score ~ 1 | question_id + model", data=df_fe)
dict_question_fe = mod.fixef()["C(question_id)"]
if len(dict_question_fe) != len(df["question_id"].unique()):
raise ValueError(
f"Estimated num. of question fixed effects ({len(dict_question_fe)}) \
not equal to num. of questions ({len(df['question_id'].unique())})"
f"Estimated num. of question fixed effects ({len(dict_question_fe)}) "
f"not equal to num. of questions ({len(df['question_id'].unique())})"
)

# Merge question FE back to the original df
Expand Down
136 changes: 136 additions & 0 deletions tests/test_ranking_sim.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,142 @@ def test_rank_by_diff_adj_mkt_weighting():
assert (df_test["model"] == diff_adj_ranking["model"]).all()


def test_rank_by_diff_adj_fe_sampling():
"""Test that diff-adj Brier returns different rankings
when not all models are sampled"""

# Create data where model subset matters for FE estimation
# Model_A: Good at Q1, Q2, bad at Q3
# Model_B: Bad at Q1, Q2, good at Q3
# Model_C: Consistently mediocre
# Model_D: Opposite of Model_C

df = pd.DataFrame(
[
# Question 1 (outcome = 1)
{
"model": "Model_A",
"question_id": "q1",
"forecast": 0.9,
"resolved_to": 1,
"question_type": "market",
"market_forecast": 0.6,
},
{
"model": "Model_B",
"question_id": "q1",
"forecast": 0.2,
"resolved_to": 1,
"question_type": "market",
"market_forecast": 0.6,
},
{
"model": "Model_C",
"question_id": "q1",
"forecast": 0.5,
"resolved_to": 1,
"question_type": "market",
"market_forecast": 0.6,
},
{
"model": "Model_D",
"question_id": "q1",
"forecast": 0.5,
"resolved_to": 1,
"question_type": "market",
"market_forecast": 0.6,
},
# Question 2 (outcome = 0)
{
"model": "Model_A",
"question_id": "q2",
"forecast": 0.1,
"resolved_to": 0,
"question_type": "market",
"market_forecast": 0.4,
},
{
"model": "Model_B",
"question_id": "q2",
"forecast": 0.8,
"resolved_to": 0,
"question_type": "market",
"market_forecast": 0.4,
},
{
"model": "Model_C",
"question_id": "q2",
"forecast": 0.5,
"resolved_to": 0,
"question_type": "market",
"market_forecast": 0.4,
},
{
"model": "Model_D",
"question_id": "q2",
"forecast": 0.5,
"resolved_to": 0,
"question_type": "market",
"market_forecast": 0.4,
},
# Question 3 (outcome = 1)
{
"model": "Model_A",
"question_id": "q3",
"forecast": 0.3,
"resolved_to": 1,
"question_type": "market",
"market_forecast": 0.5,
},
{
"model": "Model_B",
"question_id": "q3",
"forecast": 0.9,
"resolved_to": 1,
"question_type": "market",
"market_forecast": 0.5,
},
{
"model": "Model_C",
"question_id": "q3",
"forecast": 0.6,
"resolved_to": 1,
"question_type": "market",
"market_forecast": 0.5,
},
{
"model": "Model_D",
"question_id": "q3",
"forecast": 0.4,
"resolved_to": 1,
"question_type": "market",
"market_forecast": 0.5,
},
]
)

np.random.seed(42)
diff_adj_ranking_full = rank_by_diff_adj_brier(df, fe_models_frac=1.0)

np.random.seed(123) # Different seed for different sampling
diff_adj_ranking_sampled = rank_by_diff_adj_brier(df, fe_models_frac=0.25)

diff_adj_ranking_sampled.rename(
columns={"avg_diff_adj_brier": "avg_diff_adj_brier_sampled"}, inplace=True
)

diff_adj_ranking_full = pd.merge(
diff_adj_ranking_full, diff_adj_ranking_sampled, on="model", how="left"
)

diff_adj_ranking_full["brier_delta"] = (
diff_adj_ranking_full["avg_diff_adj_brier"]
- diff_adj_ranking_full["avg_diff_adj_brier_sampled"]
)

assert (diff_adj_ranking_full["brier_delta"].abs() > 1e-3).all()


def test_rank_by_diff_adj_brier_empty_dataframe():
"""Test that rank_by_diff_adj_brier handles empty dataframe gracefully."""
df_empty = pd.DataFrame(columns=["model", "question_id", "forecast", "resolved_to"])
Expand Down