diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..32ef912 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +pandas +numpy +pyfixest +pytest +tqdm diff --git a/run_simulation.py b/run_simulation.py index 8502562..77c79f8 100644 --- a/run_simulation.py +++ b/run_simulation.py @@ -39,7 +39,7 @@ # Parameters for round-based sampling N_ROUNDS = 15 -QUESTIONS_PER_ROUND = 25 +QUESTIONS_PER_ROUND = 100 MODELS_PER_ROUND_MEAN = 40 DATASET_WEIGHT = 0.5 SIMULATION_METHOD = "round_based" @@ -177,7 +177,7 @@ def validate_processed_data(df): def main(): print("Loading data...") - df = process_raw_data(f"{INPUT_FOLDER}/leaderboard_human.pkl") + df = process_raw_data(f"{INPUT_FOLDER}/leaderboard_llm.pkl") df.to_csv(f"{PROCESSED_FOLDER}/processed_dataset.csv", index=False) # Load the processed dataset diff --git a/src/ranking_sim.py b/src/ranking_sim.py index cbb76f8..efb8af8 100644 --- a/src/ranking_sim.py +++ b/src/ranking_sim.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd import pyfixest as pf +from tqdm import tqdm # ================ # Data preparation @@ -15,9 +16,16 @@ def process_raw_data(input_name): df_temp = pkl[ii]["df"] df_temp["model"] = pkl[ii]["model"] df_temp["organization"] = pkl[ii]["organization"] + # drop combo questions + df_temp = df_temp[df_temp["direction"] == ()] df = pd.concat([df, df_temp]) df = df.reset_index(drop=True) + if "horizon" not in df.columns: + df["resolution_date"] = pd.to_datetime(df["resolution_date"]) + df["forecast_due_date"] = pd.to_datetime(df["forecast_due_date"]) + df["horizon"] = (df["resolution_date"] - df["forecast_due_date"]).dt.days + # Create a new column 'question_id' by concatenating 'source', 'id', # and 'horizon' columns. This is done to create a unique identifier # for each question/prediction @@ -296,6 +304,35 @@ def simulate_random_sampling(df, n_questions_per_model, ref_model="Always 0.5"): return df_results +def simple_sample(df, n): + groups = df.groupby(["question_type", "horizon"])["question_id"].unique() + + dataset_groups = groups["dataset"] + n_horizons = len(dataset_groups) + + n_dataset_horizon = n // 2 // n_horizons + n_market = n - n_dataset_horizon * n_horizons + + # Market questions: choose randomly across all market questions + all_market_questions = np.concatenate([g for g in groups["market"].values]) + market_questions = np.random.choice(all_market_questions, size=n_market, replace=True) + + # Dataset Questions: choose randomly for one horizon, then get the same questions at all horizons + df0 = df[df["question_id"].isin(dataset_groups.values[0])] + sampled_rows = df0.sample(n=n_dataset_horizon, replace=True) + dataset_questions_list = [] + for _, row in sampled_rows.iterrows(): + subset = df[ + (df["source"] == row["source"]) + & (df["id"] == row["id"]) + & (df["forecast_due_date"] == row["forecast_due_date"]) + ] + dataset_questions_list.extend(subset["question_id"].unique()) + dataset_questions = np.array(dataset_questions_list) + + return np.concatenate([market_questions, dataset_questions]) + + def simulate_round_based( df, n_rounds=15, @@ -313,7 +350,6 @@ def simulate_round_based( """ # Get parameters models = df["model"].unique() - questions = df["question_id"].unique() # Check if ref_model exists if ref_model is None or ref_model not in models: @@ -327,9 +363,7 @@ def simulate_round_based( rounds = [] for round_id in range(n_rounds): # Sample questions with replacement for this round - round_questions = np.random.choice( - questions, size=questions_per_round, replace=True - ) + round_questions = simple_sample(df=df, n=questions_per_round) # Sample number of models for this round (Poisson, but # at least 1 non-ref model, and less than total available @@ -405,7 +439,7 @@ def evaluate_ranking_methods( # Run simulations results_list = [] - for sim in range(n_simulations): + for sim in tqdm(range(n_simulations)): # Generate simulated dataset using the provided simulation function df_sim = simulation_func(df=df, ref_model=ref_model, **simulation_kwargs)