MLAgentBench/plot.py at main · gdmurphy/MLAgentBench · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Script to parse experiment results and produce:
  - Tables 1.x and 2.x with mean and std
  - Figures (3, 4.1, 4.2, 4.3)
  - Scatter plot: avg. API cost vs avg. success
  - Correlation heatmaps: subjective vs. objective metrics
  - A new pipeline "human+single-agent" is included in tables and figures
  - Radar charts for table 2.1 & 2.2 (kept for reference),
    plus a new radar chart per task that:
      0) does not show an "Avg" chart,
      1) uses absolute performance (performance), runtime (runtime), and complexity (method_complexity),
      2) rescales subjective metrics from [1..5] to [20..100],
      3) normalizes the three objective metrics to [0..100] across pipeline-LM + baseline,
      4) adds a baseline line (dashed style) from MLAgentBench/benchmarks_base_exp/{TASK}/env/output/idea_evals.json.

Outputs (figures, LaTeX tables, captions, cost report) are saved to `results/`.
All numeric results are rounded to one decimal place for tables, but the radar charts use a scaling as described.

Requires:
  Python 3.8+
  pandas
  numpy
  matplotlib
  seaborn
"""

import os
import glob
import json
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from collections import defaultdict
from math import comb
from itertools import combinations
import matplotlib.colors as mcolors
import colorsys


from MLAgentBench.constants import *
##############################################################################
# Global config
##############################################################################

# Pipeline types
SINGLE_AGENT = "MLAB"
MULTI_AGENT = "CoI-Agent Idea + MLAB"
HUMAN_SINGLE_AGENT = "Human Idea + MLAB"
PIPELINES = [SINGLE_AGENT, MULTI_AGENT, HUMAN_SINGLE_AGENT]

# LMs
LMS = ["claude-3-5-sonnet-v2", "DeepSeek-R1", "gemini-exp-1206", "llama3-1-405b-instruct", "o3-mini", "gpt-4o"]
colors = ['#0173b2', '#029e73', '#cc78bc', '#ca9161', '#ece133', '#56b4e9']
LM_COLORS = {lm : c for lm, c in zip(LMS, colors)}
# Tasks
task_name_mapping = {
        "llm-merging" : "llm-merging",
        "backdoor-trigger" : "backdoor-trigger-recovery",
        "temporal-action-loc" : "perception_temporal_action_loc",
        "machine-unlearning" : "machine_unlearning",
        "meta-learning" : "meta-learning",
        }
TASKS = list(task_name_mapping.keys())
for k in TASKS:
    v = task_name_mapping[k]
    task_name_mapping[v] = v


# Idea indices
IDEA_IDXS = [0, 1, 2, 3]
IDEA_PROPOSAL_MODEL = "o1-preview"
adaptive_threshold = 0.05

# For human+single-agent
HUMAN_IDEA_IDX = "rag"
HUMAN_IDEA_PROPOSAL_MODEL = "human"


# For figure line styles
PIPELINE_LINESTYLES = {
    SINGLE_AGENT: "solid",
    MULTI_AGENT: "dashed",
    HUMAN_SINGLE_AGENT: "dotted",
}

HUMAN_PERFORMANCE = {
    # only for test
    "llm-merging": {"performance" : 0.83},
    "backdoor-trigger": {"performance" : 67.5732},
    "temporal-action-loc": {"performance" : 0.4859},
    "machine-unlearning": {"performance" : 0.0984971060},
    "meta-learning": {"performance" : 0.699},
}
all_task_improvement_perc = []
for task in HUMAN_PERFORMANCE:
    human_perf = HUMAN_PERFORMANCE[task]["performance"]
    base_perf = ALL_BASE_PERFORMANCE[task_name_mapping[task]]["test"]
    task_improvement_perc = 100 * (human_perf - base_perf) / base_perf
    HUMAN_PERFORMANCE[task]["improvement_perc"] = task_improvement_perc
    all_task_improvement_perc.append(task_improvement_perc)

HUMAN_PERFORMANCE["Average"] = {"improvement_perc" : sum(all_task_improvement_perc) / len(all_task_improvement_perc)}

# Consider success if improvement_perc > 5.0
# TASK_THRESHOLD[task] = 5.0
# Task adaptive threshold
TASK_THRESHOLD = {task : HUMAN_PERFORMANCE[task]["improvement_perc"]*adaptive_threshold for task in TASKS}
print("HUMAN_PERFORMANCE", HUMAN_PERFORMANCE)
print(f"task success threshold: {adaptive_threshold} of human improvement", TASK_THRESHOLD)

# Results directory
RESULTS_DIR = f"results/adaptive_threshold_{adaptive_threshold}/IDEA_PROPOSAL_MODEL_{IDEA_PROPOSAL_MODEL}"
os.makedirs(RESULTS_DIR, exist_ok=True)

##############################################################################
# Utility functions
##############################################################################

def extract_timestamp_from_dirname(_dirname):
    # Remove `_PID` if present
    dirname = _dirname.split('_')[0]  # Keep only the timestamp part

    pattern = r'^(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})$'
    m = re.match(pattern, dirname)
    if m:
        ts = tuple(int(x) for x in m.groups())
        if ts[0] >= 10: # we only use experiments conducted from January
            return None
        else:
            return ts
    return None

def load_json_safely(path):
    if not os.path.isfile(path):
        return None
    try:
        with open(path, "r") as f:
            return json.load(f)
    except:
        return None

def find_most_recent_8_runs_for_pipeline(_task, lm, pipeline, idea_idx=None):
    """
    Collect run dirs from workspace and logs, unify, then keep the last 8 by ascending time.
    """

    log_runs = []
    task = task_name_mapping[_task]
    if pipeline == SINGLE_AGENT:
        base_pattern_logs = f"logs/{task}/{lm}/*"
    elif pipeline == MULTI_AGENT:
        if idea_idx is None:
            raise ValueError("idea_idx must be specified for multi-agent pipeline.")
        base_pattern_logs = f"logs/{task}--{idea_idx}--{IDEA_PROPOSAL_MODEL}/{lm}/*"
    elif pipeline == HUMAN_SINGLE_AGENT:
        base_pattern_logs = f"logs/{task}--{HUMAN_IDEA_IDX}--{HUMAN_IDEA_PROPOSAL_MODEL}/{lm}/*"
    else:
        base_pattern_logs = ""

    if base_pattern_logs:
        for path in glob.glob(base_pattern_logs):
            if os.path.isdir(path):
                dirname = os.path.basename(path)
                ts = extract_timestamp_from_dirname(dirname)
                if ts is not None:
                    log_runs.append((dirname, ts))

    items = list(log_runs)
    items.sort(key=lambda x: x[1])  # ascending
    items = items[-8:]
    return [x[0] for x in items]

##############################################################################
# Dev/Test result helpers
##############################################################################

def get_dev_results(_task, lm, pipeline, run_id, idea_idx=None):
    task = task_name_mapping[_task]
    if pipeline == SINGLE_AGENT:
        dev_file =f"logs/{task}/{lm}/{run_id}/env_log/idea_evals.json"
    elif pipeline == MULTI_AGENT:
        if idea_idx is None:
            raise ValueError("idea_idx must be specified for multi-agent pipeline.")
        dev_file = f"logs/{task}--{idea_idx}--{IDEA_PROPOSAL_MODEL}/{lm}/{run_id}/env_log/idea_evals.json"
    else:
        dev_file = f"logs/{task}--{HUMAN_IDEA_IDX}--{HUMAN_IDEA_PROPOSAL_MODEL}/{lm}/{run_id}/env_log/idea_evals.json"

    data = load_json_safely(dev_file)
    if not data:
        return []
    out = []
    BASE_RUNTIME = ALL_BASE_RUNTIME[task]["dev"]
    BASE_PERFORMANCE = ALL_BASE_PERFORMANCE[task]["dev"]

    for imp in data.get("implementations", []):
        if imp.get("phase") == "dev" and imp["performance"] is not None: # performance should not be None
            out.append(
                (
                    100 * (imp["performance"] - BASE_PERFORMANCE) / BASE_PERFORMANCE, # updated with newest estimation
                    100 * (imp["runtime"] - BASE_RUNTIME) / BASE_RUNTIME,
                    imp["relative_complexity"],
                )
            )
    return out

def get_test_result(_task, lm, pipeline, run_id, idea_idx=None):
    task = task_name_mapping[_task]
    if pipeline == SINGLE_AGENT:
        test_file = f"logs/{task}/{lm}/{run_id}/env_log/test_idea_evals.json"
    elif pipeline == MULTI_AGENT:
        if idea_idx is None:
            raise ValueError("idea_idx must be specified for multi-agent pipeline.")
        test_file = f"logs/{task}--{idea_idx}--{IDEA_PROPOSAL_MODEL}/{lm}/{run_id}/env_log/test_idea_evals.json"
    else:
        test_file = f"logs/{task}--{HUMAN_IDEA_IDX}--{HUMAN_IDEA_PROPOSAL_MODEL}/{lm}/{run_id}/env_log/test_idea_evals.json"

    data = load_json_safely(test_file)
    if not data:
        return None
    BASE_RUNTIME = ALL_BASE_RUNTIME[task]["test"]
    BASE_PERFORMANCE = ALL_BASE_PERFORMANCE[task]["test"]

    for imp in data.get("implementations", []):
        if imp.get("phase") == "test" and imp["performance"] is not None : # performance should not be None
            if task == "machine_unlearning":
                # substitute with best dev's runtime
                dev_results = get_dev_results(_task, lm, pipeline, run_id, idea_idx)
                dev_results.sort(key=lambda x: x[0])
                best_dev_result = dev_results[-1]
                best_dev_runtime = best_dev_result[1]

            ret = (
                100 * (imp["performance"] - BASE_PERFORMANCE) / BASE_PERFORMANCE, # updated with newest estimation
                best_dev_runtime if task == "machine_unlearning" else 100 * (imp["runtime"] - BASE_RUNTIME) / BASE_RUNTIME,
                imp.get("relative_complexity", 0.0),
            )
            return ret
    return None

def load_api_cost(_task, lm, pipeline, run_id, idea_idx=None):
    task = task_name_mapping[_task]
    if pipeline == SINGLE_AGENT:
        cost_file = f"logs/{task}/{lm}/{run_id}/env_log/api_cost.json"
    elif pipeline == MULTI_AGENT:
        if idea_idx is None:
            raise ValueError("idea_idx must be specified for multi-agent pipeline.")
        cost_file = f"logs/{task}--{idea_idx}--{IDEA_PROPOSAL_MODEL}/{lm}/{run_id}/env_log/api_cost.json"
    else:
        cost_file = f"logs/{task}--{HUMAN_IDEA_IDX}--{HUMAN_IDEA_PROPOSAL_MODEL}/{lm}/{run_id}/env_log/api_cost.json"

    data = load_json_safely(cost_file)
    if not data:
        return 0.0
    return float(data.get("total_cost", 0.0))

##############################################################################
# Success rate calculations (Tables 1.1 / 1.2)
##############################################################################

def compute_success_rates_data(phase='test'):
    """
    Return dict[(task,pipeline,lm)] -> (mean_sr, std_sr)
    where mean_sr, std_sr in range [0,100].
    """
    result = {}
    for task in TASKS:
        for pipeline in PIPELINES:
            for lm in LMS:
                success_list = []
                if pipeline == SINGLE_AGENT:
                    run_ids = find_most_recent_8_runs_for_pipeline(task, lm, pipeline)
                    for rid in run_ids:
                        if phase=='test':
                            r = get_test_result(task, lm, pipeline, rid)
                            success_list.append(
                                1 if (r and r[0]>TASK_THRESHOLD[task]) else 0
                            )
                        else: # dev
                            dev_res = get_dev_results(task, lm, pipeline, rid)
                            if dev_res:
                                best_imp = max(x[0] for x in dev_res)
                                success_list.append(1 if (best_imp>TASK_THRESHOLD[task]) else 0)
                            else:
                                success_list.append(0)
                elif pipeline == MULTI_AGENT:
                    # accumulate across all ideas
                    agg = []
                    for idea_idx in IDEA_IDXS:
                        run_ids = find_most_recent_8_runs_for_pipeline(task, lm, pipeline, idea_idx)
                        for rid in run_ids:
                            if phase=='test':
                                r = get_test_result(task, lm, pipeline, rid, idea_idx)
                                agg.append(
                                    1 if (r and r[0]>TASK_THRESHOLD[task]) else 0
                                )
                            else:
                                dev_res = get_dev_results(task, lm, pipeline, rid, idea_idx)
                                if dev_res:
                                    best_imp = max(x[0] for x in dev_res)
                                    agg.append(1 if (best_imp>TASK_THRESHOLD[task]) else 0)
                                else:
                                    agg.append(0)
                    success_list = agg
                else: # human+single-agent
                    run_ids = find_most_recent_8_runs_for_pipeline(task, lm, pipeline)
                    for rid in run_ids:
                        if phase=='test':
                            r = get_test_result(task, lm, pipeline, rid)
                            success_list.append(
                                1 if (r and r[0]>TASK_THRESHOLD[task]) else 0
                            )
                        else:
                            dev_res = get_dev_results(task, lm, pipeline, rid)
                            if dev_res:
                                best_imp = max(x[0] for x in dev_res)
                                success_list.append(1 if (best_imp>TASK_THRESHOLD[task]) else 0)
                            else:
                                success_list.append(0)

                if success_list:
                    mean_sr = np.mean(success_list)*100
                    std_sr  = np.std(success_list, ddof=1)*100
                else:
                    mean_sr, std_sr = 0.0, 0.0
                result[(task,pipeline,lm)] = (mean_sr, std_sr)
    return result

def convert_table_1(df):
    """Converts the output of construct_table_1 to the desired transposed format."""

    tasks = df['Task'].unique()
    systems = df['System'].unique()
    lms = LMS

    new_rows = []
    for task in tasks:
        if task == "Avg": # handle the avg rows
            continue
        new_row = [task]
        for system in systems:
            if system == "Avg":
                continue
            for lm in lms:
                if system != SINGLE_AGENT and lm != "gpt-4o":
                    continue

                value = df[(df['Task'] == task) & (df['System'] == system)][lm].values
                if len(value) > 0:
                    new_row.append(value[0])
                else:
                    print(task, system, lm, "no value")
                    new_row.append("")
        new_rows.append(new_row)

    #Handle the avg rows
    new_row = ["Avg"]
    for system in systems:
        if system == "Avg":
            continue
        for lm in lms:
            if system != SINGLE_AGENT and lm != "gpt-4o":
                continue

            value = df[(df['Task'] == "Avg") & (df['System'] == system)][lm].values
            if len(value) > 0:
                new_row.append(value[0])
            else:
                new_row.append("")
    new_rows.append(new_row)

    new_columns = ["Task"]
    for system in systems:
        if system == "Avg":
            continue
        for lm in lms:
            if system != SINGLE_AGENT and lm != "gpt-4o":
                continue
            safe_system = system.replace("\n", "\\\\")
            new_columns.append(f"\makecell{{{safe_system}\\\\{lm}}}")

    new_df = pd.DataFrame(new_rows, columns=new_columns)
    return new_df

def construct_table_1(success_data, phase='test'):
    rows = []
    pipeline_lm_task_values = defaultdict(list)

    for task in TASKS:
        for i, pipeline in enumerate(PIPELINES):
            row_task = task
            row = [row_task, pipeline]
            for lm in LMS:
                mean_sr, std_sr = success_data.get((task,pipeline,lm),(0.0,0.0))
                row.append(f"{round(mean_sr,1)}")
                pipeline_lm_task_values[(pipeline,lm)].append(mean_sr)
            rows.append(row)

    # "Avg" row for each pipeline
    for pipeline in PIPELINES:
        row = ["Avg", pipeline]
        for lm in LMS:
            vals = pipeline_lm_task_values[(pipeline,lm)]
            if vals:
                avg_sr = np.mean(vals)
            else:
                avg_sr = 0.0
            row.append(f"{round(avg_sr,1)}")
        rows.append(row)

    cols = ["Task","System"]+LMS
    df = pd.DataFrame(rows, columns=cols)
    return convert_table_1(df)

##############################################################################
# Table 2.x average metrics (still uses improvement & relative values)
##############################################################################

def compute_test_llm_eval_metrics(_task, lm, pipeline, run_id, idea_idx=None):
    """
    Return the "with_code" portion of llm_eval if any.
    """
    task = task_name_mapping[_task]
    if pipeline == SINGLE_AGENT:
        test_file = f"logs/{task}/{lm}/{run_id}/env_log/test_idea_evals.json"
    elif pipeline == MULTI_AGENT:
        if idea_idx is None:
            return {}
        test_file = f"logs/{task}--{idea_idx}--{IDEA_PROPOSAL_MODEL}/{lm}/{run_id}/env_log/test_idea_evals.json"
    else:
        test_file = f"logs/{task}--{HUMAN_IDEA_IDX}--{HUMAN_IDEA_PROPOSAL_MODEL}/{lm}/{run_id}/env_log/test_idea_evals.json"

    data = load_json_safely(test_file)
    if not data:
        return {}
    for imp in data.get("implementations", []):
        if imp.get("phase")=="test" and imp["performance"] is not None: # performance should not be None
            llm_eval = imp.get("llm_eval", {})
            return llm_eval.get("with_code", {})
    return {}

def compute_average_metrics_data(phase='test', include_llm_eval=False):
    """
    returns dict[(task,pipeline,lm)] -> {
      'imp_mean', 'imp_std',
      'run_mean','run_std',
      'comp_mean','comp_std',
      'clarity_mean','clarity_std', ...
    }
    """
    result = {}
    for task in TASKS:
        for pipeline in PIPELINES:
            for lm in LMS:
                imp_vals, run_vals, comp_vals = [], [], []
                clarity_vals, validity_vals = [], []
                rigor_vals, innov_vals, gener_vals = [],[],[]
                if pipeline==SINGLE_AGENT:
                    run_ids = find_most_recent_8_runs_for_pipeline(task, lm, pipeline)
                    for rid in run_ids:
                        if phase=='test':
                            rr = get_test_result(task, lm, pipeline, rid)
                            if rr:
                                imp_vals.append(rr[0])
                                run_vals.append(rr[1])
                                comp_vals.append(rr[2])
                                if include_llm_eval:
                                    wc = compute_test_llm_eval_metrics(task, lm, pipeline, rid)
                                    for f, store in [
                                        ("Clarity", clarity_vals),
                                        ("Validity", validity_vals),
                                        ("Rigorousness", rigor_vals),
                                        ("Innovativeness", innov_vals),
                                        ("Generalizability", gener_vals),
                                    ]:
                                        rating = wc.get(f, {}).get("Rating", None)
                                        if rating is not None:
                                            store.append(rating)
                        else: # dev
                            dev_res = get_dev_results(task, lm, pipeline, rid)
                            if dev_res:
                                best_idx = np.argmax([x[0] for x in dev_res])
                                best = dev_res[best_idx]
                                imp_vals.append(best[0])
                                run_vals.append(best[1])
                                comp_vals.append(best[2])
                elif pipeline==MULTI_AGENT:
                    for idea_idx in IDEA_IDXS:
                        run_ids = find_most_recent_8_runs_for_pipeline(task, lm, pipeline, idea_idx)
                        for rid in run_ids:
                            if phase=='test':
                                rr = get_test_result(task, lm, pipeline, rid, idea_idx)
                                if rr:
                                    imp_vals.append(rr[0])
                                    run_vals.append(rr[1])
                                    comp_vals.append(rr[2])
                                    if include_llm_eval:
                                        wc = compute_test_llm_eval_metrics(task, lm, pipeline, rid, idea_idx)
                                        for f, store in [
                                            ("Clarity", clarity_vals),
                                            ("Validity", validity_vals),
                                            ("Rigorousness", rigor_vals),
                                            ("Innovativeness", innov_vals),
                                            ("Generalizability", gener_vals),
                                        ]:
                                            rating = wc.get(f, {}).get("Rating", None)
                                            if rating is not None:
                                                store.append(rating)
                            else:
                                dev_res = get_dev_results(task, lm, pipeline, rid, idea_idx)
                                if dev_res:
                                    best_idx = np.argmax([x[0] for x in dev_res])
                                    best = dev_res[best_idx]
                                    imp_vals.append(best[0])
                                    run_vals.append(best[1])
                                    comp_vals.append(best[2])
                else:
                    run_ids = find_most_recent_8_runs_for_pipeline(task, lm, pipeline)
                    for rid in run_ids:
                        if phase=='test':
                            rr = get_test_result(task, lm, pipeline, rid)
                            if rr:
                                imp_vals.append(rr[0])
                                run_vals.append(rr[1])
                                comp_vals.append(rr[2])
                                if include_llm_eval:
                                    wc = compute_test_llm_eval_metrics(task, lm, pipeline, rid)
                                    for f, store in [
                                        ("Clarity", clarity_vals),
                                        ("Validity", validity_vals),
                                        ("Rigorousness", rigor_vals),
                                        ("Innovativeness", innov_vals),
                                        ("Generalizability", gener_vals),
                                    ]:
                                        rating = wc.get(f, {}).get("Rating", None)
                                        if rating is not None:
                                            store.append(rating)
                        else:
                            dev_res = get_dev_results(task, lm, pipeline, rid)
                            if dev_res:
                                best_idx = np.argmax([x[0] for x in dev_res])
                                best = dev_res[best_idx]
                                imp_vals.append(best[0])
                                run_vals.append(best[1])
                                comp_vals.append(best[2])

                def mean_std(arr):
                    if not arr:
                        return (0.0, 0.0)
                    if len(arr) < 3:
                        return (np.mean(arr), 0.0)
                    return (np.mean(arr), np.std(arr, ddof=1))

                imp_m, imp_s = mean_std(imp_vals)
                run_m, run_s = mean_std(run_vals)
                comp_m,comp_s= mean_std(comp_vals)
                c_m,c_s      = mean_std(clarity_vals)
                v_m,v_s      = mean_std(validity_vals)
                r_m,r_s      = mean_std(rigor_vals)
                i_m,i_s      = mean_std(innov_vals)
                g_m,g_s      = mean_std(gener_vals)

                result[(task,pipeline,lm)] = {
                    'imp_mean':imp_m,'imp_std':imp_s,
                    'run_mean':run_m,'run_std':run_s,
                    'comp_mean':comp_m,'comp_std':comp_s,
                    'clarity_mean':c_m,'clarity_std':c_s,
                    'validity_mean':v_m,'validity_std':v_s,
                    'rigorous_mean':r_m,'rigorous_std':r_s,
                    'innov_mean':i_m,'innov_std':i_s,
                    'gener_mean':g_m,'gener_std':g_s,
                }
    return result

def convert_table_2(df):
    """Converts the output of build_table_2 to the desired transposed format."""

    tasks = df['Task'].unique()
    metrics = df['Metric'].unique()
    systems = df['System'].unique()
    lms = LMS  # Use the global LMS

    new_rows = []
    for task in tasks:
        for metric in metrics:
            new_row = [task, metric]
            for system in systems:
                for lm in lms:
                    if system != SINGLE_AGENT and lm != "gpt-4o":
                        continue
                    value = df[(df['Task'] == task) & (df['Metric'] == metric) & (df['System'] == system)][lm].values
                    if len(value) > 0:
                        new_row.append(value[0])
                    else:
                        assert 0
                        new_row.append("")  # Handle cases where data is missing
            new_rows.append(new_row)

    # Construct the columns for the new DataFrame
    new_columns = ["Task", "Metric"]
    for system in systems:
        for lm in lms:
            if system != SINGLE_AGENT and lm != "gpt-4o":
                continue

            safe_system = system.replace("\n", "\\\\")
            new_columns.append(f"\makecell{{{safe_system}\\\\{lm}}}")

    new_df = pd.DataFrame(new_rows, columns=new_columns)
    return new_df

def build_table_2(average_data, phase='test', include_llm_eval=False):
    """
    Builds a longer-style table with columns:
      Task | System | Metric | ...one column per LM...
    so that each metric (e.g. Imp, Run, Comp) appears in its own row.
    """
    # We'll define which metrics to include:
    base_metrics = [
        ("Imp",  "imp_mean",  "imp_std"),
        ("Run",  "run_mean",  "run_std"),
        ("Comp", "comp_mean", "comp_std"),
    ]
    llm_eval_metrics = [
        ("Clarity",        "clarity_mean",   "clarity_std"),
        ("Validity",       "validity_mean",  "validity_std"),
        ("Rigorousness",   "rigorous_mean",  "rigorous_std"),
        ("Innovativeness", "innov_mean",     "innov_std"),
        ("Generalizability","gener_mean",    "gener_std"),
    ]
    # If including LLM eval metrics, add them:
    metrics = base_metrics[:]
    if include_llm_eval and phase == 'test':
        metrics += llm_eval_metrics

    # Build the columns: "Task", "System", "Metric" plus each LM as a column
    columns = ["Task", "System", "Metric"] + LMS

    rows = []
    # Go through each task & pipeline & gather the metrics as separate rows
    for task in TASKS:
        for pipeline in PIPELINES:
            for (metric_label, mean_key, std_key) in metrics:
                # Prepare the row up to the LM values
                row = [task, pipeline, metric_label]
                # For each LM, pick up its mean±std for this metric
                for lm in LMS:
                    d = average_data.get((task, pipeline, lm), {})
                    m = d.get(mean_key, 0.0)
                    s = d.get(std_key, 0.0)
                    row.append(f"{round(m,1)}±{round(s,1)}")
                rows.append(row)

    df = pd.DataFrame(rows, columns=columns)
    return convert_table_2(df)

##############################################################################
# Figure 3: pass@k
##############################################################################

# change to test TODO
def get_test_improvement_success(task, lm, pipeline, run_id, threshold=5.0, idea_idx=None):
    """
    Return True if test results exist with improvement>threshold.
    """
    test_res = []
    if pipeline==SINGLE_AGENT:
        test_res = get_test_result(task, lm, pipeline, run_id)
    elif pipeline==MULTI_AGENT and idea_idx is not None:
        test_res = get_test_result(task, lm, pipeline, run_id, idea_idx)
    else:
        test_res = get_test_result(task, lm, pipeline, run_id)
    return test_res and test_res[0]>threshold

def compute_pass_at_k_data():
    pass_at_k_data = {lm: defaultdict(dict) for lm in LMS}
    m=8 # TODO: avoid hardcode 8
    kvals = range(1,m+1)

    def get_pass_at_k(kvals, c_impl):
        # pass@k
        arr_impl=[]
        for k in kvals:
            if k>m:
                pass_k = 1.0 if c_impl>0 else 0.0
            else:
                denom = comb(m,k)
                num = comb(m-c_impl,k)
                pass_k = 1.0 - num/denom if denom>0 else 0.0
            arr_impl.append(pass_k)
        return arr_impl

    for lm in LMS:
        for task in TASKS:
            # single-agent
            for N, pipeline in [(0, SINGLE_AGENT), (-1, HUMAN_SINGLE_AGENT)]:
                runs_sa = find_most_recent_8_runs_for_pipeline(task, lm, pipeline)
                sa_successes = 0
                for rid in runs_sa:
                    if get_test_improvement_success(task, lm, pipeline, rid, threshold=TASK_THRESHOLD[task]):
                        sa_successes+=1
                c_impl = sa_successes
                pass_at_k_data[lm][task][N] = get_pass_at_k(kvals, c_impl)

            # multi-agent
            # total_success_over_all_ideas = 0
            # for idea_idx in IDEA_IDXS:
            #     runs_ = find_most_recent_8_runs_for_pipeline(task, lm, MULTI_AGENT, idea_idx)
            #     for rid in runs_:
            #         if get_test_improvement_success(task, lm, MULTI_AGENT, rid, threshold=TASK_THRESHOLD[task], idea_idx=idea_idx):
            #             total_success_over_all_ideas+=1
            # pass_at_k_data[lm][task][1] = get_pass_at_k(range(1,m*len(IDEA_IDXS)+1), total_success_over_all_ideas)

            # multi-agent
            c_list=[]
            for idea_idx in IDEA_IDXS:
                runs_ = find_most_recent_8_runs_for_pipeline(task, lm, MULTI_AGENT, idea_idx)
                c_ = 0
                for rid in runs_:
                    if get_test_improvement_success(task, lm, MULTI_AGENT, rid, threshold=TASK_THRESHOLD[task], idea_idx=idea_idx):
                        c_+=1
                c_list.append(c_)
            for N in [1,2,4]:
                arrN=[]
                subsets = list(combinations(range(len(c_list)),N))
                for k in kvals:
                    numerator=0
                    denominator = comb(4,N)*(comb(m,k)**N)
                    for sub in subsets:
                        product=1
                        for idx in sub:
                            c_i = c_list[idx]
                            if m-c_i<k:
                                product=0
                                break
                            else:
                                product*=comb(m-c_i,k)
                        numerator+=product
                    pass_k=1.0-(numerator/denominator) if denominator>0 else 0.0
                    arrN.append(pass_k)
                pass_at_k_data[lm][task][N] = arrN

    # average
    averaged = {lm:{} for lm in LMS}
    for lm in LMS:
        for N in [-1,0,1,2,4]:
            sums=[0.0]*len(pass_at_k_data[lm][TASKS[0]][N])
            for task in TASKS:
                y_ = pass_at_k_data[lm][task][N]
                sums=[a+b for a,b in zip(sums,y_)]
            avg=[x/len(TASKS) for x in sums]
            averaged[lm][N]=avg
    return pass_at_k_data, averaged

def plot_figure_3(pass_at_k_data, averaged_pass_at_k):
    plt.rcParams['font.size'] = 10
    for lm in LMS:
        if lm != "gpt-4o":
            continue
        for task in TASKS+["Average"]:
            plt.figure(figsize=(6,4))
            for N in [0,-1,1,2,4]:
                if task=="Average":
                    y=averaged_pass_at_k[lm][N]
                    t_="Average Over All Tasks"
                else:
                    y=pass_at_k_data[lm][task][N]
                    t_=task
                if N==0:
                    label=f"{SINGLE_AGENT}"
                elif N==-1:
                    label=f"{HUMAN_SINGLE_AGENT}"
                else:
                    label=f"{MULTI_AGENT}\n# Ideas = {N}"
                xvals=range(1,1+len(y))
                plt.plot(xvals, y, marker='o', label=label)
            plt.title(f"{lm}, {t_}")
            plt.xlabel("Number of Trials (k)")
            plt.ylabel("pass@k")
            # plt.ylim([0,1.05])
            plt.xticks(list(xvals))
            plt.grid(True)
            plt.legend()
            if task=="Average":
                outfn = os.path.join(RESULTS_DIR,f"figure_3_{lm}_average.pdf")
            else:
                outfn = os.path.join(RESULTS_DIR,f"figure_3_{lm}_{task.replace(' ','_')}.pdf")
            plt.savefig(outfn,bbox_inches='tight')
            plt.close()
            caption=(
                f"Figure 3 for LM={lm}, Task={t_}. pass@k on dev set vs # of trials. "
                "N=0 => single-agent, N=1,2,4 => multi-agent. Probability of at least one success (>5% improvement)."
            )
            if task=="Average":
                capfn = os.path.join(RESULTS_DIR,f"figure_3_{lm}_average_caption.txt")
            else:
                capfn = os.path.join(RESULTS_DIR,f"figure_3_{lm}_{task.replace(' ','_')}_caption.txt")
            # with open(capfn,"w") as f:
            #     f.write(caption)

##############################################################################
# Figure 4: dev improvement vs i-th implementation
##############################################################################

def compute_figure_4_data():
    def expand_and_avg(list_of_arrays):
        if not list_of_arrays:
            return []
        max_len = max(len(a) for a in list_of_arrays)
        expanded=[]
        for arr in list_of_arrays:
            if len(arr)<max_len:
                arr=arr+[arr[-1]]*(max_len-len(arr))
            expanded.append(arr)
        return np.mean(expanded,axis=0).tolist()

    fig4_data=defaultdict(dict)
    for task in TASKS+["Average"]:
        for lm in LMS:
            for pipeline in PIPELINES:
                all_imps, all_runs, all_comps=[],[],[]
                tasks_ = TASKS if task=="Average" else [task]
                for t_ in tasks_:
                    if pipeline==SINGLE_AGENT:
                        rids = find_most_recent_8_runs_for_pipeline(t_, lm, pipeline)
                        for rid in rids:
                            dev_res = get_dev_results(t_, lm, pipeline, rid)
                            if dev_res:
                                im=[x[0] for x in dev_res]
                                ru=[x[1] for x in dev_res]
                                co=[x[2] for x in dev_res]
                                all_imps.append(im)
                                all_runs.append(ru)
                                all_comps.append(co)
                    elif pipeline==MULTI_AGENT:
                        for idx in IDEA_IDXS:
                            rids = find_most_recent_8_runs_for_pipeline(t_, lm, pipeline, idx)
                            for rid in rids:
                                dev_res = get_dev_results(t_, lm, pipeline, rid, idx)
                                if dev_res:
                                    im=[x[0] for x in dev_res]
                                    ru=[x[1] for x in dev_res]
                                    co=[x[2] for x in dev_res]
                                    all_imps.append(im)
                                    all_runs.append(ru)
                                    all_comps.append(co)
                    else:
                        rids = find_most_recent_8_runs_for_pipeline(t_, lm, pipeline)
                        for rid in rids:
                            dev_res = get_dev_results(t_, lm, pipeline, rid)
                            if dev_res:
                                im=[x[0] for x in dev_res]
                                ru=[x[1] for x in dev_res]
                                co=[x[2] for x in dev_res]
                                all_imps.append(im)
                                all_runs.append(ru)
                                all_comps.append(co)
                imp_means=expand_and_avg(all_imps)
                run_means=expand_and_avg(all_runs)
                comp_means=expand_and_avg(all_comps)
                fig4_data[task][(lm,pipeline)] = {
                    "improvement_perc": imp_means,
                    "relative_runtime": run_means,
                    "relative_complexity": comp_means,
                }
    return fig4_data

def plot_figure_4(fig4_data):
    plt.rcParams['font.size'] = 18
    metrics = ["improvement_perc", "relative_runtime", "relative_complexity"]
    titles = ["Performance Improvement (%, \u2191)", "Increased Runtime (%, \u2193)", "Increased Lines of Code (%, \u2193)"]
    nums = [1, 2, 3]

    if not os.path.exists(RESULTS_DIR):
        os.makedirs(RESULTS_DIR)

    for task in TASKS + ["Average"]:
        fig, axes = plt.subplots(1, 3, figsize=(24, 6))  # Create figure and subplots, adjust figsize as needed
        fig.suptitle(f"{task}" if task != "Average" else "Average over all tasks", fontsize=20, y=1.0) # Lower suptitle closer to plots

        handles, labels_for_legend = [], [] # To collect handles and labels for the centralized legend

        for i, met in enumerate(metrics):
            ax = axes[i] # Current subplot axis
            metric_handles = [] # Handles for this metric subplot
            metric_labels = [] # Labels for this metric subplot

            for lm in LMS:
                for pipeline in PIPELINES:
                    if pipeline != SINGLE_AGENT:
                        continue
                    arr = fig4_data[task][(lm, pipeline)][met]
                    if not arr:
                        continue
                    xvals = range(1, len(arr) + 1)
                    style_ = PIPELINE_LINESTYLES.get(pipeline, 'solid')
                    color_ = LM_COLORS.get(lm, 'black')
                    lab = f"{lm}"
                    line, = ax.plot(xvals, arr, marker='o', linestyle=style_, color=color_, label=lab) # Get line object

                    metric_handles.append(line) # Collect handles for legend
                    metric_labels.append(lab) # Collect labels for legend


            ax.set_title(titles[i]) # Set subplot title from titles list
            ax.set_xlabel("i-th implementation in a trial")
            # ax.set_ylabel(titles[i]) # Removed y-axis label in subplots
            ax.grid(True)
            ax.yaxis.label.set_visible(False) # alternative way to hide y label

            # Set x-axis ticks to integers only
            ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True))


        # Centralized legend below the subplots
        # Get unique handles and labels to avoid duplicates in legend
        unique_labels = []
        unique_handles = []
        label_set = set()
        for h, l in zip(metric_handles, metric_labels):
            if l not in label_set:
                unique_handles.append(h)
                unique_labels.append(l)
                label_set.add(l)


        fig.legend(unique_handles, unique_labels, loc='lower center', ncol=len(unique_labels), bbox_to_anchor=(0.5, -0.05)) # Adjust bbox_to_anchor and ncol for best position

        plt.tight_layout(rect=[0, 0.05, 1, 0.95]) # Adjust layout to make space for suptitle and legend
        # Manually adjust subplot params to move subplots up if necessary
        plt.subplots_adjust(top=0.88) # Adjust top to move subplots down relative to suptitle

        outfn = os.path.join(RESULTS_DIR, f"figure_4_all_{task.replace(' ', '_')}.pdf") # Save single figure
        fig.savefig(outfn, bbox_inches='tight')
        plt.close(fig) # Close the figure

        cap = (
            f"Figure 4 for {task}. "
            f"Plots improvement_perc, relative_runtime, and relative_complexity (from left to right) vs implementation index. "
            f"Different colors represent different LMs."
        )
        capfn = os.path.join(RESULTS_DIR, f"figure_4_all_{task.replace(' ', '_')}_caption.txt")
        # with open(capfn, "w") as f:
        #     f.write(cap)


##############################################################################
# Scatter: cost vs success
##############################################################################

def load_idea_cost(_task,lm):
    task = task_name_mapping[_task]
    idea_costs = []
    for i in IDEA_IDXS:
        coi_idea_file = f"../CoI-Agent/results/{task}/{lm}/{i}/result.json"
        with open(coi_idea_file, 'r') as reader:
            items = json.load(reader)
        idea_costs.append(items["api_cost"])
    return sum(idea_costs) / len(idea_costs)

def compute_api_cost_and_success_for_scatter():
    data_points=[]
    for pipeline in PIPELINES:
        for lm in LMS:
            if lm == "gemini-exp-1206":
                continue
            costs=[]
            successes=[]
            for task in TASKS:
                if pipeline==MULTI_AGENT:
                    if lm != "gpt-4o":
                        continue
                    for idx in IDEA_IDXS:
                        rids = find_most_recent_8_runs_for_pipeline(task, lm, pipeline, idx)
                        for rid in rids:
                            c_=load_api_cost(task,lm,pipeline,rid,idx)
                            idea_cost=load_idea_cost(task,IDEA_PROPOSAL_MODEL)
                            costs.append(c_+idea_cost)
                            res=get_test_result(task,lm,pipeline,rid,idx)
                            s=1 if (res and res[0]>TASK_THRESHOLD[task]) else 0
                            successes.append(s)
                elif pipeline==SINGLE_AGENT:
                    rids = find_most_recent_8_runs_for_pipeline(task,lm,pipeline)
                    for rid in rids:
                        c_=load_api_cost(task,lm,pipeline,rid)
                        costs.append(c_)
                        res=get_test_result(task,lm,pipeline,rid)
                        s=1 if(res and res[0]>TASK_THRESHOLD[task]) else 0
                        successes.append(s)
                else:
                    if lm != "gpt-4o":
                        continue
                    rids=find_most_recent_8_runs_for_pipeline(task,lm,pipeline)
                    for rid in rids:
                        c_=load_api_cost(task,lm,pipeline,rid)