From 7c6a071778bfd63889293b8b66df10512ddb5c14 Mon Sep 17 00:00:00 2001
From: djriffle <djriffle1@gmail.com>
Date: Wed, 30 Jul 2025 15:43:14 -0400
Subject: [PATCH 01/14] renamed benchmarking to CLI

---
 benchmarking/create_agent_system.sh             |   4 ----
 benchmarking/run_automated.sh                   |   4 ----
 benchmarking/run_interactive.sh                 |   4 ----
 ...chmarking_sandbox_management.cpython-311.pyc | Bin 30536 -> 0 bytes
 {benchmarking => cli}/.gitignore                |   0
 {benchmarking => cli}/README.md                 |   0
 {benchmarking => cli}/__init__.py               |   0
 {benchmarking => cli}/agents/AgentSystem.py     |   2 +-
 {benchmarking => cli}/agents/__init__.py        |   0
 .../agents/create_agent_system.py               |   4 ++--
 .../agents/integration_system.json              |   0
 .../agents/system_blueprint.json                |   0
 .../auto_metrics/AutoMetric.py                  |   0
 .../auto_metrics/CellCountMetric.py             |   0
 .../auto_metrics/CellTypingMetric.py            |   0
 .../auto_metrics/IntegrationMetrics.py          |   0
 .../code_samples/load_adata.py                  |   0
 {benchmarking => cli}/core/__init__.py          |   0
 {benchmarking => cli}/core/io_helpers.py        |   4 ++--
 {benchmarking => cli}/core/ollama_wrapper.py    |   0
 .../core/sandbox_management.py                  |   6 +++---
 cli/create_agent_system.sh                      |   4 ++++
 {benchmarking => cli}/create_benchmark_env.sh   |   0
 ...transcriptomics_in_mouse_puck_191109_14.h5ad | Bin
 ...transcriptomics_in_mouse_puck_191109_14.json |   0
 ...mus_scrna-seq_atlas_-_myeloid_p2_subset.h5ad | Bin
 ...mus_scrna-seq_atlas_-_myeloid_p2_subset.json |   0
 .../prompt_testing/MultiAgentAutoTester.py      |   8 ++++----
 .../prompt_testing/MultiAgentTester.py          |  15 +++++++++------
 .../prompt_testing/__init__.py                  |   0
 .../prompt_testing/extra_tools/Evaluator.py     |   0
 .../extra_tools/InteractiveAgentTester.py       |   4 ++--
 .../extra_tools/OneShotAgentTester.py           |   0
 .../prompt_testing/extra_tools/PromptEvolver.py |   0
 {benchmarking => cli}/requirements.txt          |   0
 cli/run_automated.sh                            |   4 ++++
 cli/run_interactive.sh                          |   4 ++++
 .../sample_prompt_library/Basic_scRNA_Agent.txt |   0
 {benchmarking => cli}/sandbox/Dockerfile        |   0
 {benchmarking => cli}/sandbox/Singularity       |   0
 {benchmarking => cli}/sandbox/__init__.py       |   0
 .../sandbox/benchmarking_sandbox_management.py  |   0
 ...nchmarking_sandbox_management_singularity.py |   0
 {benchmarking => cli}/sandbox/kernel_api.py     |   0
 {benchmarking => cli}/sandbox/offline_kernel.py |   0
 {benchmarking => cli}/sandbox/requirements.txt  |   0
 {benchmarking => cli}/sandbox/start.sh          |   0
 {benchmarking => cli}/sandbox/start_kernel.py   |   0
 {benchmarking => cli}/tools/__init__.py         |   0
 {benchmarking => cli}/tools/czi_browser.py      |   0
 .../tools/output_to_notebook.py                 |   0
 51 files changed, 35 insertions(+), 32 deletions(-)
 delete mode 100755 benchmarking/create_agent_system.sh
 delete mode 100755 benchmarking/run_automated.sh
 delete mode 100755 benchmarking/run_interactive.sh
 delete mode 100644 benchmarking/sandbox/__pycache__/benchmarking_sandbox_management.cpython-311.pyc
 rename {benchmarking => cli}/.gitignore (100%)
 rename {benchmarking => cli}/README.md (100%)
 rename {benchmarking => cli}/__init__.py (100%)
 rename {benchmarking => cli}/agents/AgentSystem.py (99%)
 rename {benchmarking => cli}/agents/__init__.py (100%)
 rename {benchmarking => cli}/agents/create_agent_system.py (98%)
 rename {benchmarking => cli}/agents/integration_system.json (100%)
 rename {benchmarking => cli}/agents/system_blueprint.json (100%)
 rename {benchmarking => cli}/auto_metrics/AutoMetric.py (100%)
 rename {benchmarking => cli}/auto_metrics/CellCountMetric.py (100%)
 rename {benchmarking => cli}/auto_metrics/CellTypingMetric.py (100%)
 rename {benchmarking => cli}/auto_metrics/IntegrationMetrics.py (100%)
 rename {benchmarking => cli}/code_samples/load_adata.py (100%)
 rename {benchmarking => cli}/core/__init__.py (100%)
 rename {benchmarking => cli}/core/io_helpers.py (98%)
 rename {benchmarking => cli}/core/ollama_wrapper.py (100%)
 rename {benchmarking => cli}/core/sandbox_management.py (96%)
 create mode 100755 cli/create_agent_system.sh
 rename {benchmarking => cli}/create_benchmark_env.sh (100%)
 rename {benchmarking => cli}/datasets/spatial_transcriptomics_in_mouse_puck_191109_14.h5ad (100%)
 rename {benchmarking => cli}/datasets/spatial_transcriptomics_in_mouse_puck_191109_14.json (100%)
 rename {benchmarking => cli}/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.h5ad (100%)
 rename {benchmarking => cli}/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.json (100%)
 rename {benchmarking => cli}/prompt_testing/MultiAgentAutoTester.py (98%)
 rename {benchmarking => cli}/prompt_testing/MultiAgentTester.py (97%)
 rename {benchmarking => cli}/prompt_testing/__init__.py (100%)
 rename {benchmarking => cli}/prompt_testing/extra_tools/Evaluator.py (100%)
 rename {benchmarking => cli}/prompt_testing/extra_tools/InteractiveAgentTester.py (97%)
 rename {benchmarking => cli}/prompt_testing/extra_tools/OneShotAgentTester.py (100%)
 rename {benchmarking => cli}/prompt_testing/extra_tools/PromptEvolver.py (100%)
 rename {benchmarking => cli}/requirements.txt (100%)
 create mode 100755 cli/run_automated.sh
 create mode 100755 cli/run_interactive.sh
 rename {benchmarking => cli}/sample_prompt_library/Basic_scRNA_Agent.txt (100%)
 rename {benchmarking => cli}/sandbox/Dockerfile (100%)
 rename {benchmarking => cli}/sandbox/Singularity (100%)
 rename {benchmarking => cli}/sandbox/__init__.py (100%)
 rename {benchmarking => cli}/sandbox/benchmarking_sandbox_management.py (100%)
 rename {benchmarking => cli}/sandbox/benchmarking_sandbox_management_singularity.py (100%)
 rename {benchmarking => cli}/sandbox/kernel_api.py (100%)
 rename {benchmarking => cli}/sandbox/offline_kernel.py (100%)
 rename {benchmarking => cli}/sandbox/requirements.txt (100%)
 rename {benchmarking => cli}/sandbox/start.sh (100%)
 rename {benchmarking => cli}/sandbox/start_kernel.py (100%)
 rename {benchmarking => cli}/tools/__init__.py (100%)
 rename {benchmarking => cli}/tools/czi_browser.py (100%)
 rename {benchmarking => cli}/tools/output_to_notebook.py (100%)

diff --git a/benchmarking/create_agent_system.sh b/benchmarking/create_agent_system.sh
deleted file mode 100755
index f64997b..0000000
--- a/benchmarking/create_agent_system.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/env bash
-# move *out* of benchmarking/ into its parent (Olaf/)
-cd "$(dirname "$0")"/..
-python -m benchmarking.agents.create_agent_system "$@"
\ No newline at end of file
diff --git a/benchmarking/run_automated.sh b/benchmarking/run_automated.sh
deleted file mode 100755
index 6291c5e..0000000
--- a/benchmarking/run_automated.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/env bash
-# move *out* of benchmarking/ into its parent (Olaf/)
-cd "$(dirname "$0")"/..
-python -m benchmarking.prompt_testing.MultiAgentAutoTester "$@"
\ No newline at end of file
diff --git a/benchmarking/run_interactive.sh b/benchmarking/run_interactive.sh
deleted file mode 100755
index 0021c95..0000000
--- a/benchmarking/run_interactive.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/env bash
-# move *out* of benchmarking/ into its parent (Olaf/)
-cd "$(dirname "$0")"/..
-python -m benchmarking.prompt_testing.MultiAgentTester "$@"
\ No newline at end of file
diff --git a/benchmarking/sandbox/__pycache__/benchmarking_sandbox_management.cpython-311.pyc b/benchmarking/sandbox/__pycache__/benchmarking_sandbox_management.cpython-311.pyc
deleted file mode 100644
index 015092a5d2145a7f8fecc0efef2dee6f7d4b566b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 30536
zcmc(|dr%u$nkSl)QXqi@5|Ti?#lwJX9)4kCm+dNSp2o&D;IfU&R>`C=*dVBsu*+Jg
zp(pCzr8_*^R7c%tcDCvccWzIQd)po3o!H*>&hFfG^<&yIHPfA`$X+zML4+eFYGQl$
z-WzmO_e5RCy%GESPD-hifO^dSv8m+ClbPpy=R42*Ugz+~c6$Mb<JQ+Irry29aeqfQ
z>BF1~eA;K^xNma;Cm3e9arJ5#H?Vu-xRG5=<0f|H$9Z-&kDJ*wZ#<7(^T+eq)iQ2j
zSL?Xds$wb_F9^TVz=aAMIq{HzGjbvBjx9}sVEnFu<GznS_1N~_w8MMD9eX;yz~{cd
z#)bL&+}oyW+<gP1z&uklTRdKDAZqev9J9`GC&KyTC7`7EyFm2)G%{}76|PEA>P8&P
zc!^*bcNw{UPO!eq2?gIpJ$)a4>M`!r=M^&Dai`y`d(t8OH7iD`{!Ka@D$6cS_T9KQ
z3qyH2mHry?3WYgTRtUE7O2Ix}B@~TU<F7_26^h?AjMuVqIZ!TVP70q;@-8=CC%6FX
z1vg+rs8R6TWhEDFsJDjfdP7|qM%kag>{mq2aLm^$igV&|pD;IZHzfLIrf-Qs@qsTq
z7xhifErf;k*r|&%p<pEBn+`{!!I>G~bQD3~gSiFKH#HZD`a<D*)8brsHWZE?_svhw
zYY|nHzoI_lHzf1V%!MO!Ga&@?E{b!r^U<-s34N8AQa7JAA@FVVKI+-y#*j%c3C4E~
z*(E{mMoIj<NMmDoBAu2{&=b1<bRFQ^+#>fcP~s1bOQuDGShWaTf{EU2KNbzy?TB>z
zho%_%Fwz=DE7F=5jd+@uej3%~yO=Wq*0uhp&ndeUC_(;k(%^e(XnASqh#CC|U5^=W
z`240Me`jtwoHS2|=NF<$b0jKG&-?jg-aM<iWd6j|-1I~!lFS!Elfi|VDBc93p|_(E
z9@+VDybN?)j)cTWhd4bsITI2(dP0%A(Yg7K3-h7ySZHGE^z>ZEg_+=F$E{F!Vrn)h
z-klEL?uZ1#!mYWtvmORg4+FEoaPW4BTDpDyLDCeA+>I5eZP|`+9deFL04#Gayfv$D
z&~K$LQCuq7URleN>R(x(_~WTRoO*O$ZaS(o9mNCnl)ZS(q8(VvTklxEjX(TkFK#zo
z-nG*APQHI~!_^dbHOa0P#nmF&T0W&p^&5wW{pO@45C{inLxDiDAP|_H6BcGDYzqY5
zS_sah-q_H0qtg?!q3G0{AkqZzJx-+2FS-a&pJBj<K;$66vi3_@JzGZJ>fhqh;MXRu
zp!8u?&Qlc~VM-SCu+cO*Jrjym>Z`XcRg=eOg3(YUnyl8p)T)_{wD3e|`n6h)%j(<I
zxU6sgDMr<|xhQVa#~p2MrrsLKLvMEqhDF0YPPAvm)B5F-Df}9XW!xc_*0T|-jm5(F
z*@jD(XSf2fyaG?LA{LAky7B_(Kq^}n1-evGEzEIusC4XDG%a%3bDe>UvhSn%UfdVe
zzD#JiMwGW+|71Nsg(&N?$bmwXHEa;_A6s(q3;Cm2c|;n3wPDe;Gv#-woza7$#hENX
zGfJcXMiNf_pWhlgIuM$f$3z>Q^UaIX;izvRf??+qrzfU-6Y31<n-hKLu3;f43ci`S
z+qV&+J!uj{LNf0`Xl7>ae$sqf428l;lQ2D-<ZsO^gp&NkgJ5|0dj^^+9$=QWB=b{A
zll<Ut--V>PcVy(kNHYIQ_sH<zaDUQ-*GbDj_h?{b@XP=v6gBgtnURI5CVw^*iD0;k
zdr-urWjYcF(WfG5o(bIx&1mxs8$ijT0817~r#XN$G>jshsFG#w6PxSdx#iwOe(@Gp
zV6I;6WtX+k`odb61vedK@7q_*E9Qi?Xp6HTO4r(g<Z5JLdReT!%)`s0Yaz+iu*D%v
zFKyaOo{+CyULD40HCL}sJ~6M)vEb9vXPr-LS@5?O&grEz2$*f}4y~G3@5_9x!q-ZC
zEuL8%Tb$YK{a^aoWqp8MHZ8^9w130CdizmG+H-RK^7`ec?iIUiIi*-mN$GRb?%Co@
zX7Ab*yFBXrVgJ8A7vJ|9yZdb(S5Uli|LZjmYgX8AD<4tWtDOLwPG~mOj`YC&^fVr2
zK9FF2=mW{&lV}roFbrnFjJrG`4=`V_3l==H2t`5xu2!KCPYVPqK0U?BqEU6WKhNfM
zG4`l{z8vaZ#21|kfghL*O+1*G3HjzG*#rHA<x9<GK6SdI{u>QWheKj3!*hR%I(2yN
zzE4~TM|?pa7>sae#)mv{(-(|HyDtv<@Qu2MFIhWFf-hDi-Q~f5h~OonNEDp$r8-tb
zjW?m2HZmW42~UYVUgQE=rLj9Lwv~%&lME&tZIWBpJ=KzeZO9(bOPEf@vS?u@#+|%$
zSir>cFSrl4vB~-g;*KR9JDUyA)4RysNk`(=ty)kQ$WDuiac6pR*b!o@N4W+r>dv47
zqknG6n3Z0`g^3)j&WA?9{5UUns@x?-es*nvG9Bxy69p`}@veckCF45~U@%>>(0xN%
zH1J!8lZ8DO&J6XA1O_gQj>TM;*@UTkblewfPWQl>X)q_~HPg}Q;LLO^B={l=6X25~
zlM6F5587k(jNQ#dpt(0I?Q!22I8;mM?TNs2cycZ_*c*;4h@o_(G!xYu#;gpUI6{3p
zOjxK_>xiOPkq}6k^o2tqAtbbm)Z6?;$-D_hWYVe)m!xqHj9iM3PMU6qqGA)`CV8SD
znI|x2B59cf>lZ+*BtMS~lh)q1Cqnbl={azT>db74JcuOu(A(2d;vXZSnaO0Knm9n5
zdeRVz5DggO>ZA?EiE1hkh?Qke1nub;mk~YkUjdf6L}|qe|FAGoUX4r9ma)L<-K?nk
zVDBHa|9<<UYPsU5QgL);C{bLo#To3U4QyJLs$YBZ3II;o(XBYTB}eyWh3|t7xniGE
zv2Ug4;ZVZncsR6StBTvIWLu45tC4IqoAkAi%1_I-Gm7nuls-S(ET-wwssj}Byi{@e
z>01CepN~nl5!p7P*hUaYy$#8>NyRoP*(Nt_#SaHoFMWOJ;m}If?=u{jj;&r?Bh+hG
zyHB1j=6+OMaAuF`N9EmyGd|OgeP-PJioR+S>8TiDj(82K)=z(zO+Sm=I&QWf*~50p
z2=3yPUVgZv;S>zfyiEGs!G{S#o2<mjEDVe%p_JRDbpaezKuag^que4SA}hCOTG!>3
z$#v22*ql8~cjl>tyzRWy62A!im}(pue3)uFmdshyYLlA)$?A^A-9f%8STtkm*`_fN
z3hB9E`_MFD#FSxLGQVTSgu$-53Bw*cFoF>@4K4+vGc-XH0(hN>kH!!ChN)tVkojJ<
z#}0Q#qanzom@vSZVFa=yt-f0e(M&X7^PWhvUNrWffoKda?l$pj_(p19D{W}|=5E~y
zO+?#=V=XOb(x1l+3S#wwGh!$xJOHV)bN51mA3Pk7&*)6dybzsi>q_QL-ww}-Om1ew
zJQ8a@Gq*4!Ff3xKq^S}DT_)@1m!@XSjTOjv$LPl)ZTkI%$>PA|bXW*z3DwCb|9()U
zi7i=l=ECq;_uw!l;NkA`y<!(yHE9+?w-#<Ed8$~<P@#!AAtb(rH)1!vHckuvJh6xF
zhv?opu^@_Qh)gC)M>Hs+o-<FxF^Xvh#y}`Sof7jD4t1(>1+wA>DBoCp?yR*vjsQH7
ze*l0?TIJiQ+8?jlFIOE@st!t)@<g!@yq)zF%@Y@-s*zug-I8q+ifux&O(cq458v71
zjP@qRno2c;vg54cI4e2MChGRA^6wWVTqh;h$rqKalEu4aG+U1)Tn!tpmbk0sks!HR
zWY;0Zb!fwNJnlOF^yu?0$#q<IjVZ3Nm7avFY-LED&5YJ#$Rq_)^VNjZGp6)hkt!zW
zd?Tm^WSgMa1j!~OTwPDhk_!mgY3q`*Pd1mCfs^L48WjB_Z|i9@_ak$`=~Jd36*rzf
zX8O?~1BH*70kc>sZ7%ySNcU|H^Mj5Z$w-(x>o&4xEzE@(Nr)IHHX~>Z5STA*w#qP3
zm?d}vhp~;=(ISwLI`Dii7!_yw9;q|4hf$v|<_Ac>JIx-1l`$Di14bd;oWaLV^<fOC
zJg+toe7B$ogJz)>V>7xSf``q#QpYuo-{HA%2;;k4<t%6@(S(qR*(?r&ZhwJz6qjUy
z_97y_f(POVfs+Kz(|gPCT(plAZ|Z1jq8Buz{3dlc(dZNV0J1*i;zc}()#Q$c>~VdM
zQmq0kb6b4A)tm5EeNd=)+c&%i<KBZmJSBTiE8f#9g9%raWUE5|^gLWriud4qPM_Q~
zK<8aOyn2{@i;eUm4N|@L+z-Q2#W|eMo$6J#omXt<CENK_N=m(YINeLV)JrQs)bH+T
z;U2lW51Rg?72qRFZFigLqc$_bJNM9U@Js`OewIG)-PFFJ9BKWGdS;(`#)oEs$9D~}
zGTLFVsk$#<=G(fi_0KeZy6Q1ze;*@NdR>N)r;o`P>3qTR*!o>?3g0JgfgNDV3q~=@
zcA-E=S31S#QLtplqM(oj68c&S_U&cHxYl$e8L?z%fnza?AJNs2_RY>v_j&Z`TSu^<
zHj2KBI{v;^&r3$3m~>Y?+>M7zyx>^mp*d<@L?|OitZM>_KF{ys=KI=%CG(<LEY{b>
zVh%53;2s_noD6>x@H;d3g_2BoDNm0>a4qKT-i~TZ3T};Nw+>JKV*aA(E^BqMM(`}=
z-!q66LMgkJX5~?~n9uU??$BcSOQwgvmleBQU!!R23bb|Q_LPE!&Anc&g1C$@c1N3K
zF~{CmvSi_}TD0t5;zi3HP4|Ts>s&IU#ms22j%b}OCvDCasuvL>Q!8JlB`NppQHGW~
zf|<N{$?}ec#*P}gZ-_Q#@<d4v=ai)8^GcFMFR6psA=KvPyi>lKnCgqQo?e)qNqs_6
zBIWcf^k*5qQ+-NW;r3WHi%nmOthmkBNso6U=Dyy+0?lgb>!jnm5i7Xf!2<2=?Jupj
z=D-8etw?2$)w1u;@f14BM8{?ZqrR6GmH=ABvF{m~PEYJWy1rPYDp21d>e*)tR)pie
zRF3(n9L0LXg|3~S4@Rew1*%j@nIuio;O(SQoE1q{98TuJQV|NyCJPy_+7lv9R%|B%
z?X1LUdlN+rMi(Lw?&t1@XXb+R)l3%jz8w{V6H$a;T1Huf<35r4(@XmxtITu-RcRGO
zgh#W*e9=c4dQvs3mTVG)gaq)zg(@BA=5FI=Uw7EI5DvXP4<;SFKO@>VH^JCd!6z&*
z=AI!yWVf19jU=$5MUMO3A}KqQ*1_}L{k?&)?*3%4st4#BJl7kz*gZBN9zz827RAbg
zbRLP4*=8DQt=W*u6qthZ(@8VSCTY0^V;g%#LsQ2Y6tLV`D<`ciE^)z(S!WyqV;GWF
zXf9N(fZxhEhNNAMOMC}fR-7a<3TL2w3#3}WJTbKpz6)L+8V#f;C2wFha$B6D{QCeR
z#B4EUJj0L|s#sYMk5%L_9=dm5qxAn3UtB1GEu~V~TR(gNfb*BHhtvSz7Fx^DYsAIc
zXZ$sTzhO{)vRUkWc>kO4eB+%D=j7scrMR7#j`B@gg;Y8G=K%mZUw>1&J15!ZW!t=B
zn@1qw@~(Dn8JiKas^)|KKe+Jw7k+R-t~#Mqome@~5?^>)EmdAfomXy1wwto;reeE^
zK*Cf0{`L0*8=k#!&)$TmezUUrgQFXj`{I@RfTZ$-Qh8$2T@8*VQC7KC$d&H<HJ4Xf
z^cw;zeOpCbWi50e<<K`f?VuK5)%?O$nNn#NJKGbLzIE>huWnTCk5}%OD-SA_2UmNx
z@;Gmm<Ua7>%||_Q^#P^&fK+|pGvvM=l05sMCr3Kpp7mM9clgOQ#rNv!;HJA?YB=@e
z7y!;^cckk9$^EA6ep7M3i2!n0>s;$hc&gVulBWs!a$u;gU+?-Lu-f;cta`mNUe*eg
zil_Za^|MLoz&RF{F5Qqqccn{r+1)4Q%~H!NPmTcKeAX$I56b0(O8FoHiRQMo%FkF`
zj{M-5R5_A5$Agj$qSP(Lb_)T<_odm%W-_CdX8=ol<jDc4@(9kSccjrPlI^N&yQ<i(
zB9QQwe^5vor8f*JT;&t)s&!s+H%iXye{}mhv(n-5XN}L^{z?1B>DS|@Uzbl`RZd^U
zL!5Hcb*1UL)O7tb))!Vg)gQ5*DOC~(F%dNYj1Xd4E6>q6c2k<3m26?z7FKLw1i;=Z
zwrZ*7ux#s8Y@L#=a~o39Ae9q8+lEg6QRA8VJ`eX(OLrkIf8lWt8cl!U?=BfIn4TN#
zxOraWA2?}xe$tF^-_9DP$KU{A!8V8sS>NRxwX^|J+PVQ*ZaZ@LBIumN<5`%teO-YG
zV|DvMQq^NVn{$r1vv8TA3o<><-^SsB`v6vr=hfrRrY~Y7PUS0<<y=<&)~HUp1UFRh
z;WiccS1Mg~7WTp%?2ND8t-M(&i*iyHgG+Nf&hh0iKnh^ew#E%-q~e`5Z+h)oE~gS6
zg7&Q>Q_~sk+8vF5CN2rGkFMVcC3>8wK^G`>Z>RK2wkAA#s6F+*H&GoA3jCfAjZqT)
zQpdDm$rP<l-$BOL_)DQQCuNyF9k|48x<Y-O!+*pA-wp2v{1Tsq-;vHsC|~4v;x&z=
z|H!n6#>?K(>Q-N4lZI(-k^he2pBbQQ)%eJ)8m`dOg&L>?el2FkF9~m|Agk`sON44@
zQT1(B!*rsx+xvQSU%GUtNzIFCyCn1_$6MZFo>;h;cc(0kRq&-GngzVc*S~>%y>2N#
zi^_VEYlH@pMRyuAph4F&(m8Nk_WY2;(>DUsp6nSZyFV{lVC}bnnx-#`DKzgCLuk>r
zBlYF1GWm1*pi$VneXIy3CVP?5Aa!8eO)XioFzs8kGC6C<y2<LX+Gr74bI1HwXvZ?d
z-EEf7%bn#R*=E%PMs>@Q^&KnKhZ@r9(b{62;M&Egsz^)m-<rnDcYiuM#We7_#@|@U
z2=vJ$fQX^D7T_5Jds(b)6z)o7>hdwBnF1M9w?Z%vhY+15gFpnSCYh(|(35s8UI2Q!
z*gzlS{YkSQArB}u)=pSc41XG{ulv@VNWuu|_r++tNa|EE2+$X+U|PDJkTk~{o7;UF
z@njfeIT9W?&rME?kthg`?fueZHOESTF_)M(K?hiGqAa^^w#!JY;`P?{_I9y>%0i-}
zI75h<HGxX={ew|q0v>-4+I^>~ZIB7b7U7e`#rTfs#C)vmd{Av5t-gKps4!owi0Ybo
z2eNy3lkg<ckfk6RoR~@$1{b1pfmCamDNZEjPIVU;RDE;It{H*M<vgQVP141x&?sfC
z0<o>-y6P8mBUX65BNg<w|B7brnB__k#$Z%^Y(Gpu(ATCM4kEDDM5cw1u1|sHjO7?b
zP3EVJyDzO;oy1<r5JZ+%<erfqXCC+>Qz#6s!AUqSsNKfb0y-h;!ro%^x5rA`+S+^>
zB#=WRK0NXpm^>aU-_2a(XDJag_ZW`7mO+=I7`=sjE2fy`=T^uUWJd7mThIzX7cdo#
z&L8jSV9iUdbX*mfYnX7EmU<_&if7e~wy4Qu!ycu?Wla*UWRft`MJJ1Br0I-1i`3DW
zuA8aB^O)e6Qi0FXyNn84mGskk@MJMc61aF_WGpZ^JUZCZo3v{W$RR)#fKL#LQ37uO
zz*%Y`tQx8;tU&_P0*q?Y=z-?QOei$3%C_Sa|1|=n8i$6y2NyKDI;SK{($h|+7)cBD
zRro>9Ckv59Ak|Nk*8AvW0m?wULim{IO!h2QekMh!CO@YvTE(miGpM&|?{nTi0glK!
zJk#FiaW)skL2ES=S~_Ige#N$5vh9auYNM_*Ue_tt9aZX%!jzsUu7j*)uitdkNVUVV
z<AUP2AUQ5TYTYQ?8!y}Yq*3a-DwXY(%dRP9*C1Cr>o+|OQe#-U_fH^v_Zx>uDla`p
zQaK=`azOP-qN;AK7qZ=kt0C@ckX?He7i<Mad-aR5s?`O^vNiP^)d%9$2juEQO7$Vh
zQjsXECc#godgjlIq{<oflx?$$4bs*uLr0FDFrl3ulgQ!ovf{eT(6S<g|7KXK4XY=}
zxNqntYR(xi(#3g+$OHtLfa;S3Q3KyAYnfE`<`0JfQs-1yvdqbrImI%Ez?Sg@1VP?x
zuS+=G?-weL`VB`@+|l$$4`jyy#c@Dl=VzO)^7l_FF39vIJ8Cu!7;V8HJdkRGsdM@*
z$sx)PQE`X}u=;L}mo>{}ex(eqqy}eQN;uy#A<Q0ZnSl=C<8N(T)jr9)Up;@=CY8Om
zGC<Z9_(TMM_$mO-UycOSfF@?cbFI8NQBjkqZA_HcZCTBgMX+vG7Oe~cq1{b>5~rS2
z0^pQwuPU}zCEKgiOkHbZA6!QZDb+_e9pzglQw<UVyeO##NvRw7Y7GCjfU{Rhl}kT(
z2Y}9NH<W9)m22~<+sY+Wd7qJW5c_#*3qZ65AYKs6<H{>m23fBSu1`yqLG{$iunBLF
zv#SqeTb*L7lWcWc77kt7cR21lEc=cqz9W(iN*8zqTJ1Gb%Q2<pwP%Czmh<dBaim|W
zxGIrn$w8^&;PdONrJK%L$u}k46bP#4e2md}mMAJ2A{qffBcS@^g|}+MyFc#TFMAIv
z-h-=dhNSD+VaYilI|mf!00MaSh?jil*LkQS4xX2;z9Bhp$j%#z^9BMtrAX8_t-9Z@
zNqDQ*%%70vYW9~m?*h`fuqfG<WZRNrTS7opmwZCLE*8>Fqyek7SD$uEm9OG_7L(q1
zQ?dnRTTrn<-4aaeCo*mO!q=Evb!6~1*hdWL;~S2KxTE3GVd>S2lA}R(Tv8mDB*&#h
zbsaVJcHM>(bdYYN7xU1Mk2LmFazCjs4ApTzsq8lPHkf`|O*cRF841=K=t)Cw9d16(
zD<@c2aNcG5cwcw*`6AQb6q#|8?X|C2p8p;p=sb+`E^w*HYx|KS6_^%Hkv)3P64(JT
zO{NdX9Y)9<rtN&~qH#GNe#%@x(@h9@;PJIP_|fF&V1gW(v3~)x6;fKkcFaN{>{o?L
zX69d`$&YD@=$c&yXmCs!d4CDAmS9V*;`slw=&^6(`b|4)2ZW->#kp~Jv7b9~<H0BB
zE7L4EQ}nvC=q(XkJJe3z5+D9<Ry?=9{HRyYE-hwS-_honTbus_N?N*WNe^U|v@EBj
zUX-+4$a`Fot2YaKveH$O^$}WuZ8|LhwFwR3)*>(dKd>b0dvEm)EvxB&^t}=J*W|{{
z!c@CU>CCH!@E+)zbkh4Sw#8D&=6c$EJB>(`#SDGX|F;W1O&<gqItmSg9$#I`a%$SC
zE}3S{kuLEaXF9a5>2cCe=!zE7bTYZMLZdp|_KzxY{pXZeTL}m0+PswiPJTG1UNf>a
z+IVJ>kKD;Bb;C|L*{t)AYv;z@+Btj>Wp2sT5?Y`R`WSr+>`B>JckInnZ>(&v+lFl@
z8?O4YLHCQchc-;Zh_za}yG)DdTPHQ=<jmrW{G4GCo`Ksa{&7y9X-f5(=AHV?&iXT<
zCAUYQU8WXI;$+U)^Y7F$m|6dE?Xq{Lb_us`D`9O;o9z3%HqqJxt95ePLs#=+4NCWq
zZi{q(77C#)Vm&+4>i*VPJ84OBy{<@Mq6$`|d{A{KaOIqiw!bXQNTAKJW}Ppl-jR|d
z&v`6(Q`HM_ye5qW({v1vk-j9QRLaP=X~jZ`rdY@*6zn>MLP}}UC;rbs7;6{}sjgm>
zpDzM0O)OJ@3PHmQ4Z^SJa|M6$d=d@k7NX1toEd?kIEd9|N|T~_M>DoQ7U>blwY)jD
zw^`FMAp4-|zS`~^6CaSW1B#A`5Yu*%rpcQ@k$HhCYS*h*BGGBA?re^^(ugxrb&9W8
zLk8FVU_^C7O}W5pdJb`gD#$ZJ9<5obDG-C&Ff+`tk_@JFQ|VO<ggwVMwmnuuY7Nx@
z&(dUJhl-*#gA=MLIOmcJF!QVR38Bb5^fgQ|!Z5VQ&N2!~i<nY6<~sbUP6KJhA(Sd=
zHD_vAMZRMlr}D*`gN&X}ty1+(rlWxF237UKx&g4OeK)6Y;+s^ftOIK*OfuG^WaMQL
zIUZ};g}zizQA>^f(3q(uJw;fS!?aeZ+G+nHy7keI`}}20^P}p3n5L%-^orjEAjK@X
zcrd*PRwiJ;h^zGSJ$lLCMaL6=hwe2cka&aMSkT(6Vwjr*Q|+iuldn_c*9bHbVCoyy
z-9g2(jHhrqPHV6NP&QJg$^6tfOWIPc6PTQV`$|#TJsCO|3^1m&Ng5|ZJ2*awUkB<~
zb&kUWq0Ojr@?oHfyayf{j+S{`^<Js!%%ic5w(fXax7>C{X*&bo;};}~@mNDavf!IP
zr~ycwQ*d5*OSZkG*xo`QQCvkLTXR}|B=1Y+Nbx(Q%lI8SU%#!qKA#Gg{SKtq0pl<U
zv(^hF-U33r1yr9TO3GI9p%YoLZ5jIw*7Ag{dc)Qjw>8SPCdJky*_sm7?Mij$%5b8%
zdc8beymyOp+iRJEqepi1Dvn;s(VKA8sp^qO&~7xzj)RKhpyW8X3Drr#+JX1%@7q`H
zEKW<@(IPwcDvrI9V=p9k#kKFz+i};?Ezaf~FfvtB@AE5?=d$d%tavUXkf?1~y|Q}c
z6Aza6aeKOr(uwoRi7_>t0|$*v$x}K)<pe}I0TBr>;c43N?2CK$$(}aF(<XV^HsR7S
zzHWH$#``x`Z?M;GaZj7<=}<f!lBXl#^()@}PX^-N6KI%H5LeM6`OiE*Dpici6{AYU
zC<2MbmbK=!=1=I=9qHt-a`G|-)Kk2Vtj`$xiGs=jq6rW*0jf___z9`vt~|R5pq{eh
zs^YjRIj%x^vzq^+7;6YC?R(bXDS^wDvDn$h+&qrU?pGA|E0X(_gnQ41yEX1^eR5cG
zx61A=#oZ;jyI8EFPwz|ae%alxxcd=EG&HZCd+*$5n`PDSFDYehq~UAH0G8y;bEF=V
z-D8S-3;~v8_1x;YOqzTr5~cx5)BmhnYUs!L{B7yhq~yLWyKgJ*+X%oF-qVIvLLXEr
z<sBR4hvMajq{F?+;em|TNK4yV)fya(E9*WusZ<^SIUXV`1vbgy<aO!hq*Q)eF2Ajm
z-$o$O(n@L8s<zCC2R(##Y!!kq+-*6y5~xfH?0eSp*@bCgW~At%<Xn=SONw&|fh}WR
z$v&;|pY)(#$ev@0=a}R<#(LwaXSXHKu<RLDJi`c}!D$)$XPaK==!_+B!byY8(ne{|
z@bh{AI$ysoP2Q18@5-fjmD0NiB=+oGoyq)&UeI2$kFI%KDI3*$628XOsu$`~0}S%@
z#_zN%^@pUR1Ip2%c>R!m<iBuLNUnWrP!;X_j46j~q#Q~EHjVT=gZD}g&gW;PU<m7-
zbPJzU=au8K?G?rL3KrxisGH^4?;D^nQv+zYlKTc$32=XTL-O2^JvS834Fo=~!p=$~
z*Cfxl>={=);|SnXYs&y^D1ZhT=4;aBD^lY%I)jtaTMLrsp6t1&c<v#PbFvyUQF?^5
zI%<;Y4^g!=J>~yHk<v9NoxiD^55~KK?4h<a1Fm1`AH!3Qib5drGtAx}?ddssu7UfD
z>aPAW?q?n&!apl(KAUg)*(nM?=UfOscX`gjN5|hZ=rI0eu>s+~bTkhhH~-~PBZZF{
zD16*R;ge>h`YWRuss1Y8OJSdXsDS&}YC!nof`Xw=)5q?zq5V*QQOu9`o9Rhs8^V9x
zP;g<t>95<`F6=Y?O=Z`x+xT;r0pXv!3x-dce_rDm?lS-UkeQxznd!;N7KDGX&x}-A
zZt<kBA<^m6zX!k?SOb<^aO((fKeCl2v=B`^50TOMWn74$s4`|2sx2A9`yqnlW%7Vn
z*Z?OEZE=QvnFm(Lns#!szydA<!~}lH^o}WPQLh;xfi@C=;v-MLyq_FCXzPqvU4JNL
zYF0no=|#%ghreC?rywr1$q6m%%CVHH$-sXV!vhj4mBYRk#{Bq)V;NBcuU)N-ZM;av
z&6WXJ%>3YY@>PE4e?<<e!RIv!F^1$4g?<kJJCrb^l0zdY<#oxdQ;gBbSxH|&i<nfp
zV&yq(M&{K`Vllpt46v@LvI#4Y5^nE?yCv>!k==V0_udt2qR@?TZG{J<+1{COly5kE
zafeTK)WftPIqDM)txCfo$>GBZs|KbcqrEc&*phq6*Z1rI08ZJ_uQ>W82R<l=)kJ#8
zhL{-$5P-_wF=CLa+oa-4IDeYDN|s9z+Hd3cE01vhrl7mQ^buFyU1R#F#*8p*p954`
zhCaWsZ`fZblD~>bt|lUBmXi5_06dNZ0kNMRF|*x83NfSNw<$zinfN^d<RYQl+2au1
z61afB2=)DC?S~CK%ocp?0SfH26kh{14)d`sE*msjDT)S4+sv4MHE`2#*+37r!LV`L
zBQ;XkXfuyshTIvdMIgc_@TVO-LTsFOp#xw85Ww^y7zE>E)8{$F2s~DEe}0U-FN^_Y
z%NO8)I^hdqSidkv!579T{K6PEEPUBkTA_$`KES3%wE0HFH-+6e(t8VJHXI-i^m`#6
z_L2#u_UFjn-l9FWui!d!r)N(3-S;4V1<AI4W^NXXONGdBxXqJ$zUs`cy6RI{b=5!Y
z^Ic~fc-&ylE7v>Jo0Nr^`C2e9nUq5WGnKcCJ~VW!M@xz+LX2ujueY_O%+@#Hd4e|?
z=~DH&A-Ord)k3UOhh=VFLy{t3l=K-*DY-d=2a9BN$w*GhPIzW2G;uc*oqQh<r#;rE
zrJ_ZfTAJ&_H&}k7Q`jJa+|hl*y2KRK*OJ=X$k(|aeiND8snIk%X`%^DbPB2weUrAw
z2DRMDTO&7BFP`~Y)IF7;hEn~5bYx)le#)T&1nkA?gy8J_ObDwZQcfnU5#YU%s(VdW
z#8ghhnAQ%U(Q>AwWU0DU3R^6!_Ot-eEIvluFwN^fMgZ>ee2L`heqJf@mu3F4!e2&U
zlb#Jqm##|uHJQJr@YfJv55Z4ON&K|TPb>U10_-8<&G>PdA6NKsi67rR`E_aHj>O-U
z`MU~#7lGYl-jpV1B|a?kVTBJPz+%4k+#>OpWd4%EUqV1V)g4T9GZaUh0-L=af<yRH
zvlqoq9W-~-7f(G@huNskP?b>~3pvZZX!&Wx@UR>z+8sHQ+>O7Mj7+tv+hPcOO)dBu
zU0JZP5#wfrd@L!n<Qi|l&sd3{c{r+GGyDve{Tm+MTr>&=x}{Jsd2T<l%L^lqQ79Cw
z|C2G34hnPZ`I1tCe=OKxIONtX+cD*u8<(&kO}qe}FMsC+f7v;M=gZGH5DP_WeoJ}b
zcJO}mp-vslBw*TQ&yV7rdB1c$f$wAWrK_hr@y%UY*AcBs=gF3NI$=VCq4Bm+D0%G8
zZiyw!qGe|TX?7Wfl2k1=Wwo(qo3fu<*NjjZ8ek~q7xN#N=6-uuX@f0ZC`;jX=*=bB
z`5U-ZW4Huyy{Wh-^o!DRopyzSRIF08w^L6?I=l_LA^dk)B`X&ywtpL8%#%PvWeWRs
zp^6st!C7@r8ZSy({Vum??#VG=XV+82^`)ryf_hg5b0)NC-ENKqZNHmUg4$h5FozQO
zQYAR`rRmtERUg-7mkjM>2%pNLqkb1U_JNLu6dh;36dilA@HV1$+?hJbgtFUqCy6Tj
ze`jIZvlE5k{}c2zrRaMmG%po=E{%n$n8z@O?#s}EC7rr?EW56j3c>@JA+l#tPqvG;
z(6SS4*Dx9$Zl#x2egR#?5n~s>tP<~tb=CB+0(GjJv2^7qc<j%izAy{p-o*k)q5HO*
z-=iHmEQN45FNDMSHs|atTy5Do8912w^<S2DT8tocU{=fDag4fAtNSy#V%~j{Q>*)T
zs@1RfeK_#n{e5t3`#zXxbBD~fuUI&6>3_f+Pwwx#jQ5>paZr3Cn(e^v&Z+MMJJmPR
zy_)lVJe0#j98B>Lhj!v2vc9Al$N$5*y=V7t=qms4TUkA;b0-Or?FDxvRzE_Uw9%e#
zI>r>NFcWCt2|hR99=oCnJ!}o~mCy`chKOmVL=ZZnA#AecI~~gGChAM+FH&2|iM|#R
zL(W62^4Fy#B!)|*{o0spG#0BX=%qd1w47U-i8*d218+02@Xh{M8M6ub=3(7aO=OHl
zEELw<^M^%R-@vS`;u(ZeCfmK4op2_0lvaFmVe;t5`e*T<BU!ApS+~WP>eCN9^s7n3
zgQOwsH;N9*g}%VB2tjIMixo6YPZ}AlgW@}gljIqjFH&^dW!hTtIQAEUV?C1=XyZo6
zZ7;2wG?As?qiK$`0iT`&7~%}%BgfsSCDwKwPqrUI&Tf={QnqLi4C*xgI`?e@R?_<f
zBZOg7%!n-i6c`||Bnuc75n3?dHztkk`;&%Xgeo(&6VFTQiQ6GsFfE>nHRyT@)*_>a
zoM0~HkyGtjr2m}+8Zrd_XZ)5qsq`GqN8L}I|FP=Ns-6k*u_5Ky5FV|*wKfF*SoOUg
zGAZJF2Ju_K5%Z?1O_jTJLa+D-c#T+OMT`|{v0!JM!V~c>J+^C)nUQadUXa=;R-Jmm
zgwE^Xh1pvnF)%k7pas#98{%2Sd09{Zw|n0?l2c#0c5vT+xwmC_PThqmy(bgA81&mQ
zX-f$FFROE1__OX~C6`mm$kgaN4y$}I7J!UBm=F2X;m{r{E9kw@N2Xu4a=qP0qAW%O
z%*_RZG=Rhfwwo2UuvviIpH9q1P<yQDa_*XUZ0W3PO*Y;!2;QXXsb%<5IuUB-9KZjV
zH<h=xAEdg$me}r3tq35=R^4NlI2!cyshQB*xXjN?N0SAw2eF9(8`8A<(|wT6lzY4C
zRYwRLDg3a9X(-H;fY^%|CS;}wBqd<dgtDl5LRyNOw4kcg44gxu2e;<1v8q-_s`1!{
z?M~HAFyYuAyboJ0n^R~d5eoz`*+f%@VQiJEzT~lkRYY%QPU#B7Wy-V|D-W>m7Bf5p
zh^z-E3&>Ot^a~)6dSHb3qm&MVUDW-q9J<C}LreSx;>Bn%h4M7*0IXYN?=^Rmu@Ue$
z4608y`3jk@Qur!~$4<o~2J>mdmhqCoimmyu1K(b0&!Aj!R;f6Pb?Po`y4SL9+2VP7
z`KE8rcM2a3$-XYd*R={WE|kOY+Be#v+e`yy3_kJHDb=39`K(tOgZ1*7?6{^lt|5?c
zcvmgzFRd6R+Y78-ZuoTXU2l0&+OXcOls0XYw#G|a<<fSgw0)&N;qt9J*Gtw*6j$R)
z&x_*HwWc^3$q(4C8xoCv$x)Z^HE;Ni#(hU+-!a8^3?^n5-kY7*Q8M0xSl&Gw-qyIc
zRra=HUn9xeo@hO$w33f_>v_rR#|fh&yvGfQtcRQBz7J|8|266LD+K-5<nnQ)d|cXg
zvhtWHIYKYU5uJCA8#bHUz7vxUPX4qAAa%YKmF_)|n74Vakr{wn2C0<*(Mo`;frPht
z!`l}3w#nWO#oO`ZP~3Y2m0SvhWCHD%%LkP50jYc-QSRF)?}(RoJc&L(DV2A~<=2$*
zYf|~Ozjb&LPVZV@+*!ZH)uWy^$r#_FI9en!ySLEG{<ssCbRWHRm99qLJNEvub;Gw#
zq4>^T!}`&Uh9mKYBXUER($FO}bRhu4=K9U2T~cX}T-u|Q_Najjs%mPFian3qQbpTp
zK9H)j^%FO)a_Kn*&~;96VJ`ur^OE707e~=O<crsoi`S$ZH{%y?s?QUistr$T+|&95
z*8f)7b5ik~lsqR9V20IKpS>-WUX)8ODy0|IK&GHxDvP^pjel?O{lRtTw}w%c5|pKX
zqw#pW@%Xbwsqwhn*snD9OO5>qY?fiq1Qe*OM=tA8%6ilQ;TwoM(S)9z@bw%1XjnQl
z_Vo6~sf+Pb7v)nU%Bc}Nz$rIgRvIr$>?GWCPqBigPcG|I%K8w%nW43WDyzn~T9HGr
zrKA>pwzPh;$^WSF(OZw3|F@a%%&b+cReZKtg?$=~rD)JJP<sOm?d4bH@~cYuRRplY
z<DYwBPcE%TN5txuvWm6F_om;UUa$MsEK-)#vRdqUJ|LA|mP;=yrI!)-+tQjB-r5Z>
zI@+P96_WRm?Cnv!J*g!Q=O5j{iU#$Rox_TASaJ@3;w)W@N@bT<qpMNLe_3|Et~g(p
zoUb$E`m3_@HO2Xw#7^ecaPCRJRC_LUj$twwmmTAZV;q717gp=CEw0dtl}l-`iRC0!
zN1jee_I}ykuh{z$NYph+_L`^p8(kOTT^Ho8OG?+JM{p*gr9(x{TSk+!IpMC;+qxw8
z-h}^%;y?MUJnla$x%c3N!vic^P3pPcxZa2mY6?CnsGXvFhRw#k-#IIFO#JyxfYcd&
zON!i+X76MEJZ)sVk?di+kpN=I0<Lx>JdNs>AhM@bLEkmSJ%^-{g(p`wI)~z&L$rCt
z?Woi_BzG<-oeNUuLc(6Ma(R<2yReSZoTEWj_U`9i$vz_6M-=-A0_vICS`G6xwmC^d
z7JcB^C8=tF&WqzxXj-!0k?nUB`yB)}?QR8YVrpNL?cIvKTiSNAsmn|V5&Csd{UZE!
z%E*~rlB?~>b&2nm`EG^pMnF9yzr-i!4@-xuxj%Dv7jqvK84&!axS+es^ijE|`;h6Q
zcAlObGSd^-rx8BYZN%lrhT1cA+>d>f;Ky|ZJvP&i_m%bJn|@-XNI%Ip<H=8K-FtBJ
z(?h*2xO`mY89K;)e1KAXe6aGI)AZMN3jejkKw)Pyg^zdF;qvq9LnG~`UpQ+o95DW3
zzk$LB3ND;7|Kb%Rg-;qNe5$(?H*#qO!Gl(UulPs&rnujX@B|8`9i)6^N@*W>Y})fs
zFEV8-Tb8(RBZOMr+Gz;iWe^5;Rzq*wr$aCN>Q&MDRVJDJ2C@acH194pMC}^wkO8t|
z?MlnGoj!Knm|8ID_g&2FvY5MygUKB`uEh{&9}wDKA9b49>5kkCcJY0-Vg*gk|3bko
zaHe&g;wV#I!S*iqc>5}t-EbEP#oKUWciMGb4eD?^1SeF;u58Mo{^oa~#r>6N@d%~c
zXu(c(yWlSSO1Qm3`8M2GMYId<im!ybQmER78wP}3>abd`oEK_@TEPb;D#k}<pFiv<
zQ2#Cm)qMWW^etL;Xko+7Dw~|~X<f9gJ2EYr2`yT4|DyyHo;!}(#RAbN7@@@5HhO7$
za(~0Hs#TylAa?jXlNr8*0wkmOXopx`2w&t0{6iPEd3P~faA(PXTMSIV4jK{vicmYd
zVS|P?0_qj~y9kOhMw@ygVN`sA2uTY$El*=RH<2k6%62gu!v;ywv-TGv+DY;L>0SWZ
zUuYfI?MHUqqU;8;Xp+wbHjw)mm!xU-wwR?Hu$-6)&fXG&r(&hE_)h>jeL(8@|3wlE
zwa_y6Wc1$#{xl$W_9>lx%c{!;^bh>xLU=;_SBMd-gHGx54cSa<K`M(mF%*c33!#{M
zB(;wOGn%LNT0o8Gk@z|n-d-0p^Rv1j9mVc-fwd@_iuQ3%&p^!>hsCEz4~-C$K}lC3
z{wDx^<%kJ9*sWf+s9q`&f$C+Wb0FS1AYHnwU?;B50lD+0(s?sgMNH8V^X_J&&uV9_
zY_$L0?qw4Uk8{2bq!k(-6aO8Op~Xqm{%WQ9luASHJ1EVWjl+HM!+p;M>2RNXcvLw&
znl1@xW@2T#xD8|#ga#^WBvQrX+KthDwWh&q?a1!F+wCN3vyo;9&2|76)qm(KHQPT>
zK^u{FnZvfXy|<)`qZ=1V$dNC;sa$-Mp7h?5k4z{>Ceo!Qp+8o+yVpaz_$P?*((tOP
zo7ywX)KTQw#?(>FNr8MF(#{db$vc8J*oZ<qlr)D)C&5&b+xxm!jWi?d-XA`%uVWWx
z-K+_hKHa#0(ZaNYfkbEAhN~g?0L-o1_iNH)%`xBOdx3Si_)l3(6DR%|h!_7of&W0@
z&jH5ZW;aeZRCQ`wlHLb<Cet(}ZV;e)&)PFNX@!F3zPgimS@vwDy?9COdkZP?{||~w
zHj1H<XnNH1xKAo>l8X;1#fMh-P1@D)we_*@ypD@{%9d`$(k)rKKe5<X1*z!Vim-wu
z=;vh1dBt*GvYcmWj>wjyish(eIf}i;SE5YM>31?;wPeBQ1MGn_OL{f&G*N$sz>ffu
zq;$H+v_VM=f}{iwC5yVn+Y9u6xh~SJm~@W>?~h@(#{sC$`_iA6WKmFnI%?q-OCv^-
z1$3|8ByIR-6@iq$O(bcfH^9ztia()z3s^*Y7!m&#58$QuGkTtvLQtSXY*U{Fn07mv
zzGF(;WMS$ZChhs8UArNbtoDu+vEttWyZV{QPyG*%NFg5SbY-fljHk8}{{gT56SdZm
zky;CKqS?J!;*(42m6H0E{FVIAHVfhLY%s$sG7UB@g%2&O&aW3fEL>qf?8RVqs{*y9
z@b?Zq?E2c$yGv_^cs?n=fY0JxE0ZmCilt7n)McfH2wqgG*c%h>vzuO@;_bxF2}OCo
z<^X;}V7U)|ktN;@XH(qSl(3g3?7oE4vz5=4H2<38OArZQ`RtaBbJwIckuQ^uUS#{o
zUs7C`md_=4#|B>==c}cLK05honeSKleu?jY;i}H08E0v(E3WIhG;4RCv`W2@@sQ{Z
zg?~fh-_YT^DK*?=uL24mkodrhlB$i8=6Ff-qdckef>hEhmt0gzE-nux_~H$|Dh{1u
z{V1J$mCTPR{Fua#y#U>Af86bV)FK@lk=%aSJ*v1zm(M47=LTOB=WC?K2|D>2nHLmZ
zka!_0pAnYNs8TYj%V%x$k0MVRo{G}I9LsE8;pZiO9$x&^jv6T9%%~h9Y~9b|wJLn8
z#J9dEbZ!(@#S5$C!WyNpX1V7@QOP%_zcIbWuici5nv|lZ<^E6j{C5Xdn<cw%<<iQf
zwYOxxPT}jM^vU$Izgy(w-m5ui`d5a7Ci-`C)Z+pHeD(YMdsK&CHc0=Tdh9U*s{r6j
zFx133`tLL1|3MG<I}!Y|7UT3>(n9}ig0|0PE1r^tgR``+3{z37d%4nh$08;C1EqG@
zlj#2@iab3kWRFybHC3a&50AxG0=EdvQ>^^ptq9ww7qn6?^sn(o&x{OS91HXej)){#
ziD3e>1eiYkA%%XA0Q(e@sfjtHv6+UAF4ZZQ43VmDDt(Y&T283{aPO38#NflpMMg0I
zF^~*~7hKUY`z5%7W%h##*x*<$reA`yF0)^PD_my31m~2}XM(dWv!Cv*7N4bGF5b#B
z8yx6e%jebKXNmk0>|0`RWB|SGfA_ppa#-d&6~0sAJGDn6GC!*D=)<E4-nM+^-GP-$
zUmJRNXqo-6lc&LfiD1iCWbh~WMk)JD@byymi79+}AW`I5x&8I&httdb%l#@jRs%MB
zNrObOn<AAg_frSPi|qj0$eR7wMt1w#d|)Apyj+pv;gxR&z7bd(`QXYQ1b#p8=&D?O
zM5#U^+m9;tqss+}68HPf-){X@>ssv53Av<8Dd|Fe+x*McgmeEgwsk4Rt;4-+-^!~n
zz@<4G9KX&R$SryI%dgdUzsZjBj-`O5(?Rc5gMsu{S(mqs2MkbdYGBI1@wSy%+=MyS
zU?@!RmgSzW^}XA-%zo67X#ZD3VWbY7>tFI6+@ExIH=90cG7#M7J<W4J;!96gn|@Sf
G0Q_$l0STZ0

diff --git a/benchmarking/.gitignore b/cli/.gitignore
similarity index 100%
rename from benchmarking/.gitignore
rename to cli/.gitignore
diff --git a/benchmarking/README.md b/cli/README.md
similarity index 100%
rename from benchmarking/README.md
rename to cli/README.md
diff --git a/benchmarking/__init__.py b/cli/__init__.py
similarity index 100%
rename from benchmarking/__init__.py
rename to cli/__init__.py
diff --git a/benchmarking/agents/AgentSystem.py b/cli/agents/AgentSystem.py
similarity index 99%
rename from benchmarking/agents/AgentSystem.py
rename to cli/agents/AgentSystem.py
index 5117a04..2219f0b 100644
--- a/benchmarking/agents/AgentSystem.py
+++ b/cli/agents/AgentSystem.py
@@ -2,7 +2,7 @@
 from typing import Dict, Optional
 from pathlib import Path
 
-CODE_SAMPLES_DIR = Path("benchmarking/code_samples")
+CODE_SAMPLES_DIR = Path("cli/code_samples")
 
 
 class Command:
diff --git a/benchmarking/agents/__init__.py b/cli/agents/__init__.py
similarity index 100%
rename from benchmarking/agents/__init__.py
rename to cli/agents/__init__.py
diff --git a/benchmarking/agents/create_agent_system.py b/cli/agents/create_agent_system.py
similarity index 98%
rename from benchmarking/agents/create_agent_system.py
rename to cli/agents/create_agent_system.py
index 18cdbe1..f8fadc6 100644
--- a/benchmarking/agents/create_agent_system.py
+++ b/cli/agents/create_agent_system.py
@@ -17,7 +17,7 @@ class Colors:
     UNDERLINE = '\033[4m'
 
 # Define the directory where code samples are stored
-CODE_SAMPLES_DIR = Path("benchmarking/code_samples")
+CODE_SAMPLES_DIR = Path("cli/code_samples")
 
 def define_global_policy() -> str:
     """Asks the user to define a global policy for all agents."""
@@ -33,7 +33,7 @@ def define_global_policy() -> str:
 
 def get_output_directory() -> str:
     """Asks the user for an output directory, with a default option."""
-    default_dir = "benchmarking/agent_systems"
+    default_dir = "cli/agent_systems"
     dir_prompt = f"{Colors.WARNING}Enter the output directory (press Enter to use '{default_dir}'): {Colors.ENDC}"
     user_input = input(dir_prompt).strip()
     return user_input or default_dir
diff --git a/benchmarking/agents/integration_system.json b/cli/agents/integration_system.json
similarity index 100%
rename from benchmarking/agents/integration_system.json
rename to cli/agents/integration_system.json
diff --git a/benchmarking/agents/system_blueprint.json b/cli/agents/system_blueprint.json
similarity index 100%
rename from benchmarking/agents/system_blueprint.json
rename to cli/agents/system_blueprint.json
diff --git a/benchmarking/auto_metrics/AutoMetric.py b/cli/auto_metrics/AutoMetric.py
similarity index 100%
rename from benchmarking/auto_metrics/AutoMetric.py
rename to cli/auto_metrics/AutoMetric.py
diff --git a/benchmarking/auto_metrics/CellCountMetric.py b/cli/auto_metrics/CellCountMetric.py
similarity index 100%
rename from benchmarking/auto_metrics/CellCountMetric.py
rename to cli/auto_metrics/CellCountMetric.py
diff --git a/benchmarking/auto_metrics/CellTypingMetric.py b/cli/auto_metrics/CellTypingMetric.py
similarity index 100%
rename from benchmarking/auto_metrics/CellTypingMetric.py
rename to cli/auto_metrics/CellTypingMetric.py
diff --git a/benchmarking/auto_metrics/IntegrationMetrics.py b/cli/auto_metrics/IntegrationMetrics.py
similarity index 100%
rename from benchmarking/auto_metrics/IntegrationMetrics.py
rename to cli/auto_metrics/IntegrationMetrics.py
diff --git a/benchmarking/code_samples/load_adata.py b/cli/code_samples/load_adata.py
similarity index 100%
rename from benchmarking/code_samples/load_adata.py
rename to cli/code_samples/load_adata.py
diff --git a/benchmarking/core/__init__.py b/cli/core/__init__.py
similarity index 100%
rename from benchmarking/core/__init__.py
rename to cli/core/__init__.py
diff --git a/benchmarking/core/io_helpers.py b/cli/core/io_helpers.py
similarity index 98%
rename from benchmarking/core/io_helpers.py
rename to cli/core/io_helpers.py
index 651d96e..2b9508f 100644
--- a/benchmarking/core/io_helpers.py
+++ b/cli/core/io_helpers.py
@@ -111,8 +111,8 @@ def load_bp_json(console) -> Path:
     If multiple are found, prompt user to choose or enter manual path.
     """
     search_paths = [
-        Path.home() / "Olaf" / "benchmarking" / "agents",
-        Path.cwd() / "benchmarking" / "agents",
+        Path.home() / "Olaf" / "cli" / "agents",
+        Path.cwd() / "cli" / "agents",
         Path.cwd() / "agents"
     ]
 
diff --git a/benchmarking/core/ollama_wrapper.py b/cli/core/ollama_wrapper.py
similarity index 100%
rename from benchmarking/core/ollama_wrapper.py
rename to cli/core/ollama_wrapper.py
diff --git a/benchmarking/core/sandbox_management.py b/cli/core/sandbox_management.py
similarity index 96%
rename from benchmarking/core/sandbox_management.py
rename to cli/core/sandbox_management.py
index 0474b67..94dae13 100644
--- a/benchmarking/core/sandbox_management.py
+++ b/cli/core/sandbox_management.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 import json
 
-from benchmarking.sandbox.benchmarking_sandbox_management import (
+from cli.sandbox.benchmarking_sandbox_management import (
     SandboxManager as _BackendManager,
     CONTAINER_NAME as _SANDBOX_HANDLE,
     IMAGE_TAG as _SANDBOX_IMAGE,  
@@ -32,7 +32,7 @@ def COPY_CMD(src: str, dst: str):
     return _BackendManager, _SANDBOX_HANDLE, COPY_CMD, EXECUTE_ENDPOINT, STATUS_ENDPOINT
 
 def init_singularity(script_dir:str, subprocess, console, force_refresh:bool=False):
-    import benchmarking.sandbox.benchmarking_sandbox_management_singularity as sing
+    import cli.sandbox.benchmarking_sandbox_management_singularity as sing
     sandbox_dir = script_dir / "sandbox"
 
     # optional force‑refresh
@@ -72,7 +72,7 @@ def COPY_CMD(src: str, dst: str):
 
 
 def init_singularity_exec(script_dir: str, sanbox_data_path, subprocess, console, force_refresh: bool = False):
-    import benchmarking.sandbox.benchmarking_sandbox_management_singularity as sing
+    import cli.sandbox.benchmarking_sandbox_management_singularity as sing
     sandbox_dir = script_dir / "sandbox"
 
     # optional force‑refresh
diff --git a/cli/create_agent_system.sh b/cli/create_agent_system.sh
new file mode 100755
index 0000000..bc95946
--- /dev/null
+++ b/cli/create_agent_system.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+# move *out* of cli/ into its parent (Olaf/)
+cd "$(dirname "$0")"/..
+python -m cli.agents.create_agent_system "$@"
\ No newline at end of file
diff --git a/benchmarking/create_benchmark_env.sh b/cli/create_benchmark_env.sh
similarity index 100%
rename from benchmarking/create_benchmark_env.sh
rename to cli/create_benchmark_env.sh
diff --git a/benchmarking/datasets/spatial_transcriptomics_in_mouse_puck_191109_14.h5ad b/cli/datasets/spatial_transcriptomics_in_mouse_puck_191109_14.h5ad
similarity index 100%
rename from benchmarking/datasets/spatial_transcriptomics_in_mouse_puck_191109_14.h5ad
rename to cli/datasets/spatial_transcriptomics_in_mouse_puck_191109_14.h5ad
diff --git a/benchmarking/datasets/spatial_transcriptomics_in_mouse_puck_191109_14.json b/cli/datasets/spatial_transcriptomics_in_mouse_puck_191109_14.json
similarity index 100%
rename from benchmarking/datasets/spatial_transcriptomics_in_mouse_puck_191109_14.json
rename to cli/datasets/spatial_transcriptomics_in_mouse_puck_191109_14.json
diff --git a/benchmarking/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.h5ad b/cli/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.h5ad
similarity index 100%
rename from benchmarking/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.h5ad
rename to cli/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.h5ad
diff --git a/benchmarking/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.json b/cli/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.json
similarity index 100%
rename from benchmarking/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.json
rename to cli/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.json
diff --git a/benchmarking/prompt_testing/MultiAgentAutoTester.py b/cli/prompt_testing/MultiAgentAutoTester.py
similarity index 98%
rename from benchmarking/prompt_testing/MultiAgentAutoTester.py
rename to cli/prompt_testing/MultiAgentAutoTester.py
index 8cf557d..4d73ce0 100644
--- a/benchmarking/prompt_testing/MultiAgentAutoTester.py
+++ b/cli/prompt_testing/MultiAgentAutoTester.py
@@ -35,7 +35,7 @@
 try:
     from dotenv import load_dotenv
     if BACKEND_CHOICE == "ollama":
-        from benchmarking.core.ollama_wrapper import OllamaClient as OpenAI
+        from cli.core.ollama_wrapper import OllamaClient as OpenAI
         APIError = Exception  # Ollama does not have a specific APIError
     else:
         from openai import OpenAI, APIError
@@ -47,13 +47,13 @@
 
 # ── Agent framework ---------------------------------------------------------
 try:
-    from benchmarking.agents.AgentSystem import AgentSystem, Agent
+    from cli.agents.AgentSystem import AgentSystem, Agent
 except ImportError:
     print("[ERROR] Could not import backend.agents.agent_system", file=sys.stderr)
     raise
 
 # ── Local helpers -----------------------------------------------------------
-from benchmarking.core.io_helpers import (
+from cli.core.io_helpers import (
     extract_python_code,
     display,
     select_dataset,
@@ -62,7 +62,7 @@
     format_execute_response,
     load_bp_json
 )
-from benchmarking.core.sandbox_management import (
+from cli.core.sandbox_management import (
     init_docker,
     init_singularity,
     init_singularity_exec,
diff --git a/benchmarking/prompt_testing/MultiAgentTester.py b/cli/prompt_testing/MultiAgentTester.py
similarity index 97%
rename from benchmarking/prompt_testing/MultiAgentTester.py
rename to cli/prompt_testing/MultiAgentTester.py
index 19a587d..78d8ef2 100644
--- a/benchmarking/prompt_testing/MultiAgentTester.py
+++ b/cli/prompt_testing/MultiAgentTester.py
@@ -47,22 +47,25 @@
 # ── Dependencies ------------------------------------------------------------
 try:
     from dotenv import load_dotenv
-    from openai import OpenAI, APIError
+    if BACKEND_CHOICE == "ollama":
+        from cli.core.ollama_wrapper import OllamaClient as OpenAI
+        APIError = Exception  # Ollama does not have a specific APIError
+    else:
+        from openai import OpenAI, APIError
     import requests
     from rich.console import Console
-
 except ImportError as e:
     print(f"Missing dependency: {e}", file=sys.stderr)
     sys.exit(1)
 # ── Agent framework ---------------------------------------------------------
 try:
-    from benchmarking.agents.AgentSystem import AgentSystem, Agent
+    from cli.agents.AgentSystem import AgentSystem, Agent
 except ImportError:
     print("[ERROR] Could not import backend.agents.agent_system", file=sys.stderr)
     raise
 
 # ── Local helpers -----------------------------------------------------------
-from benchmarking.core.io_helpers import (
+from cli.core.io_helpers import (
     extract_python_code,
     display,
     select_dataset,
@@ -71,7 +74,7 @@
     format_execute_response,
     load_bp_json
 )
-from benchmarking.core.sandbox_management import (
+from cli.core.sandbox_management import (
     init_docker,
     init_singularity,
     init_singularity_exec,
@@ -187,7 +190,7 @@ def build_system(a: Agent) -> str:
         openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
     else:
         # Local Ollama needs no key; model defaults to “llama2”
-        openai = OpenAI(host=OLLAMA_HOST)
+        openai = OpenAI(host=OLLAMA_HOST, model="deepseek-r1:70b")
     current_agent = agent
     turn = 0
 
diff --git a/benchmarking/prompt_testing/__init__.py b/cli/prompt_testing/__init__.py
similarity index 100%
rename from benchmarking/prompt_testing/__init__.py
rename to cli/prompt_testing/__init__.py
diff --git a/benchmarking/prompt_testing/extra_tools/Evaluator.py b/cli/prompt_testing/extra_tools/Evaluator.py
similarity index 100%
rename from benchmarking/prompt_testing/extra_tools/Evaluator.py
rename to cli/prompt_testing/extra_tools/Evaluator.py
diff --git a/benchmarking/prompt_testing/extra_tools/InteractiveAgentTester.py b/cli/prompt_testing/extra_tools/InteractiveAgentTester.py
similarity index 97%
rename from benchmarking/prompt_testing/extra_tools/InteractiveAgentTester.py
rename to cli/prompt_testing/extra_tools/InteractiveAgentTester.py
index 68edfcb..fb819e0 100644
--- a/benchmarking/prompt_testing/extra_tools/InteractiveAgentTester.py
+++ b/cli/prompt_testing/extra_tools/InteractiveAgentTester.py
@@ -46,8 +46,8 @@
     sys.exit(1)
 
 # -- Local imports ---------------------------------------------------------------
-from benchmarking.core.io_helpers import extract_python_code, display, select_dataset, collect_resources, get_initial_prompt, format_execute_response
-from benchmarking.core.sandbox_management import init_docker, init_singularity, init_singularity_exec
+from cli.core.io_helpers import extract_python_code, display, select_dataset, collect_resources, get_initial_prompt, format_execute_response
+from cli.core.sandbox_management import init_docker, init_singularity, init_singularity_exec
 
 
 console = Console()
diff --git a/benchmarking/prompt_testing/extra_tools/OneShotAgentTester.py b/cli/prompt_testing/extra_tools/OneShotAgentTester.py
similarity index 100%
rename from benchmarking/prompt_testing/extra_tools/OneShotAgentTester.py
rename to cli/prompt_testing/extra_tools/OneShotAgentTester.py
diff --git a/benchmarking/prompt_testing/extra_tools/PromptEvolver.py b/cli/prompt_testing/extra_tools/PromptEvolver.py
similarity index 100%
rename from benchmarking/prompt_testing/extra_tools/PromptEvolver.py
rename to cli/prompt_testing/extra_tools/PromptEvolver.py
diff --git a/benchmarking/requirements.txt b/cli/requirements.txt
similarity index 100%
rename from benchmarking/requirements.txt
rename to cli/requirements.txt
diff --git a/cli/run_automated.sh b/cli/run_automated.sh
new file mode 100755
index 0000000..12aed66
--- /dev/null
+++ b/cli/run_automated.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+# move *out* of cli/ into its parent (Olaf/)
+cd "$(dirname "$0")"/..
+python -m cli.prompt_testing.MultiAgentAutoTester "$@"
\ No newline at end of file
diff --git a/cli/run_interactive.sh b/cli/run_interactive.sh
new file mode 100755
index 0000000..6373db0
--- /dev/null
+++ b/cli/run_interactive.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+# move *out* of cli/ into its parent (Olaf/)
+cd "$(dirname "$0")"/..
+python -m cli.prompt_testing.MultiAgentTester "$@"
\ No newline at end of file
diff --git a/benchmarking/sample_prompt_library/Basic_scRNA_Agent.txt b/cli/sample_prompt_library/Basic_scRNA_Agent.txt
similarity index 100%
rename from benchmarking/sample_prompt_library/Basic_scRNA_Agent.txt
rename to cli/sample_prompt_library/Basic_scRNA_Agent.txt
diff --git a/benchmarking/sandbox/Dockerfile b/cli/sandbox/Dockerfile
similarity index 100%
rename from benchmarking/sandbox/Dockerfile
rename to cli/sandbox/Dockerfile
diff --git a/benchmarking/sandbox/Singularity b/cli/sandbox/Singularity
similarity index 100%
rename from benchmarking/sandbox/Singularity
rename to cli/sandbox/Singularity
diff --git a/benchmarking/sandbox/__init__.py b/cli/sandbox/__init__.py
similarity index 100%
rename from benchmarking/sandbox/__init__.py
rename to cli/sandbox/__init__.py
diff --git a/benchmarking/sandbox/benchmarking_sandbox_management.py b/cli/sandbox/benchmarking_sandbox_management.py
similarity index 100%
rename from benchmarking/sandbox/benchmarking_sandbox_management.py
rename to cli/sandbox/benchmarking_sandbox_management.py
diff --git a/benchmarking/sandbox/benchmarking_sandbox_management_singularity.py b/cli/sandbox/benchmarking_sandbox_management_singularity.py
similarity index 100%
rename from benchmarking/sandbox/benchmarking_sandbox_management_singularity.py
rename to cli/sandbox/benchmarking_sandbox_management_singularity.py
diff --git a/benchmarking/sandbox/kernel_api.py b/cli/sandbox/kernel_api.py
similarity index 100%
rename from benchmarking/sandbox/kernel_api.py
rename to cli/sandbox/kernel_api.py
diff --git a/benchmarking/sandbox/offline_kernel.py b/cli/sandbox/offline_kernel.py
similarity index 100%
rename from benchmarking/sandbox/offline_kernel.py
rename to cli/sandbox/offline_kernel.py
diff --git a/benchmarking/sandbox/requirements.txt b/cli/sandbox/requirements.txt
similarity index 100%
rename from benchmarking/sandbox/requirements.txt
rename to cli/sandbox/requirements.txt
diff --git a/benchmarking/sandbox/start.sh b/cli/sandbox/start.sh
similarity index 100%
rename from benchmarking/sandbox/start.sh
rename to cli/sandbox/start.sh
diff --git a/benchmarking/sandbox/start_kernel.py b/cli/sandbox/start_kernel.py
similarity index 100%
rename from benchmarking/sandbox/start_kernel.py
rename to cli/sandbox/start_kernel.py
diff --git a/benchmarking/tools/__init__.py b/cli/tools/__init__.py
similarity index 100%
rename from benchmarking/tools/__init__.py
rename to cli/tools/__init__.py
diff --git a/benchmarking/tools/czi_browser.py b/cli/tools/czi_browser.py
similarity index 100%
rename from benchmarking/tools/czi_browser.py
rename to cli/tools/czi_browser.py
diff --git a/benchmarking/tools/output_to_notebook.py b/cli/tools/output_to_notebook.py
similarity index 100%
rename from benchmarking/tools/output_to_notebook.py
rename to cli/tools/output_to_notebook.py

From 42831244d9cd257fdf994a7ce3570eeda7109d1e Mon Sep 17 00:00:00 2001
From: djriffle <djriffle1@gmail.com>
Date: Wed, 30 Jul 2025 16:00:41 -0400
Subject: [PATCH 02/14] merged auto and interactive to one script

---
 cli/prompt_testing/MultiAgentAutoTester.py | 449 ------------------
 cli/prompt_testing/MultiAgentTester.py     | 514 ++++++++++++++-------
 cli/run_automated.sh                       |   2 +-
 3 files changed, 336 insertions(+), 629 deletions(-)
 delete mode 100644 cli/prompt_testing/MultiAgentAutoTester.py

diff --git a/cli/prompt_testing/MultiAgentAutoTester.py b/cli/prompt_testing/MultiAgentAutoTester.py
deleted file mode 100644
index 4d73ce0..0000000
--- a/cli/prompt_testing/MultiAgentAutoTester.py
+++ /dev/null
@@ -1,449 +0,0 @@
-#!/usr/bin/env python3
-"""
-Interactive Auto Agent System Tester (v1.2-auto)
-==========================================
-"""
-from __future__ import annotations
-
-import base64
-import json
-import os
-import re
-import subprocess
-import sys
-import textwrap
-import time
-from datetime import datetime
-from pathlib import Path
-from typing import List, Tuple, Optional, Dict
-
-from rich.table import Table
-# -- Pick LLM backend ---------------------------------------------------
-from rich.prompt import Prompt
-BACKEND_CHOICE = Prompt.ask(
-    "LLM backend",
-    choices=["chatgpt", "ollama"],
-    default="chatgpt",
-)
-OLLAMA_HOST = "http://localhost:11434"
-if BACKEND_CHOICE == "ollama":
-    OLLAMA_HOST = Prompt.ask(
-        "Ollama base URL",
-        default="http://localhost:11434",
-    )
-# ── Dependencies ------------------------------------------------------------
-try:
-    from dotenv import load_dotenv
-    if BACKEND_CHOICE == "ollama":
-        from cli.core.ollama_wrapper import OllamaClient as OpenAI
-        APIError = Exception  # Ollama does not have a specific APIError
-    else:
-        from openai import OpenAI, APIError
-    import requests
-    from rich.console import Console
-except ImportError as e:
-    print(f"Missing dependency: {e}", file=sys.stderr)
-    sys.exit(1)
-
-# ── Agent framework ---------------------------------------------------------
-try:
-    from cli.agents.AgentSystem import AgentSystem, Agent
-except ImportError:
-    print("[ERROR] Could not import backend.agents.agent_system", file=sys.stderr)
-    raise
-
-# ── Local helpers -----------------------------------------------------------
-from cli.core.io_helpers import (
-    extract_python_code,
-    display,
-    select_dataset,
-    collect_resources,
-    get_initial_prompt,
-    format_execute_response,
-    load_bp_json
-)
-from cli.core.sandbox_management import (
-    init_docker,
-    init_singularity,
-    init_singularity_exec,
-)
-
-console = Console()
-SCRIPT_DIR = Path(__file__).resolve().parent
-PARENT_DIR = SCRIPT_DIR.parent
-DATASETS_DIR = PARENT_DIR / "datasets"
-OUTPUTS_DIR = PARENT_DIR / "outputs"
-ENV_FILE = PARENT_DIR / ".env"
-
-SANDBOX_DATA_PATH = "/workspace/dataset.h5ad"
-SANDBOX_RESOURCES_DIR = "/workspace/resources"
-
-# ── Benchmark persistence --------------------------------------------------
-timestamp = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
-_LEDGER_PATH = OUTPUTS_DIR / f"benchmark_history_{timestamp}.jsonl"
-_SNIPPET_DIR = OUTPUTS_DIR / "snippets"
-_SNIPPET_DIR.mkdir(exist_ok=True, parents=True)
-_LEDGER_PATH.parent.mkdir(exist_ok=True, parents=True)
-
-def _dump_code_snippet(run_id: str, code: str) -> str:
-    """
-    Write <run_id>.py under outputs/snippets/ and return the relative path.
-    """
-    snippet_path = _SNIPPET_DIR / f"{run_id}.py"
-    snippet_path.write_text(code, encoding="utf-8")
-    return str(snippet_path.relative_to(OUTPUTS_DIR))
-
-def _save_benchmark_record(*, run_id: str, results: dict, meta: dict, code: str | None):
-    """
-    Append a JSONL record containing timestamp, dataset metadata, metrics, and
-    a pointer to (or inline copy of) the integration code.
-    """
-    record = {
-        "ts": datetime.utcnow().isoformat(timespec="seconds") + "Z",
-        "run": run_id,
-        "dataset": meta.get("name"),
-        "results": results,
-    }
-    if code:
-        record["code_path"] = _dump_code_snippet(run_id, code)
-    with _LEDGER_PATH.open("a") as fh:
-        fh.write(json.dumps(record) + "\n")
-
-# ===========================================================================
-# 1 · Backend selection
-# ===========================================================================
-backend = Prompt.ask(
-    "Choose sandbox backend", choices=["docker", "singularity", "singularity-exec"], default="docker"
-)
-force_refresh = (
-    Prompt.ask("Force refresh environment?", choices=["y", "n"], default="n").lower() == "y"
-)
-is_exec_mode = backend == "singularity-exec"
-
-if backend == "docker":
-    (
-        _BackendManager,
-        _SANDBOX_HANDLE,
-        COPY_CMD,
-        EXECUTE_ENDPOINT,
-        STATUS_ENDPOINT,
-    ) = init_docker(SCRIPT_DIR, subprocess, console, force_refresh)
-    SANDBOX_DATA_PATH = "dataset.h5ad"
-elif backend == "singularity":
-    (
-        _BackendManager,
-        _SANDBOX_HANDLE,
-        COPY_CMD,
-        EXECUTE_ENDPOINT,
-        STATUS_ENDPOINT,
-    ) = init_singularity(SCRIPT_DIR, subprocess, console, force_refresh)
-elif backend == "singularity-exec":
-    (
-        _BackendManager,
-        _SANDBOX_HANDLE,
-        COPY_CMD,
-        EXECUTE_ENDPOINT,
-        STATUS_ENDPOINT,
-    ) = init_singularity_exec(SCRIPT_DIR, SANDBOX_DATA_PATH, subprocess, console, force_refresh)
-else:
-    console.print("[red]Unknown backend.")
-    sys.exit(1)
-
-# ===========================================================================
-# 2 · Agent helpers
-# ===========================================================================
-def load_agent_system() -> Tuple[AgentSystem, Agent, str]:
-    """Load the agent system from a JSON blueprint."""
-    bp = load_bp_json(console)
-    if not bp.exists():
-        console.print(f"[red]Blueprint {bp} not found.")
-        sys.exit(1)
-    system = AgentSystem.load_from_json(str(bp))
-    driver_name = Prompt.ask("Driver agent", choices=list(system.agents.keys()), default=list(system.agents)[0])
-    driver = system.get_agent(driver_name)
-    instr = system.get_instructions()
-    return system, driver, instr
-
-_DELEG_RE = re.compile(r"delegate_to_([A-Za-z0-9_]+)")
-
-def detect_delegation(msg: str) -> Optional[str]:
-    """Return the *full* command name (e.g. 'delegate_to_coder') if present."""
-    m = _DELEG_RE.search(msg)
-    return f"delegate_to_{m.group(1)}" if m else None
-
-def api_alive(url: str, tries: int = 10) -> bool:
-    """Check if the API is responsive."""
-    if is_exec_mode:
-        return True
-    for _ in range(tries):
-        try:
-            if requests.get(url, timeout=2).json().get("status") == "ok":
-                return True
-        except Exception:
-            time.sleep(1.5)
-    return False
-
-# ===========================================================================
-# 3 · Interactive *or* Automated loop
-# ===========================================================================
-def run(
-    agent_system: AgentSystem,
-    agent: Agent,
-    roster_instr: str,
-    dataset: Path,
-    metadata: dict,
-    resources: List[Tuple[Path, str]],
-    benchmark_module: Optional[Path] = None,
-    *,
-    initial_user_message: str,
-    tries: int = 0,
-):
-    """Main driver"""
-    last_code_snippet: str | None = None  
-    mgr = _BackendManager()
-    console.print(f"Launching sandbox ({backend})…")
-
-    if is_exec_mode and hasattr(mgr, "set_data"):
-        mgr.set_data(dataset, resources)
-    if not mgr.start_container():
-        console.print("[red]Failed to start sandbox")
-        return
-    if not api_alive(STATUS_ENDPOINT):
-        console.print("[red]Kernel API not responsive.")
-        return
-
-    if not is_exec_mode:
-        COPY_CMD(str(dataset), f"{_SANDBOX_HANDLE}:{SANDBOX_DATA_PATH}")
-        for hp, cp in resources:
-            COPY_CMD(str(hp), f"{_SANDBOX_HANDLE}:{cp}")
-
-    res_lines = [f"- {c} (from {h})" for h, c in resources] or ["- (none)"]
-    analysis_ctx = textwrap.dedent(
-        f"Dataset path: **{SANDBOX_DATA_PATH}**\nResources:\n" + "\n".join(res_lines) + "\n\nMetadata:\n" + json.dumps(metadata, indent=2)
-    )
-
-    def build_system(a: Agent) -> str:
-        return roster_instr + "\n\n" + a.get_full_prompt(agent_system.global_policy) + "\n\n" + analysis_ctx
-
-    history = [{"role": "system", "content": build_system(agent)}]
-    history.append({"role": "user", "content": initial_user_message})
-    display(console, "system", history[0]["content"])
-    display(console, "user", initial_user_message)
-
-    if BACKEND_CHOICE == "chatgpt":
-        if not os.getenv("OPENAI_API_KEY"):
-            console.print("[red]OPENAI_API_KEY not set in .env")
-            sys.exit(1)
-        openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
-    else:
-        # Local Ollama needs no key; model defaults to “llama2”
-        openai = OpenAI(host=OLLAMA_HOST, model="deepseek-r1:70b")
-    current_agent = agent
-    turn = 0
-
-    tries_left = tries
-
-    while True:
-        turn += 1
-        console.print(f"\n[bold]OpenAI call (turn {turn})…")
-        try:
-            resp = openai.chat.completions.create(model="gpt-4o", messages=history, temperature=0.7)
-        except APIError as e:
-            console.print(f"[red]OpenAI error: {e}")
-            break
-        msg = resp.choices[0].message.content
-        history.append({"role": "assistant", "content": msg})
-        display(console, f"assistant ({current_agent.name})", msg)
-
-        # ── Delegation --------------------------------------------------------
-        cmd = detect_delegation(msg)
-        if cmd and cmd in current_agent.commands:
-            tgt = current_agent.commands[cmd].target_agent
-            new_agent = agent_system.get_agent(tgt)
-            if new_agent:
-                console.print(f"[yellow]🔄 Routing to '{tgt}' via {cmd}")
-                history.append({"role": "assistant", "content": f"🔄 Routing to **{tgt}** (command `{cmd}`)"})
-                
-                # INJECT LOADED CODE SAMPLES ON DELEGATION ---
-                if new_agent.code_samples:
-                    sample_context = "Here are some relevant code samples for your task:"
-                    for filename, code_content in new_agent.code_samples.items():
-                        sample_context += f"\n\n--- Sample from: {filename} ---\n"
-                        sample_context += f"```python\n{code_content.strip()}\n```"
-                    
-                    history.append({"role": "user", "content": sample_context})
-                    display(console, "user", sample_context) # Display for clarity
-
-                current_agent = new_agent
-                history.insert(0, {"role": "system", "content": build_system(new_agent)})
-                continue
-
-        # ── Inline code execution -------------------------------------------
-        code = extract_python_code(msg)
-        if code:
-            last_code_snippet = code   
-            console.print("[cyan]Executing code…[/cyan]")
-            try:
-                if is_exec_mode:
-                    exec_result = mgr.exec_code(code, timeout=300)
-                else:
-                    exec_result = requests.post(
-                        EXECUTE_ENDPOINT, json={"code": code, "timeout": 300}, timeout=310
-                    ).json()
-                feedback = format_execute_response(exec_result, OUTPUTS_DIR)
-            except Exception as exc:
-                feedback = f"Code execution result:\n[Execution error on host: {exc}]"
-
-            history.append({"role": "user", "content": feedback})
-            display(console, "user", feedback)
-
-        # ── Automatic benchmarking (v1.2 addition) --------------------------
-        if benchmark_module:
-            result_str = run_benchmark(mgr, benchmark_module, metadata, current_agent.name, last_code_snippet)
-            if result_str:
-                history.append({"role": "user", "content": result_str})
-                display(console, "user", result_str)
-        tries_left -= 1
-        if tries_left <= 0:
-            break
-        # Simulate blank *continue* from the user
-        history.append({"role": "user", "content": ""})
-        continue
-    console.print("Stopping sandbox…")
-    mgr.stop_container()
-
-# ===========================================================================
-# 4 · Benchmarking helpers (modified to *return* results)
-# ===========================================================================
-def get_benchmark_module(console: Console, parent_dir: Path) -> Optional[Path]:
-    """Prompt user to select a benchmark module."""
-    benchmark_dir = parent_dir / "auto_metrics"
-    if not benchmark_dir.exists():
-        console.print("[red]No benchmarks directory found.[/red]")
-        return None
-
-    modules = [m for m in benchmark_dir.glob("*.py") if m.name != "AutoMetric.py"]
-    if not modules:
-        console.print("[red]No benchmark modules found.[/red]")
-        return None
-
-    console.print("\n[bold]Available benchmark modules:[/bold]")
-    for i, mod in enumerate(modules, start=1):
-        console.print(f"{i}. {mod.name}")
-
-    choice = Prompt.ask("Select a benchmark module by number (or press Enter to skip)", default="")
-    if not choice:
-        return None
-
-    try:
-        index = int(choice) - 1
-        if 0 <= index < len(modules):
-            return modules[index]
-        else:
-            console.print("[red]Invalid selection.[/red]")
-            return None
-    except ValueError:
-        console.print("[red]Invalid input. Please enter a number.[/red]")
-        return None
-
-
-def run_benchmark(mgr, benchmark_module: Path, metadata: dict,
-                  agent_name: str, code_snippet: str | None) -> str:
-    """Execute benchmark module and *return* a compact JSON string."""
-    console.print(f"\n[bold cyan]Running benchmark module: {benchmark_module.name}[/bold cyan]")
-    autometric_base_path = benchmark_module.parent / "AutoMetric.py"
-    try:
-        with open(autometric_base_path, "r") as f:
-            autometric_code = f.read()
-        with open(benchmark_module, "r") as f:
-            benchmark_code = f.read()
-    except FileNotFoundError:
-        err = f"Benchmark module not found at: {benchmark_module}"
-        console.print(f"[red]{err}[/red]")
-        return err
-
-    code_to_execute = f"""
-# --- Code from AutoMetric.py ---
-{autometric_code}
-# --- Code from {benchmark_module.name} ---
-{benchmark_code}
-"""
-    console.print("[cyan]Executing benchmark code...[/cyan]")
-    try:
-        if is_exec_mode:
-            exec_result = mgr.exec_code(code_to_execute, timeout=300)
-        else:
-            exec_result = requests.post(
-                EXECUTE_ENDPOINT, json={"code": code_to_execute, "timeout": 300}, timeout=310
-            ).json()
-
-        table = Table(title="Benchmark Results")
-        table.add_column("Metric", style="cyan")
-        table.add_column("Value", style="magenta")
-        stdout = exec_result.get("stdout", "")
-        try:
-            result_dict = json.loads(stdout.strip().splitlines()[-1])
-        except Exception as e:
-            console.print(f"[yellow]Warning: Could not parse JSON from stdout: {e}[/yellow]")
-            result_dict = {}
-
-        if exec_result.get("status") == "ok" and isinstance(result_dict, dict):
-            for key, value in result_dict.items():
-                table.add_row(str(key), str(value))
-            _save_benchmark_record(
-                run_id=f"{benchmark_module.stem}:{agent_name}:{int(time.time())}",
-                results=result_dict,
-                meta=metadata,
-                code=code_snippet,
-            )
-        else:
-            table.add_row("Error", exec_result.get("stderr") or "An unknown error occurred.")
-        console.print(table)
-        return "Benchmark results:\n" + json.dumps(result_dict or {"error": "see console"})
-    except Exception as exc:
-        err_msg = f"Benchmark execution error: {exc}"
-        console.print(f"[red]{err_msg}[/red]")
-        return err_msg
-
-# ===========================================================================
-# 5 · Entry point (collect *tries* & initial message)
-# ===========================================================================
-def main():
-    load_dotenv(ENV_FILE)
-    if not os.getenv("OPENAI_API_KEY"):
-        console.print("[red]OPENAI_API_KEY not set in .env")
-        sys.exit(1)
-
-    sys_, drv, roster = load_agent_system()
-    dp, meta = select_dataset(console, DATASETS_DIR)
-    benchmark_module = get_benchmark_module(console, PARENT_DIR)
-    res = collect_resources(console, SANDBOX_RESOURCES_DIR)
-
-    initial_user_message = Prompt.ask(
-        "Initial user message", default="What should I do with this dataset?"
-    )
-    try:
-        tries = int(Prompt.ask("Number of automatic tries", default="1"))
-        if tries < 0:
-            raise ValueError
-    except ValueError:
-        console.print("[yellow]Invalid number – defaulting to 1.[/yellow]")
-        tries = 1
-
-    run(
-        sys_,
-        drv,
-        roster,
-        dp,
-        meta,
-        res,
-        benchmark_module,
-        initial_user_message=initial_user_message,
-        tries=tries,
-    )
-
-if __name__ == "__main__":
-    try:
-        main()
-    except KeyboardInterrupt:
-        console.print("\nInterrupted.")
\ No newline at end of file
diff --git a/cli/prompt_testing/MultiAgentTester.py b/cli/prompt_testing/MultiAgentTester.py
index 78d8ef2..adcf003 100644
--- a/cli/prompt_testing/MultiAgentTester.py
+++ b/cli/prompt_testing/MultiAgentTester.py
@@ -1,24 +1,18 @@
 #!/usr/bin/env python3
 """
-Interactive Agent System Tester (v1.1)
-======================================
-• **New in v1.1** – Smarter delegation detection.
-  The router now recognises any of the following patterns in an assistant reply
-  when deciding to switch agents:
-
-  ```text
-  //delegate_to_coder
-  delegate_to_coder
-  `delegate_to_coder`
-  Executing command: `delegate_to_coder`
-  ```
-
-  No need to rigidly start the reply with the token – the regex scans the whole
-  message. Once detected, we alert the user ("🔄 Routing to …") and prepend the
-  new agent’s system prompt.
+Interactive and Auto Agent System Tester (v1.4-refactored)
+=========================================================
+This script combines two execution modes:
+- Interactive Mode: A standard chat-like interface for manual testing.
+- Automated Mode: Runs the agent with a given prompt for a set number of turns
+  for benchmarking purposes.
+
+Use the --auto flag to enable automated mode.
+This version has been refactored to reduce code duplication.
 """
 from __future__ import annotations
 
+import argparse
 import base64
 import json
 import os
@@ -29,10 +23,12 @@
 import time
 from datetime import datetime
 from pathlib import Path
-from typing import List, Tuple, Optional, Dict
+from typing import Dict, List, Optional, Tuple
+
+from rich.prompt import Prompt
 from rich.table import Table
+# -- Pick LLM backend ---------------------------------------------------
 from rich.prompt import Prompt
-
 BACKEND_CHOICE = Prompt.ask(
     "LLM backend",
     choices=["chatgpt", "ollama"],
@@ -47,32 +43,35 @@
 # ── Dependencies ------------------------------------------------------------
 try:
     from dotenv import load_dotenv
+
     if BACKEND_CHOICE == "ollama":
         from cli.core.ollama_wrapper import OllamaClient as OpenAI
         APIError = Exception  # Ollama does not have a specific APIError
     else:
-        from openai import OpenAI, APIError
+        from openai import APIError, OpenAI
+
     import requests
     from rich.console import Console
 except ImportError as e:
     print(f"Missing dependency: {e}", file=sys.stderr)
     sys.exit(1)
+
 # ── Agent framework ---------------------------------------------------------
 try:
-    from cli.agents.AgentSystem import AgentSystem, Agent
+    from cli.agents.AgentSystem import Agent, AgentSystem
 except ImportError:
     print("[ERROR] Could not import backend.agents.agent_system", file=sys.stderr)
     raise
 
 # ── Local helpers -----------------------------------------------------------
 from cli.core.io_helpers import (
-    extract_python_code,
-    display,
-    select_dataset,
     collect_resources,
-    get_initial_prompt,
+    display,
+    extract_python_code,
     format_execute_response,
-    load_bp_json
+    get_initial_prompt,
+    load_bp_json,
+    select_dataset,
 )
 from cli.core.sandbox_management import (
     init_docker,
@@ -90,46 +89,81 @@
 SANDBOX_DATA_PATH = "/workspace/dataset.h5ad"
 SANDBOX_RESOURCES_DIR = "/workspace/resources"
 
+# ── Benchmark persistence --------------------------------------------------
+timestamp = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
+_LEDGER_PATH = OUTPUTS_DIR / f"benchmark_history_{timestamp}.jsonl"
+_SNIPPET_DIR = OUTPUTS_DIR / "snippets"
+_SNIPPET_DIR.mkdir(exist_ok=True, parents=True)
+_LEDGER_PATH.parent.mkdir(exist_ok=True, parents=True)
+
 # ===========================================================================
 # 1 · Backend selection
 # ===========================================================================
-backend = Prompt.ask("Choose sandbox backend", choices=["docker", "singularity", "singularity-exec"], default="docker")
-force_refresh = Prompt.ask("Force refresh environment?", choices=["y", "n"], default="n").lower() == "y"
+backend = Prompt.ask(
+    "Choose sandbox backend",
+    choices=["docker", "singularity", "singularity-exec"],
+    default="docker",
+)
+force_refresh = (
+    Prompt.ask("Force refresh environment?", choices=["y", "n"], default="n").lower() == "y"
+)
 is_exec_mode = backend == "singularity-exec"
 
 if backend == "docker":
-    _BackendManager, _SANDBOX_HANDLE, COPY_CMD, EXECUTE_ENDPOINT, STATUS_ENDPOINT = init_docker(
-        SCRIPT_DIR, subprocess, console, force_refresh
-    )
+    (
+        _BackendManager,
+        _SANDBOX_HANDLE,
+        COPY_CMD,
+        EXECUTE_ENDPOINT,
+        STATUS_ENDPOINT,
+    ) = init_docker(SCRIPT_DIR, subprocess, console, force_refresh)
     SANDBOX_DATA_PATH = "dataset.h5ad"
 elif backend == "singularity":
-    _BackendManager, _SANDBOX_HANDLE, COPY_CMD, EXECUTE_ENDPOINT, STATUS_ENDPOINT = init_singularity(
-        SCRIPT_DIR, subprocess, console, force_refresh
-    )
+    (
+        _BackendManager,
+        _SANDBOX_HANDLE,
+        COPY_CMD,
+        EXECUTE_ENDPOINT,
+        STATUS_ENDPOINT,
+    ) = init_singularity(SCRIPT_DIR, subprocess, console, force_refresh)
 elif backend == "singularity-exec":
-    _BackendManager, _SANDBOX_HANDLE, COPY_CMD, EXECUTE_ENDPOINT, STATUS_ENDPOINT = init_singularity_exec(
+    (
+        _BackendManager,
+        _SANDBOX_HANDLE,
+        COPY_CMD,
+        EXECUTE_ENDPOINT,
+        STATUS_ENDPOINT,
+    ) = init_singularity_exec(
         SCRIPT_DIR, SANDBOX_DATA_PATH, subprocess, console, force_refresh
     )
 else:
     console.print("[red]Unknown backend.")
     sys.exit(1)
 
+
 # ===========================================================================
-# 2 · Agent helpers
+# 2 · Common Helpers
 # ===========================================================================
-
 def load_agent_system() -> Tuple[AgentSystem, Agent, str]:
+    """Load the agent system from a JSON blueprint."""
     bp = load_bp_json(console)
+    if not bp.exists():
+        console.print(f"[red]Blueprint {bp} not found.")
+        sys.exit(1)
     system = AgentSystem.load_from_json(str(bp))
-    driver_name = Prompt.ask("Driver agent", choices=list(system.agents.keys()), default=list(system.agents)[0])
+    driver_name = Prompt.ask(
+        "Driver agent",
+        choices=list(system.agents.keys()),
+        default=list(system.agents)[0],
+    )
     driver = system.get_agent(driver_name)
     instr = system.get_instructions()
     return system, driver, instr
 
-# Smarter regex – matches inline/backtick/explicit styles
-# Match variations like //<backtick>delegate_to_coder<backtick>, with optional punctuation.
+
 _DELEG_RE = re.compile(r"delegate_to_([A-Za-z0-9_]+)")
 
+
 def detect_delegation(msg: str) -> Optional[str]:
     """Return the *full* command name (e.g. 'delegate_to_coder') if present."""
     m = _DELEG_RE.search(msg)
@@ -137,6 +171,7 @@ def detect_delegation(msg: str) -> Optional[str]:
 
 
 def api_alive(url: str, tries: int = 10) -> bool:
+    """Check if the API is responsive."""
     if is_exec_mode:
         return True
     for _ in range(tries):
@@ -147,11 +182,126 @@ def api_alive(url: str, tries: int = 10) -> bool:
             time.sleep(1.5)
     return False
 
+
+def _dump_code_snippet(run_id: str, code: str) -> str:
+    """Write <run_id>.py under outputs/snippets/ and return the relative path."""
+    snippet_path = _SNIPPET_DIR / f"{run_id}.py"
+    snippet_path.write_text(code, encoding="utf-8")
+    return str(snippet_path.relative_to(OUTPUTS_DIR))
+
+
+def _save_benchmark_record(*, run_id: str, results: dict, meta: dict, code: str | None):
+    """Append a JSONL record for the benchmark run."""
+    record = {
+        "ts": datetime.utcnow().isoformat(timespec="seconds") + "Z",
+        "run": run_id,
+        "dataset": meta.get("name"),
+        "results": results,
+    }
+    if code:
+        record["code_path"] = _dump_code_snippet(run_id, code)
+    with _LEDGER_PATH.open("a") as fh:
+        fh.write(json.dumps(record) + "\n")
+
+
 # ===========================================================================
-# 3 · Interactive loop
+# 3 · Unified Benchmark Runner
 # ===========================================================================
+def run_benchmark(
+    mgr,
+    benchmark_module: Path,
+    *,
+    is_auto: bool,
+    metadata: Optional[Dict] = None,
+    agent_name: Optional[str] = None,
+    code_snippet: Optional[str] = None,
+) -> str:
+    """
+    Execute a benchmark module.
+    In auto mode, saves results and returns a result string for the history.
+    In interactive mode, just prints results to the console.
+    """
+    console.print(
+        f"\n[bold cyan]Running benchmark module: {benchmark_module.name}[/bold cyan]"
+    )
+    autometric_base_path = benchmark_module.parent / "AutoMetric.py"
+    try:
+        with open(autometric_base_path, "r") as f:
+            autometric_code = f.read()
+        with open(benchmark_module, "r") as f:
+            benchmark_code = f.read()
+    except FileNotFoundError:
+        err = f"Benchmark module not found at: {benchmark_module}"
+        console.print(f"[red]{err}[/red]")
+        return err if is_auto else ""
+
+    code_to_execute = f"""
+# --- Code from AutoMetric.py ---
+{autometric_code}
+# --- Code from {benchmark_module.name} ---
+{benchmark_code}
+"""
+    console.print("[cyan]Executing benchmark code...[/cyan]")
+    try:
+        if is_exec_mode:
+            exec_result = mgr.exec_code(code_to_execute, timeout=300)
+        else:
+            exec_result = requests.post(
+                EXECUTE_ENDPOINT, json={"code": code_to_execute, "timeout": 300}, timeout=310
+            ).json()
+
+        table = Table(title="Benchmark Results")
+        table.add_column("Metric", style="cyan")
+        table.add_column("Value", style="magenta")
+        stdout = exec_result.get("stdout", "")
+        result_dict = {}
+        try:
+            result_dict = json.loads(stdout.strip().splitlines()[-1])
+        except (json.JSONDecodeError, IndexError) as e:
+            console.print(f"[yellow]Warning: Could not parse JSON from stdout: {e}[/yellow]")
+
+        if exec_result.get("status") == "ok" and isinstance(result_dict, dict):
+            for key, value in result_dict.items():
+                table.add_row(str(key), str(value))
+            if is_auto:
+                _save_benchmark_record(
+                    run_id=f"{benchmark_module.stem}:{agent_name}:{int(time.time())}",
+                    results=result_dict,
+                    meta=metadata,
+                    code=code_snippet,
+                )
+        else:
+            table.add_row("Error", exec_result.get("stderr") or "An unknown error occurred.")
+        console.print(table)
+
+        if is_auto:
+            return "Benchmark results:\n" + json.dumps(result_dict or {"error": "see console"})
+    except Exception as exc:
+        err_msg = f"Benchmark execution error: {exc}"
+        console.print(f"[red]{err_msg}[/red]")
+        if is_auto:
+            return err_msg
+    return ""
+
 
-def run(agent_system: AgentSystem, agent: Agent, roster_instr: str, dataset: Path, metadata: dict, resources: List[Tuple[Path, str]], benchmark_modules: Optional[list[Path]] = None):
+# ===========================================================================
+# 4 · Unified Main Execution Loop
+# ===========================================================================
+def run(
+    agent_system: AgentSystem,
+    agent: Agent,
+    roster_instr: str,
+    dataset: Path,
+    metadata: dict,
+    resources: List[Tuple[Path, str]],
+    *,
+    is_auto: bool,
+    initial_user_message: str,
+    benchmark_modules: Optional[List[Path]] = None,
+    tries: int = 1,
+):
+    """Main driver for both interactive and automated execution."""
+    last_code_snippet: str | None = None
     mgr = _BackendManager()
     console.print(f"Launching sandbox ({backend})…")
 
@@ -171,34 +321,42 @@ def run(agent_system: AgentSystem, agent: Agent, roster_instr: str, dataset: Pat
 
     res_lines = [f"- {c} (from {h})" for h, c in resources] or ["- (none)"]
     analysis_ctx = textwrap.dedent(
-        f"Dataset path: **{SANDBOX_DATA_PATH}**\nResources:\n" + "\n".join(res_lines) + "\n\nMetadata:\n" + json.dumps(metadata, indent=2)
+        f"Dataset path: **{SANDBOX_DATA_PATH}**\nResources:\n"
+        + "\n".join(res_lines)
+        + "\n\nMetadata:\n"
+        + json.dumps(metadata, indent=2)
     )
 
     def build_system(a: Agent) -> str:
-        return roster_instr + "\n\n" + a.get_full_prompt(agent_system.global_policy) + "\n\n" + analysis_ctx
+        return (
+            roster_instr
+            + "\n\n"
+            + a.get_full_prompt(agent_system.global_policy)
+            + "\n\n"
+            + analysis_ctx
+        )
 
     history = [{"role": "system", "content": build_system(agent)}]
-    first_user = "Beginning interactive session. You can ask questions or give commands."
-    history.append({"role": "user", "content": first_user})
+    history.append({"role": "user", "content": initial_user_message})
     display(console, "system", history[0]["content"])
-    display(console, "user", first_user)
+    display(console, "user", initial_user_message)
 
     if BACKEND_CHOICE == "chatgpt":
-        if not os.getenv("OPENAI_API_KEY"):
-            console.print("[red]OPENAI_API_KEY not set in .env")
-            sys.exit(1)
         openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
     else:
-        # Local Ollama needs no key; model defaults to “llama2”
         openai = OpenAI(host=OLLAMA_HOST, model="deepseek-r1:70b")
+
     current_agent = agent
     turn = 0
+    tries_left = tries
 
     while True:
         turn += 1
         console.print(f"\n[bold]OpenAI call (turn {turn})…")
         try:
-            resp = openai.chat.completions.create(model="gpt-4o", messages=history, temperature=0.7)
+            resp = openai.chat.completions.create(
+                model="gpt-4o", messages=history, temperature=0.7
+            )
         except APIError as e:
             console.print(f"[red]OpenAI error: {e}")
             break
@@ -212,24 +370,23 @@ def build_system(a: Agent) -> str:
             new_agent = agent_system.get_agent(tgt)
             if new_agent:
                 console.print(f"[yellow]🔄 Routing to '{tgt}' via {cmd}")
-                history.append({"role": "assistant", "content": f"🔄 Routing to **{tgt}** (command `{cmd}`)"})
-                  
-                # INJECT LOADED CODE SAMPLES ON DELEGATION ---
+                history.append(
+                    {"role": "assistant", "content": f"🔄 Routing to **{tgt}** (command `{cmd}`)"}
+                )
                 if new_agent.code_samples:
                     sample_context = "Here are some relevant code samples for your task:"
                     for filename, code_content in new_agent.code_samples.items():
                         sample_context += f"\n\n--- Sample from: {filename} ---\n"
                         sample_context += f"```python\n{code_content.strip()}\n```"
-                    
                     history.append({"role": "user", "content": sample_context})
-                    display(console, "user", sample_context) # Display for clarity
-
+                    display(console, "user", sample_context)
                 current_agent = new_agent
                 history.insert(0, {"role": "system", "content": build_system(new_agent)})
                 continue
 
         code = extract_python_code(msg)
         if code:
+            last_code_snippet = code
             console.print("[cyan]Executing code…[/cyan]")
             try:
                 if is_exec_mode:
@@ -241,158 +398,157 @@ def build_system(a: Agent) -> str:
                 feedback = format_execute_response(exec_result, OUTPUTS_DIR)
             except Exception as exc:
                 feedback = f"Code execution result:\n[Execution error on host: {exc}]"
-
             history.append({"role": "user", "content": feedback})
             display(console, "user", feedback)
-            
-        def input_loop():
-            if benchmark_modules:
-                console.print("\n[bold]Next message (blank = continue, 'benchmark' to run benchmarks, 'exit' to quit):[/bold]")
-            else:
-                console.print("\n[bold]Next message (blank = continue, 'exit' to quit):[/bold]")
-            try:
-                user_in = input().strip()
-            except (EOFError, KeyboardInterrupt):
-                user_in = "exit"
-            if user_in.lower() in {"exit", "quit"}:
-                return "break"
-            if user_in.lower() == "benchmark" and benchmark_modules:
-                for benchmark_module in benchmark_modules:
-                    run_benchmark(mgr, benchmark_module)
-                input_loop()  # Recurse to continue the loop after benchmarks
-            if user_in:
-                history.append({"role": "user", "content": user_in})
-                display(console, "user", user_in)
-        input_val = input_loop()
-        if input_val == "break":  # User chose to exit
-            break
+
+        # --- Mode-specific logic ---
+        if is_auto:
+            if benchmark_modules:  # In auto mode, this is a list with 0 or 1 module
+                result_str = run_benchmark(
+                    mgr,
+                    benchmark_modules[0],
+                    is_auto=True,
+                    metadata=metadata,
+                    agent_name=current_agent.name,
+                    code_snippet=last_code_snippet,
+                )
+                history.append({"role": "user", "content": result_str})
+                display(console, "user", result_str)
+
+            tries_left -= 1
+            if tries_left <= 0:
+                console.print("[bold green]Auto run finished.[/bold green]")
+                break
+            history.append({"role": "user", "content": ""})  # Auto-continue
+        else:
+            # Interactive mode input loop
+            while True:
+                prompt_text = (
+                    "\n[bold]Next message (blank = continue, 'benchmark' to run, 'exit' to quit):[/bold]"
+                    if benchmark_modules
+                    else "\n[bold]Next message (blank = continue, 'exit' to quit):[/bold]"
+                )
+                try:
+                    user_input = Prompt.ask(prompt_text, default="").strip()
+                except (EOFError, KeyboardInterrupt):
+                    user_input = "exit"
+
+                if user_input.lower() in {"exit", "quit"}:
+                    console.print("Stopping sandbox…")
+                    mgr.stop_container()
+                    return  # Exit the entire run function
+
+                if user_input.lower() == "benchmark":
+                    if benchmark_modules:
+                        for bm_module in benchmark_modules:
+                            run_benchmark(mgr, bm_module, is_auto=False)
+                        continue  # Re-prompt after running benchmarks
+                    else:
+                        console.print("[yellow]No benchmark modules selected at startup.[/yellow]")
+                        continue
+                
+                if user_input:
+                    history.append({"role": "user", "content": user_input})
+                    display(console, "user", user_input)
+                break  # Exit input loop and proceed to next agent turn
 
     console.print("Stopping sandbox…")
     mgr.stop_container()
 
 
 # ===========================================================================
-# 4 · Benchmarking
+# 5 · Mode-Specific Setup Functions
 # ===========================================================================
-
-def get_benchmark_modules(console: Console, parent_dir: Path) -> Optional[list[Path]]:
-    """
-    Prompts the user to select a benchmark module from the available ones.
-    Returns the path to the selected module or None if no selection is made.
-    """
+def get_benchmark_modules(console: Console, parent_dir: Path) -> Optional[List[Path]]:
+    """Prompt user to select one or more benchmark modules for interactive mode."""
     benchmark_dir = parent_dir / "auto_metrics"
     if not benchmark_dir.exists():
-        console.print("[red]No benchmarks directory found.[/red]")
         return None
-
-    module_names = list(benchmark_dir.glob("*.py"))
-    # remove AutoMetric.py from modules (it is the base class)
-    module_names = [m for m in module_names if m.name != "AutoMetric.py"]
-    if not module_names:
-        console.print("[red]No benchmark modules found.[/red]")
+    modules = [m for m in benchmark_dir.glob("*.py") if m.name != "AutoMetric.py"]
+    if not modules:
         return None
-
     console.print("\n[bold]Available benchmark modules:[/bold]")
-    for i, mod in enumerate(module_names, start=1):
+    for i, mod in enumerate(modules, start=1):
         console.print(f"{i}. {mod.name}")
-    console.print(f"{len(module_names)+1}. Select All")
-    choices = Prompt.ask("Select benchmark modules by number  (e.g. 1 2 3 or 1,2,3) (or press Enter to skip)", default="")
-    choices = re.split(r'[,\s]+', choices) #User input must be seperated by commas or spaces 
-    
-    if not choices or choices == ['']:
+    console.print(f"{len(modules)+1}. Select All")
+    choices_str = Prompt.ask("Select modules (e.g., 1 2 or 1,2,3) (Enter to skip)", default="")
+    choices = re.split(r"[,|\s]+", choices_str.strip())
+    if not choices or choices == [""]:
         return None
-
-    modules = []
-    for choice in choices:
-        try: 
-            index = int(choice) - 1
-            if index == len(module_names): #Handles select all case 
-                return module_names
-            elif 0 <= index < len(module_names): 
-                modules.append(module_names[index])
-            else:
-                console.print("[red]Invalid selection.[/red]")
-                return None
-        except ValueError:
-            console.print("[red]Invalid input. Please enter a number.[/red]")
-            return None
-    return modules 
-    
-def run_benchmark(mgr, benchmark_module: str):
-    """
-    Runs the benchmark module and displays the results.
-    """
-    console.print(f"\n[bold cyan]Running benchmark module: {benchmark_module}[/bold cyan]")
-    autometric_base_path = benchmark_module.parent / "AutoMetric.py"
+    selected = []
     try:
-        # Read the abstract base class definition
-        with open(autometric_base_path, "r") as f:
-            autometric_code = f.read()
-
-        with open(benchmark_module, "r") as f:
-            benchmark_code = f.read()
-    except FileNotFoundError:
-        console.print(f"[red]Benchmark module not found at: {benchmark_module}[/red]")
-        return
-
-    code_to_execute = f"""
-# --- Code from AutoMetric.py --- 
-{autometric_code}
-# --- Code from {benchmark_module.name} ---
-{benchmark_code}
-"""
-    console.print("[cyan]Executing benchmark code...[/cyan]")
-    try:
-        if is_exec_mode:
-            exec_result = mgr.exec_code(code_to_execute, timeout=300)
-        else:
-            exec_result = requests.post(
-                EXECUTE_ENDPOINT, json={"code": code_to_execute, "timeout": 300}, timeout=310
-            ).json()
-
-        # Create a table to display the results
-        table = Table(title="Benchmark Results")
-        table.add_column("Metric", style="cyan")
-        table.add_column("Value", style="magenta")
-
-        # Assuming the benchmark module returns a dictionary of results
-        stdout = exec_result.get("stdout", "")
-        try:
-            result_dict = json.loads(stdout.strip().splitlines()[-1])  # Parse last printed line
-        except Exception as e:
-            console.print(f"[yellow]Warning: Could not parse JSON from stdout: {e}[/yellow]")
-            result_dict = {}
-
-        if exec_result.get("status") == "ok" and isinstance(result_dict, dict):
-            for key, value in result_dict.items():
-                table.add_row(str(key), str(value))
-        else:
-            table.add_row("Error", exec_result.get("stderr") or "An unknown error occurred.")
+        for choice in choices:
+            if not choice: continue
+            index = int(choice) - 1
+            if index == len(modules): return modules  # Select All
+            if 0 <= index < len(modules): selected.append(modules[index])
+    except (ValueError, IndexError):
+        console.print("[red]Invalid selection.[/red]")
+        return None
+    return selected
 
-        console.print(table)
-
-    except Exception as exc:
-        console.print(f"[red]Benchmark execution error: {exc}[/red]")
 
 # ===========================================================================
-# 4 · Entry point
+# 6 · Entry Point
 # ===========================================================================
-
 def main():
+    """Main entry point to parse args and start the correct mode."""
+    parser = argparse.ArgumentParser(
+        description="Interactive or Automated Agent System Tester.",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument("--auto", action="store_true", help="Run in automated benchmark mode.")
+    args = parser.parse_args()
+
     load_dotenv(ENV_FILE)
-    if not os.getenv("OPENAI_API_KEY"):
-        console.print("[red]OPENAI_API_KEY not set in .env")
+    if BACKEND_CHOICE == "chatgpt" and not os.getenv("OPENAI_API_KEY"):
+        console.print("[red]OPENAI_API_KEY not set in .env[/red]")
         sys.exit(1)
 
-    sys, drv, roster = load_agent_system()
+    sys_, drv, roster = load_agent_system()
     dp, meta = select_dataset(console, DATASETS_DIR)
-    benchmark_modules = get_benchmark_modules(console, PARENT_DIR)
     res = collect_resources(console, SANDBOX_RESOURCES_DIR)
-    run(sys, drv, roster, dp, meta, res, benchmark_modules)
+
+    if args.auto:
+        console.print("[bold green]🚀 Running in Automated Mode...[/bold green]")
+        benchmark_module = get_benchmark_modules(console, PARENT_DIR)
+        initial_user_message = Prompt.ask("Initial user message", default="What should I do with this dataset?")
+        try:
+            tries = int(Prompt.ask("Number of automatic turns", default="1"))
+            if tries <= 0: raise ValueError
+        except ValueError:
+            console.print("[yellow]Invalid number – defaulting to 1.[/yellow]")
+            tries = 1
+        run(
+            agent_system=sys_,
+            agent=drv,
+            roster_instr=roster,
+            dataset=dp,
+            metadata=meta,
+            resources=res,
+            is_auto=True,
+            initial_user_message=initial_user_message,
+            benchmark_modules=[benchmark_module] if benchmark_module else [],
+            tries=tries,
+        )
+    else:
+        console.print("[bold blue]🚀 Running in Interactive Mode...[/bold blue]")
+        benchmark_modules = get_benchmark_modules(console, PARENT_DIR)
+        run(
+            agent_system=sys_,
+            agent=drv,
+            roster_instr=roster,
+            dataset=dp,
+            metadata=meta,
+            resources=res,
+            is_auto=False,
+            initial_user_message="Beginning interactive session. You can ask questions or give commands.",
+            benchmark_modules=benchmark_modules,
+        )
 
 
 if __name__ == "__main__":
     try:
         main()
     except KeyboardInterrupt:
-        console.print("\nInterrupted.")
+        console.print("\nInterrupted by user. Exiting.")
\ No newline at end of file
diff --git a/cli/run_automated.sh b/cli/run_automated.sh
index 12aed66..db05b05 100755
--- a/cli/run_automated.sh
+++ b/cli/run_automated.sh
@@ -1,4 +1,4 @@
 #!/usr/bin/env bash
 # move *out* of cli/ into its parent (Olaf/)
 cd "$(dirname "$0")"/..
-python -m cli.prompt_testing.MultiAgentAutoTester "$@"
\ No newline at end of file
+python -m cli.prompt_testing.MultiAgentTester --auto "$@"
\ No newline at end of file

From e6df395e58746931cc7a669c835ca50b17413cd4 Mon Sep 17 00:00:00 2001
From: djriffle <djriffle1@gmail.com>
Date: Wed, 13 Aug 2025 11:12:20 -0400
Subject: [PATCH 03/14] moving agent system into CLI

---
 cli/olaf/pyproject.toml                       | 41 +++++++++++++++++++
 cli/{agents => olaf/src/olaf}/__init__.py     |  0
 cli/{ => olaf/src/olaf}/agents/AgentSystem.py |  0
 .../src/olaf/agents}/__init__.py              |  0
 .../src/olaf}/agents/create_agent_system.py   |  0
 .../src/olaf}/agents/integration_system.json  |  0
 .../src/olaf}/agents/system_blueprint.json    |  0
 cli/olaf/src/olaf/core/__init__.py            |  0
 cli/{ => olaf/src/olaf}/core/io_helpers.py    |  0
 .../src/olaf}/core/ollama_wrapper.py          |  0
 .../src/olaf}/core/sandbox_management.py      |  0
 11 files changed, 41 insertions(+)
 create mode 100644 cli/olaf/pyproject.toml
 rename cli/{agents => olaf/src/olaf}/__init__.py (100%)
 rename cli/{ => olaf/src/olaf}/agents/AgentSystem.py (100%)
 rename cli/{core => olaf/src/olaf/agents}/__init__.py (100%)
 rename cli/{ => olaf/src/olaf}/agents/create_agent_system.py (100%)
 rename cli/{ => olaf/src/olaf}/agents/integration_system.json (100%)
 rename cli/{ => olaf/src/olaf}/agents/system_blueprint.json (100%)
 create mode 100644 cli/olaf/src/olaf/core/__init__.py
 rename cli/{ => olaf/src/olaf}/core/io_helpers.py (100%)
 rename cli/{ => olaf/src/olaf}/core/ollama_wrapper.py (100%)
 rename cli/{ => olaf/src/olaf}/core/sandbox_management.py (100%)

diff --git a/cli/olaf/pyproject.toml b/cli/olaf/pyproject.toml
new file mode 100644
index 0000000..e72214f
--- /dev/null
+++ b/cli/olaf/pyproject.toml
@@ -0,0 +1,41 @@
+[build-system]
+requires = ["hatchling>=1.24"]
+build-backend = "hatchling.build"
+
+[project]
+name = "olaf"
+version = "0.1.0"
+description = "CLI + library for <what it does>"
+readme = "README.md"
+requires-python = ">=3.10"
+authors = [{ name = "Your Name", email = "you@example.com" }]
+
+# Core runtime deps (PyPI names!)
+dependencies = [
+  "cellxgene-census",
+  "tiledbsoma",
+  "rich",
+  "numpy",
+  "docker",           # a.k.a. docker-py
+  "python-dotenv",    # NOTE: PyPI name is 'python-dotenv' (import is 'dotenv')
+  "openai",
+  "jupyter-client",   # NOTE: PyPI name has a hyphen
+  "nbformat"
+]
+
+# If you want a command like `olaf …`
+[project.scripts]
+olaf = "olaf.cli.main:main"
+
+# Optional extras (install with: pip install .[dev])
+[project.optional-dependencies]
+dev = [
+  "pytest",
+  "ruff",
+  "mypy",
+  "ipykernel"
+]
+
+# If you’re using a src/ layout, tell Hatch where packages live.
+[tool.hatch.build.targets.wheel]
+packages = ["src/olaf"]
\ No newline at end of file
diff --git a/cli/agents/__init__.py b/cli/olaf/src/olaf/__init__.py
similarity index 100%
rename from cli/agents/__init__.py
rename to cli/olaf/src/olaf/__init__.py
diff --git a/cli/agents/AgentSystem.py b/cli/olaf/src/olaf/agents/AgentSystem.py
similarity index 100%
rename from cli/agents/AgentSystem.py
rename to cli/olaf/src/olaf/agents/AgentSystem.py
diff --git a/cli/core/__init__.py b/cli/olaf/src/olaf/agents/__init__.py
similarity index 100%
rename from cli/core/__init__.py
rename to cli/olaf/src/olaf/agents/__init__.py
diff --git a/cli/agents/create_agent_system.py b/cli/olaf/src/olaf/agents/create_agent_system.py
similarity index 100%
rename from cli/agents/create_agent_system.py
rename to cli/olaf/src/olaf/agents/create_agent_system.py
diff --git a/cli/agents/integration_system.json b/cli/olaf/src/olaf/agents/integration_system.json
similarity index 100%
rename from cli/agents/integration_system.json
rename to cli/olaf/src/olaf/agents/integration_system.json
diff --git a/cli/agents/system_blueprint.json b/cli/olaf/src/olaf/agents/system_blueprint.json
similarity index 100%
rename from cli/agents/system_blueprint.json
rename to cli/olaf/src/olaf/agents/system_blueprint.json
diff --git a/cli/olaf/src/olaf/core/__init__.py b/cli/olaf/src/olaf/core/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/cli/core/io_helpers.py b/cli/olaf/src/olaf/core/io_helpers.py
similarity index 100%
rename from cli/core/io_helpers.py
rename to cli/olaf/src/olaf/core/io_helpers.py
diff --git a/cli/core/ollama_wrapper.py b/cli/olaf/src/olaf/core/ollama_wrapper.py
similarity index 100%
rename from cli/core/ollama_wrapper.py
rename to cli/olaf/src/olaf/core/ollama_wrapper.py
diff --git a/cli/core/sandbox_management.py b/cli/olaf/src/olaf/core/sandbox_management.py
similarity index 100%
rename from cli/core/sandbox_management.py
rename to cli/olaf/src/olaf/core/sandbox_management.py

From 23106546e01b5441be40c83b516992cf4b2c2d7c Mon Sep 17 00:00:00 2001
From: djriffle <djriffle1@gmail.com>
Date: Wed, 13 Aug 2025 11:53:38 -0400
Subject: [PATCH 04/14] Organized datasets and agent cli tooling

---
 .../extra_tools/Evaluator.py                  |   0
 .../extra_tools/InteractiveAgentTester.py     |   0
 .../extra_tools/OneShotAgentTester.py         |   0
 .../extra_tools/PromptEvolver.py              |   0
 cli/olaf/README.md                            |   0
 cli/olaf/pyproject.toml                       |   3 +-
 .../src/olaf/agents/create_agent_system.py    | 142 ++++----
 .../src/olaf}/auto_metrics/AutoMetric.py      |   0
 .../src/olaf}/auto_metrics/CellCountMetric.py |   0
 .../olaf}/auto_metrics/CellTypingMetric.py    |   0
 .../olaf}/auto_metrics/IntegrationMetrics.py  |   0
 cli/olaf/src/olaf/cli/__init__.py             |   0
 cli/olaf/src/olaf/cli/create_agent_cli.py     | 107 ++++++
 cli/olaf/src/olaf/cli/datasets_cli.py         |  72 ++++
 cli/olaf/src/olaf/cli/main.py                 |  27 ++
 .../src/olaf}/code_samples/Celltyping.py      |   0
 .../src/olaf}/code_samples/DataCheck.py       |   0
 .../src/olaf}/code_samples/Doublets.py        |   0
 .../src/olaf}/code_samples/Downstream.py      |   0
 .../olaf}/code_samples/Integrate_Harmony.py   |   0
 .../src/olaf}/code_samples/Integrate_scvi.py  |   0
 cli/{ => olaf/src/olaf}/code_samples/MAD.py   |   0
 .../src/olaf}/code_samples/QC_Inspection.py   |   0
 .../code_samples/QC_umap_visualization.py     |   0
 .../olaf}/code_samples/Re-analysis_afterQC.py |   0
 .../src/olaf}/code_samples/load_adata.py      |   0
 cli/olaf/src/olaf/datasets/czi_datasets.py    | 323 ++++++++++++++++++
 ...anscriptomics_in_mouse_puck_191109_14.h5ad | Bin
 ...anscriptomics_in_mouse_puck_191109_14.json |   0
 ...s_scrna-seq_atlas_-_myeloid_p2_subset.h5ad | Bin
 ...s_scrna-seq_atlas_-_myeloid_p2_subset.json |   0
 .../Basic_scRNA_Agent.txt                     | 165 ---------
 32 files changed, 594 insertions(+), 245 deletions(-)
 rename cli/{prompt_testing => }/extra_tools/Evaluator.py (100%)
 rename cli/{prompt_testing => }/extra_tools/InteractiveAgentTester.py (100%)
 rename cli/{prompt_testing => }/extra_tools/OneShotAgentTester.py (100%)
 rename cli/{prompt_testing => }/extra_tools/PromptEvolver.py (100%)
 create mode 100644 cli/olaf/README.md
 rename cli/{ => olaf/src/olaf}/auto_metrics/AutoMetric.py (100%)
 rename cli/{ => olaf/src/olaf}/auto_metrics/CellCountMetric.py (100%)
 rename cli/{ => olaf/src/olaf}/auto_metrics/CellTypingMetric.py (100%)
 rename cli/{ => olaf/src/olaf}/auto_metrics/IntegrationMetrics.py (100%)
 create mode 100644 cli/olaf/src/olaf/cli/__init__.py
 create mode 100644 cli/olaf/src/olaf/cli/create_agent_cli.py
 create mode 100644 cli/olaf/src/olaf/cli/datasets_cli.py
 create mode 100644 cli/olaf/src/olaf/cli/main.py
 rename cli/{ => olaf/src/olaf}/code_samples/Celltyping.py (100%)
 rename cli/{ => olaf/src/olaf}/code_samples/DataCheck.py (100%)
 rename cli/{ => olaf/src/olaf}/code_samples/Doublets.py (100%)
 rename cli/{ => olaf/src/olaf}/code_samples/Downstream.py (100%)
 rename cli/{ => olaf/src/olaf}/code_samples/Integrate_Harmony.py (100%)
 rename cli/{ => olaf/src/olaf}/code_samples/Integrate_scvi.py (100%)
 rename cli/{ => olaf/src/olaf}/code_samples/MAD.py (100%)
 rename cli/{ => olaf/src/olaf}/code_samples/QC_Inspection.py (100%)
 rename cli/{ => olaf/src/olaf}/code_samples/QC_umap_visualization.py (100%)
 rename cli/{ => olaf/src/olaf}/code_samples/Re-analysis_afterQC.py (100%)
 rename cli/{ => olaf/src/olaf}/code_samples/load_adata.py (100%)
 create mode 100644 cli/olaf/src/olaf/datasets/czi_datasets.py
 rename cli/{ => olaf/src/olaf}/datasets/spatial_transcriptomics_in_mouse_puck_191109_14.h5ad (100%)
 rename cli/{ => olaf/src/olaf}/datasets/spatial_transcriptomics_in_mouse_puck_191109_14.json (100%)
 rename cli/{ => olaf/src/olaf}/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.h5ad (100%)
 rename cli/{ => olaf/src/olaf}/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.json (100%)
 delete mode 100644 cli/sample_prompt_library/Basic_scRNA_Agent.txt

diff --git a/cli/prompt_testing/extra_tools/Evaluator.py b/cli/extra_tools/Evaluator.py
similarity index 100%
rename from cli/prompt_testing/extra_tools/Evaluator.py
rename to cli/extra_tools/Evaluator.py
diff --git a/cli/prompt_testing/extra_tools/InteractiveAgentTester.py b/cli/extra_tools/InteractiveAgentTester.py
similarity index 100%
rename from cli/prompt_testing/extra_tools/InteractiveAgentTester.py
rename to cli/extra_tools/InteractiveAgentTester.py
diff --git a/cli/prompt_testing/extra_tools/OneShotAgentTester.py b/cli/extra_tools/OneShotAgentTester.py
similarity index 100%
rename from cli/prompt_testing/extra_tools/OneShotAgentTester.py
rename to cli/extra_tools/OneShotAgentTester.py
diff --git a/cli/prompt_testing/extra_tools/PromptEvolver.py b/cli/extra_tools/PromptEvolver.py
similarity index 100%
rename from cli/prompt_testing/extra_tools/PromptEvolver.py
rename to cli/extra_tools/PromptEvolver.py
diff --git a/cli/olaf/README.md b/cli/olaf/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/cli/olaf/pyproject.toml b/cli/olaf/pyproject.toml
index e72214f..f8587f1 100644
--- a/cli/olaf/pyproject.toml
+++ b/cli/olaf/pyproject.toml
@@ -20,7 +20,8 @@ dependencies = [
   "python-dotenv",    # NOTE: PyPI name is 'python-dotenv' (import is 'dotenv')
   "openai",
   "jupyter-client",   # NOTE: PyPI name has a hyphen
-  "nbformat"
+  "nbformat",
+  "typer"
 ]
 
 # If you want a command like `olaf …`
diff --git a/cli/olaf/src/olaf/agents/create_agent_system.py b/cli/olaf/src/olaf/agents/create_agent_system.py
index f8fadc6..0102ee1 100644
--- a/cli/olaf/src/olaf/agents/create_agent_system.py
+++ b/cli/olaf/src/olaf/agents/create_agent_system.py
@@ -2,29 +2,37 @@
 import os
 from typing import Dict, Any
 from pathlib import Path
+from platformdirs import PlatformDirs   # pip install platformdirs
+import tempfile
+
+APP_NAME   = "olaf"
+APP_AUTHOR = "OpenTechBio"   # or your org
+dirs = PlatformDirs(APP_NAME, APP_AUTHOR)
+
+# Root for user-specific OLAF files. Precedence: env -> platformdirs.
+OLAF_HOME = Path(os.environ.get("OLAF_HOME", dirs.user_data_dir)).expanduser()
+
+# Subfolders we manage
+DEFAULT_AGENT_DIR = OLAF_HOME / "agent_systems"
+DEFAULT_SAMPLES_DIR = OLAF_HOME / "code_samples"
 
 # A simple class to hold ANSI color codes for terminal output
 class Colors:
-    """A class to hold ANSI color codes for terminal output."""
-    HEADER = '\033[95m'      # Magenta
-    OKBLUE = '\033[94m'      # Blue
-    OKCYAN = '\033[96m'      # Cyan
-    OKGREEN = '\033[92m'     # Green
-    WARNING = '\033[93m'     # Yellow
-    FAIL = '\033[91m'        # Red
-    ENDC = '\033[0m'         # Reset to default
-    BOLD = '\033[1m'
-    UNDERLINE = '\033[4m'
+    HEADER = '\033[95m'; OKBLUE = '\033[94m'; OKCYAN = '\033[96m'
+    OKGREEN = '\033[92m'; WARNING = '\033[93m'; FAIL = '\033[91m'
+    ENDC = '\033[0m'; BOLD = '\033[1m'; UNDERLINE = '\033[4m'
 
-# Define the directory where code samples are stored
-CODE_SAMPLES_DIR = Path("cli/code_samples")
+# Prefer the user’s code_samples under OLAF_HOME; fall back to repo relative path if empty.
+REPO_SAMPLES_DIR = Path("cli/code_samples")
+CODE_SAMPLES_DIR = DEFAULT_SAMPLES_DIR if (DEFAULT_SAMPLES_DIR.exists() and any(DEFAULT_SAMPLES_DIR.glob("*.py"))) else REPO_SAMPLES_DIR
+
+def ensure_dir(path: Path) -> Path:
+    path.mkdir(parents=True, exist_ok=True)
+    return path
 
 def define_global_policy() -> str:
-    """Asks the user to define a global policy for all agents."""
     print(f"\n{Colors.OKBLUE}--- Global Policy Definition ---{Colors.ENDC}")
-    print("First, let's define a global policy. This is a set of general guidelines that all agents should follow.")
-    policy_prompt = f"{Colors.WARNING}Enter the global policy text (e.g., 'Always be concise and professional'): {Colors.ENDC}"
-    policy = input(policy_prompt).strip()
+    policy = input(f"{Colors.WARNING}Enter the global policy text (e.g., 'Always be concise and professional'): {Colors.ENDC}").strip()
     if not policy:
         print(f"{Colors.OKCYAN}No global policy provided. Proceeding without one.{Colors.ENDC}")
         return ""
@@ -32,57 +40,54 @@ def define_global_policy() -> str:
     return policy
 
 def get_output_directory() -> str:
-    """Asks the user for an output directory, with a default option."""
-    default_dir = "cli/agent_systems"
-    dir_prompt = f"{Colors.WARNING}Enter the output directory (press Enter to use '{default_dir}'): {Colors.ENDC}"
-    user_input = input(dir_prompt).strip()
-    return user_input or default_dir
+    """
+    Ask the user for an output directory. Default is a user data directory:
+      - $OLAF_HOME/agent_systems if OLAF_HOME is set
+      - otherwise platformdirs user_data_dir, e.g.:
+          macOS:   ~/Library/Application Support/olaf/agent_systems
+          Linux:   ~/.local/share/olaf/agent_systems
+          Windows: %APPDATA%/olaf/agent_systems
+    """
+    default_dir = str(DEFAULT_AGENT_DIR)
+    prompt = f"{Colors.WARNING}Enter the output directory (press Enter to use '{default_dir}'): {Colors.ENDC}"
+    user_input = input(prompt).strip()
+    out = Path(user_input or default_dir).expanduser()
+    ensure_dir(out)
+    return str(out)
 
 def define_agents() -> Dict[str, Dict[str, Any]]:
-    """Guides the user through defining all agents and their prompts."""
     agents = {}
     print(f"\n{Colors.OKBLUE}--- Agent Definition ---{Colors.ENDC}")
     print("Now, let's define your agents. Type 'done' when you have no more agents to add.")
-
     while True:
-        prompt_text = f"\n{Colors.WARNING}Enter a unique name for the agent (e.g., 'master_agent') or 'done': {Colors.ENDC}"
-        agent_name = input(prompt_text).strip()
-        
+        agent_name = input(f"\n{Colors.WARNING}Enter a unique name for the agent (e.g., 'master_agent') or 'done': {Colors.ENDC}").strip()
         if agent_name.lower() == 'done':
             if not agents:
                 print(f"{Colors.FAIL}No agents defined. Exiting.{Colors.ENDC}")
                 return {}
             break
-        
         if not agent_name:
             print(f"{Colors.FAIL}Agent name cannot be empty. Please try again.{Colors.ENDC}")
             continue
-            
         if agent_name in agents:
             print(f"{Colors.FAIL}Agent '{agent_name}' already exists. Please use a unique name.{Colors.ENDC}")
             continue
-
         prompt = input(f"{Colors.WARNING}Enter the system prompt for '{Colors.OKCYAN}{agent_name}{Colors.WARNING}': {Colors.ENDC}").strip()
-        # Initialize agent with an empty list for code samples
         agents[agent_name] = {"prompt": prompt, "neighbors": {}, "code_samples": []}
         print(f"{Colors.OKGREEN}Agent '{Colors.OKCYAN}{agent_name}{Colors.OKGREEN}' added successfully.{Colors.ENDC}")
-        
     print(f"\n{Colors.OKBLUE}--- All Agents Defined ---{Colors.ENDC}")
     for name in agents:
         print(f"- {Colors.OKCYAN}{name}{Colors.ENDC}")
     return agents
 
 def connect_agents(agents: Dict[str, Dict[str, Any]]) -> None:
-    """Guides the user through connecting agents to each other."""
     print(f"\n{Colors.OKBLUE}--- Agent Connection ---{Colors.ENDC}")
     print("Now, let's define the connections (neighbors) between agents.")
     print("Type 'done' at any point to finish connecting agents.")
-
     agent_names = list(agents.keys())
     if len(agent_names) < 2:
         print(f"{Colors.WARNING}You need at least two agents to create a connection. Skipping this step.{Colors.ENDC}")
         return
-
     while True:
         print(f"\n{Colors.BOLD}Select the agent that will delegate the task (source agent).{Colors.ENDC}")
         for i, name in enumerate(agent_names):
@@ -115,91 +120,70 @@ def connect_agents(agents: Dict[str, Dict[str, Any]]) -> None:
             "target_agent": target_agent_name,
             "description": description
         }
-        print(f"{Colors.OKGREEN}Successfully connected '{Colors.OKCYAN}{source_agent_name}{Colors.OKGREEN}' to '{Colors.OKCYAN}{target_agent_name}{Colors.OKGREEN}' via '{delegation_command}'.{Colors.ENDC}")
+        print(f"{Colors.OKGREEN}Connected '{Colors.OKCYAN}{source_agent_name}{Colors.OKGREEN}' → '{Colors.OKCYAN}{target_agent_name}{Colors.OKGREEN}' via '{delegation_command}'.{Colors.ENDC}")
 
 def assign_code_samples(agents: Dict[str, Dict[str, Any]]) -> None:
-    """Interactively assign code sample files to agents."""
     print(f"\n{Colors.OKBLUE}--- Code Sample Assignment ---{Colors.ENDC}")
-    
-    # Ensure the code samples directory exists
-    CODE_SAMPLES_DIR.mkdir(exist_ok=True, parents=True)
-    
+    ensure_dir(CODE_SAMPLES_DIR)
     try:
         sample_files = [f.name for f in CODE_SAMPLES_DIR.glob("*.py")]
     except Exception as e:
         print(f"{Colors.FAIL}Could not read code samples directory: {e}{Colors.ENDC}")
         return
-
     if not sample_files:
         print(f"{Colors.WARNING}No code samples found in '{CODE_SAMPLES_DIR}'. Skipping assignment.{Colors.ENDC}")
-        print(f"You can add `.py` files to this directory to make them available.")
+        print("You can add `.py` files there to make them available.")
         return
-
     for agent_name, agent_data in agents.items():
         while True:
-            assign_prompt = f"\n{Colors.WARNING}Assign code samples to agent '{Colors.OKCYAN}{agent_name}{Colors.WARNING}'? (y/n): {Colors.ENDC}"
-            if input(assign_prompt).strip().lower() != 'y':
+            if input(f"\n{Colors.WARNING}Assign code samples to '{Colors.OKCYAN}{agent_name}{Colors.WARNING}'? (y/n): {Colors.ENDC}").strip().lower() != 'y':
                 break
-
-            print(f"{Colors.BOLD}Available code samples:{Colors.ENDC}")
+            print(f"{Colors.BOLD}Available code samples from {CODE_SAMPLES_DIR}:{Colors.ENDC}")
             for i, filename in enumerate(sample_files):
                 print(f"  {i + 1}: {Colors.OKCYAN}{filename}{Colors.ENDC}")
-
-            choice_prompt = f"{Colors.WARNING}Enter a number to add a sample, or type 'done': {Colors.ENDC}"
-            choice = input(choice_prompt).strip().lower()
-
+            choice = input(f"{Colors.WARNING}Enter a number to add a sample, or type 'done': {Colors.ENDC}").strip().lower()
             if choice == 'done':
                 break
-            
             try:
                 index = int(choice) - 1
-                if not 0 <= index < len(sample_files):
-                    raise ValueError
-                
+                if not 0 <= index < len(sample_files): raise ValueError
                 chosen_file = sample_files[index]
                 if chosen_file not in agent_data["code_samples"]:
                     agent_data["code_samples"].append(chosen_file)
                     print(f"{Colors.OKGREEN}Assigned '{chosen_file}' to '{agent_name}'.{Colors.ENDC}")
                 else:
                     print(f"{Colors.WARNING}'{chosen_file}' is already assigned to this agent.{Colors.ENDC}")
-
             except (ValueError, IndexError):
                 print(f"{Colors.FAIL}Invalid selection. Please enter a valid number.{Colors.ENDC}")
 
+def _atomic_write_json(obj: Any, path: Path) -> None:
+    """Write JSON atomically: write to a tmp file in the same dir, then replace."""
+    ensure_dir(path.parent)
+    with tempfile.NamedTemporaryFile("w", delete=False, dir=str(path.parent), prefix=path.stem, suffix=".tmp") as tmp:
+        json.dump(obj, tmp, indent=2)
+        tmp_path = Path(tmp.name)
+    tmp_path.replace(path)  # atomic on POSIX; safe on Windows
+
 def save_configuration(global_policy: str, agents_config: Dict[str, Any], output_dir: str) -> None:
-    """Saves the final configuration, including the global policy, to a JSON file."""
     if not agents_config:
-        return 
-
-    final_structure = {
-        "global_policy": global_policy,
-        "agents": agents_config
-    }
-    
-    os.makedirs(output_dir, exist_ok=True)
-
-    filename_prompt = f"\n{Colors.WARNING}Enter a filename for your agent system (e.g., 'my_research_team.json'): {Colors.ENDC}"
-    filename = input(filename_prompt).strip()
-    if not filename.endswith('.json'):
-        filename += '.json'
-        
-    file_path = os.path.join(output_dir, filename)
-
+        return
+    final_structure = {"global_policy": global_policy, "agents": agents_config}
+    ensure_dir(Path(output_dir))
+    filename = input(f"\n{Colors.WARNING}Enter a filename for your agent system (e.g., 'my_research_team.json'): {Colors.ENDC}").strip()
+    if not filename.endswith(".json"):
+        filename += ".json"
+    file_path = Path(output_dir).expanduser() / filename
     try:
-        with open(file_path, 'w') as f:
-            json.dump(final_structure, f, indent=2)
+        _atomic_write_json(final_structure, file_path)
         print(f"\n{Colors.OKGREEN}{Colors.BOLD}Success! Agent configuration saved to: {file_path}{Colors.ENDC}")
-    except IOError as e:
+    except OSError as e:
         print(f"\n{Colors.FAIL}Error: Could not save the file. {e}{Colors.ENDC}")
 
 def main():
-    """Main function to run the interactive agent builder."""
     print(f"{Colors.HEADER}{Colors.BOLD}--- Welcome to the Interactive Agent Configuration Builder ---{Colors.ENDC}")
-
     global_policy_text = define_global_policy()
     output_directory = get_output_directory()
     agents_data = define_agents()
-    
     if agents_data:
         connect_agents(agents_data)
         assign_code_samples(agents_data)
diff --git a/cli/auto_metrics/AutoMetric.py b/cli/olaf/src/olaf/auto_metrics/AutoMetric.py
similarity index 100%
rename from cli/auto_metrics/AutoMetric.py
rename to cli/olaf/src/olaf/auto_metrics/AutoMetric.py
diff --git a/cli/auto_metrics/CellCountMetric.py b/cli/olaf/src/olaf/auto_metrics/CellCountMetric.py
similarity index 100%
rename from cli/auto_metrics/CellCountMetric.py
rename to cli/olaf/src/olaf/auto_metrics/CellCountMetric.py
diff --git a/cli/auto_metrics/CellTypingMetric.py b/cli/olaf/src/olaf/auto_metrics/CellTypingMetric.py
similarity index 100%
rename from cli/auto_metrics/CellTypingMetric.py
rename to cli/olaf/src/olaf/auto_metrics/CellTypingMetric.py
diff --git a/cli/auto_metrics/IntegrationMetrics.py b/cli/olaf/src/olaf/auto_metrics/IntegrationMetrics.py
similarity index 100%
rename from cli/auto_metrics/IntegrationMetrics.py
rename to cli/olaf/src/olaf/auto_metrics/IntegrationMetrics.py
diff --git a/cli/olaf/src/olaf/cli/__init__.py b/cli/olaf/src/olaf/cli/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/cli/olaf/src/olaf/cli/create_agent_cli.py b/cli/olaf/src/olaf/cli/create_agent_cli.py
new file mode 100644
index 0000000..4b970a1
--- /dev/null
+++ b/cli/olaf/src/olaf/cli/create_agent_cli.py
@@ -0,0 +1,107 @@
+# src/olaf/cli/create_agent.py
+from __future__ import annotations
+
+import os
+import json
+import tempfile
+from pathlib import Path
+import typer
+
+from olaf.agents.create_agent_system import (
+    DEFAULT_AGENT_DIR,
+    DEFAULT_SAMPLES_DIR,
+    OLAF_HOME,
+    define_global_policy,
+    define_agents,
+    connect_agents,
+    assign_code_samples,
+    save_configuration,
+    Colors,
+)
+
+# Initialize Typer. `no_args_is_help=False` allows the callback to run by default.
+app = typer.Typer(
+    no_args_is_help=False,
+    help="Create OLAF agent systems. Defaults to interactive mode."
+)
+
+def _run_interactive(output_dir: str, code_samples_dir: str):
+    """
+    The actual logic for the interactive agent system builder.
+    """
+    os.environ.setdefault("OLAF_HOME", str(OLAF_HOME))
+
+    print(f"{Colors.HEADER}{Colors.BOLD}--- OLAF: Create Agent System (Interactive) ---{Colors.ENDC}")
+    print(f"Using output directory: {output_dir}")
+    print(f"Using code samples dir: {code_samples_dir}")
+
+    global_policy_text = define_global_policy()
+    agents_data = define_agents()
+    if agents_data:
+        connect_agents(agents_data)
+        assign_code_samples(agents_data)
+        save_configuration(global_policy_text, agents_data, output_dir)
+
+
+@app.callback(invoke_without_command=True)
+def main(
+    ctx: typer.Context,
+    output_dir: str = typer.Option(
+        str(DEFAULT_AGENT_DIR),
+        "--output-dir",
+        "-o",
+        help="Where to save the resulting JSON.",
+        show_default=True,
+    ),
+    code_samples_dir: str = typer.Option(
+        str(DEFAULT_SAMPLES_DIR),
+        "--code-samples-dir",
+        help="Where to look for code samples by default.",
+        show_default=True,
+    ),
+):
+    """
+    Manages agent system creation.
+
+    If no subcommand (like 'quick') is provided, this tool runs in
+    interactive mode.
+    """
+    # If a subcommand was not invoked, run the default interactive mode.
+    if ctx.invoked_subcommand is None:
+        _run_interactive(output_dir=output_dir, code_samples_dir=code_samples_dir)
+
+
+@app.command("quick")
+def quick(
+    name: str = typer.Option(..., "--name", "-n", help="Filename (without .json) for the agent system."),
+    policy: str = typer.Option("", "--policy", help="Optional global policy text."),
+    output_dir: str = typer.Option(
+        str(DEFAULT_AGENT_DIR),
+        "--output-dir",
+        "-o",
+        help="Where to save the resulting JSON.",
+        show_default=True,
+    ),
+):
+    """
+    Create a minimal agent system non-interactively.
+    """
+    from typing import Any, Dict
+    agents: Dict[str, Any] = {}
+    final_structure = {"global_policy": policy, "agents": agents}
+    
+    # Ensure the output directory exists
+    output_path = Path(output_dir).expanduser()
+    output_path.mkdir(parents=True, exist_ok=True)
+    path = output_path / f"{name}.json"
+
+    # Use an atomic write to prevent corrupted files
+    try:
+        with tempfile.NamedTemporaryFile("w", delete=False, dir=str(path.parent), prefix=path.stem, suffix=".tmp") as tmp:
+            json.dump(final_structure, tmp, indent=2)
+            tmp_path = Path(tmp.name)
+        tmp_path.replace(path)
+        typer.secho(f"Created {path}", fg=typer.colors.GREEN)
+    except OSError as e:
+        typer.secho(f"Error creating file: {e}", fg=typer.colors.RED)
+        raise typer.Exit(code=1) from e
\ No newline at end of file
diff --git a/cli/olaf/src/olaf/cli/datasets_cli.py b/cli/olaf/src/olaf/cli/datasets_cli.py
new file mode 100644
index 0000000..36d2d97
--- /dev/null
+++ b/cli/olaf/src/olaf/cli/datasets_cli.py
@@ -0,0 +1,72 @@
+# cli/olaf/src/olaf/cli/datasets_cli.py
+
+import typer
+from typing_extensions import Annotated
+
+# Import the logic functions from our other file
+import olaf.datasets.czi_datasets as datasets
+
+try:
+    from rich.console import Console
+    HAS_RICH = True
+except ImportError:
+    HAS_RICH = False
+
+# Create a Typer app for the "datasets" subcommand group
+datasets_app = typer.Typer(
+    name="datasets",
+    help="Browse and download datasets from the CZI CELLxGENE Census. Defaults to interactive mode.",
+    no_args_is_help=False  # Allows our callback to run
+)
+
+@datasets_app.callback(invoke_without_command=True)
+def datasets_main(ctx: typer.Context):
+    """
+    If no subcommand is specified, enter interactive mode.
+    """
+    if ctx.invoked_subcommand is None:
+        console = datasets.Console()
+        console.print("No subcommand given. Starting interactive CZI Census browser...")
+        # Ensure dependencies for interactive mode are checked
+        try:
+            import numpy
+        except ImportError:
+            console.print("[bold red]Error: 'numpy' is required. Please 'pip install numpy'.[/bold red]")
+            raise typer.Exit(1)
+        datasets.interactive_loop()
+
+@datasets_app.command("list-versions")
+def list_versions():
+    """List available CELLxGENE Census versions."""
+    datasets.display_versions_list(datasets.Console())
+
+@datasets_app.command("list-datasets")
+def list_datasets(
+    version: Annotated[str, typer.Option(help='Census version tag (e.g., "stable", "latest").')],
+    limit: Annotated[int, typer.Option(help="Max number of datasets to paginate through.")] = None,
+    page_size: Annotated[int, typer.Option(help="Number of datasets per page.")] = 5,
+):
+    """List source datasets within a specific Census version."""
+    datasets.display_paginated_datasets(datasets.Console(), version, limit, page_size)
+
+@datasets_app.command("show-metadata")
+def show_metadata(
+    version: Annotated[str, typer.Option(help='Census version tag (e.g., "stable").')],
+    dataset_id: Annotated[str, typer.Option(help="The dataset_id to view.")],
+):
+    """Show all metadata for a specific source dataset."""
+    datasets.display_dataset_metadata(datasets.Console(), version, dataset_id)
+
+@datasets_app.command("download")
+def download(
+    version: Annotated[str, typer.Option(help='Census version tag (e.g., "stable").')],
+    dataset_id: Annotated[str, typer.Option(help="The dataset_id to download.")],
+):
+    """Download a dataset's H5AD file and metadata JSON."""
+    console = datasets.Console()
+    try:
+        import numpy
+    except ImportError:
+        console.print("[bold red]Error: 'numpy' is required for this command. Please 'pip install numpy'.[/bold red]")
+        raise typer.Exit(1)
+    datasets.download_dataset(console, version, dataset_id)
\ No newline at end of file
diff --git a/cli/olaf/src/olaf/cli/main.py b/cli/olaf/src/olaf/cli/main.py
new file mode 100644
index 0000000..d34a922
--- /dev/null
+++ b/cli/olaf/src/olaf/cli/main.py
@@ -0,0 +1,27 @@
+# cli/olaf/src/olaf/__main__.py
+
+import typer
+
+# Import the app for the 'create-system' command
+from .create_agent_cli import app as create_system_app
+
+# Import the app for the new 'datasets' command
+from .datasets_cli import datasets_app
+
+# Main OLAF application
+app = typer.Typer(
+    name="olaf",
+    help="OLAF: The Open-source Language Agent Framework",
+    no_args_is_help=True
+)
+
+# Register the command groups
+app.add_typer(create_system_app, name="create-system")
+app.add_typer(datasets_app, name="datasets")
+
+
+def main():
+    app()
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/cli/code_samples/Celltyping.py b/cli/olaf/src/olaf/code_samples/Celltyping.py
similarity index 100%
rename from cli/code_samples/Celltyping.py
rename to cli/olaf/src/olaf/code_samples/Celltyping.py
diff --git a/cli/code_samples/DataCheck.py b/cli/olaf/src/olaf/code_samples/DataCheck.py
similarity index 100%
rename from cli/code_samples/DataCheck.py
rename to cli/olaf/src/olaf/code_samples/DataCheck.py
diff --git a/cli/code_samples/Doublets.py b/cli/olaf/src/olaf/code_samples/Doublets.py
similarity index 100%
rename from cli/code_samples/Doublets.py
rename to cli/olaf/src/olaf/code_samples/Doublets.py
diff --git a/cli/code_samples/Downstream.py b/cli/olaf/src/olaf/code_samples/Downstream.py
similarity index 100%
rename from cli/code_samples/Downstream.py
rename to cli/olaf/src/olaf/code_samples/Downstream.py
diff --git a/cli/code_samples/Integrate_Harmony.py b/cli/olaf/src/olaf/code_samples/Integrate_Harmony.py
similarity index 100%
rename from cli/code_samples/Integrate_Harmony.py
rename to cli/olaf/src/olaf/code_samples/Integrate_Harmony.py
diff --git a/cli/code_samples/Integrate_scvi.py b/cli/olaf/src/olaf/code_samples/Integrate_scvi.py
similarity index 100%
rename from cli/code_samples/Integrate_scvi.py
rename to cli/olaf/src/olaf/code_samples/Integrate_scvi.py
diff --git a/cli/code_samples/MAD.py b/cli/olaf/src/olaf/code_samples/MAD.py
similarity index 100%
rename from cli/code_samples/MAD.py
rename to cli/olaf/src/olaf/code_samples/MAD.py
diff --git a/cli/code_samples/QC_Inspection.py b/cli/olaf/src/olaf/code_samples/QC_Inspection.py
similarity index 100%
rename from cli/code_samples/QC_Inspection.py
rename to cli/olaf/src/olaf/code_samples/QC_Inspection.py
diff --git a/cli/code_samples/QC_umap_visualization.py b/cli/olaf/src/olaf/code_samples/QC_umap_visualization.py
similarity index 100%
rename from cli/code_samples/QC_umap_visualization.py
rename to cli/olaf/src/olaf/code_samples/QC_umap_visualization.py
diff --git a/cli/code_samples/Re-analysis_afterQC.py b/cli/olaf/src/olaf/code_samples/Re-analysis_afterQC.py
similarity index 100%
rename from cli/code_samples/Re-analysis_afterQC.py
rename to cli/olaf/src/olaf/code_samples/Re-analysis_afterQC.py
diff --git a/cli/code_samples/load_adata.py b/cli/olaf/src/olaf/code_samples/load_adata.py
similarity index 100%
rename from cli/code_samples/load_adata.py
rename to cli/olaf/src/olaf/code_samples/load_adata.py
diff --git a/cli/olaf/src/olaf/datasets/czi_datasets.py b/cli/olaf/src/olaf/datasets/czi_datasets.py
new file mode 100644
index 0000000..a896082
--- /dev/null
+++ b/cli/olaf/src/olaf/datasets/czi_datasets.py
@@ -0,0 +1,323 @@
+# cli/olaf/src/olaf/cli/datasets.py
+
+import os
+import re
+import json
+import math
+import shlex
+import sys
+from pathlib import Path
+
+import cellxgene_census
+from platformdirs import PlatformDirs
+
+try:
+    from rich.console import Console
+    from rich.table import Table
+    from rich.pretty import pprint
+    from rich.prompt import Prompt
+    HAS_RICH = True
+except ImportError:
+    HAS_RICH = False
+    # Define simple fallback classes if rich is not installed
+    def pprint(obj): print(obj)
+    class Console:
+        def print(self, *args, **kwargs): print(*args)
+    class Table:
+        def __init__(self, title=""):
+            self._title = title
+            self._rows = []
+            self._columns = []
+        def add_column(self, header, **kwargs):
+            self._columns.append(header)
+        def add_row(self, *items):
+            if len(items) != len(self._columns):
+                raise ValueError("Number of items in row does not match number of columns")
+            self._rows.append(items)
+        def print_table(self, console):
+            console.print(self._title)
+            if not self._columns:
+                return
+            col_widths = [len(h) for h in self._columns]
+            for row in self._rows:
+                for i, item in enumerate(row):
+                    col_widths[i] = max(col_widths[i], len(str(item)))
+            header_line = "  ".join(f"{h:<{w}}" for h, w in zip(self._columns, col_widths))
+            separator = "-" * len(header_line)
+            console.print(header_line)
+            console.print(separator)
+            for row in self._rows:
+                row_line = "  ".join(f"{str(item):<{w}}" for item, w in zip(row, col_widths))
+                console.print(row_line)
+    class Prompt:
+        @staticmethod
+        def ask(prompt, choices=None, default=None):
+            p_text = f"{prompt} "
+            if choices:
+                p_text += f"({'/'.join(choices)}) "
+            if default:
+                p_text += f"[{default}] "
+            return input(p_text).strip()
+
+# --- Path Configuration ---
+APP_NAME = "olaf"
+APP_AUTHOR = "OpenTechBio"
+dirs = PlatformDirs(APP_NAME, APP_AUTHOR)
+
+OLAF_HOME = Path(os.environ.get("OLAF_HOME", dirs.user_data_dir)).expanduser()
+DEFAULT_DATASETS_DIR = OLAF_HOME / "datasets"
+
+def get_datasets_dir() -> Path:
+    """
+    Returns the path to the datasets directory, creating it if it doesn't exist.
+    """
+    DEFAULT_DATASETS_DIR.mkdir(parents=True, exist_ok=True)
+    return DEFAULT_DATASETS_DIR
+
+# --- Helper Functions ---
+def sanitize_filename(name: str) -> str:
+    """Removes invalid characters and replaces spaces for use in filenames."""
+    name = re.sub(r'[^\w\-.]+', '_', name)
+    return re.sub(r'_+', '_', name).strip('_').lower()
+
+# --- Core Data Fetching and Download Functions ---
+
+def get_census_versions_data():
+    """Fetches available CELLxGENE Census versions data."""
+    try:
+        census_versions = cellxgene_census.get_census_version_directory()
+        versions_list = []
+        sorted_versions = sorted(
+            census_versions.keys(),
+            key=lambda v: ('0' if v == 'stable' else '1' if v == 'latest' else '2') + v,
+            reverse=True
+        )
+        for version in sorted_versions:
+            desc = census_versions[version]
+            versions_list.append({
+                "version": version,
+                "description": desc.get('description', desc.get('uri', 'N/A')),
+                "release_date": desc.get("release_date", "N/A")
+            })
+        return versions_list
+    except Exception as e:
+        raise RuntimeError(f"Error listing versions: {e}")
+
+def fetch_source_datasets_data(census_version: str):
+    """Fetches source datasets DataFrame for a specific Census version."""
+    console = Console()
+    console.print(f"Fetching source datasets info for Census version: [cyan]{census_version}[/cyan]...")
+    try:
+        with cellxgene_census.open_soma(census_version=census_version) as census:
+            datasets_df = census["census_info"]["datasets"].read().concat().to_pandas()
+            if datasets_df.empty:
+                console.print(f"No source dataset information found for version {census_version}.")
+            return datasets_df
+    except Exception as e:
+        raise RuntimeError(f"Error fetching datasets for version {census_version}: {e}")
+
+def get_dataset_metadata_data(census_version: str, dataset_id: str):
+    """Fetches metadata dictionary for a specific source dataset."""
+    console = Console()
+    console.print(f"Fetching metadata for [cyan]{dataset_id}[/cyan] in Census version: [cyan]{census_version}[/cyan]...")
+    try:
+        datasets_df = fetch_source_datasets_data(census_version)
+        if datasets_df is None or datasets_df.empty:
+             raise ValueError(f"Could not retrieve datasets for version {census_version}.")
+        
+        dataset_metadata = datasets_df[datasets_df['dataset_id'] == dataset_id]
+        if dataset_metadata.empty:
+            raise ValueError(f"Dataset ID '{dataset_id}' not found in Census version '{census_version}'.")
+        return dataset_metadata.iloc[0].to_dict()
+    except Exception as e:
+        raise RuntimeError(f"Error fetching metadata for {dataset_id}: {e}")
+
+def download_dataset(console: Console, census_version: str, dataset_id: str):
+    """Downloads H5AD file and saves metadata JSON for a dataset."""
+    try:
+        # 1. Get target directory using the new function
+        target_dir = get_datasets_dir()
+        console.print(f"Target directory: [blue]{target_dir}[/blue]")
+
+        # 2. Fetch metadata
+        metadata = get_dataset_metadata_data(census_version, dataset_id)
+        dataset_title = metadata.get('dataset_title', f'dataset_{dataset_id}')
+        base_filename = sanitize_filename(dataset_title) or f"dataset_{dataset_id}"
+        
+        h5ad_filepath = target_dir / f"{base_filename}.h5ad"
+        json_filepath = target_dir / f"{base_filename}.json"
+
+        console.print(f"Preparing to download dataset [green]{dataset_title}[/green]...")
+        if h5ad_filepath.exists() or json_filepath.exists():
+            console.print("[yellow]Warning: Output file(s) already exist. Skipping download.[/yellow]")
+            return
+
+        # 3. Download H5AD
+        console.print(f"Downloading H5AD to [blue]{h5ad_filepath}[/blue]...")
+        cellxgene_census.download_source_h5ad(dataset_id, to_path=str(h5ad_filepath), census_version=census_version)
+        console.print("[bold green]H5AD Download complete.[/bold green]")
+
+        # 4. Save Metadata JSON
+        console.print(f"Saving metadata to [blue]{json_filepath}[/blue]...")
+        import numpy as np
+        def convert_types(obj):
+            if isinstance(obj, np.generic): return obj.item()
+            if isinstance(obj, np.ndarray): return obj.tolist()
+            if isinstance(obj, np.void): return None
+            return obj
+        with open(json_filepath, 'w', encoding='utf-8') as f:
+            json.dump(metadata, f, indent=4, default=convert_types, ensure_ascii=False)
+        console.print("[bold green]Metadata JSON saved successfully.[/bold green]")
+
+    except Exception as e:
+        console.print(f"[bold red]Download failed:[/bold red] {e}")
+        sys.exit(1)
+
+# --- Display and Interaction Functions ---
+
+def display_versions_list(console: Console):
+    """Displays available versions."""
+    try:
+        versions_data = get_census_versions_data()
+        if not versions_data:
+             console.print("[yellow]No Census versions found.[/yellow]")
+             return
+
+        table = Table(title="Available CELLxGENE Census Versions")
+        table.add_column("Version Tag", style="cyan")
+        table.add_column("Release Date", style="green")
+        table.add_column("Description", style="magenta")
+
+        for v_data in versions_data:
+            table.add_row(v_data["version"], v_data["release_date"], v_data["description"])
+
+        if HAS_RICH:
+            console.print(table)
+        else:
+            table.print_table(console)
+    except Exception as e:
+        console.print(f"[bold red]Error displaying versions:[/bold red] {e}")
+
+def display_paginated_datasets(console: Console, census_version: str, limit: int = None, page_size: int = 5):
+    """Fetches and displays datasets with pagination."""
+    try:
+        datasets_df = fetch_source_datasets_data(census_version)
+        if datasets_df is None or datasets_df.empty:
+            return
+
+        df_view = datasets_df.head(limit) if limit and limit > 0 else datasets_df
+        total_items_in_view = len(df_view)
+        if total_items_in_view == 0:
+            console.print(f"No datasets found for version {census_version}.")
+            return
+
+        total_pages = math.ceil(total_items_in_view / page_size)
+        current_page = 1
+
+        while True:
+            start_index = (current_page - 1) * page_size
+            end_index = start_index + page_size
+            page_df = df_view.iloc[start_index:end_index]
+
+            if page_df.empty:
+                break
+
+            range_end = min(end_index, total_items_in_view)
+            table = Table(title=f"Source Datasets in Census {census_version} (Showing {start_index+1}-{range_end} of {total_items_in_view})")
+            table.add_column("Dataset ID", style="cyan", no_wrap=True)
+            table.add_column("Dataset Title", style="green", overflow="fold")
+            table.add_column("Cell Count", style="yellow", justify="right")
+
+            for _, row in page_df.iterrows():
+                 cell_count_str = f"{int(row.get('cell_count', 0)):,}" if row.get('cell_count') else 'N/A'
+                 table.add_row(row.get('dataset_id', 'N/A'), row.get('dataset_title', 'N/A'), cell_count_str)
+            
+            console.print(f"\n--- Page {current_page} of {total_pages} ---")
+            if HAS_RICH:
+                console.print(table)
+            else:
+                table.print_table(console)
+
+            if total_pages <= 1: break
+
+            prompt_text = "[P]revious, [N]ext, [Q]uit?"
+            action = Prompt.ask(prompt_text, default="N" if current_page < total_pages else "Q").upper()
+
+            if action == "N" and current_page < total_pages: current_page += 1
+            elif action == "P" and current_page > 1: current_page -= 1
+            elif action == "Q": break
+            else: console.print("[yellow]Invalid choice.[/yellow]")
+
+    except Exception as e:
+        console.print(f"[bold red]Error displaying datasets:[/bold red] {e}")
+
+def display_dataset_metadata(console: Console, census_version: str, dataset_id: str):
+     """Displays metadata for a specific dataset."""
+     try:
+         metadata_dict = get_dataset_metadata_data(census_version, dataset_id)
+         console.print(f"\nMetadata for Dataset: [bold green]{dataset_id}[/bold green]")
+         pprint(metadata_dict)
+     except Exception as e:
+         console.print(f"[bold red]Error displaying metadata:[/bold red] {e}")
+
+def print_interactive_help(console: Console):
+     """Prints help message for interactive mode."""
+     console.print("\n[bold cyan]Available Commands:[/bold cyan]")
+     console.print("  [green]list_versions[/green]                    List available CELLxGENE Census versions.")
+     console.print("  [green]list_datasets[/green] <version> [limit]  List source datasets (paginated).")
+     console.print("  [green]show_metadata[/green] <version> <dataset_id> Show metadata for a specific dataset.")
+     console.print("  [green]download[/green] <version> <dataset_id>      Download dataset H5AD and metadata JSON.")
+     console.print("  [green]help[/green]                         Show this help message.")
+     console.print("  [green]exit[/green]                         Exit the interactive browser.")
+     console.print("\nExample: [yellow]download stable <some_dataset_id>[/yellow]")
+
+def interactive_loop():
+    """Runs the interactive command loop."""
+    console = Console()
+    console.print("[bold blue]Welcome to the Interactive CZI CELLxGENE Census Browser![/bold blue]")
+    print_interactive_help(console)
+
+    while True:
+        try:
+            raw_command = Prompt.ask("\nEnter command ('help' or 'exit')")
+            if not raw_command: continue
+
+            command_parts = shlex.split(raw_command)
+            if not command_parts: continue
+
+            command = command_parts[0].lower()
+            args = command_parts[1:]
+
+            if command == "exit": break
+            elif command == "help": print_interactive_help(console)
+            elif command == "list_versions":
+                if not args: display_versions_list(console)
+                else: console.print("[yellow]Usage: list_versions[/yellow]")
+            elif command == "list_datasets":
+                if not args:
+                    console.print("[yellow]Usage: list_datasets <version> [limit][/yellow]")
+                    continue
+                version = args[0]
+                limit = int(args[1]) if len(args) > 1 else None
+                display_paginated_datasets(console, version, limit=limit, page_size=5)
+            elif command == "show_metadata":
+                if len(args) < 2:
+                    console.print("[yellow]Usage: show_metadata <version> <dataset_id>[/yellow]")
+                    continue
+                display_dataset_metadata(console, args[0], args[1])
+            elif command == "download":
+                if len(args) < 2:
+                    console.print("[yellow]Usage: download <version> <dataset_id>[/yellow]")
+                    continue
+                download_dataset(console, args[0], args[1])
+            else:
+                console.print(f"[red]Unknown command: '{command}'. Type 'help' for options.[/red]")
+        except EOFError:
+             console.print("\n[yellow]EOF detected. Exiting.[/yellow]")
+             break
+        except KeyboardInterrupt:
+             console.print("\n[yellow]Interrupted by user. Type 'exit' to quit.[/yellow]")
+        except Exception as e:
+             console.print(f"[bold red]An unexpected error occurred:[/bold red] {e}")
+
+    console.print("[bold blue]Exiting browser. Goodbye![/bold blue]")
\ No newline at end of file
diff --git a/cli/datasets/spatial_transcriptomics_in_mouse_puck_191109_14.h5ad b/cli/olaf/src/olaf/datasets/spatial_transcriptomics_in_mouse_puck_191109_14.h5ad
similarity index 100%
rename from cli/datasets/spatial_transcriptomics_in_mouse_puck_191109_14.h5ad
rename to cli/olaf/src/olaf/datasets/spatial_transcriptomics_in_mouse_puck_191109_14.h5ad
diff --git a/cli/datasets/spatial_transcriptomics_in_mouse_puck_191109_14.json b/cli/olaf/src/olaf/datasets/spatial_transcriptomics_in_mouse_puck_191109_14.json
similarity index 100%
rename from cli/datasets/spatial_transcriptomics_in_mouse_puck_191109_14.json
rename to cli/olaf/src/olaf/datasets/spatial_transcriptomics_in_mouse_puck_191109_14.json
diff --git a/cli/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.h5ad b/cli/olaf/src/olaf/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.h5ad
similarity index 100%
rename from cli/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.h5ad
rename to cli/olaf/src/olaf/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.h5ad
diff --git a/cli/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.json b/cli/olaf/src/olaf/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.json
similarity index 100%
rename from cli/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.json
rename to cli/olaf/src/olaf/datasets/thymus_scrna-seq_atlas_-_myeloid_p2_subset.json
diff --git a/cli/sample_prompt_library/Basic_scRNA_Agent.txt b/cli/sample_prompt_library/Basic_scRNA_Agent.txt
deleted file mode 100644
index 4f22eb9..0000000
--- a/cli/sample_prompt_library/Basic_scRNA_Agent.txt
+++ /dev/null
@@ -1,165 +0,0 @@
-You are a highly skilled bioinformatics agent specializing in single-cell RNA-seq data analysis using Python. Your goal is to provide accurate, efficient, and clear analysis while adapting to different datasets and scenarios. You have access to a python code interpreter, so every code block you generate will be executed, and you'll receive feedback on its execution. The code will be executed on a python jupyter kernel and the kernel will remain active after execution retaining all variables in memory. Use the following framework for structured analysis with detailed code, outputs, and guidance to the user.
-
-**Primary Analysis Flow**:
-For analyzing single-cell RNA-seq data using the `Scanpy` package, follow this structured framework:
-
-### 1. **Data Loading & Package Setup**
-    a. Load the provided dataset from the working directory.
-    b. Recognize common formats (e.g., 10X `.h5` or `mtx` files). If multiple samples are present, load them as a batch.
-    c. Use the following libraries and settings:
-    ```python
-    import scanpy as sc
-    import os
-    import pandas as pd
-    import matplotlib.pyplot as plt
-    import seaborn as sns
-    import numpy as np
-    from scipy.stats import median_abs_deviation as mad
-    import celltypist
-    from celltypist import models
-    import anndata as ad
-
-    # Set verbosity and figure parameters
-    sc.settings.verbosity = 0
-    sc.settings.set_figure_params(dpi=50, facecolor="white", frameon=False)
-    ```
-
-### 2. **Initial Data Inspection**
-    a. **Summarize the dataset**: Provide the number of cells and genes for each sample.
-    b. **Plot initial cell and gene counts** for user reference:
-    ```python
-    fig, ax = plt.subplots(figsize=(10, 6))
-    n_cells = [adata.n_obs for adata in adatas]
-    n_genes = [adata.n_vars for adata in adatas]
-    ax.bar(range(len(adatas)), n_cells, label='Cells')
-    ax.bar(range(len(adatas)), n_genes, label='Genes', align='edge')
-    ax.set_title('Cell and Gene Counts Before QC')
-    plt.show()
-    ```
-
-### 3. **Quality Control (QC) Metrics**
-    a. Calculate mitochondrial content per cell and flag potential low-quality cells.
-    ```python
-    def calculate_mito_percentage(adata):
-        mito_genes = adata.var_names.str.contains('^MT-')
-        adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X, axis=1) / np.sum(adata.X, axis=1)
-        return adata
-    adatas = [calculate_mito_percentage(x) for x in adatas]
-    ```
-    b. Visualize the key QC metrics: counts, genes, mitochondrial content:
-    ```python
-    for adata in adatas:
-        sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'])
-    ```
-
-### 4. **Pre-QC Analysis**
-    a. Perform normalization, feature selection, clustering, and UMAP projection:
-    ```python
-    for adata in adatas:
-        sc.pp.normalize_total(adata)
-        sc.pp.log1p(adata)
-        sc.pp.highly_variable_genes(adata, n_top_genes=2000)
-        sc.tl.pca(adata)
-        sc.pp.neighbors(adata, n_pcs=20)
-        sc.tl.umap(adata)
-        sc.tl.leiden(adata, resolution=0.5)
-        sc.pl.umap(adata, color=['leiden'])
-    ```
-    b. Plot differential expression for the top 3 genes per cluster:
-    ```python
-    sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
-    sc.pl.rank_genes_groups_dotplot(adata, n_genes=3)
-    ```
-
-### 5. **Post-QC Filtering**
-    a. Apply filtering based on cell quality and mitochondrial content:
-    ```python
-    def filter_cells(adata):
-        sc.pp.filter_cells(adata, min_genes=200)
-        sc.pp.filter_genes(adata, min_cells=3)
-        return adata
-    adatas = [filter_cells(adata) for adata in adatas]
-    ```
-
-### 6. **Reanalysis Post-QC**
-    a. Re-perform PCA, clustering, and UMAP after filtering:
-    ```python
-    for adata in adatas:
-        sc.tl.pca(adata)
-        sc.pp.neighbors(adata, n_pcs=20)
-        sc.tl.umap(adata)
-        sc.pl.umap(adata, color=['leiden'])
-    ```
-
-### 7. **Cell Type Annotation**
-    a. Download and apply `Celltypist` models for automatic cell-type annotation:
-    ```python
-    models.download_models()
-    predictions = celltypist.annotate(adata, model='Developing_Mouse_Brain.pkl', majority_voting=True)
-    adata.obs['celltypes'] = predictions.cell_types
-    sc.pl.umap(adata, color='celltypes')
-    ```
-
-### 8. **Batch Effect Correction** (if applicable)
-    a. If multiple samples are present, merge datasets and perform batch correction:
-    ```python
-    adata = ad.concat(adatas, label='sample', keys=['sample1', 'sample2'])
-    sc.pp.combat(adata, key='sample')
-    sc.pp.neighbors(adata)
-    sc.tl.umap(adata)
-    sc.pl.umap(adata, color=['sample', 'celltypes'])
-    ```
-
-### 9. **Final Output and Saving**
-    a. Save the final integrated dataset in `.h5ad` format:
-    ```python
-    adata.write('path/to/final_output.h5ad')
-    ```
-
-**Execution Instructions**:
-1. Before proceeding with any step, confirm execution and results with the user.
-2. Adjust or modify steps based on the user's input.
-3. Output visualizations for the user to inspect results at each step (e.g., UMAP plots, differential expression).
-4. Ensure appropriate feedback and quality checks (e.g., warnings, large deviations in mitochondrial content).
-
-**Customization**:
-1. If the user provides specific thresholds or metrics for QC, adjust your methods accordingly.
-2. Ensure adaptability to multiple formats (e.g., `.h5`, `.mtx`) and large datasets.
-3. If batch correction is requested, use advanced methods (e.g., Harmony, scDREAMER) based on the scenario.
-
-The following dependencies are already installed and available in the Jupyter kernel:
-
-ansi2html==1.8.0
-scanpy==1.10.2
-scrublet
-anndata==0.10.8
-celltypist==1.6.3
-leidenalg==0.10.2
-igraph==0.11.6
-networkx==3.2.1
-pynndescent==0.5.13
-numpy==1.26.4
-scipy==1.13.1
-pandas==2.2.2
-scikit-learn==1.5.1
-umap-learn==0.5.6
-statsmodels==0.14.2
-numba==0.60.0
-matplotlib==3.9.1
-seaborn==0.13.2
-h5py==3.11.0
-openpyxl==3.1.5
-PyPDF2
-tqdm==4.66.4
-psutil==6.0.0
-defusedxml==0.7.1
-requests==2.32.3
-
-Whenever you need to run code on the terminal using a package that is not already install, first provide a corresponding Bash code block labeled ```bash``` with the installation commands for all dependencies utilized, if they are not already installed in the environment. Do this for each code snippet you generate, like so:
-```bash
-pip install <dependency-name>
-```
-
-You can proceed with executing code that utilizes any of these packages without needing to install them. Don't install any additional packages
-
-Your objective is to guide the user through single-cell RNA-seq analysis, ensuring accuracy, reproducibility, and meaningful insights from the data.
\ No newline at end of file

From 735e13b5e83b6b47a3435865eed114f4708210c9 Mon Sep 17 00:00:00 2001
From: djriffle <djriffle1@gmail.com>
Date: Wed, 13 Aug 2025 14:45:25 -0400
Subject: [PATCH 05/14] Added run support

---
 cli/create_agent_system.sh                    |   4 -
 cli/create_benchmark_env.sh                   |  42 ---
 cli/olaf/src/olaf/cli/datasets_cli.py         |   2 +-
 cli/olaf/src/olaf/cli/main.py                 |   3 +-
 cli/olaf/src/olaf/cli/run_cli.py              | 218 +++++++++++++++
 cli/olaf/src/olaf/core/sandbox_management.py  |   6 +-
 .../src/olaf/execution}/__init__.py           |   0
 cli/olaf/src/olaf/execution/runner.py         | 249 ++++++++++++++++++
 cli/{ => olaf/src/olaf}/sandbox/Dockerfile    |   0
 cli/{ => olaf/src/olaf}/sandbox/Singularity   |   0
 cli/olaf/src/olaf/sandbox/__init__.py         |   0
 .../benchmarking_sandbox_management.py        |   0
 ...hmarking_sandbox_management_singularity.py |   0
 cli/{ => olaf/src/olaf}/sandbox/kernel_api.py |   0
 .../src/olaf}/sandbox/offline_kernel.py       |   0
 .../src/olaf}/sandbox/requirements.txt        |   0
 cli/{ => olaf/src/olaf}/sandbox/start.sh      |   0
 .../src/olaf}/sandbox/start_kernel.py         |   0
 cli/run_automated.sh                          |   4 -
 cli/run_interactive.sh                        |   4 -
 20 files changed, 473 insertions(+), 59 deletions(-)
 delete mode 100755 cli/create_agent_system.sh
 delete mode 100755 cli/create_benchmark_env.sh
 create mode 100644 cli/olaf/src/olaf/cli/run_cli.py
 rename cli/{sandbox => olaf/src/olaf/execution}/__init__.py (100%)
 create mode 100644 cli/olaf/src/olaf/execution/runner.py
 rename cli/{ => olaf/src/olaf}/sandbox/Dockerfile (100%)
 rename cli/{ => olaf/src/olaf}/sandbox/Singularity (100%)
 create mode 100644 cli/olaf/src/olaf/sandbox/__init__.py
 rename cli/{ => olaf/src/olaf}/sandbox/benchmarking_sandbox_management.py (100%)
 rename cli/{ => olaf/src/olaf}/sandbox/benchmarking_sandbox_management_singularity.py (100%)
 rename cli/{ => olaf/src/olaf}/sandbox/kernel_api.py (100%)
 rename cli/{ => olaf/src/olaf}/sandbox/offline_kernel.py (100%)
 rename cli/{ => olaf/src/olaf}/sandbox/requirements.txt (100%)
 rename cli/{ => olaf/src/olaf}/sandbox/start.sh (100%)
 rename cli/{ => olaf/src/olaf}/sandbox/start_kernel.py (100%)
 delete mode 100755 cli/run_automated.sh
 delete mode 100755 cli/run_interactive.sh

diff --git a/cli/create_agent_system.sh b/cli/create_agent_system.sh
deleted file mode 100755
index bc95946..0000000
--- a/cli/create_agent_system.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/env bash
-# move *out* of cli/ into its parent (Olaf/)
-cd "$(dirname "$0")"/..
-python -m cli.agents.create_agent_system "$@"
\ No newline at end of file
diff --git a/cli/create_benchmark_env.sh b/cli/create_benchmark_env.sh
deleted file mode 100755
index 94bc64d..0000000
--- a/cli/create_benchmark_env.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-
-# Get the directory where the script is located
-SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
-
-# Define the path for the .env file in the script's directory
-ENV_FILE_PATH="${SCRIPT_DIR}/.env"
-
-echo "This script will create a .env file to store your OpenAI API key."
-echo "The file will be saved in the script's directory: ${SCRIPT_DIR}"
-echo "" # Add a blank line for spacing
-
-# Prompt the user for their OpenAI API key
-# -p: Display the prompt string
-# -s: Silent mode (do not echo input characters) - recommended for keys/passwords
-# -r: Raw mode (backslashes are not treated as escape characters)
-read -p "Please enter your OpenAI API key: " -s -r OPENAI_API_KEY
-echo "" # Add a newline after the hidden input
-
-# Check if the key was entered
-if [ -z "$OPENAI_API_KEY" ]; then
-  echo "Error: No API key entered. Exiting."
-  exit 1
-fi
-
-# Write the key to the .env file in the format OPENAI_KEY:key_value
-# Overwrites the file if it already exists
-echo "OPENAI_API_KEY=${OPENAI_API_KEY}" > "${ENV_FILE_PATH}"
-
-# Check if the file was created successfully
-if [ $? -eq 0 ]; then
-  echo "" # Add a blank line
-  echo "Successfully saved the OpenAI API key to ${ENV_FILE_PATH}"
-  # Optionally, set permissions to be readable only by the user
-  chmod 600 "${ENV_FILE_PATH}"
-  echo "Set permissions for ${ENV_FILE_PATH} to read-only for the current user (600)."
-else
-  echo "Error: Failed to write to ${ENV_FILE_PATH}. Please check permissions."
-  exit 1
-fi
-
-exit 0
diff --git a/cli/olaf/src/olaf/cli/datasets_cli.py b/cli/olaf/src/olaf/cli/datasets_cli.py
index 36d2d97..75fb1a2 100644
--- a/cli/olaf/src/olaf/cli/datasets_cli.py
+++ b/cli/olaf/src/olaf/cli/datasets_cli.py
@@ -15,7 +15,7 @@
 # Create a Typer app for the "datasets" subcommand group
 datasets_app = typer.Typer(
     name="datasets",
-    help="Browse and download datasets from the CZI CELLxGENE Census. Defaults to interactive mode.",
+    help="Browse and download datasets from the CZI CELLxGENE Census.",
     no_args_is_help=False  # Allows our callback to run
 )
 
diff --git a/cli/olaf/src/olaf/cli/main.py b/cli/olaf/src/olaf/cli/main.py
index d34a922..eafdd0c 100644
--- a/cli/olaf/src/olaf/cli/main.py
+++ b/cli/olaf/src/olaf/cli/main.py
@@ -7,7 +7,7 @@
 
 # Import the app for the new 'datasets' command
 from .datasets_cli import datasets_app
-
+from .run_cli import run_app
 # Main OLAF application
 app = typer.Typer(
     name="olaf",
@@ -18,6 +18,7 @@
 # Register the command groups
 app.add_typer(create_system_app, name="create-system")
 app.add_typer(datasets_app, name="datasets")
+app.add_typer(run_app, name="run")
 
 
 def main():
diff --git a/cli/olaf/src/olaf/cli/run_cli.py b/cli/olaf/src/olaf/cli/run_cli.py
new file mode 100644
index 0000000..f4da586
--- /dev/null
+++ b/cli/olaf/src/olaf/cli/run_cli.py
@@ -0,0 +1,218 @@
+# olaf/cli/run_cli.py
+import os
+import textwrap
+from pathlib import Path
+from typing import List, cast
+import subprocess
+
+import typer
+from rich.console import Console
+from rich.prompt import Prompt
+
+# Import your project's modules and shared configuration
+from olaf.agents.AgentSystem import AgentSystem
+from olaf.core.io_helpers import collect_resources
+from olaf.core.sandbox_management import (init_docker, init_singularity, init_singularity_exec)
+from olaf.execution.runner import run_agent_session, SandboxManager
+from olaf.datasets.czi_datasets import get_datasets_dir
+from olaf.agents.create_agent_system import DEFAULT_AGENT_DIR
+
+# --- Define package-internal paths ---
+PACKAGE_ROOT = Path(__file__).resolve().parent.parent
+PACKAGE_AGENTS_DIR = PACKAGE_ROOT / "agents"
+PACKAGE_DATASETS_DIR = PACKAGE_ROOT / "datasets"
+
+
+# --- Helper functions for interactive prompts (unchanged) ---
+
+def _prompt_for_file(
+    console: Console,
+    user_dir: Path,
+    package_dir: Path,
+    extension: str,
+    prompt_title: str,
+) -> Path:
+    """
+    Generic helper to find files in both user and package directories and prompt for a selection.
+    User files take priority over package files with the same name.
+    """
+    console.print(f"[bold]Select {prompt_title}:[/bold]")
+    
+    found_files = []
+    seen_filenames = set()
+
+    if user_dir.exists():
+        for file_path in sorted(list(user_dir.glob(f"**/*{extension}"))):
+            if file_path.name not in seen_filenames:
+                found_files.append({"path": file_path, "label": "User"})
+                seen_filenames.add(file_path.name)
+
+    if package_dir.exists():
+        for file_path in sorted(list(package_dir.glob(f"**/*{extension}"))):
+            if file_path.name not in seen_filenames:
+                found_files.append({"path": file_path, "label": "Package"})
+                seen_filenames.add(file_path.name)
+
+    if not found_files:
+        console.print(f"[bold red]No '{extension}' files found in your user directory ({user_dir}) or the package directory ({package_dir}).[/bold red]")
+        raise typer.Exit(1)
+        
+    for i, file_info in enumerate(found_files, 1):
+        console.print(f"  [cyan]{i}[/cyan]: {file_info['path'].name} [yellow]({file_info['label']})[/yellow]")
+
+    choice_str = Prompt.ask("Enter the number of your choice", choices=[str(i) for i in range(1, len(found_files) + 1)])
+    return found_files[int(choice_str) - 1]['path']
+
+def _prompt_for_driver(console: Console, system: AgentSystem) -> str:
+    """Prompts the user to select a driver agent from the loaded system."""
+    console.print("[bold]Select a driver agent:[/bold]")
+    agents = list(system.agents.keys())
+    driver = Prompt.ask("Enter the name of the driver agent", choices=agents, default=agents[0])
+    return driver
+
+
+# --- Typer App and Context (unchanged) ---
+
+run_app = typer.Typer(
+    name="run",
+    help="Run an agent system. Prompts for configuration if not provided via flags.",
+    no_args_is_help=True,
+)
+
+class AppContext:
+    def __init__(self):
+        self.console = Console()
+        self.agent_system: AgentSystem | None = None
+        self.driver_agent_name: str | None = None
+        self.roster_instructions: str | None = None
+        self.analysis_context: str | None = None
+        self.sandbox_manager: SandboxManager | None = None
+        self.llm_client: object | None = None
+        self.initial_history: List[dict] | None = None
+
+@run_app.callback(invoke_without_command=True)
+def main_run_callback(
+    ctx: typer.Context,
+    blueprint: Path = typer.Option(None, "--blueprint", "-bp", help="Path to the agent system JSON blueprint.", readable=True),
+    driver_agent: str = typer.Option(None, "--driver-agent", "-d", help="Name of the agent to start with."),
+    dataset: Path = typer.Option(None, "--dataset", "-ds", help="Path to the dataset file (.h5ad).", readable=True),
+    resources_dir: Path = typer.Option(None, "--resources", help="Path to a directory of resource files to mount.", exists=True, file_okay=False),
+    llm_backend: str = typer.Option("chatgpt", "--llm", help="LLM backend to use.", case_sensitive=False),
+    ollama_host: str = typer.Option("http://localhost:11434", "--ollama-host", help="Base URL for Ollama backend."),
+    sandbox: str = typer.Option(None, "--sandbox", help="Sandbox backend to use: 'docker', 'singularity', or 'singularity-exec'."), # <-- Changed default
+    force_refresh: bool = typer.Option(False, "--force-refresh", help="Force refresh/rebuild of the sandbox environment."),
+):
+    app_context = AppContext()
+    console = app_context.console
+    ctx.obj = app_context
+
+    # Steps 1, 2, and 3 are unchanged
+    if blueprint is None:
+        blueprint = _prompt_for_file(console, DEFAULT_AGENT_DIR, PACKAGE_AGENTS_DIR, ".json", "Agent System Blueprint")
+    app_context.agent_system = AgentSystem.load_from_json(str(blueprint))
+    
+    if driver_agent is None:
+        driver_agent = _prompt_for_driver(console, app_context.agent_system)
+    if driver_agent not in app_context.agent_system.agents:
+        raise typer.BadParameter(f"Driver agent '{driver_agent}' not found in blueprint.")
+    app_context.driver_agent_name = driver_agent
+    app_context.roster_instructions = app_context.agent_system.get_instructions()
+    
+    if dataset is None:
+        dataset = _prompt_for_file(console, get_datasets_dir(), PACKAGE_DATASETS_DIR, ".h5ad", "Dataset")
+
+    # --- Step 4. Configure Sandbox (Corrected Logic) ---
+    # Prompt for sandbox if not provided as a flag
+    if sandbox is None:
+        sandbox = Prompt.ask(
+            "Choose a sandbox backend",
+            choices=["docker", "singularity", "singularity-exec"],
+            default="docker"
+        )
+        
+    console.print(f"[cyan]Initializing sandbox backend: {sandbox}[/cyan]")
+    script_dir = Path(__file__).resolve().parent
+    
+    manager_class = None
+    if sandbox == "docker":
+        manager_class, _, _, _, _ = init_docker(script_dir, subprocess, console, force_refresh=force_refresh)
+    elif sandbox == "singularity":
+        manager_class, _, _, _, _ = init_singularity(script_dir, subprocess, console, force_refresh=force_refresh)
+    elif sandbox == "singularity-exec":
+        SANDBOX_DATA_PATH = "/workspace/dataset.h5ad"
+        manager_class, _, _, _, _ = init_singularity_exec(script_dir, SANDBOX_DATA_PATH, subprocess, console, force_refresh=force_refresh)
+    else:
+        raise typer.BadParameter(f"Unknown sandbox type '{sandbox}'. Supported types are 'docker', 'singularity', 'singularity-exec'.")
+
+    app_context.sandbox_manager = manager_class()
+
+    # Step 5 and 6 are unchanged
+    console.print(f"[cyan]Initializing LLM backend: {llm_backend}[/cyan]")
+    if llm_backend == "chatgpt":
+        from openai import OpenAI
+        app_context.llm_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+    elif llm_backend == "ollama":
+        from olaf.core.ollama_wrapper import OllamaClient as OpenAI
+        app_context.llm_client = OpenAI(host=ollama_host)
+    else:
+        raise typer.BadParameter(f"Unknown LLM backend '{llm_backend}'.")
+
+    resources = collect_resources(console, resources_dir) if resources_dir else []
+    app_context.analysis_context = textwrap.dedent(f"Dataset path: **{dataset.name}**\n...")
+    driver = app_context.agent_system.get_agent(driver_agent)
+    system_prompt = (app_context.roster_instructions + "\n\n" + driver.get_full_prompt() + "\n\n" + app_context.analysis_context)
+    app_context.initial_history = [{"role": "system", "content": system_prompt}]
+
+
+# --- Subcommands (interactive, auto) are unchanged ---
+
+@run_app.command("interactive")
+def run_interactive(ctx: typer.Context):
+    """Run the agent system in a manual, interactive chat session."""
+    context: AppContext = ctx.obj
+    context.console.print("\n[bold blue]🚀 Starting Interactive Mode...[/bold blue]")
+    
+    history = context.initial_history[:]
+    history.append({"role": "user", "content": "Beginning interactive session. What is the plan?"})
+    
+    run_agent_session(
+        console=context.console,
+        agent_system=cast(AgentSystem, context.agent_system),
+        driver_agent=cast(AgentSystem, context.agent_system).get_agent(cast(str, context.driver_agent_name)),
+        roster_instructions=cast(str, context.roster_instructions),
+        analysis_context=cast(str, context.analysis_context),
+        llm_client=cast(object, context.llm_client),
+        sandbox_manager=cast(SandboxManager, context.sandbox_manager),
+        history=history,
+        is_auto=False
+    )
+
+@run_app.command("auto")
+def run_auto(
+    ctx: typer.Context,
+    prompt: str = typer.Option(None, "--prompt", "-p", help="Initial prompt for the auto run."),
+    turns: int = typer.Option(3, "--turns", "-t", help="Number of turns to run automatically."),
+):
+    """Run the agent system automatically for a set number of turns."""
+    context: AppContext = ctx.obj
+    
+    if prompt is None:
+        prompt = Prompt.ask("Enter the initial prompt for the automated run", default="Analyze this dataset.")
+
+    context.console.print(f"\n[bold green]🚀 Starting Automated Mode for {turns} turns...[/bold green]")
+    
+    history = context.initial_history[:]
+    history.append({"role": "user", "content": prompt})
+    
+    run_agent_session(
+        console=context.console,
+        agent_system=cast(AgentSystem, context.agent_system),
+        driver_agent=cast(AgentSystem, context.agent_system).get_agent(cast(str, context.driver_agent_name)),
+        roster_instructions=cast(str, context.roster_instructions),
+        analysis_context=cast(str, context.analysis_context),
+        llm_client=cast(object, context.llm_client),
+        sandbox_manager=cast(SandboxManager, context.sandbox_manager),
+        history=history,
+        is_auto=True,
+        max_turns=turns
+    )
\ No newline at end of file
diff --git a/cli/olaf/src/olaf/core/sandbox_management.py b/cli/olaf/src/olaf/core/sandbox_management.py
index 94dae13..10f7824 100644
--- a/cli/olaf/src/olaf/core/sandbox_management.py
+++ b/cli/olaf/src/olaf/core/sandbox_management.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 import json
 
-from cli.sandbox.benchmarking_sandbox_management import (
+from olaf.sandbox.benchmarking_sandbox_management import (
     SandboxManager as _BackendManager,
     CONTAINER_NAME as _SANDBOX_HANDLE,
     IMAGE_TAG as _SANDBOX_IMAGE,  
@@ -32,7 +32,7 @@ def COPY_CMD(src: str, dst: str):
     return _BackendManager, _SANDBOX_HANDLE, COPY_CMD, EXECUTE_ENDPOINT, STATUS_ENDPOINT
 
 def init_singularity(script_dir:str, subprocess, console, force_refresh:bool=False):
-    import cli.sandbox.benchmarking_sandbox_management_singularity as sing
+    import olaf.sandbox.benchmarking_sandbox_management_singularity as sing
     sandbox_dir = script_dir / "sandbox"
 
     # optional force‑refresh
@@ -72,7 +72,7 @@ def COPY_CMD(src: str, dst: str):
 
 
 def init_singularity_exec(script_dir: str, sanbox_data_path, subprocess, console, force_refresh: bool = False):
-    import cli.sandbox.benchmarking_sandbox_management_singularity as sing
+    import olaf.sandbox.benchmarking_sandbox_management_singularity as sing
     sandbox_dir = script_dir / "sandbox"
 
     # optional force‑refresh
diff --git a/cli/sandbox/__init__.py b/cli/olaf/src/olaf/execution/__init__.py
similarity index 100%
rename from cli/sandbox/__init__.py
rename to cli/olaf/src/olaf/execution/__init__.py
diff --git a/cli/olaf/src/olaf/execution/runner.py b/cli/olaf/src/olaf/execution/runner.py
new file mode 100644
index 0000000..fd51235
--- /dev/null
+++ b/cli/olaf/src/olaf/execution/runner.py
@@ -0,0 +1,249 @@
+# olaf/testing/runner.py
+from __future__ import annotations
+
+import json
+import re
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+from rich.console import Console
+from rich.table import Table
+
+# --- Project-specific Imports ---
+# These imports assume your project structure allows them.
+# You may need to adjust them based on your final package layout.
+try:
+    from olaf.agents.AgentSystem import Agent, AgentSystem
+    from olaf.core.io_helpers import display, extract_python_code, format_execute_response
+except ImportError as e:
+    print(f"Failed to import a required OLAF module: {e}", file=sys.stderr)
+    sys.exit(1)
+
+
+# --- Type Hinting & Base Classes ---
+# Define a base class for sandbox managers to ensure a consistent interface.
+class SandboxManager:
+    """Abstract base class for sandbox interaction."""
+    def start_container(self) -> bool:
+        raise NotImplementedError
+
+    def stop_container(self) -> None:
+        raise NotImplementedError
+
+    def exec_code(self, code: str, timeout: int) -> dict:
+        raise NotImplementedError
+
+# --- Constants and Path Setup ---
+_DELEG_RE = re.compile(r"delegate_to_([A-Za-z0-9_]+)")
+_OUTPUTS_DIR = Path("outputs")
+_SNIPPET_DIR = _OUTPUTS_DIR / "snippets"
+_LEDGER_PATH = _OUTPUTS_DIR / f"benchmark_history_{datetime.utcnow().strftime('%Y%m%d-%H%M%S')}.jsonl"
+
+def _init_paths():
+    """Ensure output directories exist before writing."""
+    _SNIPPET_DIR.mkdir(exist_ok=True, parents=True)
+    _LEDGER_PATH.parent.mkdir(exist_ok=True, parents=True)
+
+# --- Helper Functions (from original script) ---
+def detect_delegation(msg: str) -> Optional[str]:
+    """Return the *full* command name (e.g. 'delegate_to_coder') if present."""
+    m = _DELEG_RE.search(msg)
+    return f"delegate_to_{m.group(1)}" if m else None
+
+def _dump_code_snippet(run_id: str, code: str) -> str:
+    """Write <run_id>.py under outputs/snippets/ and return the relative path."""
+    snippet_path = _SNIPPET_DIR / f"{run_id}.py"
+    snippet_path.write_text(code, encoding="utf-8")
+    return str(snippet_path.relative_to(_OUTPUTS_DIR))
+
+def _save_benchmark_record(*, run_id: str, results: dict, meta: dict, code: str | None):
+    """Append a JSONL record for the benchmark run."""
+    record = {
+        "ts": datetime.utcnow().isoformat(timespec="seconds") + "Z",
+        "run": run_id,
+        "dataset": meta.get("name"),
+        "results": results,
+    }
+    if code:
+        record["code_path"] = _dump_code_snippet(run_id, code)
+    with _LEDGER_PATH.open("a") as fh:
+        fh.write(json.dumps(record) + "\n")
+
+# --- Core Runner Functions ---
+def run_benchmark(
+    console: Console,
+    mgr: SandboxManager,
+    benchmark_module: Path,
+    *,
+    is_auto: bool,
+    metadata: Optional[Dict] = None,
+    agent_name: Optional[str] = None,
+    code_snippet: Optional[str] = None,
+) -> str:
+    """
+    Execute a benchmark module inside the sandbox.
+    In auto mode, saves results and returns a result string for the history.
+    In interactive mode, prints results to the console.
+    """
+    console.print(f"\n[bold cyan]Running benchmark module: {benchmark_module.name}[/bold cyan]")
+    autometric_base_path = benchmark_module.parent / "AutoMetric.py"
+    try:
+        with open(autometric_base_path, "r") as f:
+            autometric_code = f.read()
+        with open(benchmark_module, "r") as f:
+            benchmark_code = f.read()
+    except FileNotFoundError as e:
+        err = f"Benchmark module or AutoMetric.py not found: {e}"
+        console.print(f"[red]{err}[/red]")
+        return err if is_auto else ""
+
+    code_to_execute = f"# --- Code from AutoMetric.py ---\n{autometric_code}\n# --- Code from {benchmark_module.name} ---\n{benchmark_code}"
+    console.print("[cyan]Executing benchmark code...[/cyan]")
+    
+    try:
+        exec_result = mgr.exec_code(code_to_execute, timeout=300)
+
+        table = Table(title="Benchmark Results")
+        table.add_column("Metric", style="cyan")
+        table.add_column("Value", style="magenta")
+        stdout = exec_result.get("stdout", "")
+        result_dict = {}
+        try:
+            # The JSON result is expected to be the last line of stdout
+            result_dict = json.loads(stdout.strip().splitlines()[-1])
+        except (json.JSONDecodeError, IndexError) as e:
+            console.print(f"[yellow]Warning: Could not parse JSON from stdout: {e}[/yellow]")
+
+        if exec_result.get("status") == "ok" and isinstance(result_dict, dict):
+            for key, value in result_dict.items():
+                table.add_row(str(key), str(value))
+            if is_auto:
+                _save_benchmark_record(
+                    run_id=f"{benchmark_module.stem}:{agent_name}:{int(time.time())}",
+                    results=result_dict,
+                    meta=metadata if metadata else {},
+                    code=code_snippet,
+                )
+        else:
+            error_message = exec_result.get("stderr") or "An unknown error occurred."
+            table.add_row("Error", error_message)
+        
+        console.print(table)
+        return "Benchmark results:\n" + json.dumps(result_dict or {"error": "see console logs"})
+    
+    except Exception as exc:
+        err_msg = f"Benchmark execution failed: {exc}"
+        console.print(f"[red]{err_msg}[/red]")
+        return err_msg
+
+def run_agent_session(
+    *,
+    console: Console,
+    agent_system: AgentSystem,
+    driver_agent: Agent,
+    roster_instructions: str,
+    analysis_context: str,
+    llm_client: object,
+    sandbox_manager: SandboxManager,
+    history: List[Dict[str, str]],
+    is_auto: bool,
+    max_turns: int = 1,
+    benchmark_modules: Optional[List[Path]] = None,
+):
+    """
+    Main driver for both interactive and automated agent execution sessions.
+    This is the core, refactored loop from your original script.
+    """
+    from rich.prompt import Prompt
+    _init_paths()
+    
+    current_agent = driver_agent
+    turn = 0
+    turns_left = max_turns
+    last_code_snippet: str | None = None
+
+    while True:
+        turn += 1
+        if is_auto and turn > max_turns:
+            console.print("[bold green]Auto run finished: Max turns reached.[/bold green]")
+            break
+
+        console.print(f"\n[bold]LLM call (turn {turn})…[/bold]")
+        
+        try:
+            # Assuming the llm_client has an OpenAI-compatible interface
+            resp = llm_client.chat.completions.create(
+                model="gpt-4o",  # This could be a parameter
+                messages=history,
+                temperature=0.7,
+            )
+            msg = resp.choices[0].message.content
+        except Exception as e:
+            console.print(f"[red]LLM API error: {e}[/red]")
+            break
+        
+        history.append({"role": "assistant", "content": msg})
+        display(console, f"assistant ({current_agent.name})", msg)
+
+        cmd = detect_delegation(msg)
+        if cmd and cmd in current_agent.commands:
+            target_agent_name = current_agent.commands[cmd].target_agent
+            new_agent = agent_system.get_agent(target_agent_name)
+            if new_agent:
+                console.print(f"[yellow]🔄 Routing to '{target_agent_name}' via {cmd}[/yellow]")
+                history.append({"role": "assistant", "content": f"🔄 Routing to **{target_agent_name}** (command `{cmd}`)"})
+                current_agent = new_agent
+                
+                # Rebuild system prompt for the new agent
+                system_prompt = (roster_instructions + "\n\n" + current_agent.get_full_prompt(agent_system.global_policy) + "\n\n" + analysis_context)
+                history.insert(0, {"role": "system", "content": system_prompt})
+                continue
+
+        code = extract_python_code(msg)
+        if code:
+            last_code_snippet = code
+            console.print("[cyan]Executing code in sandbox…[/cyan]")
+            exec_result = sandbox_manager.exec_code(code, timeout=300)
+            feedback = format_execute_response(exec_result, _OUTPUTS_DIR)
+            history.append({"role": "user", "content": feedback})
+            display(console, "user", feedback)
+
+        # --- Mode-specific logic ---
+        if is_auto:
+            if benchmark_modules:
+                result_str = run_benchmark(
+                    console, sandbox_manager, benchmark_modules[0],
+                    is_auto=True, metadata={"name": "auto"}, agent_name=current_agent.name, code_snippet=last_code_snippet
+                )
+                history.append({"role": "user", "content": result_str})
+                display(console, "user", result_str)
+            console.print(f"[yellow]Auto-continuing... {turn}/{max_turns} turns complete.[/yellow]")
+        else:
+            # Interactive mode input loop
+            while True:
+                prompt_text = "\n[bold]Next message ('benchmark' to run, 'exit' to quit)[/bold]"
+                try:
+                    user_input = Prompt.ask(prompt_text, default="").strip()
+                except (EOFError, KeyboardInterrupt):
+                    user_input = "exit"
+
+                if user_input.lower() in {"exit", "quit"}:
+                    console.print("[bold yellow]Exiting session.[/bold yellow]")
+                    return
+
+                if user_input.lower() == "benchmark":
+                    if benchmark_modules:
+                        for bm_module in benchmark_modules:
+                            run_benchmark(console, sandbox_manager, bm_module, is_auto=False)
+                        continue  # Re-prompt after running benchmarks
+                    else:
+                        console.print("[yellow]No benchmark modules were specified at startup.[/yellow]")
+                        continue
+                
+                if user_input:
+                    history.append({"role": "user", "content": user_input})
+                    display(console, "user", user_input)
+                break  # Exit input loop and proceed to next agent turn
\ No newline at end of file
diff --git a/cli/sandbox/Dockerfile b/cli/olaf/src/olaf/sandbox/Dockerfile
similarity index 100%
rename from cli/sandbox/Dockerfile
rename to cli/olaf/src/olaf/sandbox/Dockerfile
diff --git a/cli/sandbox/Singularity b/cli/olaf/src/olaf/sandbox/Singularity
similarity index 100%
rename from cli/sandbox/Singularity
rename to cli/olaf/src/olaf/sandbox/Singularity
diff --git a/cli/olaf/src/olaf/sandbox/__init__.py b/cli/olaf/src/olaf/sandbox/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/cli/sandbox/benchmarking_sandbox_management.py b/cli/olaf/src/olaf/sandbox/benchmarking_sandbox_management.py
similarity index 100%
rename from cli/sandbox/benchmarking_sandbox_management.py
rename to cli/olaf/src/olaf/sandbox/benchmarking_sandbox_management.py
diff --git a/cli/sandbox/benchmarking_sandbox_management_singularity.py b/cli/olaf/src/olaf/sandbox/benchmarking_sandbox_management_singularity.py
similarity index 100%
rename from cli/sandbox/benchmarking_sandbox_management_singularity.py
rename to cli/olaf/src/olaf/sandbox/benchmarking_sandbox_management_singularity.py
diff --git a/cli/sandbox/kernel_api.py b/cli/olaf/src/olaf/sandbox/kernel_api.py
similarity index 100%
rename from cli/sandbox/kernel_api.py
rename to cli/olaf/src/olaf/sandbox/kernel_api.py
diff --git a/cli/sandbox/offline_kernel.py b/cli/olaf/src/olaf/sandbox/offline_kernel.py
similarity index 100%
rename from cli/sandbox/offline_kernel.py
rename to cli/olaf/src/olaf/sandbox/offline_kernel.py
diff --git a/cli/sandbox/requirements.txt b/cli/olaf/src/olaf/sandbox/requirements.txt
similarity index 100%
rename from cli/sandbox/requirements.txt
rename to cli/olaf/src/olaf/sandbox/requirements.txt
diff --git a/cli/sandbox/start.sh b/cli/olaf/src/olaf/sandbox/start.sh
similarity index 100%
rename from cli/sandbox/start.sh
rename to cli/olaf/src/olaf/sandbox/start.sh
diff --git a/cli/sandbox/start_kernel.py b/cli/olaf/src/olaf/sandbox/start_kernel.py
similarity index 100%
rename from cli/sandbox/start_kernel.py
rename to cli/olaf/src/olaf/sandbox/start_kernel.py
diff --git a/cli/run_automated.sh b/cli/run_automated.sh
deleted file mode 100755
index db05b05..0000000
--- a/cli/run_automated.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/env bash
-# move *out* of cli/ into its parent (Olaf/)
-cd "$(dirname "$0")"/..
-python -m cli.prompt_testing.MultiAgentTester --auto "$@"
\ No newline at end of file
diff --git a/cli/run_interactive.sh b/cli/run_interactive.sh
deleted file mode 100755
index 6373db0..0000000
--- a/cli/run_interactive.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/env bash
-# move *out* of cli/ into its parent (Olaf/)
-cd "$(dirname "$0")"/..
-python -m cli.prompt_testing.MultiAgentTester "$@"
\ No newline at end of file

From b37d244133733ed9a4a1d579c218054a45888555 Mon Sep 17 00:00:00 2001
From: djriffle <djriffle1@gmail.com>
Date: Wed, 13 Aug 2025 15:37:56 -0400
Subject: [PATCH 06/14] Added cli configuration tooling

---
 cli/olaf/pyproject.toml             |  3 ++-
 cli/olaf/src/olaf/cli/config_cli.py | 41 +++++++++++++++++++++++++++++
 cli/olaf/src/olaf/cli/main.py       |  2 ++
 cli/olaf/src/olaf/cli/run_cli.py    |  5 +++-
 cli/olaf/src/olaf/config.py         | 29 ++++++++++++++++++++
 5 files changed, 78 insertions(+), 2 deletions(-)
 create mode 100644 cli/olaf/src/olaf/cli/config_cli.py
 create mode 100644 cli/olaf/src/olaf/config.py

diff --git a/cli/olaf/pyproject.toml b/cli/olaf/pyproject.toml
index f8587f1..b40ff8d 100644
--- a/cli/olaf/pyproject.toml
+++ b/cli/olaf/pyproject.toml
@@ -21,7 +21,8 @@ dependencies = [
   "openai",
   "jupyter-client",   # NOTE: PyPI name has a hyphen
   "nbformat",
-  "typer"
+  "typer",
+  "platformdirs"
 ]
 
 # If you want a command like `olaf …`
diff --git a/cli/olaf/src/olaf/cli/config_cli.py b/cli/olaf/src/olaf/cli/config_cli.py
new file mode 100644
index 0000000..1aa9607
--- /dev/null
+++ b/cli/olaf/src/olaf/cli/config_cli.py
@@ -0,0 +1,41 @@
+# olaf/cli/config_cli.py
+import re
+import typer
+from rich.console import Console
+
+# Import the centralized ENV_FILE path
+from olaf.config import ENV_FILE
+
+config_app = typer.Typer(
+    name="config",
+    help="Manage OLAF configuration and API keys.",
+    no_args_is_help=True
+)
+
+console = Console()
+
+@config_app.command("set-openai-key")
+def set_api_key(
+    api_key: str = typer.Argument(..., help="Your OpenAI API key (e.g., 'sk-...')")
+):
+    """
+    Saves your OpenAI API key to the OLAF environment file.
+    """
+    if not api_key.startswith("sk-"):
+        console.print("[yellow]Warning: Key does not look like a standard OpenAI API key (should start with 'sk-').[/yellow]")
+
+    # Ensure the .env file exists
+    if not ENV_FILE.exists():
+        ENV_FILE.touch()
+
+    content = ENV_FILE.read_text()
+    key_to_set = f'OPENAI_API_KEY="{api_key}"'
+
+    # Use regex to safely replace the key if it already exists
+    if re.search(r"^OPENAI_API_KEY=.*$", content, flags=re.MULTILINE):
+        new_content = re.sub(r"^OPENAI_API_KEY=.*$", key_to_set, content, flags=re.MULTILINE)
+    else:
+        new_content = content + f"\n{key_to_set}\n"
+
+    ENV_FILE.write_text(new_content.strip())
+    console.print(f"[bold green]✅ OpenAI API key has been set successfully in:[/bold green] {ENV_FILE}")
\ No newline at end of file
diff --git a/cli/olaf/src/olaf/cli/main.py b/cli/olaf/src/olaf/cli/main.py
index eafdd0c..e14aa8f 100644
--- a/cli/olaf/src/olaf/cli/main.py
+++ b/cli/olaf/src/olaf/cli/main.py
@@ -8,6 +8,7 @@
 # Import the app for the new 'datasets' command
 from .datasets_cli import datasets_app
 from .run_cli import run_app
+from .config_cli import config_app 
 # Main OLAF application
 app = typer.Typer(
     name="olaf",
@@ -19,6 +20,7 @@
 app.add_typer(create_system_app, name="create-system")
 app.add_typer(datasets_app, name="datasets")
 app.add_typer(run_app, name="run")
+app.add_typer(config_app, name="config")  # <-- Register the new config app
 
 
 def main():
diff --git a/cli/olaf/src/olaf/cli/run_cli.py b/cli/olaf/src/olaf/cli/run_cli.py
index f4da586..0450c71 100644
--- a/cli/olaf/src/olaf/cli/run_cli.py
+++ b/cli/olaf/src/olaf/cli/run_cli.py
@@ -8,14 +8,16 @@
 import typer
 from rich.console import Console
 from rich.prompt import Prompt
+from dotenv import load_dotenv
 
 # Import your project's modules and shared configuration
+from olaf.config import DEFAULT_AGENT_DIR, ENV_FILE
+
 from olaf.agents.AgentSystem import AgentSystem
 from olaf.core.io_helpers import collect_resources
 from olaf.core.sandbox_management import (init_docker, init_singularity, init_singularity_exec)
 from olaf.execution.runner import run_agent_session, SandboxManager
 from olaf.datasets.czi_datasets import get_datasets_dir
-from olaf.agents.create_agent_system import DEFAULT_AGENT_DIR
 
 # --- Define package-internal paths ---
 PACKAGE_ROOT = Path(__file__).resolve().parent.parent
@@ -102,6 +104,7 @@ def main_run_callback(
     sandbox: str = typer.Option(None, "--sandbox", help="Sandbox backend to use: 'docker', 'singularity', or 'singularity-exec'."), # <-- Changed default
     force_refresh: bool = typer.Option(False, "--force-refresh", help="Force refresh/rebuild of the sandbox environment."),
 ):
+    load_dotenv(dotenv_path=ENV_FILE)
     app_context = AppContext()
     console = app_context.console
     ctx.obj = app_context
diff --git a/cli/olaf/src/olaf/config.py b/cli/olaf/src/olaf/config.py
new file mode 100644
index 0000000..a0ff88a
--- /dev/null
+++ b/cli/olaf/src/olaf/config.py
@@ -0,0 +1,29 @@
+# olaf/config.py
+import os
+from pathlib import Path
+from platformdirs import PlatformDirs
+
+# Define app-specific identifiers for platformdirs
+APP_NAME = "olaf"
+APP_AUTHOR = "OpenTechBio"
+dirs = PlatformDirs(APP_NAME, APP_AUTHOR)
+
+# Define the root directory for all user-specific OLAF files.
+# This respects the OLAF_HOME environment variable but has a sensible default.
+OLAF_HOME = Path(os.environ.get("OLAF_HOME", dirs.user_data_dir)).expanduser()
+
+# Define standard subdirectories
+DEFAULT_AGENT_DIR = OLAF_HOME / "agent_systems"
+DEFAULT_DATASETS_DIR = OLAF_HOME / "datasets"
+
+# Define the path to the environment file for storing secrets like API keys
+ENV_FILE = OLAF_HOME / ".env"
+
+def init_olaf_home():
+    """Ensures the main OLAF directory and its subdirectories exist."""
+    OLAF_HOME.mkdir(parents=True, exist_ok=True)
+    DEFAULT_AGENT_DIR.mkdir(exist_ok=True)
+    DEFAULT_DATASETS_DIR.mkdir(exist_ok=True)
+
+# Automatically initialize directories when this module is imported
+init_olaf_home()
\ No newline at end of file

From 0a85f86ac22ba8d5049d864e58de1a1bc36efc1e Mon Sep 17 00:00:00 2001
From: djriffle <djriffle1@gmail.com>
Date: Wed, 13 Aug 2025 22:22:34 -0400
Subject: [PATCH 07/14] added llm sanbox and backend selection

---
 cli/olaf/src/olaf/cli/run_cli.py         | 26 ++++++++++++------------
 cli/olaf/src/olaf/core/ollama_wrapper.py |  2 +-
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/cli/olaf/src/olaf/cli/run_cli.py b/cli/olaf/src/olaf/cli/run_cli.py
index 0450c71..5ac28aa 100644
--- a/cli/olaf/src/olaf/cli/run_cli.py
+++ b/cli/olaf/src/olaf/cli/run_cli.py
@@ -12,7 +12,6 @@
 
 # Import your project's modules and shared configuration
 from olaf.config import DEFAULT_AGENT_DIR, ENV_FILE
-
 from olaf.agents.AgentSystem import AgentSystem
 from olaf.core.io_helpers import collect_resources
 from olaf.core.sandbox_management import (init_docker, init_singularity, init_singularity_exec)
@@ -99,17 +98,17 @@ def main_run_callback(
     driver_agent: str = typer.Option(None, "--driver-agent", "-d", help="Name of the agent to start with."),
     dataset: Path = typer.Option(None, "--dataset", "-ds", help="Path to the dataset file (.h5ad).", readable=True),
     resources_dir: Path = typer.Option(None, "--resources", help="Path to a directory of resource files to mount.", exists=True, file_okay=False),
-    llm_backend: str = typer.Option("chatgpt", "--llm", help="LLM backend to use.", case_sensitive=False),
+    llm_backend: str = typer.Option(None, "--llm", help="LLM backend to use: 'chatgpt' or 'ollama'."),
     ollama_host: str = typer.Option("http://localhost:11434", "--ollama-host", help="Base URL for Ollama backend."),
-    sandbox: str = typer.Option(None, "--sandbox", help="Sandbox backend to use: 'docker', 'singularity', or 'singularity-exec'."), # <-- Changed default
+    sandbox: str = typer.Option(None, "--sandbox", help="Sandbox backend to use: 'docker', 'singularity', or 'singularity-exec'."),
     force_refresh: bool = typer.Option(False, "--force-refresh", help="Force refresh/rebuild of the sandbox environment."),
 ):
     load_dotenv(dotenv_path=ENV_FILE)
+
     app_context = AppContext()
     console = app_context.console
     ctx.obj = app_context
 
-    # Steps 1, 2, and 3 are unchanged
     if blueprint is None:
         blueprint = _prompt_for_file(console, DEFAULT_AGENT_DIR, PACKAGE_AGENTS_DIR, ".json", "Agent System Blueprint")
     app_context.agent_system = AgentSystem.load_from_json(str(blueprint))
@@ -124,14 +123,8 @@ def main_run_callback(
     if dataset is None:
         dataset = _prompt_for_file(console, get_datasets_dir(), PACKAGE_DATASETS_DIR, ".h5ad", "Dataset")
 
-    # --- Step 4. Configure Sandbox (Corrected Logic) ---
-    # Prompt for sandbox if not provided as a flag
     if sandbox is None:
-        sandbox = Prompt.ask(
-            "Choose a sandbox backend",
-            choices=["docker", "singularity", "singularity-exec"],
-            default="docker"
-        )
+        sandbox = Prompt.ask("Choose a sandbox backend", choices=["docker", "singularity", "singularity-exec"], default="docker")
         
     console.print(f"[cyan]Initializing sandbox backend: {sandbox}[/cyan]")
     script_dir = Path(__file__).resolve().parent
@@ -145,11 +138,18 @@ def main_run_callback(
         SANDBOX_DATA_PATH = "/workspace/dataset.h5ad"
         manager_class, _, _, _, _ = init_singularity_exec(script_dir, SANDBOX_DATA_PATH, subprocess, console, force_refresh=force_refresh)
     else:
-        raise typer.BadParameter(f"Unknown sandbox type '{sandbox}'. Supported types are 'docker', 'singularity', 'singularity-exec'.")
+        raise typer.BadParameter(f"Unknown sandbox type '{sandbox}'.")
 
     app_context.sandbox_manager = manager_class()
 
-    # Step 5 and 6 are unchanged
+    # --- Step 5. Configure LLM Client (Corrected Logic) ---
+    if llm_backend is None:
+        llm_backend = Prompt.ask("Choose an LLM backend", choices=["chatgpt", "ollama"], default="chatgpt")
+
+    # Only ask for Ollama host if it's the selected backend and the user hasn't already provided a custom host via flags.
+    if llm_backend == "ollama" and ollama_host == "http://localhost:11434":
+         ollama_host = Prompt.ask("Enter the Ollama base URL", default="http://localhost:11434")
+
     console.print(f"[cyan]Initializing LLM backend: {llm_backend}[/cyan]")
     if llm_backend == "chatgpt":
         from openai import OpenAI
diff --git a/cli/olaf/src/olaf/core/ollama_wrapper.py b/cli/olaf/src/olaf/core/ollama_wrapper.py
index 1d796ce..961e598 100644
--- a/cli/olaf/src/olaf/core/ollama_wrapper.py
+++ b/cli/olaf/src/olaf/core/ollama_wrapper.py
@@ -22,7 +22,7 @@ class OllamaClient:
         print(resp.choices[0].message.content)
     """
 
-    def __init__(self, host: str = "http://localhost:11434", model: str = "llama2"):
+    def __init__(self, host: str = "http://localhost:11434", model: str = "deepseek-r1:70b"):
         if not host.startswith(("http://", "https://")):          # ← add
             host = "http://" + host         
         self._host = host.rstrip("/")

From e6157a62ad4dd39e03202833e63085c91496e89b Mon Sep 17 00:00:00 2001
From: djriffle <djriffle1@gmail.com>
Date: Wed, 13 Aug 2025 22:30:13 -0400
Subject: [PATCH 08/14] added sandbox startup to run

---
 cli/olaf/src/olaf/cli/run_cli.py | 125 +++++++++++++++----------------
 1 file changed, 62 insertions(+), 63 deletions(-)

diff --git a/cli/olaf/src/olaf/cli/run_cli.py b/cli/olaf/src/olaf/cli/run_cli.py
index 5ac28aa..44a8492 100644
--- a/cli/olaf/src/olaf/cli/run_cli.py
+++ b/cli/olaf/src/olaf/cli/run_cli.py
@@ -2,7 +2,7 @@
 import os
 import textwrap
 from pathlib import Path
-from typing import List, cast
+from typing import List, Tuple, cast
 import subprocess
 
 import typer
@@ -10,69 +10,50 @@
 from rich.prompt import Prompt
 from dotenv import load_dotenv
 
-# Import your project's modules and shared configuration
 from olaf.config import DEFAULT_AGENT_DIR, ENV_FILE
-from olaf.agents.AgentSystem import AgentSystem
+from olaf.agents.AgentSystem import Agent, AgentSystem
 from olaf.core.io_helpers import collect_resources
 from olaf.core.sandbox_management import (init_docker, init_singularity, init_singularity_exec)
 from olaf.execution.runner import run_agent_session, SandboxManager
 from olaf.datasets.czi_datasets import get_datasets_dir
 
-# --- Define package-internal paths ---
 PACKAGE_ROOT = Path(__file__).resolve().parent.parent
 PACKAGE_AGENTS_DIR = PACKAGE_ROOT / "agents"
 PACKAGE_DATASETS_DIR = PACKAGE_ROOT / "datasets"
 
 
-# --- Helper functions for interactive prompts (unchanged) ---
-
 def _prompt_for_file(
-    console: Console,
-    user_dir: Path,
-    package_dir: Path,
-    extension: str,
-    prompt_title: str,
+    console: Console, user_dir: Path, package_dir: Path, extension: str, prompt_title: str
 ) -> Path:
     """
     Generic helper to find files in both user and package directories and prompt for a selection.
     User files take priority over package files with the same name.
     """
     console.print(f"[bold]Select {prompt_title}:[/bold]")
-    
     found_files = []
     seen_filenames = set()
-
     if user_dir.exists():
         for file_path in sorted(list(user_dir.glob(f"**/*{extension}"))):
             if file_path.name not in seen_filenames:
                 found_files.append({"path": file_path, "label": "User"})
                 seen_filenames.add(file_path.name)
-
     if package_dir.exists():
         for file_path in sorted(list(package_dir.glob(f"**/*{extension}"))):
             if file_path.name not in seen_filenames:
                 found_files.append({"path": file_path, "label": "Package"})
                 seen_filenames.add(file_path.name)
-
     if not found_files:
-        console.print(f"[bold red]No '{extension}' files found in your user directory ({user_dir}) or the package directory ({package_dir}).[/bold red]")
+        console.print(f"[bold red]No '{extension}' files found.[/bold red]")
         raise typer.Exit(1)
-        
     for i, file_info in enumerate(found_files, 1):
         console.print(f"  [cyan]{i}[/cyan]: {file_info['path'].name} [yellow]({file_info['label']})[/yellow]")
-
     choice_str = Prompt.ask("Enter the number of your choice", choices=[str(i) for i in range(1, len(found_files) + 1)])
     return found_files[int(choice_str) - 1]['path']
 
 def _prompt_for_driver(console: Console, system: AgentSystem) -> str:
-    """Prompts the user to select a driver agent from the loaded system."""
     console.print("[bold]Select a driver agent:[/bold]")
     agents = list(system.agents.keys())
-    driver = Prompt.ask("Enter the name of the driver agent", choices=agents, default=agents[0])
-    return driver
-
-
-# --- Typer App and Context (unchanged) ---
+    return Prompt.ask("Enter the name of the driver agent", choices=agents, default=agents[0])
 
 run_app = typer.Typer(
     name="run",
@@ -90,6 +71,10 @@ def __init__(self):
         self.sandbox_manager: SandboxManager | None = None
         self.llm_client: object | None = None
         self.initial_history: List[dict] | None = None
+        self.dataset_path: Path | None = None
+        self.resources: List[Tuple[Path, str]] = []
+        # Store sandbox details
+        self.sandbox_details: dict = {}
 
 @run_app.callback(invoke_without_command=True)
 def main_run_callback(
@@ -104,7 +89,6 @@ def main_run_callback(
     force_refresh: bool = typer.Option(False, "--force-refresh", help="Force refresh/rebuild of the sandbox environment."),
 ):
     load_dotenv(dotenv_path=ENV_FILE)
-
     app_context = AppContext()
     console = app_context.console
     ctx.obj = app_context
@@ -122,31 +106,29 @@ def main_run_callback(
     
     if dataset is None:
         dataset = _prompt_for_file(console, get_datasets_dir(), PACKAGE_DATASETS_DIR, ".h5ad", "Dataset")
+    app_context.dataset_path = dataset
 
     if sandbox is None:
         sandbox = Prompt.ask("Choose a sandbox backend", choices=["docker", "singularity", "singularity-exec"], default="docker")
-        
+    
     console.print(f"[cyan]Initializing sandbox backend: {sandbox}[/cyan]")
     script_dir = Path(__file__).resolve().parent
     
-    manager_class = None
+    manager_class, handle, copy_cmd, exec_endpoint, status_endpoint = (None, None, None, None, None)
     if sandbox == "docker":
-        manager_class, _, _, _, _ = init_docker(script_dir, subprocess, console, force_refresh=force_refresh)
+        manager_class, handle, copy_cmd, exec_endpoint, status_endpoint = init_docker(script_dir, subprocess, console, force_refresh=force_refresh)
     elif sandbox == "singularity":
-        manager_class, _, _, _, _ = init_singularity(script_dir, subprocess, console, force_refresh=force_refresh)
+        manager_class, handle, copy_cmd, exec_endpoint, status_endpoint = init_singularity(script_dir, subprocess, console, force_refresh=force_refresh)
     elif sandbox == "singularity-exec":
         SANDBOX_DATA_PATH = "/workspace/dataset.h5ad"
-        manager_class, _, _, _, _ = init_singularity_exec(script_dir, SANDBOX_DATA_PATH, subprocess, console, force_refresh=force_refresh)
+        manager_class, handle, copy_cmd, exec_endpoint, status_endpoint = init_singularity_exec(script_dir, SANDBOX_DATA_PATH, subprocess, console, force_refresh=force_refresh)
     else:
         raise typer.BadParameter(f"Unknown sandbox type '{sandbox}'.")
-
     app_context.sandbox_manager = manager_class()
+    app_context.sandbox_details = {"handle": handle, "copy_cmd": copy_cmd, "is_exec_mode": sandbox == "singularity-exec"}
 
-    # --- Step 5. Configure LLM Client (Corrected Logic) ---
     if llm_backend is None:
         llm_backend = Prompt.ask("Choose an LLM backend", choices=["chatgpt", "ollama"], default="chatgpt")
-
-    # Only ask for Ollama host if it's the selected backend and the user hasn't already provided a custom host via flags.
     if llm_backend == "ollama" and ollama_host == "http://localhost:11434":
          ollama_host = Prompt.ask("Enter the Ollama base URL", default="http://localhost:11434")
 
@@ -160,35 +142,62 @@ def main_run_callback(
     else:
         raise typer.BadParameter(f"Unknown LLM backend '{llm_backend}'.")
 
-    resources = collect_resources(console, resources_dir) if resources_dir else []
+    app_context.resources = collect_resources(console, resources_dir) if resources_dir else []
     app_context.analysis_context = textwrap.dedent(f"Dataset path: **{dataset.name}**\n...")
     driver = app_context.agent_system.get_agent(driver_agent)
-    system_prompt = (app_context.roster_instructions + "\n\n" + driver.get_full_prompt() + "\n\n" + app_context.analysis_context)
+    system_prompt = (app_context.roster_instructions + "\n\n" + driver.get_full_prompt(app_context.agent_system.global_policy) + "\n\n" + app_context.analysis_context)
     app_context.initial_history = [{"role": "system", "content": system_prompt}]
 
-
-# --- Subcommands (interactive, auto) are unchanged ---
+def _setup_and_run_session(context: AppContext, history: list, is_auto: bool, max_turns: int):
+    """Helper to start, run, and stop the sandbox session."""
+    sandbox_manager = cast(SandboxManager, context.sandbox_manager)
+    console = context.console
+    
+    console.print("[cyan]Starting sandbox...[/cyan]")
+    if not sandbox_manager.start_container():
+        console.print("[bold red]Failed to start sandbox container.[/bold red]")
+        raise typer.Exit(1)
+    
+    try:
+        # Data setup logic from original script
+        details = context.sandbox_details
+        dataset_path = cast(Path, context.dataset_path)
+        if details["is_exec_mode"] and hasattr(sandbox_manager, "set_data"):
+            sandbox_manager.set_data(dataset_path, context.resources)
+        else:
+            SANDBOX_DATA_PATH = "/workspace/dataset.h5ad" # Or get from context
+            details["copy_cmd"](str(dataset_path), f"{details['handle']}:{SANDBOX_DATA_PATH}")
+            for hp, cp in context.resources:
+                details["copy_cmd"](str(hp), f"{details['handle']}:{cp}")
+
+        # Run the main agent loop
+        run_agent_session(
+            console=console,
+            agent_system=cast(AgentSystem, context.agent_system),
+            driver_agent=cast(AgentSystem, context.agent_system).get_agent(cast(str, context.driver_agent_name)),
+            roster_instructions=cast(str, context.roster_instructions),
+            analysis_context=cast(str, context.analysis_context),
+            llm_client=cast(object, context.llm_client),
+            sandbox_manager=sandbox_manager,
+            history=history,
+            is_auto=is_auto,
+            max_turns=max_turns
+        )
+    finally:
+        console.print("[cyan]Stopping sandbox...[/cyan]")
+        sandbox_manager.stop_container()
 
 @run_app.command("interactive")
 def run_interactive(ctx: typer.Context):
     """Run the agent system in a manual, interactive chat session."""
     context: AppContext = ctx.obj
-    context.console.print("\n[bold blue]🚀 Starting Interactive Mode...[/bold blue]")
+    console = context.console
+    console.print("\n[bold blue]🚀 Starting Interactive Mode...[/bold blue]")
     
     history = context.initial_history[:]
     history.append({"role": "user", "content": "Beginning interactive session. What is the plan?"})
     
-    run_agent_session(
-        console=context.console,
-        agent_system=cast(AgentSystem, context.agent_system),
-        driver_agent=cast(AgentSystem, context.agent_system).get_agent(cast(str, context.driver_agent_name)),
-        roster_instructions=cast(str, context.roster_instructions),
-        analysis_context=cast(str, context.analysis_context),
-        llm_client=cast(object, context.llm_client),
-        sandbox_manager=cast(SandboxManager, context.sandbox_manager),
-        history=history,
-        is_auto=False
-    )
+    _setup_and_run_session(context, history, is_auto=False, max_turns=-1)
 
 @run_app.command("auto")
 def run_auto(
@@ -198,24 +207,14 @@ def run_auto(
 ):
     """Run the agent system automatically for a set number of turns."""
     context: AppContext = ctx.obj
+    console = context.console
     
     if prompt is None:
         prompt = Prompt.ask("Enter the initial prompt for the automated run", default="Analyze this dataset.")
 
-    context.console.print(f"\n[bold green]🚀 Starting Automated Mode for {turns} turns...[/bold green]")
+    console.print(f"\n[bold green]🚀 Starting Automated Mode for {turns} turns...[/bold green]")
     
     history = context.initial_history[:]
     history.append({"role": "user", "content": prompt})
     
-    run_agent_session(
-        console=context.console,
-        agent_system=cast(AgentSystem, context.agent_system),
-        driver_agent=cast(AgentSystem, context.agent_system).get_agent(cast(str, context.driver_agent_name)),
-        roster_instructions=cast(str, context.roster_instructions),
-        analysis_context=cast(str, context.analysis_context),
-        llm_client=cast(object, context.llm_client),
-        sandbox_manager=cast(SandboxManager, context.sandbox_manager),
-        history=history,
-        is_auto=True,
-        max_turns=turns
-    )
\ No newline at end of file
+    _setup_and_run_session(context, history, is_auto=True, max_turns=turns)
\ No newline at end of file

From e3baee09b369fd0b3b0d8bb2fb139836ee307e89 Mon Sep 17 00:00:00 2001
From: djriffle <djriffle1@gmail.com>
Date: Thu, 14 Aug 2025 16:32:30 -0400
Subject: [PATCH 09/14] fixed singularity dataset bind issue

---
 cli/olaf/src/olaf/cli/run_cli.py      | 30 ++++++++++++++---------
 cli/olaf/src/olaf/execution/runner.py | 35 ++++++++++++++-------------
 2 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/cli/olaf/src/olaf/cli/run_cli.py b/cli/olaf/src/olaf/cli/run_cli.py
index 44a8492..6dc005f 100644
--- a/cli/olaf/src/olaf/cli/run_cli.py
+++ b/cli/olaf/src/olaf/cli/run_cli.py
@@ -21,6 +21,9 @@
 PACKAGE_AGENTS_DIR = PACKAGE_ROOT / "agents"
 PACKAGE_DATASETS_DIR = PACKAGE_ROOT / "datasets"
 
+# This is the static path where the dataset will ALWAYS be inside the container
+SANDBOX_DATA_PATH = "/workspace/dataset.h5ad"
+
 
 def _prompt_for_file(
     console: Console, user_dir: Path, package_dir: Path, extension: str, prompt_title: str
@@ -73,7 +76,6 @@ def __init__(self):
         self.initial_history: List[dict] | None = None
         self.dataset_path: Path | None = None
         self.resources: List[Tuple[Path, str]] = []
-        # Store sandbox details
         self.sandbox_details: dict = {}
 
 @run_app.callback(invoke_without_command=True)
@@ -100,7 +102,7 @@ def main_run_callback(
     if driver_agent is None:
         driver_agent = _prompt_for_driver(console, app_context.agent_system)
     if driver_agent not in app_context.agent_system.agents:
-        raise typer.BadParameter(f"Driver agent '{driver_agent}' not found in blueprint.")
+        raise typer.BadParameter(f"Driver agent '{driver_agent}' not found.")
     app_context.driver_agent_name = driver_agent
     app_context.roster_instructions = app_context.agent_system.get_instructions()
     
@@ -120,7 +122,6 @@ def main_run_callback(
     elif sandbox == "singularity":
         manager_class, handle, copy_cmd, exec_endpoint, status_endpoint = init_singularity(script_dir, subprocess, console, force_refresh=force_refresh)
     elif sandbox == "singularity-exec":
-        SANDBOX_DATA_PATH = "/workspace/dataset.h5ad"
         manager_class, handle, copy_cmd, exec_endpoint, status_endpoint = init_singularity_exec(script_dir, SANDBOX_DATA_PATH, subprocess, console, force_refresh=force_refresh)
     else:
         raise typer.BadParameter(f"Unknown sandbox type '{sandbox}'.")
@@ -143,7 +144,11 @@ def main_run_callback(
         raise typer.BadParameter(f"Unknown LLM backend '{llm_backend}'.")
 
     app_context.resources = collect_resources(console, resources_dir) if resources_dir else []
-    app_context.analysis_context = textwrap.dedent(f"Dataset path: **{dataset.name}**\n...")
+    
+    # --- CORRECTED ANALYSIS CONTEXT ---
+    # Always use the static in-container path for the prompt, not the host path.
+    app_context.analysis_context = textwrap.dedent(f"Dataset path: **{SANDBOX_DATA_PATH}**\n...")
+    
     driver = app_context.agent_system.get_agent(driver_agent)
     system_prompt = (app_context.roster_instructions + "\n\n" + driver.get_full_prompt(app_context.agent_system.global_policy) + "\n\n" + app_context.analysis_context)
     app_context.initial_history = [{"role": "system", "content": system_prompt}]
@@ -154,23 +159,24 @@ def _setup_and_run_session(context: AppContext, history: list, is_auto: bool, ma
     console = context.console
     
     console.print("[cyan]Starting sandbox...[/cyan]")
+    
+    # For exec mode, we must configure the mounts *before* starting.
+    details = context.sandbox_details
+    dataset_path = cast(Path, context.dataset_path)
+    if details["is_exec_mode"] and hasattr(sandbox_manager, "set_data"):
+        sandbox_manager.set_data(dataset_path, context.resources)
+
     if not sandbox_manager.start_container():
         console.print("[bold red]Failed to start sandbox container.[/bold red]")
         raise typer.Exit(1)
     
     try:
-        # Data setup logic from original script
-        details = context.sandbox_details
-        dataset_path = cast(Path, context.dataset_path)
-        if details["is_exec_mode"] and hasattr(sandbox_manager, "set_data"):
-            sandbox_manager.set_data(dataset_path, context.resources)
-        else:
-            SANDBOX_DATA_PATH = "/workspace/dataset.h5ad" # Or get from context
+        # For non-exec modes, we copy data *after* starting.
+        if not details["is_exec_mode"]:
             details["copy_cmd"](str(dataset_path), f"{details['handle']}:{SANDBOX_DATA_PATH}")
             for hp, cp in context.resources:
                 details["copy_cmd"](str(hp), f"{details['handle']}:{cp}")
 
-        # Run the main agent loop
         run_agent_session(
             console=console,
             agent_system=cast(AgentSystem, context.agent_system),
diff --git a/cli/olaf/src/olaf/execution/runner.py b/cli/olaf/src/olaf/execution/runner.py
index fd51235..1b7446b 100644
--- a/cli/olaf/src/olaf/execution/runner.py
+++ b/cli/olaf/src/olaf/execution/runner.py
@@ -1,4 +1,4 @@
-# olaf/testing/runner.py
+# olaf/execution/runner.py
 from __future__ import annotations
 
 import json
@@ -13,9 +13,8 @@
 from rich.table import Table
 
 # --- Project-specific Imports ---
-# These imports assume your project structure allows them.
-# You may need to adjust them based on your final package layout.
 try:
+    from olaf.config import OLAF_HOME
     from olaf.agents.AgentSystem import Agent, AgentSystem
     from olaf.core.io_helpers import display, extract_python_code, format_execute_response
 except ImportError as e:
@@ -24,7 +23,6 @@
 
 
 # --- Type Hinting & Base Classes ---
-# Define a base class for sandbox managers to ensure a consistent interface.
 class SandboxManager:
     """Abstract base class for sandbox interaction."""
     def start_container(self) -> bool:
@@ -38,7 +36,7 @@ def exec_code(self, code: str, timeout: int) -> dict:
 
 # --- Constants and Path Setup ---
 _DELEG_RE = re.compile(r"delegate_to_([A-Za-z0-9_]+)")
-_OUTPUTS_DIR = Path("outputs")
+_OUTPUTS_DIR = OLAF_HOME / "runs"
 _SNIPPET_DIR = _OUTPUTS_DIR / "snippets"
 _LEDGER_PATH = _OUTPUTS_DIR / f"benchmark_history_{datetime.utcnow().strftime('%Y%m%d-%H%M%S')}.jsonl"
 
@@ -112,7 +110,6 @@ def run_benchmark(
         stdout = exec_result.get("stdout", "")
         result_dict = {}
         try:
-            # The JSON result is expected to be the last line of stdout
             result_dict = json.loads(stdout.strip().splitlines()[-1])
         except (json.JSONDecodeError, IndexError) as e:
             console.print(f"[yellow]Warning: Could not parse JSON from stdout: {e}[/yellow]")
@@ -155,14 +152,19 @@ def run_agent_session(
 ):
     """
     Main driver for both interactive and automated agent execution sessions.
-    This is the core, refactored loop from your original script.
     """
     from rich.prompt import Prompt
     _init_paths()
     
+    # --- Display the initial context provided by the CLI ---
+    for message in history:
+        role = message.get("role", "unknown")
+        content = message.get("content", "")
+        if role in ["system", "user"]:
+            display(console, role, content)
+            
     current_agent = driver_agent
     turn = 0
-    turns_left = max_turns
     last_code_snippet: str | None = None
 
     while True:
@@ -174,9 +176,8 @@ def run_agent_session(
         console.print(f"\n[bold]LLM call (turn {turn})…[/bold]")
         
         try:
-            # Assuming the llm_client has an OpenAI-compatible interface
             resp = llm_client.chat.completions.create(
-                model="gpt-4o",  # This could be a parameter
+                model="gpt-4o",
                 messages=history,
                 temperature=0.7,
             )
@@ -196,10 +197,12 @@ def run_agent_session(
                 console.print(f"[yellow]🔄 Routing to '{target_agent_name}' via {cmd}[/yellow]")
                 history.append({"role": "assistant", "content": f"🔄 Routing to **{target_agent_name}** (command `{cmd}`)"})
                 current_agent = new_agent
-                
-                # Rebuild system prompt for the new agent
                 system_prompt = (roster_instructions + "\n\n" + current_agent.get_full_prompt(agent_system.global_policy) + "\n\n" + analysis_context)
+                # We replace the last system prompt with the new one for the new agent
                 history.insert(0, {"role": "system", "content": system_prompt})
+                # Remove the old system prompt to avoid confusion
+                if len(history) > 1 and history[1].get("role") == "system":
+                    history.pop(1)
                 continue
 
         code = extract_python_code(msg)
@@ -211,7 +214,6 @@ def run_agent_session(
             history.append({"role": "user", "content": feedback})
             display(console, "user", feedback)
 
-        # --- Mode-specific logic ---
         if is_auto:
             if benchmark_modules:
                 result_str = run_benchmark(
@@ -222,9 +224,8 @@ def run_agent_session(
                 display(console, "user", result_str)
             console.print(f"[yellow]Auto-continuing... {turn}/{max_turns} turns complete.[/yellow]")
         else:
-            # Interactive mode input loop
             while True:
-                prompt_text = "\n[bold]Next message ('benchmark' to run, 'exit' to quit)[/bold]"
+                prompt_text = "\n[bold]Next message ('benchmark' to run selected benchmark, 'exit' to quit)[/bold]"
                 try:
                     user_input = Prompt.ask(prompt_text, default="").strip()
                 except (EOFError, KeyboardInterrupt):
@@ -238,7 +239,7 @@ def run_agent_session(
                     if benchmark_modules:
                         for bm_module in benchmark_modules:
                             run_benchmark(console, sandbox_manager, bm_module, is_auto=False)
-                        continue  # Re-prompt after running benchmarks
+                        continue
                     else:
                         console.print("[yellow]No benchmark modules were specified at startup.[/yellow]")
                         continue
@@ -246,4 +247,4 @@ def run_agent_session(
                 if user_input:
                     history.append({"role": "user", "content": user_input})
                     display(console, "user", user_input)
-                break  # Exit input loop and proceed to next agent turn
\ No newline at end of file
+                break
\ No newline at end of file

From 194a77fb5cc05843f845b9790765582517683f27 Mon Sep 17 00:00:00 2001
From: djriffle <djriffle1@gmail.com>
Date: Thu, 14 Aug 2025 16:39:17 -0400
Subject: [PATCH 10/14] Fixed UX around turn and benchmark selection

---
 cli/olaf/src/olaf/cli/run_cli.py | 74 ++++++++++++++++++++++++++------
 1 file changed, 61 insertions(+), 13 deletions(-)

diff --git a/cli/olaf/src/olaf/cli/run_cli.py b/cli/olaf/src/olaf/cli/run_cli.py
index 6dc005f..ebd7442 100644
--- a/cli/olaf/src/olaf/cli/run_cli.py
+++ b/cli/olaf/src/olaf/cli/run_cli.py
@@ -1,13 +1,14 @@
 # olaf/cli/run_cli.py
 import os
+import re
 import textwrap
 from pathlib import Path
-from typing import List, Tuple, cast
+from typing import List, Tuple, cast, Optional
 import subprocess
 
 import typer
 from rich.console import Console
-from rich.prompt import Prompt
+from rich.prompt import Prompt, IntPrompt
 from dotenv import load_dotenv
 
 from olaf.config import DEFAULT_AGENT_DIR, ENV_FILE
@@ -20,8 +21,8 @@
 PACKAGE_ROOT = Path(__file__).resolve().parent.parent
 PACKAGE_AGENTS_DIR = PACKAGE_ROOT / "agents"
 PACKAGE_DATASETS_DIR = PACKAGE_ROOT / "datasets"
+PACKAGE_AUTO_METRICS_DIR = PACKAGE_ROOT / "auto_metrics"
 
-# This is the static path where the dataset will ALWAYS be inside the container
 SANDBOX_DATA_PATH = "/workspace/dataset.h5ad"
 
 
@@ -54,10 +55,38 @@ def _prompt_for_file(
     return found_files[int(choice_str) - 1]['path']
 
 def _prompt_for_driver(console: Console, system: AgentSystem) -> str:
+    """Prompts the user to select a driver agent from the loaded system."""
     console.print("[bold]Select a driver agent:[/bold]")
     agents = list(system.agents.keys())
     return Prompt.ask("Enter the name of the driver agent", choices=agents, default=agents[0])
 
+def _prompt_for_benchmark_module(console: Console) -> Optional[Path]:
+    """Finds and prompts the user to select an auto metric script."""
+    console.print("[bold]Select a benchmark module (optional):[/bold]")
+    
+    # Filter out helper scripts
+    modules = [
+        m for m in PACKAGE_AUTO_METRICS_DIR.glob("*.py")
+        if m.name not in ["__init__.py", "AutoMetric.py"]
+    ]
+    
+    if not modules:
+        console.print("[yellow]No benchmark modules found.[/yellow]")
+        return None
+        
+    for i, mod in enumerate(modules, 1):
+        console.print(f"  [cyan]{i}[/cyan]: {mod.name}")
+    
+    console.print(f"  [cyan]{len(modules) + 1}[/cyan]: Skip")
+
+    choices = [str(i) for i in range(1, len(modules) + 2)]
+    choice_str = Prompt.ask("Enter the number of your choice", choices=choices, default=str(len(modules) + 1))
+    choice_idx = int(choice_str) - 1
+
+    if choice_idx == len(modules):
+        return None
+    return modules[choice_idx]
+
 run_app = typer.Typer(
     name="run",
     help="Run an agent system. Prompts for configuration if not provided via flags.",
@@ -145,22 +174,19 @@ def main_run_callback(
 
     app_context.resources = collect_resources(console, resources_dir) if resources_dir else []
     
-    # --- CORRECTED ANALYSIS CONTEXT ---
-    # Always use the static in-container path for the prompt, not the host path.
     app_context.analysis_context = textwrap.dedent(f"Dataset path: **{SANDBOX_DATA_PATH}**\n...")
     
     driver = app_context.agent_system.get_agent(driver_agent)
     system_prompt = (app_context.roster_instructions + "\n\n" + driver.get_full_prompt(app_context.agent_system.global_policy) + "\n\n" + app_context.analysis_context)
     app_context.initial_history = [{"role": "system", "content": system_prompt}]
 
-def _setup_and_run_session(context: AppContext, history: list, is_auto: bool, max_turns: int):
+def _setup_and_run_session(context: AppContext, history: list, is_auto: bool, max_turns: int, benchmark_modules: Optional[List[Path]] = None):
     """Helper to start, run, and stop the sandbox session."""
     sandbox_manager = cast(SandboxManager, context.sandbox_manager)
     console = context.console
     
     console.print("[cyan]Starting sandbox...[/cyan]")
     
-    # For exec mode, we must configure the mounts *before* starting.
     details = context.sandbox_details
     dataset_path = cast(Path, context.dataset_path)
     if details["is_exec_mode"] and hasattr(sandbox_manager, "set_data"):
@@ -171,7 +197,6 @@ def _setup_and_run_session(context: AppContext, history: list, is_auto: bool, ma
         raise typer.Exit(1)
     
     try:
-        # For non-exec modes, we copy data *after* starting.
         if not details["is_exec_mode"]:
             details["copy_cmd"](str(dataset_path), f"{details['handle']}:{SANDBOX_DATA_PATH}")
             for hp, cp in context.resources:
@@ -187,7 +212,8 @@ def _setup_and_run_session(context: AppContext, history: list, is_auto: bool, ma
             sandbox_manager=sandbox_manager,
             history=history,
             is_auto=is_auto,
-            max_turns=max_turns
+            max_turns=max_turns,
+            benchmark_modules=benchmark_modules
         )
     finally:
         console.print("[cyan]Stopping sandbox...[/cyan]")
@@ -199,17 +225,27 @@ def run_interactive(ctx: typer.Context):
     context: AppContext = ctx.obj
     console = context.console
     console.print("\n[bold blue]🚀 Starting Interactive Mode...[/bold blue]")
+
+    # For consistency, allow selecting benchmarks in interactive mode too
+    benchmark_module = _prompt_for_benchmark_module(console)
     
     history = context.initial_history[:]
     history.append({"role": "user", "content": "Beginning interactive session. What is the plan?"})
     
-    _setup_and_run_session(context, history, is_auto=False, max_turns=-1)
+    _setup_and_run_session(
+        context,
+        history,
+        is_auto=False,
+        max_turns=-1,
+        benchmark_modules=[benchmark_module] if benchmark_module else None
+    )
 
 @run_app.command("auto")
 def run_auto(
     ctx: typer.Context,
-    prompt: str = typer.Option(None, "--prompt", "-p", help="Initial prompt for the auto run."),
-    turns: int = typer.Option(3, "--turns", "-t", help="Number of turns to run automatically."),
+    prompt: Optional[str] = typer.Option(None, "--prompt", "-p", help="Initial prompt for the auto run."),
+    turns: Optional[int] = typer.Option(None, "--turns", "-t", help="Number of turns to run automatically."),
+    benchmark_module: Optional[Path] = typer.Option(None, "--benchmark-module", "-bm", help="Path to the auto metric script.", readable=True, exists=True),
 ):
     """Run the agent system automatically for a set number of turns."""
     context: AppContext = ctx.obj
@@ -218,9 +254,21 @@ def run_auto(
     if prompt is None:
         prompt = Prompt.ask("Enter the initial prompt for the automated run", default="Analyze this dataset.")
 
+    if turns is None:
+        turns = IntPrompt.ask("Enter the number of turns for the automated run", default=3)
+    
+    if benchmark_module is None:
+        benchmark_module = _prompt_for_benchmark_module(console)
+
     console.print(f"\n[bold green]🚀 Starting Automated Mode for {turns} turns...[/bold green]")
     
     history = context.initial_history[:]
     history.append({"role": "user", "content": prompt})
     
-    _setup_and_run_session(context, history, is_auto=True, max_turns=turns)
\ No newline at end of file
+    _setup_and_run_session(
+        context,
+        history,
+        is_auto=True,
+        max_turns=turns,
+        benchmark_modules=[benchmark_module] if benchmark_module else None
+    )
\ No newline at end of file

From 7bfdb6cc235c7938b67b50d37f887fb0d15f608a Mon Sep 17 00:00:00 2001
From: djriffle <djriffle1@gmail.com>
Date: Thu, 14 Aug 2025 16:54:37 -0400
Subject: [PATCH 11/14] Added cli README

---
 cli/README.md                                 | 322 +++++-----
 .../output_to_notebook.py                     |   0
 cli/olaf/src/olaf/agents/AgentSystem.py       |  55 +-
 cli/olaf/src/olaf/cli/run_cli.py              |  16 +-
 cli/prompt_testing/MultiAgentTester.py        | 554 -----------------
 cli/prompt_testing/__init__.py                |   0
 cli/tools/__init__.py                         |   0
 cli/tools/czi_browser.py                      | 580 ------------------
 8 files changed, 201 insertions(+), 1326 deletions(-)
 rename cli/{tools => extra_tools}/output_to_notebook.py (100%)
 delete mode 100644 cli/prompt_testing/MultiAgentTester.py
 delete mode 100644 cli/prompt_testing/__init__.py
 delete mode 100644 cli/tools/__init__.py
 delete mode 100644 cli/tools/czi_browser.py

diff --git a/cli/README.md b/cli/README.md
index 47e38a2..7f81495 100644
--- a/cli/README.md
+++ b/cli/README.md
@@ -1,166 +1,164 @@
-# Benchmarking and Evolving Agent Prompts for Single-Cell Data Analysis
-
-**⚠️ Work in Progress:** This tooling is currently under development. Its primary goal is to facilitate rapid iteration, testing, evaluation, and evolution of LLM agent prompts for analyzing single-cell transcriptomics datasets using a secure code execution sandbox.
-
-## Overview
-
-This framework provides the necessary tools to:
-
-1. **Discover and Download Datasets:** Browse and fetch datasets (specifically from the CZI CELLxGENE Census) along with their metadata.
-2. **Secure Code Execution:** Run Python code generated by an AI agent within an isolated Docker container (sandbox). The sandbox now runs a Jupyter kernel managed by a  **FastAPI service** , providing a stable HTTP interface for code execution.
-3. **Agent Interaction & Testing (`OneShotAgentTester.py`):** Orchestrate interactions between an AI agent (powered by OpenAI's API), a selected dataset, and the code execution sandbox (via the FastAPI service). Allows testing prompts with limited code execution attempts.
-4. **Results Conversion (`output_to_notebook.py`):** Convert the detailed JSON logs from test runs into Jupyter Notebooks (`.ipynb`) for easier review and analysis reproduction.
-5. **AI-Powered Evaluation (`evaluator.py`):** Use an LLM (like GPT-4o) to automatically evaluate the performance of the agent based on the conversation logs, assigning a grade and providing comments.
-6. **Automated Prompt Evolution (`prompt_evolver.py`):** Iteratively refine an initial agent prompt based on an objective, test results, and AI evaluation feedback to automatically discover more effective prompts.
-
-## Components
-
-The framework consists of the following main components:
-
-* **`.env` / `make_benchmarking_env.sh`:**
-  * `make_benchmarking_env.sh`: An interactive script to securely prompt for and save your OpenAI API key.
-  * `.env`: The file (created by the script) storing the `OPENAI_KEY`. This file should be added to your `.gitignore`.
-* **`tools/czi_browser.py`:**
-  * A CLI tool for listing CZI CELLxGENE Census versions and datasets.
-  * Allows downloading specific datasets (`.h5ad`) and metadata (`.json`) to the `datasets/` directory.
-* **`sandbox/`:** Contains the code execution environment.
-  * `Dockerfile`: Defines the Docker image based on a Python base, adding necessary Python/system dependencies, Jupyter components, FastAPI, Uvicorn, and the application code.
-  * `requirements.txt`: Lists Python packages installed *inside* the sandbox container (e.g., `anndata`, `scanpy`, `matplotlib`).
-  * `kernel_api.py`: The FastAPI application running inside the container. It receives code execution requests via HTTP, interacts with a local Jupyter kernel using `jupyter_client`, captures results (stdout, stderr, errors, display data), and returns them as JSON.
-  * `start_kernel.py`: A simple script used internally by `start.sh` to launch the Jupyter kernel process with specific arguments (e.g., listening IP, ports).
-  * `start.sh`: The main startup script run by the container (managed by `tini`). It launches the Jupyter kernel in the background and then starts the Uvicorn server to run the `kernel_api.py` FastAPI app.
-  * `benchmarking_sandbox_management.py`: A Python script (with CLI and interactive modes) primarily used for building the sandbox image and manually starting/stopping the container (which runs the API service). Direct kernel interaction commands have been removed.
-* **`datasets/`:** (Created by `czi_browser.py`)
-  * Stores downloaded `.h5ad` data files and `.json` metadata files.
-* **`outputs/`:** (Created automatically)
-  * Default directory for storing JSON logs from `OneShotAgentTester.py` and `PromptEvolver.py`, evaluation results from `evaluator.py`, and potentially generated notebooks/images.
-* **`OneShotAgentTester.py`:**
-  * Orchestrates a single test run for one or more prompts against a dataset.
-  * Starts the sandbox container (via `SandboxManager`).
-  * Copies the dataset into the running container.
-  * Checks if the internal API service is responsive.
-  * Manages the interaction loop with the OpenAI API (specified agent model).
-  * When the agent generates code, it sends the code to the sandbox's FastAPI `/execute` endpoint using the `requests` library.
-  * Formats the JSON response (stdout, stderr, errors, display data) from the API and feeds it back to the agent.
-  * Saves the full conversation log for the test run(s) to a JSON file in the `outputs/` directory.
-* **`output_to_notebook.py`:**
-  * An interactive script that takes a results JSON file (from `OneShotAgentTester` or `PromptEvolver`) as input.
-  * Converts the conversation log, including code cells and their outputs (stdout, stderr, errors, display data), into a Jupyter Notebook (`.ipynb`) file.
-  * Saves the `.ipynb` file in the same directory as the input JSON.
-* **`evaluator.py`:**
-  * An interactive script that processes results JSON files from a specified input directory (defaults to `outputs/`).
-  * For each test run in the JSON, it formats the conversation and sends it to an OpenAI model (specified evaluator model) with instructions to evaluate the agent's performance (0-100 grade and comments) based on defined criteria (e.g., correctness, efficiency, clarity).
-  * Saves the evaluations (grade and comments) to JSON files (either aggregated or individually) in a specified output location (defaults to the input directory).
-* **`prompt_evolver.py`:**
-  * An orchestrator script for automatically refining prompts.
-  * Takes an initial prompt, an objective, a dataset, and the number of iterations.
-  * In each iteration:
-    * Runs the current prompt using the testing logic (`run_single_test_iteration`).
-    * Evaluates the result using the evaluation logic (`call_openai_evaluator`).
-    * Calls another OpenAI model (specified evolver model) to generate an improved prompt based on the objective, previous prompt, conversation summary, and evaluation feedback.
-    * Uses the evolved prompt for the next iteration.
-  * Saves a detailed log of the entire evolution process (prompts, test data, evaluations) and the final evolved prompt.
-* **`requirements.txt`:** (Top-level)
-  * Lists Python packages required for the *host* scripts (`OneShotAgentTester.py`, `evaluator.py`, `prompt_evolver.py`, `czi_browser.py`, etc.). Key dependencies include `openai`, `python-dotenv`, `requests`, `docker`, `rich`, `nbformat`.
-
-## Setup
-
-1. **Prerequisites:**
-   * Python (3.10+ recommended)
-   * `pip` (Python package installer)
-   * Docker Desktop or Docker Engine (must be running)
-   * Git (for cloning the repository)
-2. **Install Host Python Dependencies:**
-   * Create and activate a Python virtual environment (recommended):
-     ```
-     python -m venv venv
-     source venv/bin/activate  # Linux/macOS
-     # venv\Scripts\activate  # Windows CMD
-
-     ```
-   * Install required packages for the host scripts:
-     ```
-     pip install -r requirements.txt
-
-     ```
-3. **Set OpenAI API Key:**
-   * Make the script executable: `chmod +x make_benchmarking_env.sh`
-   * Run the script and enter your key when prompted: `./make_benchmarking_env.sh`
-   * This creates the `.env` file. **Ensure `.env` is listed in your `.gitignore` file.**
-4. **Prepare Sandbox Requirements:**
-   * Edit `sandbox/requirements.txt` to include all the additional Python packages needed *inside* the container for agent code execution (e.g., `pandas`, `numpy`, `scipy`, `scikit-learn`, `anndata`, `matplotlib`, `seaborn`). Ensure these are compatible with the base Python version in the `Dockerfile`.
-
-## Usage
-
-1. **Download a Dataset:**
-   * Use the `tools/czi_browser.py` script (run `python tools/czi_browser.py` for interactive mode) to find and download a dataset to the `datasets/` directory.
-2. **Test a Prompt (`OneShotAgentTester.py`):**
-   * Run the script: `python OneShotAgentTester.py`
-   * Follow prompts to select the prompt source (paste, file, folder), dataset, and max code attempts.
-   * The script starts the sandbox, runs the test(s) by communicating with the internal API, and saves the results to a JSON file in `outputs/`.
-3. **Convert Results to Notebook (`output_to_notebook.py`):**
-   * Run the script: `python output_to_notebook.py`
-   * Enter the path to a results JSON file (e.g., `outputs/benchmark_results_....json`).
-   * An `.ipynb` file will be generated in the same directory.
-4. **Evaluate Results (`evaluator.py`):**
-   * Run the script: `python evaluator.py`
-   * Enter the path to the folder containing results JSON files (defaults to `outputs/`).
-   * Enter the desired output location for evaluation files.
-   * The script calls OpenAI to evaluate each test run and saves the grades/comments.
-5. **Evolve a Prompt (`prompt_evolver.py`):**
-   * Run the script: `python prompt_evolver.py`
-   * Enter the overall objective for the prompt.
-   * Provide the initial prompt (paste or file path).
-   * Select the dataset.
-   * Enter the number of evolution iterations.
-   * Specify the output directory for logs.
-   * The script runs the test-evaluate-evolve loop and saves the full log and the final prompt.
-6. **Manage Sandbox Manually (Optional):**
-   * Use `sandbox/benchmarking_sandbox_management.py` for basic container control:
-     * Build image: `python sandbox/benchmarking_sandbox_management.py build`
-     * Start container (API): `python sandbox/benchmarking_sandbox_management.py start`
-     * Check status: `python sandbox/benchmarking_sandbox_management.py status`
-     * View logs: `python sandbox/benchmarking_sandbox_management.py logs [N]`
-     * Stop container: `python sandbox/benchmarking_sandbox_management.py stop`
-     * Run interactively: `python sandbox/benchmarking_sandbox_management.py`
-
-## File Structure (Updated)
+# OLAF CLI: The Open-source Language Agent Framework 🚀
 
+**The OLAF CLI is a powerful command-line interface for building, testing, and running sandboxed, multi-agent AI systems.** 
+
+It provides a robust framework for orchestrating multiple language agents that can collaborate to perform complex tasks, such as data analysis, in a secure and isolated environment.
+
+At its core, OLAF allows you to define a team of specialized AI agents in a simple JSON "blueprint." You can then deploy this team into a secure sandbox (powered by Docker or Singularity) with a specific dataset and give them a high-level task to solve.
+
+## \#\# Key Features
+
+  * **Multi-Agent Blueprints:** Define agents, their specialized prompts, and how they delegate tasks to each other using a simple JSON configuration.
+  * **Secure Sandboxing:** Execute agent-generated code in an isolated environment using **Docker** or **Singularity** to protect your host system.
+  * **Interactive & Automated Modes:** Run agent systems in a turn-by-turn interactive chat for debugging or in a fully automated mode for benchmarking.
+  * **Data Curation:** Includes tools to browse and download single-cell datasets from the CZI CELLxGENE Census to easily test your agents.
+  * **Configuration Management:** Easily manage API keys and application settings with built-in commands.
+  * **User-Friendly CLI:** A guided, interactive experience helps you configure every run, with flags available to override settings for use in scripts.
+
+## \#\# Installation
+
+### \#\#\# Prerequisites
+
+Before installing OLAF, you need to have the following installed and configured on your system:
+
+1.  **Python** (version 3.9 or higher)
+2.  **Pip** (Python's package installer)
+3.  **A Sandbox Backend:**
+      * **Docker:** Must be installed and the Docker daemon must be running.
+      * **Singularity (Apptainer):** Must be installed on your system.
+
+### \#\#\# Install from PyPI (Recommended)
+Coming soon!
+
+### \#\#\# Install from Source (For Developers)
+
+To install the latest development version, you can clone the repository and install it in editable mode:
+
+```bash
+git clone https://github.com/OpenTechBio/Ola
+cd olaf
+pip install -e .
 ```
-benchmarking/
-├── sandbox/
-│   ├── Dockerfile
-│   ├── kernel_api.py          # FastAPI application
-│   ├── start_kernel.py        # Script to launch kernel
-│   ├── start.sh               # Container startup script (kernel + API)
-│   ├── requirements.txt       # Requirements for INSIDE the container
-│   └── benchmarking_sandbox_management.py # Simplified manager
-│
-├── datasets/                  # Created by czi_browser.py download
-│   └── <dataset_name>.h5ad
-│   └── <dataset_name>.json
-│   └── ...
-│
-├── outputs/                   # Default location for results/logs/notebooks
-│   └── benchmark_results_*.json
-│   └── benchmark_results_*.ipynb
-│   └── *_eval.json
-│   └── evolution_log_*.json
-│   └── final_prompt_*.txt
-│   └── output_image_*.png
-│   └── ...
-│
-├── tools/
-│   └── czi_browser.py
-│
-├── make_benchmarking_env.sh  # Used to make the .env file
-├── OneShotAgentTester.py      # Runs agent tests via API
-├── output_to_notebook.py      # Converts results JSON to Notebook
-├── evaluator.py               # Evaluates test results using AI
-├── prompt_evolver.py          # Orchestrates prompt evolution loop
-├── requirements.txt           # Requirements for HOST scripts (this file)
-└── README.md                  # This file
-└── .env                       # Stores API key (add to .gitignore)
-└── .gitignore                 # Should include .env, venv/, __pycache__, outputs/, datasets/
 
+-----
+
+## \#\# 🚀 Quick Start Guide
+
+This guide will walk you through setting up your API key, downloading a dataset, and launching your first interactive agent session in just a few steps.
+
+### \#\#\# Step 1: Configure Your API Key
+
+First, tell OLAF about your OpenAI API key. This is a one-time setup.
+
+```bash
+olaf config set-openai-key "sk-YourSecretKeyGoesHere"
+```
+
+Your key will be stored securely in a local `.env` file within the OLAF configuration directory.
+
+### \#\#\# Step 2: Download a Dataset
+
+Next, let's get some data for our agents to analyze. Run the `datasets` command to browse and download a sample dataset from the CZI CELLxGENE Census.
+
+```bash
+# This will start the interactive dataset browser
+olaf datasets
 ```
+
+Follow the prompts to list versions and datasets, then use the `download` command as instructed.
+
+### \#\#\# Step 3: Run an Agent System\!
+
+Now you're ready to run an agent system. The `run` command is fully interactive if you don't provide any flags. It will guide you through selecting a blueprint, a dataset, and a sandbox environment.
+
+```bash
+olaf run interactive
+```
+
+This will trigger a series of prompts:
+
+1.  **Select Agent System Blueprint:** Choose one of the default systems (from the Package) or one you've created (from User).
+2.  **Select a driver agent:** Choose which agent in the system will receive the first instruction.
+3.  **Select Dataset:** Pick the dataset you downloaded in Step 2.
+4.  **Choose a sandbox backend:** Select `docker` or `singularity`.
+5.  **Choose an LLM backend:** Select `chatgpt` or `ollama`.
+
+After configuration, the session will begin, and you can start giving instructions to your agent team\!
+
+-----
+
+## \#\# Command Reference
+
+OLAF's commands are organized into logical groups.
+
+### \#\#\# `olaf run`
+
+The main command for executing an agent system.
+
+  * **Run interactively (recommended for manual use):**
+    ```bash
+    olaf run interactive
+    ```
+  * **Run automatically for 5 turns:**
+    ```bash
+    olaf run auto --turns 5 --prompt "Analyze this dataset and generate a UMAP plot."
+    ```
+  * **Run with all options specified (for scripting):**
+    ```bash
+    olaf run interactive \
+      --blueprint ~/.local/share/olaf/agent_systems/my_custom_system.json \
+      --driver-agent data_analyst \
+      --dataset ~/.local/share/olaf/datasets/my_data.h5ad \
+      --sandbox docker \
+      --llm chatgpt
+    ```
+
+### \#\#\# `olaf create-system`
+
+Tools for building new agent system blueprints.
+
+  * **Start the interactive builder:**
+    ```bash
+    olaf create-system
+    ```
+  * **Create a minimal blueprint quickly:**
+    ```bash
+    olaf create-system quick --name my-first-system
+    ```
+
+### \#\#\# `olaf datasets`
+
+Tools for managing datasets.
+
+  * **Start the interactive dataset browser:**
+    ```bash
+    olaf datasets
+    ```
+  * **Download a specific dataset directly:**
+    ```bash
+    olaf datasets download --version stable --dataset-id "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
+    ```
+
+### \#\#\# `olaf config`
+
+Manage your OLAF configuration.
+
+  * **Set your OpenAI API key:**
+    ```bash
+    olaf config set-openai-key "sk-..."
+    ```
+
+-----
+
+## \#\# Configuration
+
+OLAF stores all user-generated content and configuration in a central directory. You can override this location by setting the `OLAF_HOME` environment variable.
+
+  * **Default Location:**
+      * **Linux:** `~/.local/share/olaf/`
+      * **macOS:** `~/Library/Application Support/olaf/`
+      * **Windows:** `C:\Users\<user>\AppData\Local\OpenTechBio\olaf\`
+  * **Configuration File:** API keys are stored in `$OLAF_HOME/.env`.
+  * **Agent Systems:** Custom blueprints are saved to `$OLAF_HOME/agent_systems/`.
+  * **Datasets:** Downloaded datasets are stored in `$OLAF_HOME/datasets/`.
+  * **Run Outputs:** Code snippets and logs from agent runs are saved to `$OLAF_HOME/runs/`.
\ No newline at end of file
diff --git a/cli/tools/output_to_notebook.py b/cli/extra_tools/output_to_notebook.py
similarity index 100%
rename from cli/tools/output_to_notebook.py
rename to cli/extra_tools/output_to_notebook.py
diff --git a/cli/olaf/src/olaf/agents/AgentSystem.py b/cli/olaf/src/olaf/agents/AgentSystem.py
index 2219f0b..8d6f915 100644
--- a/cli/olaf/src/olaf/agents/AgentSystem.py
+++ b/cli/olaf/src/olaf/agents/AgentSystem.py
@@ -2,7 +2,15 @@
 from typing import Dict, Optional
 from pathlib import Path
 
-CODE_SAMPLES_DIR = Path("cli/code_samples")
+# Import the central OLAF_HOME path from our config module
+from olaf.config import OLAF_HOME
+
+# 1. The user-specific directory (for custom samples)
+USER_CODE_SAMPLES_DIR = OLAF_HOME / "code_samples"
+USER_CODE_SAMPLES_DIR.mkdir(exist_ok=True) # Ensure it exists
+
+# 2. The package-internal directory (for default samples), found relative to this file
+PACKAGE_CODE_SAMPLES_DIR = Path(__file__).resolve().parent.parent / "code_samples"
 
 
 class Command:
@@ -19,7 +27,6 @@ def __repr__(self) -> str:
 
 class Agent:
     """Represents a single agent in the system."""
-    # Updated to accept a dictionary of loaded code samples
     def __init__(self, name: str, prompt: str, commands: Dict[str, Command], code_samples: Dict[str, str]):
         self.name = name
         self.prompt = prompt
@@ -27,7 +34,6 @@ def __init__(self, name: str, prompt: str, commands: Dict[str, Command], code_sa
         self.code_samples = code_samples
 
     def __repr__(self) -> str:
-        # Updated to show if code samples are loaded
         sample_keys = list(self.code_samples.keys())
         return f"Agent(name='{self.name}', commands={list(self.commands.keys())}, samples={sample_keys})"
 
@@ -67,11 +73,10 @@ def __init__(self, global_policy: str, agents: Dict[str, Agent]):
     @classmethod
     def load_from_json(cls, file_path: str) -> 'AgentSystem':
         """
-        Parses the JSON blueprint, reads code sample files from disk,
-        and builds the AgentSystem data structure.
+        Parses the JSON blueprint, reads code sample files from disk from both user
+        and package locations, and builds the AgentSystem data structure.
         """
         print(f"Loading agent system from: {file_path}")
-        blueprint_path = Path(file_path).parent
         with open(file_path, 'r') as f:
             config = json.load(f)
 
@@ -79,7 +84,6 @@ def load_from_json(cls, file_path: str) -> 'AgentSystem':
         agents: Dict[str, Agent] = {}
         
         for agent_name, agent_data in config.get('agents', {}).items():
-            # --- Load Commands (unchanged) ---
             commands: Dict[str, Command] = {}
             for cmd_name, cmd_data in agent_data.get('neighbors', {}).items():
                 commands[cmd_name] = Command(
@@ -89,29 +93,38 @@ def load_from_json(cls, file_path: str) -> 'AgentSystem':
                 )
 
             loaded_samples: Dict[str, str] = {}
-            # Get the list of filenames from the JSON, e.g., ["load_data.py", "plot.py"]
             sample_filenames = agent_data.get('code_samples', [])
             
             if sample_filenames:
                 print(f"  Loading code samples for '{agent_name}'...")
                 for filename in sample_filenames:
-                    try:
-                        # Construct the full path to the sample file
-                        sample_path = CODE_SAMPLES_DIR / filename
-                        # Read the file content and store it in the dictionary
-                        loaded_samples[filename] = sample_path.read_text(encoding="utf-8")
-                        print(f"    ✅ Loaded {filename}")
-                    except FileNotFoundError:
-                        print(f"    ❌ WARNING: Code sample file not found and will be skipped: {sample_path}")
-                    except Exception as e:
-                        print(f"    ❌ ERROR: Could not read code sample file {sample_path}: {e}")
-
-            # --- Create Agent with loaded samples ---
+                    user_path = USER_CODE_SAMPLES_DIR / filename
+                    package_path = PACKAGE_CODE_SAMPLES_DIR / filename
+                    
+                    # Default to package path, but overwrite if user path exists
+                    path_to_load = None
+                    source_label = ""
+                    if user_path.exists():
+                        path_to_load = user_path
+                        source_label = "User"
+                    elif package_path.exists():
+                        path_to_load = package_path
+                        source_label = "Package"
+
+                    if path_to_load:
+                        try:
+                            loaded_samples[filename] = path_to_load.read_text(encoding="utf-8")
+                            print(f"    ✅ Loaded {filename} (from {source_label})")
+                        except Exception as e:
+                            print(f"    ❌ ERROR: Could not read code sample file {path_to_load}: {e}")
+                    else:
+                        print(f"    ❌ WARNING: Code sample file '{filename}' not found in any location.")
+
             agent = Agent(
                 name=agent_name,
                 prompt=agent_data['prompt'],
                 commands=commands,
-                code_samples=loaded_samples  # Pass the dictionary of loaded code
+                code_samples=loaded_samples
             )
             agents[agent_name] = agent
         
diff --git a/cli/olaf/src/olaf/cli/run_cli.py b/cli/olaf/src/olaf/cli/run_cli.py
index ebd7442..82693c2 100644
--- a/cli/olaf/src/olaf/cli/run_cli.py
+++ b/cli/olaf/src/olaf/cli/run_cli.py
@@ -14,7 +14,7 @@
 from olaf.config import DEFAULT_AGENT_DIR, ENV_FILE
 from olaf.agents.AgentSystem import Agent, AgentSystem
 from olaf.core.io_helpers import collect_resources
-from olaf.core.sandbox_management import (init_docker, init_singularity, init_singularity_exec)
+from olaf.core.sandbox_management import (init_docker, init_singularity_exec)
 from olaf.execution.runner import run_agent_session, SandboxManager
 from olaf.datasets.czi_datasets import get_datasets_dir
 
@@ -64,7 +64,6 @@ def _prompt_for_benchmark_module(console: Console) -> Optional[Path]:
     """Finds and prompts the user to select an auto metric script."""
     console.print("[bold]Select a benchmark module (optional):[/bold]")
     
-    # Filter out helper scripts
     modules = [
         m for m in PACKAGE_AUTO_METRICS_DIR.glob("*.py")
         if m.name not in ["__init__.py", "AutoMetric.py"]
@@ -116,7 +115,7 @@ def main_run_callback(
     resources_dir: Path = typer.Option(None, "--resources", help="Path to a directory of resource files to mount.", exists=True, file_okay=False),
     llm_backend: str = typer.Option(None, "--llm", help="LLM backend to use: 'chatgpt' or 'ollama'."),
     ollama_host: str = typer.Option("http://localhost:11434", "--ollama-host", help="Base URL for Ollama backend."),
-    sandbox: str = typer.Option(None, "--sandbox", help="Sandbox backend to use: 'docker', 'singularity', or 'singularity-exec'."),
+    sandbox: str = typer.Option(None, "--sandbox", help="Sandbox backend to use: 'docker' or 'singularity'."),
     force_refresh: bool = typer.Option(False, "--force-refresh", help="Force refresh/rebuild of the sandbox environment."),
 ):
     load_dotenv(dotenv_path=ENV_FILE)
@@ -140,7 +139,7 @@ def main_run_callback(
     app_context.dataset_path = dataset
 
     if sandbox is None:
-        sandbox = Prompt.ask("Choose a sandbox backend", choices=["docker", "singularity", "singularity-exec"], default="docker")
+        sandbox = Prompt.ask("Choose a sandbox backend", choices=["docker", "singularity"], default="docker")
     
     console.print(f"[cyan]Initializing sandbox backend: {sandbox}[/cyan]")
     script_dir = Path(__file__).resolve().parent
@@ -149,13 +148,13 @@ def main_run_callback(
     if sandbox == "docker":
         manager_class, handle, copy_cmd, exec_endpoint, status_endpoint = init_docker(script_dir, subprocess, console, force_refresh=force_refresh)
     elif sandbox == "singularity":
-        manager_class, handle, copy_cmd, exec_endpoint, status_endpoint = init_singularity(script_dir, subprocess, console, force_refresh=force_refresh)
-    elif sandbox == "singularity-exec":
+        # This now correctly maps to the 'singularity-exec' implementation
         manager_class, handle, copy_cmd, exec_endpoint, status_endpoint = init_singularity_exec(script_dir, SANDBOX_DATA_PATH, subprocess, console, force_refresh=force_refresh)
     else:
-        raise typer.BadParameter(f"Unknown sandbox type '{sandbox}'.")
+        raise typer.BadParameter(f"Unknown sandbox type '{sandbox}'. Supported: 'docker', 'singularity'.")
     app_context.sandbox_manager = manager_class()
-    app_context.sandbox_details = {"handle": handle, "copy_cmd": copy_cmd, "is_exec_mode": sandbox == "singularity-exec"}
+    # This check now correctly identifies the exec-style singularity backend
+    app_context.sandbox_details = {"handle": handle, "copy_cmd": copy_cmd, "is_exec_mode": sandbox == "singularity"}
 
     if llm_backend is None:
         llm_backend = Prompt.ask("Choose an LLM backend", choices=["chatgpt", "ollama"], default="chatgpt")
@@ -226,7 +225,6 @@ def run_interactive(ctx: typer.Context):
     console = context.console
     console.print("\n[bold blue]🚀 Starting Interactive Mode...[/bold blue]")
 
-    # For consistency, allow selecting benchmarks in interactive mode too
     benchmark_module = _prompt_for_benchmark_module(console)
     
     history = context.initial_history[:]
diff --git a/cli/prompt_testing/MultiAgentTester.py b/cli/prompt_testing/MultiAgentTester.py
deleted file mode 100644
index adcf003..0000000
--- a/cli/prompt_testing/MultiAgentTester.py
+++ /dev/null
@@ -1,554 +0,0 @@
-#!/usr/bin/env python3
-"""
-Interactive and Auto Agent System Tester (v1.4-refactored)
-=========================================================
-This script combines two execution modes:
-- Interactive Mode: A standard chat-like interface for manual testing.
-- Automated Mode: Runs the agent with a given prompt for a set number of turns
-  for benchmarking purposes.
-
-Use the --auto flag to enable automated mode.
-This version has been refactored to reduce code duplication.
-"""
-from __future__ import annotations
-
-import argparse
-import base64
-import json
-import os
-import re
-import subprocess
-import sys
-import textwrap
-import time
-from datetime import datetime
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-
-from rich.prompt import Prompt
-from rich.table import Table
-# -- Pick LLM backend ---------------------------------------------------
-from rich.prompt import Prompt
-BACKEND_CHOICE = Prompt.ask(
-    "LLM backend",
-    choices=["chatgpt", "ollama"],
-    default="chatgpt",
-)
-OLLAMA_HOST = "http://localhost:11434"
-if BACKEND_CHOICE == "ollama":
-    OLLAMA_HOST = Prompt.ask(
-        "Ollama base URL",
-        default="http://localhost:11434",
-    )
-# ── Dependencies ------------------------------------------------------------
-try:
-    from dotenv import load_dotenv
-
-    if BACKEND_CHOICE == "ollama":
-        from cli.core.ollama_wrapper import OllamaClient as OpenAI
-        APIError = Exception  # Ollama does not have a specific APIError
-    else:
-        from openai import APIError, OpenAI
-
-    import requests
-    from rich.console import Console
-except ImportError as e:
-    print(f"Missing dependency: {e}", file=sys.stderr)
-    sys.exit(1)
-
-# ── Agent framework ---------------------------------------------------------
-try:
-    from cli.agents.AgentSystem import Agent, AgentSystem
-except ImportError:
-    print("[ERROR] Could not import backend.agents.agent_system", file=sys.stderr)
-    raise
-
-# ── Local helpers -----------------------------------------------------------
-from cli.core.io_helpers import (
-    collect_resources,
-    display,
-    extract_python_code,
-    format_execute_response,
-    get_initial_prompt,
-    load_bp_json,
-    select_dataset,
-)
-from cli.core.sandbox_management import (
-    init_docker,
-    init_singularity,
-    init_singularity_exec,
-)
-
-console = Console()
-SCRIPT_DIR = Path(__file__).resolve().parent
-PARENT_DIR = SCRIPT_DIR.parent
-DATASETS_DIR = PARENT_DIR / "datasets"
-OUTPUTS_DIR = PARENT_DIR / "outputs"
-ENV_FILE = PARENT_DIR / ".env"
-
-SANDBOX_DATA_PATH = "/workspace/dataset.h5ad"
-SANDBOX_RESOURCES_DIR = "/workspace/resources"
-
-# ── Benchmark persistence --------------------------------------------------
-timestamp = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
-_LEDGER_PATH = OUTPUTS_DIR / f"benchmark_history_{timestamp}.jsonl"
-_SNIPPET_DIR = OUTPUTS_DIR / "snippets"
-_SNIPPET_DIR.mkdir(exist_ok=True, parents=True)
-_LEDGER_PATH.parent.mkdir(exist_ok=True, parents=True)
-
-# ===========================================================================
-# 1 · Backend selection
-# ===========================================================================
-backend = Prompt.ask(
-    "Choose sandbox backend",
-    choices=["docker", "singularity", "singularity-exec"],
-    default="docker",
-)
-force_refresh = (
-    Prompt.ask("Force refresh environment?", choices=["y", "n"], default="n").lower() == "y"
-)
-is_exec_mode = backend == "singularity-exec"
-
-if backend == "docker":
-    (
-        _BackendManager,
-        _SANDBOX_HANDLE,
-        COPY_CMD,
-        EXECUTE_ENDPOINT,
-        STATUS_ENDPOINT,
-    ) = init_docker(SCRIPT_DIR, subprocess, console, force_refresh)
-    SANDBOX_DATA_PATH = "dataset.h5ad"
-elif backend == "singularity":
-    (
-        _BackendManager,
-        _SANDBOX_HANDLE,
-        COPY_CMD,
-        EXECUTE_ENDPOINT,
-        STATUS_ENDPOINT,
-    ) = init_singularity(SCRIPT_DIR, subprocess, console, force_refresh)
-elif backend == "singularity-exec":
-    (
-        _BackendManager,
-        _SANDBOX_HANDLE,
-        COPY_CMD,
-        EXECUTE_ENDPOINT,
-        STATUS_ENDPOINT,
-    ) = init_singularity_exec(
-        SCRIPT_DIR, SANDBOX_DATA_PATH, subprocess, console, force_refresh
-    )
-else:
-    console.print("[red]Unknown backend.")
-    sys.exit(1)
-
-
-# ===========================================================================
-# 2 · Common Helpers
-# ===========================================================================
-def load_agent_system() -> Tuple[AgentSystem, Agent, str]:
-    """Load the agent system from a JSON blueprint."""
-    bp = load_bp_json(console)
-    if not bp.exists():
-        console.print(f"[red]Blueprint {bp} not found.")
-        sys.exit(1)
-    system = AgentSystem.load_from_json(str(bp))
-    driver_name = Prompt.ask(
-        "Driver agent",
-        choices=list(system.agents.keys()),
-        default=list(system.agents)[0],
-    )
-    driver = system.get_agent(driver_name)
-    instr = system.get_instructions()
-    return system, driver, instr
-
-
-_DELEG_RE = re.compile(r"delegate_to_([A-Za-z0-9_]+)")
-
-
-def detect_delegation(msg: str) -> Optional[str]:
-    """Return the *full* command name (e.g. 'delegate_to_coder') if present."""
-    m = _DELEG_RE.search(msg)
-    return f"delegate_to_{m.group(1)}" if m else None
-
-
-def api_alive(url: str, tries: int = 10) -> bool:
-    """Check if the API is responsive."""
-    if is_exec_mode:
-        return True
-    for _ in range(tries):
-        try:
-            if requests.get(url, timeout=2).json().get("status") == "ok":
-                return True
-        except Exception:
-            time.sleep(1.5)
-    return False
-
-
-def _dump_code_snippet(run_id: str, code: str) -> str:
-    """Write <run_id>.py under outputs/snippets/ and return the relative path."""
-    snippet_path = _SNIPPET_DIR / f"{run_id}.py"
-    snippet_path.write_text(code, encoding="utf-8")
-    return str(snippet_path.relative_to(OUTPUTS_DIR))
-
-
-def _save_benchmark_record(*, run_id: str, results: dict, meta: dict, code: str | None):
-    """Append a JSONL record for the benchmark run."""
-    record = {
-        "ts": datetime.utcnow().isoformat(timespec="seconds") + "Z",
-        "run": run_id,
-        "dataset": meta.get("name"),
-        "results": results,
-    }
-    if code:
-        record["code_path"] = _dump_code_snippet(run_id, code)
-    with _LEDGER_PATH.open("a") as fh:
-        fh.write(json.dumps(record) + "\n")
-
-
-# ===========================================================================
-# 3 · Unified Benchmark Runner
-# ===========================================================================
-def run_benchmark(
-    mgr,
-    benchmark_module: Path,
-    *,
-    is_auto: bool,
-    metadata: Optional[Dict] = None,
-    agent_name: Optional[str] = None,
-    code_snippet: Optional[str] = None,
-) -> str:
-    """
-    Execute a benchmark module.
-    In auto mode, saves results and returns a result string for the history.
-    In interactive mode, just prints results to the console.
-    """
-    console.print(
-        f"\n[bold cyan]Running benchmark module: {benchmark_module.name}[/bold cyan]"
-    )
-    autometric_base_path = benchmark_module.parent / "AutoMetric.py"
-    try:
-        with open(autometric_base_path, "r") as f:
-            autometric_code = f.read()
-        with open(benchmark_module, "r") as f:
-            benchmark_code = f.read()
-    except FileNotFoundError:
-        err = f"Benchmark module not found at: {benchmark_module}"
-        console.print(f"[red]{err}[/red]")
-        return err if is_auto else ""
-
-    code_to_execute = f"""
-# --- Code from AutoMetric.py ---
-{autometric_code}
-# --- Code from {benchmark_module.name} ---
-{benchmark_code}
-"""
-    console.print("[cyan]Executing benchmark code...[/cyan]")
-    try:
-        if is_exec_mode:
-            exec_result = mgr.exec_code(code_to_execute, timeout=300)
-        else:
-            exec_result = requests.post(
-                EXECUTE_ENDPOINT, json={"code": code_to_execute, "timeout": 300}, timeout=310
-            ).json()
-
-        table = Table(title="Benchmark Results")
-        table.add_column("Metric", style="cyan")
-        table.add_column("Value", style="magenta")
-        stdout = exec_result.get("stdout", "")
-        result_dict = {}
-        try:
-            result_dict = json.loads(stdout.strip().splitlines()[-1])
-        except (json.JSONDecodeError, IndexError) as e:
-            console.print(f"[yellow]Warning: Could not parse JSON from stdout: {e}[/yellow]")
-
-        if exec_result.get("status") == "ok" and isinstance(result_dict, dict):
-            for key, value in result_dict.items():
-                table.add_row(str(key), str(value))
-            if is_auto:
-                _save_benchmark_record(
-                    run_id=f"{benchmark_module.stem}:{agent_name}:{int(time.time())}",
-                    results=result_dict,
-                    meta=metadata,
-                    code=code_snippet,
-                )
-        else:
-            table.add_row("Error", exec_result.get("stderr") or "An unknown error occurred.")
-        console.print(table)
-
-        if is_auto:
-            return "Benchmark results:\n" + json.dumps(result_dict or {"error": "see console"})
-    except Exception as exc:
-        err_msg = f"Benchmark execution error: {exc}"
-        console.print(f"[red]{err_msg}[/red]")
-        if is_auto:
-            return err_msg
-    return ""
-
-
-# ===========================================================================
-# 4 · Unified Main Execution Loop
-# ===========================================================================
-def run(
-    agent_system: AgentSystem,
-    agent: Agent,
-    roster_instr: str,
-    dataset: Path,
-    metadata: dict,
-    resources: List[Tuple[Path, str]],
-    *,
-    is_auto: bool,
-    initial_user_message: str,
-    benchmark_modules: Optional[List[Path]] = None,
-    tries: int = 1,
-):
-    """Main driver for both interactive and automated execution."""
-    last_code_snippet: str | None = None
-    mgr = _BackendManager()
-    console.print(f"Launching sandbox ({backend})…")
-
-    if is_exec_mode and hasattr(mgr, "set_data"):
-        mgr.set_data(dataset, resources)
-    if not mgr.start_container():
-        console.print("[red]Failed to start sandbox")
-        return
-    if not api_alive(STATUS_ENDPOINT):
-        console.print("[red]Kernel API not responsive.")
-        return
-
-    if not is_exec_mode:
-        COPY_CMD(str(dataset), f"{_SANDBOX_HANDLE}:{SANDBOX_DATA_PATH}")
-        for hp, cp in resources:
-            COPY_CMD(str(hp), f"{_SANDBOX_HANDLE}:{cp}")
-
-    res_lines = [f"- {c} (from {h})" for h, c in resources] or ["- (none)"]
-    analysis_ctx = textwrap.dedent(
-        f"Dataset path: **{SANDBOX_DATA_PATH}**\nResources:\n"
-        + "\n".join(res_lines)
-        + "\n\nMetadata:\n"
-        + json.dumps(metadata, indent=2)
-    )
-
-    def build_system(a: Agent) -> str:
-        return (
-            roster_instr
-            + "\n\n"
-            + a.get_full_prompt(agent_system.global_policy)
-            + "\n\n"
-            + analysis_ctx
-        )
-
-    history = [{"role": "system", "content": build_system(agent)}]
-    history.append({"role": "user", "content": initial_user_message})
-    display(console, "system", history[0]["content"])
-    display(console, "user", initial_user_message)
-
-    if BACKEND_CHOICE == "chatgpt":
-        openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
-    else:
-        openai = OpenAI(host=OLLAMA_HOST, model="deepseek-r1:70b")
-
-    current_agent = agent
-    turn = 0
-    tries_left = tries
-
-    while True:
-        turn += 1
-        console.print(f"\n[bold]OpenAI call (turn {turn})…")
-        try:
-            resp = openai.chat.completions.create(
-                model="gpt-4o", messages=history, temperature=0.7
-            )
-        except APIError as e:
-            console.print(f"[red]OpenAI error: {e}")
-            break
-        msg = resp.choices[0].message.content
-        history.append({"role": "assistant", "content": msg})
-        display(console, f"assistant ({current_agent.name})", msg)
-
-        cmd = detect_delegation(msg)
-        if cmd and cmd in current_agent.commands:
-            tgt = current_agent.commands[cmd].target_agent
-            new_agent = agent_system.get_agent(tgt)
-            if new_agent:
-                console.print(f"[yellow]🔄 Routing to '{tgt}' via {cmd}")
-                history.append(
-                    {"role": "assistant", "content": f"🔄 Routing to **{tgt}** (command `{cmd}`)"}
-                )
-                if new_agent.code_samples:
-                    sample_context = "Here are some relevant code samples for your task:"
-                    for filename, code_content in new_agent.code_samples.items():
-                        sample_context += f"\n\n--- Sample from: {filename} ---\n"
-                        sample_context += f"```python\n{code_content.strip()}\n```"
-                    history.append({"role": "user", "content": sample_context})
-                    display(console, "user", sample_context)
-                current_agent = new_agent
-                history.insert(0, {"role": "system", "content": build_system(new_agent)})
-                continue
-
-        code = extract_python_code(msg)
-        if code:
-            last_code_snippet = code
-            console.print("[cyan]Executing code…[/cyan]")
-            try:
-                if is_exec_mode:
-                    exec_result = mgr.exec_code(code, timeout=300)
-                else:
-                    exec_result = requests.post(
-                        EXECUTE_ENDPOINT, json={"code": code, "timeout": 300}, timeout=310
-                    ).json()
-                feedback = format_execute_response(exec_result, OUTPUTS_DIR)
-            except Exception as exc:
-                feedback = f"Code execution result:\n[Execution error on host: {exc}]"
-            history.append({"role": "user", "content": feedback})
-            display(console, "user", feedback)
-
-        # --- Mode-specific logic ---
-        if is_auto:
-            if benchmark_modules:  # In auto mode, this is a list with 0 or 1 module
-                result_str = run_benchmark(
-                    mgr,
-                    benchmark_modules[0],
-                    is_auto=True,
-                    metadata=metadata,
-                    agent_name=current_agent.name,
-                    code_snippet=last_code_snippet,
-                )
-                history.append({"role": "user", "content": result_str})
-                display(console, "user", result_str)
-
-            tries_left -= 1
-            if tries_left <= 0:
-                console.print("[bold green]Auto run finished.[/bold green]")
-                break
-            history.append({"role": "user", "content": ""})  # Auto-continue
-        else:
-            # Interactive mode input loop
-            while True:
-                prompt_text = (
-                    "\n[bold]Next message (blank = continue, 'benchmark' to run, 'exit' to quit):[/bold]"
-                    if benchmark_modules
-                    else "\n[bold]Next message (blank = continue, 'exit' to quit):[/bold]"
-                )
-                try:
-                    user_input = Prompt.ask(prompt_text, default="").strip()
-                except (EOFError, KeyboardInterrupt):
-                    user_input = "exit"
-
-                if user_input.lower() in {"exit", "quit"}:
-                    console.print("Stopping sandbox…")
-                    mgr.stop_container()
-                    return  # Exit the entire run function
-
-                if user_input.lower() == "benchmark":
-                    if benchmark_modules:
-                        for bm_module in benchmark_modules:
-                            run_benchmark(mgr, bm_module, is_auto=False)
-                        continue  # Re-prompt after running benchmarks
-                    else:
-                        console.print("[yellow]No benchmark modules selected at startup.[/yellow]")
-                        continue
-                
-                if user_input:
-                    history.append({"role": "user", "content": user_input})
-                    display(console, "user", user_input)
-                break  # Exit input loop and proceed to next agent turn
-
-    console.print("Stopping sandbox…")
-    mgr.stop_container()
-
-
-# ===========================================================================
-# 5 · Mode-Specific Setup Functions
-# ===========================================================================
-def get_benchmark_modules(console: Console, parent_dir: Path) -> Optional[List[Path]]:
-    """Prompt user to select one or more benchmark modules for interactive mode."""
-    benchmark_dir = parent_dir / "auto_metrics"
-    if not benchmark_dir.exists():
-        return None
-    modules = [m for m in benchmark_dir.glob("*.py") if m.name != "AutoMetric.py"]
-    if not modules:
-        return None
-    console.print("\n[bold]Available benchmark modules:[/bold]")
-    for i, mod in enumerate(modules, start=1):
-        console.print(f"{i}. {mod.name}")
-    console.print(f"{len(modules)+1}. Select All")
-    choices_str = Prompt.ask("Select modules (e.g., 1 2 or 1,2,3) (Enter to skip)", default="")
-    choices = re.split(r"[,|\s]+", choices_str.strip())
-    if not choices or choices == [""]:
-        return None
-    selected = []
-    try:
-        for choice in choices:
-            if not choice: continue
-            index = int(choice) - 1
-            if index == len(modules): return modules  # Select All
-            if 0 <= index < len(modules): selected.append(modules[index])
-    except (ValueError, IndexError):
-        console.print("[red]Invalid selection.[/red]")
-        return None
-    return selected
-
-
-# ===========================================================================
-# 6 · Entry Point
-# ===========================================================================
-def main():
-    """Main entry point to parse args and start the correct mode."""
-    parser = argparse.ArgumentParser(
-        description="Interactive or Automated Agent System Tester.",
-        formatter_class=argparse.RawTextHelpFormatter,
-    )
-    parser.add_argument("--auto", action="store_true", help="Run in automated benchmark mode.")
-    args = parser.parse_args()
-
-    load_dotenv(ENV_FILE)
-    if BACKEND_CHOICE == "chatgpt" and not os.getenv("OPENAI_API_KEY"):
-        console.print("[red]OPENAI_API_KEY not set in .env[/red]")
-        sys.exit(1)
-
-    sys_, drv, roster = load_agent_system()
-    dp, meta = select_dataset(console, DATASETS_DIR)
-    res = collect_resources(console, SANDBOX_RESOURCES_DIR)
-
-    if args.auto:
-        console.print("[bold green]🚀 Running in Automated Mode...[/bold green]")
-        benchmark_module = get_benchmark_modules(console, PARENT_DIR)
-        initial_user_message = Prompt.ask("Initial user message", default="What should I do with this dataset?")
-        try:
-            tries = int(Prompt.ask("Number of automatic turns", default="1"))
-            if tries <= 0: raise ValueError
-        except ValueError:
-            console.print("[yellow]Invalid number – defaulting to 1.[/yellow]")
-            tries = 1
-        run(
-            agent_system=sys_,
-            agent=drv,
-            roster_instr=roster,
-            dataset=dp,
-            metadata=meta,
-            resources=res,
-            is_auto=True,
-            initial_user_message=initial_user_message,
-            benchmark_modules=[benchmark_module] if benchmark_module else [],
-            tries=tries,
-        )
-    else:
-        console.print("[bold blue]🚀 Running in Interactive Mode...[/bold blue]")
-        benchmark_modules = get_benchmark_modules(console, PARENT_DIR)
-        run(
-            agent_system=sys_,
-            agent=drv,
-            roster_instr=roster,
-            dataset=dp,
-            metadata=meta,
-            resources=res,
-            is_auto=False,
-            initial_user_message="Beginning interactive session. You can ask questions or give commands.",
-            benchmark_modules=benchmark_modules,
-        )
-
-
-if __name__ == "__main__":
-    try:
-        main()
-    except KeyboardInterrupt:
-        console.print("\nInterrupted by user. Exiting.")
\ No newline at end of file
diff --git a/cli/prompt_testing/__init__.py b/cli/prompt_testing/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/cli/tools/__init__.py b/cli/tools/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/cli/tools/czi_browser.py b/cli/tools/czi_browser.py
deleted file mode 100644
index 597ea09..0000000
--- a/cli/tools/czi_browser.py
+++ /dev/null
@@ -1,580 +0,0 @@
-#!/usr/bin/env python
-import argparse
-import cellxgene_census
-import sys
-import math
-import shlex  # For parsing interactive commands safely
-import os     # For path operations and directory creation
-import json   # For saving metadata
-import re     # For sanitizing filenames
-
-try:
-    from rich.console import Console
-    from rich.table import Table
-    from rich.pretty import pprint
-    from rich.prompt import Prompt # For interactive prompts
-    HAS_RICH = True
-except ImportError:
-    HAS_RICH = False
-    # Simple print/input fallback if rich is not installed
-    def pprint(obj): print(obj)
-    class Console:
-        def print(self, *args, **kwargs): print(*args)
-    class Table:
-        # Basic fallback Table class
-        def __init__(self, title=""):
-            self._title = title
-            self._rows = []
-            self._columns = []
-            self._styles = {} # Dummy style storage
-        def add_column(self, header, style=""):
-            self._columns.append(header)
-            self._styles[header] = style # Store style info even if unused
-        def add_row(self, *items):
-             # Ensure row has same number of items as columns
-             if len(items) != len(self._columns):
-                 raise ValueError("Number of items in row does not match number of columns")
-             self._rows.append(items)
-        def __rich_console__(self, console, options): # Dummy method for rich compatibility
-             # Basic text rendering for fallback
-             yield self._title
-             yield "\t".join(self._columns)
-             for row in self._rows: yield "\t".join(map(str, row))
-        def print_table(self, console): # Custom print method if rich not available
-             console.print(self._title)
-             if self._columns: # Only print header/rows if columns exist
-                 col_widths = [len(h) for h in self._columns]
-                 for row in self._rows:
-                     for i, item in enumerate(row):
-                         col_widths[i] = max(col_widths[i], len(str(item)))
-
-                 header_line = "  ".join(f"{h:<{w}}" for h, w in zip(self._columns, col_widths))
-                 separator = "-" * len(header_line)
-                 console.print(header_line)
-                 console.print(separator)
-                 for row in self._rows:
-                     row_line = "  ".join(f"{str(item):<{w}}" for item, w in zip(row, col_widths))
-                     console.print(row_line)
-
-    class Prompt:
-        @staticmethod
-        def ask(prompt, choices=None, default=None):
-            p_text = f"{prompt} "
-            if choices:
-                choices_str = '/'.join(choices)
-                p_text += f"({choices_str}) "
-            if default:
-                p_text += f"[{default}] "
-            return input(p_text).strip()
-
-# --- Helper Functions ---
-
-def sanitize_filename(name):
-    """Removes invalid characters and replaces spaces for use in filenames."""
-    # Remove characters that are not alphanumeric, underscore, or hyphen
-    name = re.sub(r'[^\w\-]+', '_', name)
-    # Replace multiple underscores with a single one
-    name = re.sub(r'_+', '_', name)
-    # Remove leading/trailing underscores
-    name = name.strip('_')
-    # Convert to lowercase
-    return name.lower()
-
-def ensure_datasets_dir_exists(base_dir="../datasets"):
-    """Checks if the target directory exists and creates it if not."""
-    # Get the absolute path relative to the script location
-    script_dir = os.path.dirname(os.path.abspath(__file__))
-    target_dir = os.path.abspath(os.path.join(script_dir, base_dir))
-
-    if not os.path.exists(target_dir):
-        print(f"Creating target directory: {target_dir}")
-        try:
-            os.makedirs(target_dir)
-        except OSError as e:
-            raise OSError(f"Failed to create directory {target_dir}: {e}")
-    elif not os.path.isdir(target_dir):
-        raise NotADirectoryError(f"Target path {target_dir} exists but is not a directory.")
-    return target_dir
-
-
-# --- Core Data Fetching Functions ---
-
-def get_census_versions_data():
-    """Fetches available CELLxGENE Census versions data."""
-    try:
-        census_versions = cellxgene_census.get_census_version_directory()
-        versions_list = []
-        # Prioritize 'stable', then 'latest', then sort others reverse chronologically
-        sorted_versions = sorted(
-            census_versions.keys(),
-            key=lambda v: ('0' if v == 'stable' else '1' if v == 'latest' else '2') + v,
-            reverse=True # Puts stable/latest effectively first, then sorts dates reverse
-        )
-
-        for version in sorted_versions:
-            description = census_versions[version]
-            release_date = "N/A"
-            try:
-                # Avoid fetching description again if already present
-                release_date = description.get("release_date")
-                if not release_date:
-                     details = cellxgene_census.get_census_version_description(version)
-                     release_date = details.get("release_date", "N/A")
-            except Exception:
-                pass # Ignore if details can't be fetched
-            versions_list.append({
-                "version": version,
-                "description": description.get('description', description.get('uri', 'N/A')),
-                "release_date": release_date
-            })
-        return versions_list
-    except Exception as e:
-        raise RuntimeError(f"Error listing versions: {e}")
-
-def fetch_source_datasets_data(census_version):
-    """Fetches source datasets DataFrame for a specific Census version."""
-    console = Console()
-    console.print(f"Fetching source datasets info for Census version: [cyan]{census_version}[/cyan]...")
-    try:
-        # Check if version is valid before opening (optional, but good practice)
-        available_versions = cellxgene_census.get_census_version_directory()
-        if census_version not in available_versions:
-             console.print(f"[bold red]Error:[/bold red] Census version '{census_version}' not found.")
-             # Attempt to list versions to help user
-             try:
-                 versions_data = get_census_versions_data()
-                 console.print("Available versions:")
-                 for v in versions_data:
-                     console.print(f"  - {v['version']} ({v.get('release_date', 'N/A')})")
-             except Exception:
-                 console.print("(Could not fetch list of available versions)")
-             return None
-
-        # Inform user about specific date mapping if using 'stable'/'latest'
-        try:
-             version_description = cellxgene_census.get_census_version_description(census_version)
-             actual_version = version_description.get("release_date", census_version)
-             if census_version in ["stable", "latest"] and actual_version != census_version:
-                 console.print(f"The \"{census_version}\" release is currently [bold green]{actual_version}[/bold green]. Specify 'census_version=\"{actual_version}\"' in future calls to open_soma() to ensure data consistency.")
-        except Exception:
-             console.print(f"[yellow]Warning: Could not verify exact date for '{census_version}'. Proceeding...[/yellow]")
-
-
-        with cellxgene_census.open_soma(census_version=census_version) as census:
-            if "census_info" not in census or "datasets" not in census["census_info"]:
-                 raise RuntimeError("Census object structure unexpected: 'census_info' or 'datasets' missing.")
-
-            datasets_df = census["census_info"]["datasets"].read().concat().to_pandas()
-            if datasets_df.empty:
-                console.print(f"No source dataset information found for version {census_version}.")
-                return datasets_df # Return empty DataFrame
-            return datasets_df
-    except Exception as e:
-        raise RuntimeError(f"Error fetching datasets for version {census_version}: {e}")
-
-
-def get_dataset_metadata_data(census_version, dataset_id):
-    """Fetches metadata dictionary for a specific source dataset."""
-    console = Console()
-    console.print(f"Fetching metadata for dataset [cyan]{dataset_id}[/cyan] in Census version: [cyan]{census_version}[/cyan]...")
-    try:
-        # Reuse fetch_source_datasets_data which includes version check
-        datasets_df = fetch_source_datasets_data(census_version)
-        if datasets_df is None: # Check if fetch failed (e.g., invalid version)
-             raise ValueError(f"Could not retrieve dataset list for version {census_version}.")
-        if datasets_df.empty: # Check if fetch succeeded but returned empty
-             raise ValueError(f"No datasets found for version {census_version}, cannot fetch metadata.")
-
-        dataset_metadata = datasets_df[datasets_df['dataset_id'] == dataset_id]
-
-        if dataset_metadata.empty:
-            raise ValueError(f"Dataset ID '{dataset_id}' not found in Census version '{census_version}'.")
-
-        return dataset_metadata.iloc[0].to_dict()
-    except Exception as e:
-        # Catch specific errors if needed, otherwise re-raise or wrap
-        raise RuntimeError(f"Error fetching metadata for dataset {dataset_id}: {e}")
-
-
-# --- Download Function ---
-
-def download_dataset(console, census_version, dataset_id):
-    """Downloads the H5AD file and saves metadata JSON for a dataset."""
-    try:
-        # 1. Ensure target directory exists
-        target_dir = ensure_datasets_dir_exists()
-        console.print(f"Target directory: [blue]{target_dir}[/blue]")
-
-        # 2. Fetch metadata first to get the title and verify dataset exists
-        metadata = get_dataset_metadata_data(census_version, dataset_id) # Handles errors
-        dataset_title = metadata.get('dataset_title', f'dataset_{dataset_id}') # Fallback title
-
-        # 3. Generate filenames
-        base_filename = sanitize_filename(dataset_title)
-        if not base_filename: # Handle cases where title sanitizes to empty string
-            base_filename = f"dataset_{dataset_id}"
-        h5ad_filename = f"{base_filename}.h5ad"
-        json_filename = f"{base_filename}.json"
-        h5ad_filepath = os.path.join(target_dir, h5ad_filename)
-        json_filepath = os.path.join(target_dir, json_filename)
-
-        console.print(f"Preparing to download dataset:")
-        console.print(f"  ID:      [cyan]{dataset_id}[/cyan]")
-        console.print(f"  Title:   [green]{dataset_title}[/green]")
-        console.print(f"  Version: [cyan]{census_version}[/cyan]")
-        console.print(f"  Output H5AD: [blue]{h5ad_filepath}[/blue]")
-        console.print(f"  Output JSON: [blue]{json_filepath}[/blue]")
-
-        # Check if files already exist (optional, add overwrite flag later if needed)
-        if os.path.exists(h5ad_filepath) or os.path.exists(json_filepath):
-             console.print("[yellow]Warning: One or both output files already exist. Skipping download.[/yellow]")
-             console.print("[yellow]         (Delete existing files or implement an --overwrite flag to replace.)[/yellow]")
-             return # Or prompt user, or add an overwrite flag
-
-        # 4. Download H5AD
-        console.print(f"Downloading H5AD file...")
-        cellxgene_census.download_source_h5ad(
-            dataset_id=dataset_id,
-            to_path=h5ad_filepath,
-            census_version=census_version
-        )
-        console.print("[bold green]H5AD Download complete.[/bold green]")
-
-        # 5. Save Metadata JSON
-        console.print("Saving metadata JSON file...")
-        try:
-            with open(json_filepath, 'w', encoding='utf-8') as f:
-                # Convert numpy types to standard Python types if necessary
-                def convert_types(obj):
-                    if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
-                                        np.int16, np.int32, np.int64, np.uint8,
-                                        np.uint16, np.uint32, np.uint64)):
-                        return int(obj)
-                    elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)):
-                        return float(obj)
-                    elif isinstance(obj, (np.ndarray,)): # Handle arrays if needed
-                        return obj.tolist() # Or other representation
-                    elif isinstance(obj, (np.bool_)):
-                        return bool(obj)
-                    elif isinstance(obj, (np.void)): # Handle complex types if they appear
-                        return None # Or suitable representation
-                    return obj
-
-                # Import numpy locally for type checking if needed
-                import numpy as np
-                json.dump(metadata, f, indent=4, default=convert_types, ensure_ascii=False)
-            console.print("[bold green]Metadata JSON saved successfully.[/bold green]")
-        except Exception as json_e:
-            console.print(f"[bold red]Error saving metadata JSON:[/bold red] {json_e}")
-            # Decide if we should clean up the downloaded H5AD file
-            # try:
-            #     os.remove(h5ad_filepath)
-            #     console.print(f"[yellow]Cleaned up partially downloaded H5AD file.[/yellow]")
-            # except OSError:
-            #     pass
-
-    except (ValueError, RuntimeError, OSError, NotADirectoryError, Exception) as e:
-        console.print(f"[bold red]Download failed:[/bold red] {e}")
-        # Potentially add more specific error handling based on exception type
-
-# --- Display and Interaction Functions ---
-
-def display_versions_list(console):
-    """Displays available versions."""
-    try:
-        versions_data = get_census_versions_data()
-        if not versions_data:
-             console.print("[yellow]No Census versions found.[/yellow]")
-             return
-
-        table = Table(title="Available CELLxGENE Census Versions")
-        table.add_column("Version Tag", style="cyan", justify="right")
-        table.add_column("Release Date", style="green")
-        table.add_column("Description / URL", style="magenta")
-
-
-        for v_data in versions_data:
-            table.add_row(v_data["version"], v_data["release_date"], v_data["description"])
-
-        if HAS_RICH:
-            console.print(table)
-        else:
-            table.print_table(console) # Use fallback print
-    except Exception as e:
-        console.print(f"[bold red]Error displaying versions:[/bold red] {e}")
-
-
-def display_paginated_datasets(console, census_version, limit=None, page_size=5):
-    """Fetches and displays datasets with pagination."""
-    try:
-        datasets_df = fetch_source_datasets_data(census_version)
-        if datasets_df is None: # Error handled in fetch
-            return
-        if datasets_df.empty: # Message handled in fetch
-             return
-
-        if limit is not None and limit > 0:
-             datasets_df = datasets_df.head(limit)
-             total_items_in_view = len(datasets_df) # Number we are actually paging through
-             if total_items_in_view == 0:
-                 console.print(f"No datasets found matching the criteria within the limit of {limit}.")
-                 return
-        else:
-             total_items_in_view = len(datasets_df)
-             limit = total_items_in_view # Set limit for display consistency
-
-        if total_items_in_view == 0:
-             console.print(f"No datasets found for version {census_version}.")
-             return
-
-        total_pages = math.ceil(total_items_in_view / page_size)
-        current_page = 1
-
-        while True:
-            start_index = (current_page - 1) * page_size
-            end_index = start_index + page_size
-            page_df = datasets_df.iloc[start_index:end_index]
-
-            if page_df.empty and current_page > 1: # Handle reaching end with partial page
-                console.print("[yellow]No more datasets to display.[/yellow]")
-                break
-            elif page_df.empty: # Only happens if total_items_in_view was 0 initially
-                 console.print("[yellow]No datasets to display.[/yellow]")
-                 break
-
-            range_end = min(end_index, total_items_in_view)
-            table = Table(title=f"Source Datasets in Census {census_version} (Showing {start_index+1}-{range_end} of {total_items_in_view})")
-            table.add_column("Dataset ID", style="cyan", no_wrap=True)
-            table.add_column("Collection Name", style="magenta", overflow="fold")
-            table.add_column("Dataset Title", style="green", overflow="fold")
-            table.add_column("Cell Count", style="yellow", justify="right")
-
-            for _, row in page_df.iterrows():
-                 # Safely format cell_count, handling potential None or non-numeric types
-                 cell_count = row.get('cell_count')
-                 cell_count_str = 'N/A'
-                 if cell_count is not None:
-                     try:
-                         cell_count_str = f"{int(cell_count):,}"
-                     except (ValueError, TypeError):
-                         cell_count_str = str(cell_count) # Fallback to string if not int-convertible
-
-                 table.add_row(
-                     row.get('dataset_id', 'N/A'),
-                     row.get('collection_name', 'N/A'),
-                     row.get('dataset_title', 'N/A'),
-                     cell_count_str
-                 )
-
-            console.print(f"\n--- Page {current_page} of {total_pages} ---")
-            if HAS_RICH:
-                 console.print(table)
-            else:
-                 table.print_table(console)
-
-            if total_pages <= 1:
-                break # No more pages
-
-            choices = []
-            prompt_text = "Action"
-            if current_page > 1: choices.append("P")
-            if current_page < total_pages: choices.append("N")
-            choices.append("Q")
-
-            prompt_parts = []
-            if "P" in choices: prompt_parts.append("[P]revious")
-            if "N" in choices: prompt_parts.append("[N]ext")
-            prompt_parts.append("[Q]uit listing")
-            prompt_text = ", ".join(prompt_parts) + "?"
-
-
-            default_action = "Q"
-            if current_page < total_pages: default_action = "N"
-            elif current_page > 1: default_action = "P"
-
-
-            action = Prompt.ask(
-                prompt_text,
-                choices=choices,
-                default=default_action
-            ).upper()
-
-            if action == "N" and current_page < total_pages:
-                current_page += 1
-            elif action == "P" and current_page > 1:
-                current_page -= 1
-            elif action == "Q":
-                break
-            else:
-                console.print("[yellow]Invalid choice.[/yellow]")
-
-    except Exception as e:
-        console.print(f"[bold red]Error displaying datasets:[/bold red] {e}")
-
-def display_dataset_metadata(console, census_version, dataset_id):
-     """Displays metadata for a specific dataset."""
-     try:
-         metadata_dict = get_dataset_metadata_data(census_version, dataset_id)
-         console.print(f"\nMetadata for Dataset: [bold green]{dataset_id}[/bold green]")
-         pprint(metadata_dict) # Use rich's pprint or fallback print
-     except Exception as e:
-         console.print(f"[bold red]Error displaying metadata:[/bold red] {e}")
-
-
-def print_interactive_help(console):
-     """Prints help message for interactive mode."""
-     console.print("\n[bold cyan]Available Commands:[/bold cyan]")
-     console.print("  [green]list_versions[/green]                    List available CELLxGENE Census versions.")
-     console.print("  [green]list_datasets[/green] <version> [limit]  List source datasets (paginated).")
-     console.print("                                     <version>: stable, latest, or YYYY-MM-DD")
-     console.print("                                     [limit] (optional): Total number of datasets to fetch.")
-     console.print("  [green]show_metadata[/green] <version> <dataset_id> Show metadata for a specific dataset.")
-     console.print("  [green]download[/green] <version> <dataset_id>      Download dataset H5AD and metadata JSON.")
-     console.print("  [green]help[/green]                         Show this help message.")
-     console.print("  [green]exit[/green]                         Exit the interactive browser.")
-     console.print("\nExample: [yellow]download stable <some_dataset_id>[/yellow]")
-
-
-def interactive_loop():
-    """Runs the interactive command loop."""
-    console = Console()
-    console.print("[bold blue]Welcome to the Interactive CZI CELLxGENE Census Browser![/bold blue]")
-    print_interactive_help(console)
-
-    while True:
-        try:
-            if HAS_RICH:
-                 raw_command = Prompt.ask("\nEnter command (\'help\' or \'exit\')")
-            else:
-                 raw_command = input("\nEnter command ('help' or 'exit'): ").strip()
-
-            if not raw_command:
-                continue
-
-            try:
-                command_parts = shlex.split(raw_command)
-            except ValueError as e:
-                console.print(f"[red]Error parsing command (check quotes?): {e}[/red]")
-                continue
-
-            if not command_parts: continue
-
-            command = command_parts[0].lower()
-            args = command_parts[1:]
-
-            if command == "exit":
-                break
-            elif command == "help":
-                print_interactive_help(console)
-            elif command == "list_versions":
-                if len(args) == 0:
-                     display_versions_list(console)
-                else:
-                     console.print("[yellow]Usage: list_versions[/yellow]")
-            elif command == "list_datasets":
-                version = args[0] if len(args) > 0 else None
-                limit = None
-                if len(args) > 1:
-                    try:
-                        limit = int(args[1])
-                        if limit <= 0:
-                             console.print("[red]Limit must be a positive integer.[/red]")
-                             continue
-                    except ValueError:
-                        console.print(f"[red]Invalid limit '{args[1]}'. Must be an integer.[/red]")
-                        continue
-                if version:
-                    display_paginated_datasets(console, version, limit=limit, page_size=5)
-                else:
-                    console.print("[yellow]Usage: list_datasets <version> [limit][/yellow]")
-            elif command == "show_metadata":
-                version = args[0] if len(args) > 0 else None
-                dataset_id = args[1] if len(args) > 1 else None
-                if version and dataset_id:
-                    display_dataset_metadata(console, version, dataset_id)
-                else:
-                    console.print("[yellow]Usage: show_metadata <version> <dataset_id>[/yellow]")
-            elif command == "download":
-                version = args[0] if len(args) > 0 else None
-                dataset_id = args[1] if len(args) > 1 else None
-                if version and dataset_id:
-                    download_dataset(console, version, dataset_id)
-                else:
-                    console.print("[yellow]Usage: download <version> <dataset_id>[/yellow]")
-            else:
-                console.print(f"[red]Unknown command: '{command}'. Type 'help' for options.[/red]")
-
-        except EOFError:
-             console.print("\n[yellow]EOF detected. Exiting.[/yellow]")
-             break
-        except KeyboardInterrupt:
-             console.print("\n[yellow]Interrupted by user. Type 'exit' to quit.[/yellow]")
-        except Exception as e:
-             console.print(f"[bold red]An unexpected error occurred in the interactive loop:[/bold red] {e}")
-
-
-    console.print("[bold blue]Exiting browser. Goodbye![/bold blue]")
-
-
-def main():
-    # Check if running interactively (no arguments other than script name)
-    if len(sys.argv) == 1:
-        interactive_loop()
-        sys.exit(0)
-
-    # --- Original argparse logic for non-interactive mode ---
-    parser = argparse.ArgumentParser(
-        description="CZI CELLxGENE Census Browser CLI. Run without arguments for interactive mode.",
-        formatter_class=argparse.RawTextHelpFormatter # Keep help text formatting
-    )
-    subparsers = parser.add_subparsers(dest='command', help='Available commands (run without arguments for interactive mode)')
-
-    # Subparser for listing census versions
-    parser_list_versions = subparsers.add_parser('list-versions', help='List available CELLxGENE Census versions')
-    parser_list_versions.set_defaults(func=lambda args: display_versions_list(Console()))
-
-    # Subparser for listing datasets within a version
-    parser_list_datasets = subparsers.add_parser('list-datasets', help='List source datasets within a specific Census version (paginated)')
-    parser_list_datasets.add_argument('--version', required=True, help='Census version tag (e.g., "stable", "latest", "YYYY-MM-DD")')
-    parser_list_datasets.add_argument('--limit', type=int, default=None, help='Maximum number of datasets to fetch and paginate through')
-    parser_list_datasets.add_argument('--page-size', type=int, default=5, help='Number of datasets to show per page (default: 5)')
-    parser_list_datasets.set_defaults(func=lambda args: display_paginated_datasets(Console(), args.version, args.limit, args.page_size))
-
-    # Subparser for showing metadata for a specific dataset
-    parser_show_metadata = subparsers.add_parser('show-metadata', help='Show metadata for a specific source dataset')
-    parser_show_metadata.add_argument('--version', required=True, help='Census version tag')
-    parser_show_metadata.add_argument('--dataset-id', required=True, help='The dataset_id')
-    parser_show_metadata.set_defaults(func=lambda args: display_dataset_metadata(Console(), args.version, args.dataset_id))
-
-    # Subparser for downloading a dataset
-    parser_download = subparsers.add_parser('download', help='Download dataset H5AD and metadata JSON')
-    parser_download.add_argument('--version', required=True, help='Census version tag')
-    parser_download.add_argument('--dataset-id', required=True, help='The dataset_id to download')
-    parser_download.set_defaults(func=lambda args: download_dataset(Console(), args.version, args.dataset_id))
-
-
-    # Allow showing help if no subcommand is given when args are present
-    if len(sys.argv) > 1 and sys.argv[1] not in ['list-versions', 'list-datasets', 'show-metadata', 'download', '-h', '--help']:
-         args = parser.parse_args(sys.argv[1:2]) # Parse just the first potential command
-    else:
-         args = parser.parse_args()
-
-    if hasattr(args, 'func'):
-         try:
-             args.func(args)
-         except Exception as e:
-             Console().print(f"[bold red]Command failed:[/bold red] {e}")
-             sys.exit(1)
-    else:
-         if len(sys.argv) > 1:
-             parser.print_help()
-
-
-if __name__ == "__main__":
-    # Need numpy for JSON conversion of metadata types
-    try:
-        import numpy as np
-    except ImportError:
-        print("Error: The 'numpy' package is required for saving metadata. Please install it (`pip install numpy`).")
-        sys.exit(1)
-    main()

From a88c4c636eb3caccb8fc6582ed99acd0d32711a5 Mon Sep 17 00:00:00 2001
From: djriffle <djriffle1@gmail.com>
Date: Thu, 14 Aug 2025 16:54:58 -0400
Subject: [PATCH 12/14] added pip cli specifc readme

---
 cli/olaf/README.md | 164 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 164 insertions(+)

diff --git a/cli/olaf/README.md b/cli/olaf/README.md
index e69de29..7f81495 100644
--- a/cli/olaf/README.md
+++ b/cli/olaf/README.md
@@ -0,0 +1,164 @@
+# OLAF CLI: The Open-source Language Agent Framework 🚀
+
+**The OLAF CLI is a powerful command-line interface for building, testing, and running sandboxed, multi-agent AI systems.** 
+
+It provides a robust framework for orchestrating multiple language agents that can collaborate to perform complex tasks, such as data analysis, in a secure and isolated environment.
+
+At its core, OLAF allows you to define a team of specialized AI agents in a simple JSON "blueprint." You can then deploy this team into a secure sandbox (powered by Docker or Singularity) with a specific dataset and give them a high-level task to solve.
+
+## \#\# Key Features
+
+  * **Multi-Agent Blueprints:** Define agents, their specialized prompts, and how they delegate tasks to each other using a simple JSON configuration.
+  * **Secure Sandboxing:** Execute agent-generated code in an isolated environment using **Docker** or **Singularity** to protect your host system.
+  * **Interactive & Automated Modes:** Run agent systems in a turn-by-turn interactive chat for debugging or in a fully automated mode for benchmarking.
+  * **Data Curation:** Includes tools to browse and download single-cell datasets from the CZI CELLxGENE Census to easily test your agents.
+  * **Configuration Management:** Easily manage API keys and application settings with built-in commands.
+  * **User-Friendly CLI:** A guided, interactive experience helps you configure every run, with flags available to override settings for use in scripts.
+
+## \#\# Installation
+
+### \#\#\# Prerequisites
+
+Before installing OLAF, you need to have the following installed and configured on your system:
+
+1.  **Python** (version 3.9 or higher)
+2.  **Pip** (Python's package installer)
+3.  **A Sandbox Backend:**
+      * **Docker:** Must be installed and the Docker daemon must be running.
+      * **Singularity (Apptainer):** Must be installed on your system.
+
+### \#\#\# Install from PyPI (Recommended)
+Coming soon!
+
+### \#\#\# Install from Source (For Developers)
+
+To install the latest development version, you can clone the repository and install it in editable mode:
+
+```bash
+git clone https://github.com/OpenTechBio/Ola
+cd olaf
+pip install -e .
+```
+
+-----
+
+## \#\# 🚀 Quick Start Guide
+
+This guide will walk you through setting up your API key, downloading a dataset, and launching your first interactive agent session in just a few steps.
+
+### \#\#\# Step 1: Configure Your API Key
+
+First, tell OLAF about your OpenAI API key. This is a one-time setup.
+
+```bash
+olaf config set-openai-key "sk-YourSecretKeyGoesHere"
+```
+
+Your key will be stored securely in a local `.env` file within the OLAF configuration directory.
+
+### \#\#\# Step 2: Download a Dataset
+
+Next, let's get some data for our agents to analyze. Run the `datasets` command to browse and download a sample dataset from the CZI CELLxGENE Census.
+
+```bash
+# This will start the interactive dataset browser
+olaf datasets
+```
+
+Follow the prompts to list versions and datasets, then use the `download` command as instructed.
+
+### \#\#\# Step 3: Run an Agent System\!
+
+Now you're ready to run an agent system. The `run` command is fully interactive if you don't provide any flags. It will guide you through selecting a blueprint, a dataset, and a sandbox environment.
+
+```bash
+olaf run interactive
+```
+
+This will trigger a series of prompts:
+
+1.  **Select Agent System Blueprint:** Choose one of the default systems (from the Package) or one you've created (from User).
+2.  **Select a driver agent:** Choose which agent in the system will receive the first instruction.
+3.  **Select Dataset:** Pick the dataset you downloaded in Step 2.
+4.  **Choose a sandbox backend:** Select `docker` or `singularity`.
+5.  **Choose an LLM backend:** Select `chatgpt` or `ollama`.
+
+After configuration, the session will begin, and you can start giving instructions to your agent team\!
+
+-----
+
+## \#\# Command Reference
+
+OLAF's commands are organized into logical groups.
+
+### \#\#\# `olaf run`
+
+The main command for executing an agent system.
+
+  * **Run interactively (recommended for manual use):**
+    ```bash
+    olaf run interactive
+    ```
+  * **Run automatically for 5 turns:**
+    ```bash
+    olaf run auto --turns 5 --prompt "Analyze this dataset and generate a UMAP plot."
+    ```
+  * **Run with all options specified (for scripting):**
+    ```bash
+    olaf run interactive \
+      --blueprint ~/.local/share/olaf/agent_systems/my_custom_system.json \
+      --driver-agent data_analyst \
+      --dataset ~/.local/share/olaf/datasets/my_data.h5ad \
+      --sandbox docker \
+      --llm chatgpt
+    ```
+
+### \#\#\# `olaf create-system`
+
+Tools for building new agent system blueprints.
+
+  * **Start the interactive builder:**
+    ```bash
+    olaf create-system
+    ```
+  * **Create a minimal blueprint quickly:**
+    ```bash
+    olaf create-system quick --name my-first-system
+    ```
+
+### \#\#\# `olaf datasets`
+
+Tools for managing datasets.
+
+  * **Start the interactive dataset browser:**
+    ```bash
+    olaf datasets
+    ```
+  * **Download a specific dataset directly:**
+    ```bash
+    olaf datasets download --version stable --dataset-id "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
+    ```
+
+### \#\#\# `olaf config`
+
+Manage your OLAF configuration.
+
+  * **Set your OpenAI API key:**
+    ```bash
+    olaf config set-openai-key "sk-..."
+    ```
+
+-----
+
+## \#\# Configuration
+
+OLAF stores all user-generated content and configuration in a central directory. You can override this location by setting the `OLAF_HOME` environment variable.
+
+  * **Default Location:**
+      * **Linux:** `~/.local/share/olaf/`
+      * **macOS:** `~/Library/Application Support/olaf/`
+      * **Windows:** `C:\Users\<user>\AppData\Local\OpenTechBio\olaf\`
+  * **Configuration File:** API keys are stored in `$OLAF_HOME/.env`.
+  * **Agent Systems:** Custom blueprints are saved to `$OLAF_HOME/agent_systems/`.
+  * **Datasets:** Downloaded datasets are stored in `$OLAF_HOME/datasets/`.
+  * **Run Outputs:** Code snippets and logs from agent runs are saved to `$OLAF_HOME/runs/`.
\ No newline at end of file

From a7e576409935d65a42bd041c05b422bb506863df Mon Sep 17 00:00:00 2001
From: djriffle <djriffle1@gmail.com>
Date: Thu, 14 Aug 2025 18:31:53 -0400
Subject: [PATCH 13/14] Fixed cli readme's

---
 cli/README.md      | 30 +++++++++++++++---------------
 cli/olaf/README.md | 30 +++++++++++++++---------------
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/cli/README.md b/cli/README.md
index 7f81495..9e8d725 100644
--- a/cli/README.md
+++ b/cli/README.md
@@ -6,7 +6,7 @@ It provides a robust framework for orchestrating multiple language agents that c
 
 At its core, OLAF allows you to define a team of specialized AI agents in a simple JSON "blueprint." You can then deploy this team into a secure sandbox (powered by Docker or Singularity) with a specific dataset and give them a high-level task to solve.
 
-## \#\# Key Features
+## Key Features
 
   * **Multi-Agent Blueprints:** Define agents, their specialized prompts, and how they delegate tasks to each other using a simple JSON configuration.
   * **Secure Sandboxing:** Execute agent-generated code in an isolated environment using **Docker** or **Singularity** to protect your host system.
@@ -15,9 +15,9 @@ At its core, OLAF allows you to define a team of specialized AI agents in a simp
   * **Configuration Management:** Easily manage API keys and application settings with built-in commands.
   * **User-Friendly CLI:** A guided, interactive experience helps you configure every run, with flags available to override settings for use in scripts.
 
-## \#\# Installation
+## Installation
 
-### \#\#\# Prerequisites
+### Prerequisites
 
 Before installing OLAF, you need to have the following installed and configured on your system:
 
@@ -27,10 +27,10 @@ Before installing OLAF, you need to have the following installed and configured
       * **Docker:** Must be installed and the Docker daemon must be running.
       * **Singularity (Apptainer):** Must be installed on your system.
 
-### \#\#\# Install from PyPI (Recommended)
+### Install from PyPI (Recommended)
 Coming soon!
 
-### \#\#\# Install from Source (For Developers)
+### Install from Source (For Developers)
 
 To install the latest development version, you can clone the repository and install it in editable mode:
 
@@ -42,11 +42,11 @@ pip install -e .
 
 -----
 
-## \#\# 🚀 Quick Start Guide
+## 🚀 Quick Start Guide
 
 This guide will walk you through setting up your API key, downloading a dataset, and launching your first interactive agent session in just a few steps.
 
-### \#\#\# Step 1: Configure Your API Key
+### Step 1: Configure Your API Key
 
 First, tell OLAF about your OpenAI API key. This is a one-time setup.
 
@@ -56,7 +56,7 @@ olaf config set-openai-key "sk-YourSecretKeyGoesHere"
 
 Your key will be stored securely in a local `.env` file within the OLAF configuration directory.
 
-### \#\#\# Step 2: Download a Dataset
+### Step 2: Download a Dataset
 
 Next, let's get some data for our agents to analyze. Run the `datasets` command to browse and download a sample dataset from the CZI CELLxGENE Census.
 
@@ -67,7 +67,7 @@ olaf datasets
 
 Follow the prompts to list versions and datasets, then use the `download` command as instructed.
 
-### \#\#\# Step 3: Run an Agent System\!
+### Step 3: Run an Agent System\!
 
 Now you're ready to run an agent system. The `run` command is fully interactive if you don't provide any flags. It will guide you through selecting a blueprint, a dataset, and a sandbox environment.
 
@@ -87,11 +87,11 @@ After configuration, the session will begin, and you can start giving instructio
 
 -----
 
-## \#\# Command Reference
+## Command Reference
 
 OLAF's commands are organized into logical groups.
 
-### \#\#\# `olaf run`
+### `olaf run`
 
 The main command for executing an agent system.
 
@@ -113,7 +113,7 @@ The main command for executing an agent system.
       --llm chatgpt
     ```
 
-### \#\#\# `olaf create-system`
+### `olaf create-system`
 
 Tools for building new agent system blueprints.
 
@@ -126,7 +126,7 @@ Tools for building new agent system blueprints.
     olaf create-system quick --name my-first-system
     ```
 
-### \#\#\# `olaf datasets`
+### `olaf datasets`
 
 Tools for managing datasets.
 
@@ -139,7 +139,7 @@ Tools for managing datasets.
     olaf datasets download --version stable --dataset-id "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
     ```
 
-### \#\#\# `olaf config`
+### `olaf config`
 
 Manage your OLAF configuration.
 
@@ -150,7 +150,7 @@ Manage your OLAF configuration.
 
 -----
 
-## \#\# Configuration
+## Configuration
 
 OLAF stores all user-generated content and configuration in a central directory. You can override this location by setting the `OLAF_HOME` environment variable.
 
diff --git a/cli/olaf/README.md b/cli/olaf/README.md
index 7f81495..9e8d725 100644
--- a/cli/olaf/README.md
+++ b/cli/olaf/README.md
@@ -6,7 +6,7 @@ It provides a robust framework for orchestrating multiple language agents that c
 
 At its core, OLAF allows you to define a team of specialized AI agents in a simple JSON "blueprint." You can then deploy this team into a secure sandbox (powered by Docker or Singularity) with a specific dataset and give them a high-level task to solve.
 
-## \#\# Key Features
+## Key Features
 
   * **Multi-Agent Blueprints:** Define agents, their specialized prompts, and how they delegate tasks to each other using a simple JSON configuration.
   * **Secure Sandboxing:** Execute agent-generated code in an isolated environment using **Docker** or **Singularity** to protect your host system.
@@ -15,9 +15,9 @@ At its core, OLAF allows you to define a team of specialized AI agents in a simp
   * **Configuration Management:** Easily manage API keys and application settings with built-in commands.
   * **User-Friendly CLI:** A guided, interactive experience helps you configure every run, with flags available to override settings for use in scripts.
 
-## \#\# Installation
+## Installation
 
-### \#\#\# Prerequisites
+### Prerequisites
 
 Before installing OLAF, you need to have the following installed and configured on your system:
 
@@ -27,10 +27,10 @@ Before installing OLAF, you need to have the following installed and configured
       * **Docker:** Must be installed and the Docker daemon must be running.
       * **Singularity (Apptainer):** Must be installed on your system.
 
-### \#\#\# Install from PyPI (Recommended)
+### Install from PyPI (Recommended)
 Coming soon!
 
-### \#\#\# Install from Source (For Developers)
+### Install from Source (For Developers)
 
 To install the latest development version, you can clone the repository and install it in editable mode:
 
@@ -42,11 +42,11 @@ pip install -e .
 
 -----
 
-## \#\# 🚀 Quick Start Guide
+## 🚀 Quick Start Guide
 
 This guide will walk you through setting up your API key, downloading a dataset, and launching your first interactive agent session in just a few steps.
 
-### \#\#\# Step 1: Configure Your API Key
+### Step 1: Configure Your API Key
 
 First, tell OLAF about your OpenAI API key. This is a one-time setup.
 
@@ -56,7 +56,7 @@ olaf config set-openai-key "sk-YourSecretKeyGoesHere"
 
 Your key will be stored securely in a local `.env` file within the OLAF configuration directory.
 
-### \#\#\# Step 2: Download a Dataset
+### Step 2: Download a Dataset
 
 Next, let's get some data for our agents to analyze. Run the `datasets` command to browse and download a sample dataset from the CZI CELLxGENE Census.
 
@@ -67,7 +67,7 @@ olaf datasets
 
 Follow the prompts to list versions and datasets, then use the `download` command as instructed.
 
-### \#\#\# Step 3: Run an Agent System\!
+### Step 3: Run an Agent System\!
 
 Now you're ready to run an agent system. The `run` command is fully interactive if you don't provide any flags. It will guide you through selecting a blueprint, a dataset, and a sandbox environment.
 
@@ -87,11 +87,11 @@ After configuration, the session will begin, and you can start giving instructio
 
 -----
 
-## \#\# Command Reference
+## Command Reference
 
 OLAF's commands are organized into logical groups.
 
-### \#\#\# `olaf run`
+### `olaf run`
 
 The main command for executing an agent system.
 
@@ -113,7 +113,7 @@ The main command for executing an agent system.
       --llm chatgpt
     ```
 
-### \#\#\# `olaf create-system`
+### `olaf create-system`
 
 Tools for building new agent system blueprints.
 
@@ -126,7 +126,7 @@ Tools for building new agent system blueprints.
     olaf create-system quick --name my-first-system
     ```
 
-### \#\#\# `olaf datasets`
+### `olaf datasets`
 
 Tools for managing datasets.
 
@@ -139,7 +139,7 @@ Tools for managing datasets.
     olaf datasets download --version stable --dataset-id "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
     ```
 
-### \#\#\# `olaf config`
+### `olaf config`
 
 Manage your OLAF configuration.
 
@@ -150,7 +150,7 @@ Manage your OLAF configuration.
 
 -----
 
-## \#\# Configuration
+## Configuration
 
 OLAF stores all user-generated content and configuration in a central directory. You can override this location by setting the `OLAF_HOME` environment variable.
 

From 114375ee56e81e2c0aac77e23a1af32eb15e3f03 Mon Sep 17 00:00:00 2001
From: Dylan Riffle <djriffle1@gmail.com>
Date: Fri, 15 Aug 2025 00:47:52 -0400
Subject: [PATCH 14/14] Added cli docker support

---
 cli/olaf/src/olaf/core/sandbox_management.py  | 43 ++-----------------
 cli/olaf/src/olaf/sandbox/Dockerfile          | 33 +++++---------
 .../benchmarking_sandbox_management.py        | 24 ++++++++---
 cli/olaf/src/olaf/sandbox/requirements.txt    |  4 +-
 4 files changed, 34 insertions(+), 70 deletions(-)

diff --git a/cli/olaf/src/olaf/core/sandbox_management.py b/cli/olaf/src/olaf/core/sandbox_management.py
index 10f7824..0570976 100644
--- a/cli/olaf/src/olaf/core/sandbox_management.py
+++ b/cli/olaf/src/olaf/core/sandbox_management.py
@@ -1,6 +1,8 @@
+
 import time
-from typing import List, Tuple, Dict, Optional
+from typing import List, Tuple, Dict
 from pathlib import Path
+
 import json
 
 from olaf.sandbox.benchmarking_sandbox_management import (
@@ -12,7 +14,6 @@
 
 
 def init_docker(script_dir:str, subprocess, console, force_refresh:bool=False):
-    sandbox_dir = script_dir / "workspace"
     # --- optional force‑refresh logic --------------------------------------
     if force_refresh:
         console.print("[yellow]Forcing Docker sandbox refresh…[/yellow]")
@@ -31,49 +32,11 @@ def COPY_CMD(src: str, dst: str):
 
     return _BackendManager, _SANDBOX_HANDLE, COPY_CMD, EXECUTE_ENDPOINT, STATUS_ENDPOINT
 
-def init_singularity(script_dir:str, subprocess, console, force_refresh:bool=False):
-    import olaf.sandbox.benchmarking_sandbox_management_singularity as sing
-    sandbox_dir = script_dir / "sandbox"
-
-    # optional force‑refresh
-    if force_refresh:
-        console.print("[yellow]Forcing Singularity sandbox refresh…[/yellow]")
-        try:
-            sing.stop_instance()
-        except Exception:
-            pass  # ignore if not running
-        if sing.SIF_PATH.exists():
-            sing.SIF_PATH.unlink()
-            console.print(
-                f"[green]Deleted {sing.SIF_PATH.name} – it will be re‑downloaded on next start.[/green]"
-            )
-
-    class _SingInstanceWrapper:
-        def start_container(self):
-            return sing.start_instance()
-
-        def stop_container(self):
-            return sing.stop_instance()
-
-    _BackendManager = _SingInstanceWrapper
-    _SANDBOX_HANDLE = sing.INSTANCE_NAME
-    _API_PORT = sing.API_PORT_HOST
-
-    def COPY_CMD(src: str, dst: str):
-        console.print(
-            f"[yellow]Singularity instance: ensure {src} is reachable at {dst} via bind mount.[/yellow]"
-        )
-
-    EXECUTE_ENDPOINT = f"http://localhost:{_API_PORT}/execute"
-    STATUS_ENDPOINT = f"http://localhost:{_API_PORT}/status"
-
-    return _BackendManager, _SANDBOX_HANDLE, COPY_CMD, EXECUTE_ENDPOINT, STATUS_ENDPOINT
 
 
 
 def init_singularity_exec(script_dir: str, sanbox_data_path, subprocess, console, force_refresh: bool = False):
     import olaf.sandbox.benchmarking_sandbox_management_singularity as sing
-    sandbox_dir = script_dir / "sandbox"
 
     # optional force‑refresh
     if force_refresh:
diff --git a/cli/olaf/src/olaf/sandbox/Dockerfile b/cli/olaf/src/olaf/sandbox/Dockerfile
index c10e567..8ce8461 100644
--- a/cli/olaf/src/olaf/sandbox/Dockerfile
+++ b/cli/olaf/src/olaf/sandbox/Dockerfile
@@ -1,13 +1,10 @@
-# Use official Python slim image based on Debian (adjust version if needed)
+# Use official Python slim image based on Debian
 FROM python:3.11-slim
 
 # Set DEBIAN_FRONTEND to noninteractive to prevent interactive prompts
 ENV DEBIAN_FRONTEND=noninteractive
 
 # --- Install System Dependencies ---
-# Combine apt-get operations into a single layer to leverage caching.
-# This layer rarely changes unless system dependencies are added/removed.
-# Install tini, tzdata, build tools, C libraries, and utilities.
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
     tini \
@@ -35,38 +32,38 @@ RUN apt-get update && \
     && rm -rf /var/lib/apt/lists/*
 
 # --- Create Non-Root User & Group ---
-# These arguments and user setup steps rarely change.
 ARG NB_USER="sandboxuser"
 ARG NB_UID=1001
 ARG NB_GID=1001
 ENV USER=${NB_USER}
 ENV HOME=/home/${NB_USER}
-# Add user's local bin to PATH early
 ENV PATH=${HOME}/.local/bin:${PATH}
 
-# Create group, user, add to sudoers (run as root)
 RUN groupadd -g ${NB_GID} ${NB_USER} && \
     useradd -m -s /bin/bash -u ${NB_UID} -g ${NB_GID} ${NB_USER} && \
     adduser ${NB_USER} sudo && \
     echo "${NB_USER} ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
 
+# --- Create /workspace Directory as Root ---
+# This step is now done BEFORE switching to the non-root user.
+RUN mkdir -p /workspace && \
+    chown ${NB_USER}:${NB_GID} /workspace
+
 # --- Install Python Dependencies ---
 COPY ./requirements.txt /tmp/requirements.txt
 
-# Install Python packages. This layer is cached if requirements.txt hasn't changed.
-# Run pip installs as the target user to ensure correct permissions and paths.
-# Switch user and set working directory *before* pip install --user.
+# Now, switch to the non-root user
 USER ${NB_USER}
 WORKDIR ${HOME}
 
 RUN python -m pip install --no-cache-dir --upgrade pip --user && \
     python -m pip install --no-cache-dir --user \
-    # Core Jupyter components (pin versions for stability)
     ipython==8.12.0 \
     traitlets==5.9.0 \
     jupyter_client==8.3.0 \
     jupyter_core==5.3.1 \
     pyzmq==25.1.0 \
+
     tornado==6.3.2 \
     ipykernel==6.25.1 \
     # FastAPI dependencies
@@ -77,28 +74,20 @@ RUN python -m pip install --no-cache-dir --upgrade pip --user && \
     -r /tmp/requirements.txt
 
 # --- Application Setup ---
-# Copy application code and scripts AFTER dependencies are installed.
-# Changes to these files will only invalidate the cache from this point.
 COPY --chown=${NB_USER}:${NB_GID} ./kernel_api.py ${HOME}/kernel_api.py
 COPY --chown=${NB_USER}:${NB_GID} ./start_kernel.py ${HOME}/start_kernel.py
 COPY --chown=${NB_USER}:${NB_GID} ./start.sh ${HOME}/start.sh
 
-# Create user directories and make scripts executable in a single layer
+# Create user-specific directories and make scripts executable
+# The /workspace creation has been moved, so we only handle user-owned files here.
 RUN mkdir -p ${HOME}/.local/share/jupyter \
              ${HOME}/.ipython/profile_default/startup \
              ${HOME}/.ipython/profile_default/static && \
     chmod +x ${HOME}/start_kernel.py ${HOME}/start.sh
 
 # --- Runtime Configuration ---
-# Expose the FastAPI port (informational)
 EXPOSE 8000
-
-# Set environment variable for kernel port (used by start_kernel.py)
 ENV IPY_BASE_PORT=4000
 
-# Use tini as the entrypoint; it will execute the CMD
-# Ensure tini installed via apt is in the default PATH or use /usr/bin/tini
 ENTRYPOINT ["/usr/bin/tini", "--"]
-
-# Set the default command to run the startup script from user's home
-CMD ["/home/sandboxuser/start.sh"]
+CMD ["/home/sandboxuser/start.sh"]
\ No newline at end of file
diff --git a/cli/olaf/src/olaf/sandbox/benchmarking_sandbox_management.py b/cli/olaf/src/olaf/sandbox/benchmarking_sandbox_management.py
index 1f8e624..3aa0905 100644
--- a/cli/olaf/src/olaf/sandbox/benchmarking_sandbox_management.py
+++ b/cli/olaf/src/olaf/sandbox/benchmarking_sandbox_management.py
@@ -6,12 +6,9 @@
 import argparse
 import os
 import time
-import subprocess # Still needed for docker cp (if used elsewhere)
 import shlex
-import json
-import io
-import tempfile # May not be needed anymore
-
+from typing import Dict
+import requests
 # --- Third-Party Imports ---
 try:
     import docker
@@ -259,7 +256,22 @@ def start_container(self, rebuild=False):
                 self.stop_container(remove=True, container_obj=current_container)
             self.container = None
             return False
-
+        
+    def exec_code(self, code: str, timeout: int = 300) -> Dict:
+            """Executes code by sending it to the API inside the container."""
+            api_url = f"http://localhost:{API_PORT_HOST}/execute"
+            try:
+                response = requests.post(
+                    api_url,
+                    json={"code": code, "timeout": timeout},
+                    timeout=timeout + 10
+                )
+                response.raise_for_status()
+                return response.json()
+            except requests.RequestException as e:
+                console.print(f"[bold red]API request to sandbox failed: {e}[/bold red]")
+                return {"status": "error", "stdout": "", "stderr": f"Host-level request error: {e}"}
+            
     def stop_container(self, remove=False, container_obj=None):
         """Stops the container and optionally removes it."""
         # Find the container to stop if not provided
diff --git a/cli/olaf/src/olaf/sandbox/requirements.txt b/cli/olaf/src/olaf/sandbox/requirements.txt
index a540117..839acf0 100644
--- a/cli/olaf/src/olaf/sandbox/requirements.txt
+++ b/cli/olaf/src/olaf/sandbox/requirements.txt
@@ -5,7 +5,7 @@ pandas==2.2.2
 scikit-learn==1.5.1
 numba==0.60.0
 statsmodels==0.14.2
-h5py==3.11.0
+h5py==3.10.0
 
 # Visualization
 matplotlib==3.9.1
@@ -48,5 +48,5 @@ scarches
 harmonypy
 
 # Additional Tools
-rapids-singlecell
+# rapids-singlecell
 scib-metrics