BasisResearch · eb8680 · Oct 9, 2025 · Oct 9, 2025 · Oct 10, 2025 · Oct 10, 2025
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -19,6 +19,10 @@ jobs:
         with:
           enable-cache: true
 
+      - name: Install pandoc
+        run: |
+          sudo apt install -y pandoc
+
       - name: Install dependencies
         run: |
           uv sync --all-extras --dev

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -22,6 +22,10 @@ jobs:
         with:
           enable-cache: true
 
+      - name: Install pandoc
+        run: |
+          sudo apt install -y pandoc
+
       - name: Install Python dependencies
         run: |
           uv sync --all-extras --dev --python ${{ matrix.python-version }}

diff --git a/.github/workflows/test_llm.yml b/.github/workflows/test_llm.yml
@@ -0,0 +1,34 @@
+name: LLM Integration Tests
+
+on:
+  pull_request:
+    branches: 
+      - master
+      - 'staging-*'
+  # Allow manual trigger
+  workflow_dispatch:
+
+jobs:
+  test-llm:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["'3.13'"]
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+
+      - name: Install Python dependencies
+        run: |
+          uv sync --all-extras --dev --python ${{ matrix.python-version }}
+
+      - name: Run LLM integration tests
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+        run: |
+          uv run pytest tests/test_handlers_llm_provider.py -v --tb=short
diff --git a/.github/workflows/test_notebooks.yml b/.github/workflows/test_notebooks.yml
@@ -26,6 +26,10 @@ jobs:
         with:
           enable-cache: true
 
+      - name: Install pandoc
+        run: |
+          sudo apt install -y pandoc
+
       - name: Install Python packages
         run: |
           uv sync --all-extras --dev

diff --git a/Makefile b/Makefile
@@ -1,3 +1,5 @@
+.PHONY: lint format test test-notebooks rebuild-fixtures FORCE
+
 lint: FORCE
 	./scripts/lint.sh
 
@@ -10,4 +12,7 @@ test: lint FORCE
 test-notebooks: lint FORCE
 	./scripts/test_notebooks.sh
 
+rebuild-fixtures:
+	REBUILD_FIXTURES=true uv run pytest tests/test_handlers_llm_provider.py
+
 FORCE:
diff --git a/docs/source/beam.py b/docs/source/beam.py
@@ -0,0 +1,144 @@
+"""This example demonstrates a beam search over a program that uses a `choose`
+effect for nondeterminism and `score` effect to weigh its choices.
+
+"""
+
+import functools
+import heapq
+import random
+from collections.abc import Callable
+from dataclasses import dataclass
+from pprint import pprint
+
+from effectful.ops.semantics import fwd, handler
+from effectful.ops.syntax import ObjectInterpretation, defop, implements
+
+
+@defop
+def choose[T](choices: list[T]) -> T:
+    result = random.choice(choices)
+    print(f"choose({choices}) = {result}")
+    return result
+
+
+@defop
+def score(value: float) -> None:
+    pass
+
+
+class Suspend(Exception): ...
+
+
+class ReplayIntp(ObjectInterpretation):
+    def __init__(self, trace):
+        self.trace = trace
+        self.step = 0
+
+    @implements(choose)
+    def _(self, *args, **kwargs):
+        if self.step < len(self.trace):
+            result = self.trace[self.step][1]
+            self.step += 1
+            return result
+        return fwd()
+
+
+class TraceIntp(ObjectInterpretation):
+    def __init__(self):
+        self.trace = []
+
+    @implements(choose)
+    def _(self, *args, **kwargs):
+        result = fwd()
+        self.trace.append(((args, kwargs), result))
+        return result
+
+
+class ScoreIntp(ObjectInterpretation):
+    def __init__(self):
+        self.score = 0.0
+
+    @implements(score)
+    def _(self, value):
+        self.score += value
+
+
+class ChooseOnceIntp(ObjectInterpretation):
+    def __init__(self):
+        self.is_first_call = True
+
+    @implements(choose)
+    def _(self, *args, **kwargs):
+        if not self.is_first_call:
+            raise Suspend
+
+        self.is_first_call = False
+        return fwd()
+
+
+@dataclass
+class BeamCandidate[S, T]:
+    """Represents a candidate execution path in beam search."""
+
+    trace: list[S]
+    score: float
+    in_progress: bool
+    result: T | None
+
+    def __lt__(self, other: "BeamCandidate[S, T]") -> bool:
+        return self.score < other.score
+
+    def expand[**P](self, model_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs):
+        in_progress = False
+        result = None
+        score_intp = ScoreIntp()
+        trace_intp = TraceIntp()
+        with (
+            handler(score_intp),
+            handler(ChooseOnceIntp()),
+            handler(ReplayIntp(self.trace)),
+            handler(trace_intp),
+        ):
+            try:
+                result = model_fn(*args, **kwargs)
+            except Suspend:
+                in_progress = True
+
+        return BeamCandidate(trace_intp.trace, score_intp.score, in_progress, result)
+
+
+def beam_search[**P, S, T](
+    model_fn: Callable[P, T], beam_width=3
+) -> Callable[P, BeamCandidate[S, T]]:
+    @functools.wraps(model_fn)
+    def wrapper(*args, **kwargs):
+        beam = [BeamCandidate([], 0.0, True, None)]
+
+        while True:
+            expandable = [c for c in beam if c.in_progress] * beam_width
+            if not expandable:
+                return beam
+
+            new_candidates = [c.expand(model_fn, *args, **kwargs) for c in expandable]
+
+            for c in new_candidates:
+                heapq.heappushpop(beam, c) if len(
+                    beam
+                ) >= beam_width else heapq.heappush(beam, c)
+
+    return wrapper
+
+
+if __name__ == "__main__":
+
+    def model():
+        s1 = choose(range(100))
+        score(s1)
+        s2 = choose(range(-100, 100))
+        score(s2)
+        s3 = choose(range(-100, 100))
+        score(s3)
+        return s3
+
+    result: BeamCandidate = beam_search(model)()
+    pprint(result)
diff --git a/docs/source/beam_search_example.rst b/docs/source/beam_search_example.rst
@@ -0,0 +1,26 @@
+Angelic Nondeterminism
+======================
+
+Here we give an example of *angelic nondeterminism* in effectful [#f1]_.
+Our model is a nondeterministic program that makes choices using a ``choose`` effect and uses a ``score`` effect to sum up a final score.
+We implement a beam search that optimizes this final score as a handler for the ``choose`` and ``score`` effects.
+
+The beam search works by running the model until it reaches a ``choose``, at which point the continuation is captured.
+This continuation is resumed multiple times with different values from ``choose`` to expand the beam.
+The intermediate score is used to rank the beam candidates.
+
+Because Python does not have support for first-class continuations, we use *thermometer continuations* [#f2]_.
+A thermometer continuation works by tracking any nondeterminism
+(essentially, the model is rerun from the start replaying the ``choose`` effects).
+If ``choose`` is the only source of nondeterminism, then the 
+after each ``choose`` and replaying it  uses *thermometer continuations* to 
+
+.. literalinclude:: ./beam.py
+    :language: python
+
+References
+----------
+
+.. [#f1] Li, Z., Solar-Lezama, A., Yue, Y., and Zheng, S., "EnCompass: Enhancing Agent Programming with Search Over Program Execution Paths", 2025. https://arxiv.org/abs/2512.03571
+
+.. [#f2] James Koppel, Gabriel Scherer, and Armando Solar-Lezama. 2018. Capturing the future by replaying the past (functional pearl). Proc. ACM Program. Lang. 2, ICFP, Article 76 (September 2018), 29 pages. https://doi.org/10.1145/3236771
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -12,10 +12,8 @@
 
 import os
 import sys
-from typing import List
 
 sys.path.insert(0, os.path.abspath("../../"))
-import sphinx_rtd_theme  # noqa: E402
 
 # -- Project information -----------------------------------------------------
 
@@ -69,7 +67,7 @@
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns: List[str] = []
+exclude_patterns: list[str] = []
 
 
 # -- Options for HTML output -------------------------------------------------

diff --git a/docs/source/effectful.rst b/docs/source/effectful.rst
@@ -15,7 +15,6 @@ Syntax
    :members:
    :undoc-members:
 
-   .. autofunction:: effectful.ops.syntax.defterm(value: T) -> Expr[T]
    .. autofunction:: effectful.ops.syntax.defdata(value: Term[T]) -> Expr[T]
 
 Semantics
@@ -41,6 +40,27 @@ Handlers
    :undoc-members:
 
 
+LLM
+^^^
+
+.. automodule:: effectful.handlers.llm
+   :members:
+   :undoc-members:
+
+Encoding
+""""""""
+
+.. automodule:: effectful.handlers.llm.encoding
+   :members:
+   :undoc-members:
+
+Providers
+"""""""""
+
+.. automodule:: effectful.handlers.llm.providers
+   :members:
+   :undoc-members:
+
 Jax
 ^^^
 

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -16,6 +16,7 @@ Table of Contents
    minipyro_example
    lambda_example
    semi_ring_example
+   beam_search_example
 
 .. toctree::
    :maxdepth: 2

diff --git a/docs/source/lambda_.py b/docs/source/lambda_.py
@@ -1,5 +1,6 @@
 import functools
-from typing import Annotated, Callable
+from collections.abc import Callable
+from typing import Annotated
 
 from effectful.ops.semantics import coproduct, evaluate, fvsof, fwd, handler
 from effectful.ops.syntax import Scoped, defdata, defop, syntactic_eq
@@ -102,7 +103,7 @@ def sort_add(x: Expr[int], y: Expr[int]) -> Expr[int]:
         case Term(add_, (a, Term(vx, ()))), Term(vy, ()) if add_ == add and id(vx) > id(
             vy
         ):
-            return (a + vy()) + vx()  # type: ignore
+            return (a + vy()) + vx()
         case _:
             return fwd()