From 124bb931fc4e41c010a59ee717c318b10aaa909e Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Tue, 1 Apr 2025 13:43:22 +0300
Subject: [PATCH 01/38] Use sa.sql.text for SQL statement execution in Block
 class

---
 core/src/datayoga_core/blocks/relational/write/block.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/core/src/datayoga_core/blocks/relational/write/block.py b/core/src/datayoga_core/blocks/relational/write/block.py
index 1348d420..39c1ac7d 100644
--- a/core/src/datayoga_core/blocks/relational/write/block.py
+++ b/core/src/datayoga_core/blocks/relational/write/block.py
@@ -10,7 +10,6 @@
 from datayoga_core.context import Context
 from datayoga_core.opcode import OpCode
 from datayoga_core.result import BlockResult, Result, Status
-from sqlalchemy import text
 from sqlalchemy.exc import OperationalError
 
 logger = logging.getLogger("dy")
@@ -210,7 +209,7 @@ def process_records(
     def execute(self, statement: Any, records: List[Dict[str, Any]]):
         """Executes a SQL statement with given records."""
         if isinstance(statement, str):
-            statement = text(statement)
+            statement = sa.sql.text(statement)
 
         logger.debug(f"Executing {statement} on {records}")
         connected = False

From 4773f9aed61249b297b740fd75b7d756fc26accb Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Tue, 1 Apr 2025 13:43:47 +0300
Subject: [PATCH 02/38] Add batch processing capability to Block class and
 update schema for batch_size

---
 .../datayoga_core/blocks/std/read/block.py    | 29 ++++++++++++++++---
 .../blocks/std/read/block.schema.json         | 10 ++++++-
 2 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/core/src/datayoga_core/blocks/std/read/block.py b/core/src/datayoga_core/blocks/std/read/block.py
index b649c88b..184423a1 100644
--- a/core/src/datayoga_core/blocks/std/read/block.py
+++ b/core/src/datayoga_core/blocks/std/read/block.py
@@ -16,19 +16,40 @@ class Block(DyProducer):
 
     def init(self, context: Optional[Context] = None):
         logger.debug(f"Initializing {self.get_block_name()}")
+        self.batch_size = int(self.properties.get("batch_size", 1000))
+        logger.info(f"!Using batch size: {self.batch_size}")
+
+    async def process_batch(self, records: List[Dict[str, Any]]) -> AsyncGenerator[List[Message], None]:
+        """Process records and yield batches according to batch_size"""
+        batch = []
+        for record in records:
+            batch.append(self.get_message(record))
+
+            # When batch is full, yield it
+            if len(batch) >= self.batch_size:
+                logger.info(f"Yielding batch of {len(batch)} records")
+                yield batch
+                batch = []
+
+        # Yield any remaining records
+        if batch:
+            logger.info(f"Yielding final batch of {len(batch)} records")
+            yield batch
 
     async def produce(self) -> AsyncGenerator[List[Message], None]:
         if select.select([sys.stdin, ], [], [], 0.0)[0]:
             # piped data exists
+            all_records = []
             for data in sys.stdin:
-                for record in self.get_records(data):
-                    yield [self.get_message(record)]
+                all_records.extend(self.get_records(data))
         else:
             # interactive mode
             print("Enter data to process:")
             data = input()
-            for record in self.get_records(data):
-                yield [self.get_message(record)]
+            all_records = self.get_records(data)
+
+        async for batch in self.process_batch(all_records):
+            yield batch
 
     @staticmethod
     def get_records(data: str) -> List[Dict[str, Any]]:
diff --git a/core/src/datayoga_core/blocks/std/read/block.schema.json b/core/src/datayoga_core/blocks/std/read/block.schema.json
index 11453dbf..38ad05af 100644
--- a/core/src/datayoga_core/blocks/std/read/block.schema.json
+++ b/core/src/datayoga_core/blocks/std/read/block.schema.json
@@ -1,4 +1,12 @@
 {
   "title": "std.read",
-  "description": "Read from the standard input"
+  "description": "Read from the standard input",
+  "type": "object",
+  "properties": {
+    "batch_size": {
+      "type": "integer",
+      "description": "Number of records to process in a single batch",
+      "default": 1000
+    }
+  }
 }

From c02ab0485bc580febefad6c54716ed218c815399 Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Tue, 1 Apr 2025 10:45:27 +0000
Subject: [PATCH 03/38] update json schemas

---
 schemas/job.schema.json | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/schemas/job.schema.json b/schemas/job.schema.json
index d9031291..340e0e68 100644
--- a/schemas/job.schema.json
+++ b/schemas/job.schema.json
@@ -176,7 +176,15 @@
             "properties": {
               "with": {
                 "title": "std.read",
-                "description": "Read from the standard input"
+                "description": "Read from the standard input",
+                "type": "object",
+                "properties": {
+                  "batch_size": {
+                    "type": "integer",
+                    "description": "Number of records to process in a single batch",
+                    "default": 1000
+                  }
+                }
               }
             }
           }

From 3a43bc1bb95400a02fa5b66f1c48de1b85354039 Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Tue, 1 Apr 2025 10:45:48 +0000
Subject: [PATCH 04/38] update autogenerated docs

---
 docs/reference/blocks/std_read.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/docs/reference/blocks/std_read.md b/docs/reference/blocks/std_read.md
index 32b6904b..aca1c24a 100644
--- a/docs/reference/blocks/std_read.md
+++ b/docs/reference/blocks/std_read.md
@@ -8,4 +8,17 @@ grand_parent: Reference
 Read from the standard input
 
 
+**Properties**
+
+|Name|Type|Description|Required|
+|----|----|-----------|--------|
+|**batch\_size**|`integer`|Number of records to process in a single batch<br/>Default: `1000`<br/>||
+
+**Example**
+
+```yaml
+batch_size: 1000
+
+```
+
 

From 0b51d63147ec5b7716d233edd5bebac2638c03b6 Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Tue, 1 Apr 2025 13:52:49 +0300
Subject: [PATCH 05/38] Fix log message to remove unnecessary exclamation mark
 in batch size initialization

---
 core/src/datayoga_core/blocks/std/read/block.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/datayoga_core/blocks/std/read/block.py b/core/src/datayoga_core/blocks/std/read/block.py
index 184423a1..e0b60b13 100644
--- a/core/src/datayoga_core/blocks/std/read/block.py
+++ b/core/src/datayoga_core/blocks/std/read/block.py
@@ -17,7 +17,7 @@ class Block(DyProducer):
     def init(self, context: Optional[Context] = None):
         logger.debug(f"Initializing {self.get_block_name()}")
         self.batch_size = int(self.properties.get("batch_size", 1000))
-        logger.info(f"!Using batch size: {self.batch_size}")
+        logger.info(f"Using batch size: {self.batch_size}")
 
     async def process_batch(self, records: List[Dict[str, Any]]) -> AsyncGenerator[List[Message], None]:
         """Process records and yield batches according to batch_size"""

From 8f4f992a77106536b42d321b32dfca015e6fb0c1 Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Sun, 28 Dec 2025 14:42:17 +0200
Subject: [PATCH 06/38] update json schemas

---
 schemas/job.schema.json | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/schemas/job.schema.json b/schemas/job.schema.json
index 74b3e45e..1b2a2533 100644
--- a/schemas/job.schema.json
+++ b/schemas/job.schema.json
@@ -1371,7 +1371,15 @@
             "properties": {
               "with": {
                 "description": "Read from the standard input",
-                "title": "std.read"
+                "properties": {
+                  "batch_size": {
+                    "default": 1000,
+                    "description": "Number of records to process in a single batch",
+                    "type": "integer"
+                  }
+                },
+                "title": "std.read",
+                "type": "object"
               }
             }
           }

From 807d61ad8a0f1a50b3fc45572e7d6e12d934a3a8 Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Sun, 28 Dec 2025 15:19:30 +0200
Subject: [PATCH 07/38] Increase timeout for integration tests from 10 to 15
 minutes

---
 .github/workflows/integration-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
index 7b4b819c..66b9ee0d 100644
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@@ -8,7 +8,7 @@ concurrency:
 jobs:
   integration-tests:
     runs-on: ubuntu-22.04
-    timeout-minutes: 10
+    timeout-minutes: 15
 
     steps:
       - name: Check out repository code

From 633d9bfb4420f814ed543dabd416d89f92eb3ddb Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 16:14:10 +0300
Subject: [PATCH 08/38] Add design spec for producer batching unification
 (#400)

Brainstormed design for unifying batch handling across all 7 producer
blocks (std/read, files/read_csv, relational/read, parquet/read,
redis/read_stream, azure/read_event_hub, http/receiver). Closes the
gap behind #294, #295, #296, #377 by making the Producer base class
own batching via a new produce_chunks() hook.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 ...28-producer-batching-unification-design.md | 385 ++++++++++++++++++
 1 file changed, 385 insertions(+)
 create mode 100644 docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md

diff --git a/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md b/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md
new file mode 100644
index 00000000..81692cdc
--- /dev/null
+++ b/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md
@@ -0,0 +1,385 @@
+# Producer batching unification
+
+**Status:** Design — pending implementation
+**Date:** 2026-05-28
+**Issue:** #400
+**Closes:** #294, #295, #296, #377 (as a side effect of the refactor)
+
+## Problem
+
+Seven producer blocks each handle (or fail to handle) batching differently:
+
+| Producer | Bounded/Streaming | `batch_size` today | Behavior |
+|---|---|---|---|
+| `std/read` | bounded | yes, default 1000 *(on `batch_size_in_std_read_block` branch)* | custom `process_batch` accumulator |
+| `files/read_csv` | bounded | yes, default 1000 | own `islice(reader, batch_size)` loop |
+| `relational/read` | bounded | **no** — hardcoded `fetchmany(10000)` | yields one row at a time downstream (bug) |
+| `parquet/read` | bounded | **no** | yields one row at a time (bug) |
+| `redis/read_stream` | streaming | **no** | yields one record at a time (bug #377) |
+| `azure/read_event_hub` | streaming | yes, default 300, **but** controls *SDK callback batch size*, not pipeline batch size | drains internal queue in unbounded batches |
+| `http/receiver` | streaming | **no** | yields one record per HTTP request (bug) |
+
+Four are actively buggy (yielding single records into the pipeline when batches are intended). One uses `batch_size` with a different semantic. Each producer that has implemented batching has done it differently.
+
+The duplication is the root cause of issues #294, #295, #296, and #377 — all four are the same gap, in different blocks.
+
+## Goal
+
+Make the `Producer` base class own batching. Subclasses describe how to fetch records; the base class controls the size and timing of batches yielded to the pipeline.
+
+After the change:
+
+- `batch_size` means the same thing in every producer: the maximum number of records yielded per downstream batch.
+- Adding a new producer cannot reintroduce the "yield single records" bug — there's no place for it to happen.
+- Streaming producers get an optional `flush_ms` so partial batches flush on inactivity instead of being held indefinitely.
+
+Non-goals: changing the `Job`/`Step` pipeline, adding new sources, restructuring the Result/payload model (that's #245).
+
+## Design
+
+### Base-class contract
+
+```python
+# core/src/datayoga_core/producer.py
+
+class Producer(Block):
+    DEFAULT_BATCH_SIZE = 1000
+    DEFAULT_FLUSH_MS = None  # streaming subclasses override
+
+    @abstractmethod
+    async def produce_chunks(self) -> AsyncGenerator[List[Message], None]:
+        """Yield natural chunks of any size. Base class re-chunks to batch_size."""
+        raise NotImplementedError
+
+    async def produce(self) -> AsyncGenerator[List[Message], None]:
+        """Public entry point. Reads chunks from produce_chunks() and re-emits
+        in exact batch_size slices, with optional time-based flush."""
+        ...
+```
+
+Subclasses override `produce_chunks` instead of `produce`. They emit chunks of any size — whatever's natural to the source (a Parquet row group, a `fetchmany` result, an `xreadgroup` response, an Event Hub callback batch, a single record).
+
+The base class accumulates chunks and re-emits them in exact `batch_size` slices, flushing whatever's left on end-of-stream.
+
+### `batch_size` and `flush_ms` are read lazily
+
+`produce()` reads `self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE)` on first call, not in `init()`. This avoids the "subclass forgot `super().init(context)`" footgun.
+
+### `flush_ms` implementation
+
+For streaming sources, partial batches must flush on inactivity, otherwise a low-traffic stream could hold records indefinitely.
+
+Implementation uses an internal queue + background pump task, mirroring the pattern already in `azure/read_event_hub`:
+
+```python
+async def produce(self) -> AsyncGenerator[List[Message], None]:
+    batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE))
+    flush_ms = self.properties.get("flush_ms", self.DEFAULT_FLUSH_MS)
+    timeout = (flush_ms / 1000) if flush_ms is not None else None
+
+    queue: asyncio.Queue[Optional[List[Message]]] = asyncio.Queue()
+    EOS = object()
+
+    async def pump():
+        try:
+            async for chunk in self.produce_chunks():
+                if chunk:
+                    await queue.put(chunk)
+        finally:
+            await queue.put(EOS)
+
+    pump_task = asyncio.create_task(pump())
+    buffer: List[Message] = []
+    try:
+        while True:
+            try:
+                item = await asyncio.wait_for(queue.get(), timeout=timeout)
+            except asyncio.TimeoutError:
+                if buffer:
+                    yield buffer
+                    buffer = []
+                continue
+
+            if item is EOS:
+                if buffer:
+                    yield buffer
+                return
+
+            buffer.extend(item)
+            while len(buffer) >= batch_size:
+                yield buffer[:batch_size]
+                buffer = buffer[batch_size:]
+    finally:
+        pump_task.cancel()
+        with suppress(asyncio.CancelledError):
+            await pump_task
+```
+
+Why a queue and not `asyncio.wait_for(anext(gen), timeout)`: cancelling `__anext__` on an async generator with side effects (open connections, partial reads) can leave it in a broken state. Cancelling the *pump task* boundary is safe; the generator finishes its current chunk before the pump's `try/finally` runs.
+
+`flush_ms = None` ⇒ `timeout = None` ⇒ `queue.get()` waits forever ⇒ no time-based flush. Bounded sources don't set `flush_ms` and aren't affected.
+
+### Schema fragments
+
+Two shared fragments in `core/src/datayoga_core/resources/schemas/`:
+
+`batchable.schema.json`:
+```json
+{
+  "type": "object",
+  "properties": {
+    "batch_size": {
+      "type": "integer",
+      "minimum": 1,
+      "description": "Maximum number of records yielded per downstream batch",
+      "default": 1000
+    }
+  }
+}
+```
+
+`streamable.schema.json`:
+```json
+{
+  "type": "object",
+  "allOf": [{ "$ref": "batchable.schema.json" }],
+  "properties": {
+    "flush_ms": {
+      "type": ["integer", "null"],
+      "minimum": 1,
+      "description": "If set, flush a partial batch after this many ms of inactivity. null/omitted = wait until batch_size or end-of-stream.",
+      "default": 1000
+    }
+  }
+}
+```
+
+Bounded producer schemas `$ref` `batchable`; streaming producer schemas `$ref` `streamable`. The fragments are the single source of truth for the description, validation, and default.
+
+### Per-producer changes
+
+**`std/read`** (bounded)
+
+Replace `process_batch` with a single-chunk yield. Base class slices.
+
+```python
+async def produce_chunks(self):
+    if select.select([sys.stdin], [], [], 0.0)[0]:
+        all_records = [r for line in sys.stdin for r in self.get_records(line)]
+    else:
+        print("Enter data to process:")
+        all_records = self.get_records(input())
+    if all_records:
+        yield [self.get_message(r) for r in all_records]
+```
+
+**`files/read_csv`** (bounded)
+
+Drops the `islice` loop; yield in `batch_size` chunks. Base class re-emits.
+
+```python
+async def produce_chunks(self):
+    batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE))
+    with open(self.file, "r", encoding=self.encoding) as f:
+        reader = DictReader(f, fieldnames=self.fields,
+                            delimiter=self.delimiter, quotechar=self.quotechar)
+        for _ in range(self.skip):
+            next(reader, None)
+        counter = iter(count())
+        while True:
+            chunk = [{self.MSG_ID_FIELD: f"{next(counter)}", **r}
+                     for r in islice(reader, batch_size)]
+            if not chunk:
+                return
+            yield chunk
+```
+
+**`relational/read`** (bounded)
+
+`batch_size` uses the framework default (1000). `fetch_size` defaults to **10000** to preserve today's driver-roundtrip count as the no-config baseline. Result: strict improvement vs. today (downstream goes from 1-record batches to 1000-record batches; DB roundtrips stay at 10000).
+
+```python
+class Block(DyProducer):
+    DEFAULT_FETCH_SIZE = 10000
+
+    async def produce_chunks(self):
+        fetch_size = int(self.properties.get("fetch_size", self.DEFAULT_FETCH_SIZE))
+        result = self.connection.execution_options(stream_results=True).execute(self.tbl.select())
+        while True:
+            rows = result.fetchmany(fetch_size)
+            if not rows:
+                return
+            yield [utils.add_uid(dict(r._asdict())) for r in rows]
+```
+
+Schema adds optional `fetch_size` with default 10000.
+
+**`parquet/read`** (bounded)
+
+Fix one-by-one yield. Each row group becomes one chunk; base class re-emits in `batch_size` slices.
+
+```python
+async def produce_chunks(self):
+    pf = ParquetFile(self.file)
+    counter = iter(count())
+    for df in pf.iter_row_groups():
+        yield [{self.MSG_ID_FIELD: str(next(counter)), **row.to_dict()}
+               for _, row in df.iterrows()]
+```
+
+**`redis/read_stream`** (streaming, closes #377)
+
+Use `count=batch_size` on `xreadgroup`. Yield each batch as a chunk. Class overrides `DEFAULT_FLUSH_MS = 1000`.
+
+```python
+class Block(DyProducer):
+    DEFAULT_FLUSH_MS = 1000
+
+async def produce_chunks(self):
+    batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE))
+    read_pending = True
+    while True:
+        streams = self.redis_client.xreadgroup(
+            self.consumer_group, self.requesting_consumer,
+            {self.stream: "0" if read_pending else ">"},
+            count=batch_size,
+            block=100 if self.snapshot else 0,  # streaming blocks forever; snapshot polls briefly
+        )
+        for stream in streams:
+            chunk = []
+            for key, value in stream[1]:
+                payload = orjson.loads(value[next(iter(value))])
+                payload[self.MSG_ID_FIELD] = key
+                chunk.append(payload)
+            if chunk:
+                yield chunk
+        if self.snapshot and not read_pending:
+            return
+        read_pending = False
+```
+
+`flush_ms` (default 1000) ensures partial batches flush during low-volume periods. The pump task can sit blocked inside `xreadgroup` indefinitely — that's fine, because the pump and the consumer side of the base-class queue are decoupled. When a single message finally arrives, it lands in the queue immediately and `flush_ms` flushes the partial batch downstream.
+
+**`azure/read_event_hub`** (streaming, breaking change)
+
+Existing `batch_size` property → renamed `max_batch_size` (matches SDK semantic, default 300). New `batch_size` (pipeline semantic, default 1000) comes from the streamable fragment.
+
+```python
+class Block(DyProducer):
+    DEFAULT_FLUSH_MS = 1000
+
+    def init(self, context=None):
+        self.max_batch_size = int(self.properties.get("max_batch_size", 300))
+        # ... existing client setup ...
+        self.events = {}
+        self.messages = asyncio.Queue()
+
+    async def produce_chunks(self):
+        asyncio.create_task(self.receive_batch())  # uses self.max_batch_size
+        while True:
+            msg = await self.messages.get()
+            chunk = [msg]
+            while not self.messages.empty():
+                chunk.append(self.messages.get_nowait())
+            yield chunk
+```
+
+**Migration:** Users with `batch_size: 300` in YAML thinking it controls SDK callbacks must rename to `max_batch_size: 300`. No backward-compat shim. Called out in CHANGELOG.
+
+The schema for `azure/read_event_hub` also gains `additionalProperties: false` (it doesn't have it today). Without this, an old `batch_size: 300` in YAML would silently be ignored as an unknown property after the rename. With it, validation fails loudly with a clear error.
+
+**`http/receiver`** (streaming)
+
+Drain the queue per chunk; `flush_ms` flushes partial batches when traffic is low.
+
+```python
+class Block(DyProducer):
+    DEFAULT_FLUSH_MS = 1000
+
+    async def produce_chunks(self):
+        queue: Queue = Queue(maxsize=1000)
+        async def handler(request):
+            try:
+                queue.put_nowait(orjson.loads(await request.read()))
+                return HTTPOk()
+            except Exception:
+                logger.exception("Got exception while parsing request:")
+                return HTTPInternalServerError()
+        runner = ServerRunner(Server(handler))
+        await runner.setup()
+        srv = TCPSite(runner, self.host, self.port)
+        await srv.start()
+        try:
+            counter = iter(count())
+            while True:
+                msg = await queue.get()
+                chunk = [{self.MSG_ID_FIELD: f"{next(counter)}", **msg}]
+                while not queue.empty():
+                    chunk.append({self.MSG_ID_FIELD: f"{next(counter)}", **queue.get_nowait()})
+                yield chunk
+        finally:
+            with suppress(Exception):
+                await srv.stop()
+```
+
+### Defaults summary
+
+| Producer | `batch_size` | `flush_ms` | Other |
+|---|---|---|---|
+| `std/read` | 1000 | — | — |
+| `files/read_csv` | 1000 | — | — |
+| `relational/read` | 1000 | — | optional `fetch_size`, defaults to 10000 |
+| `parquet/read` | 1000 | — | — |
+| `redis/read_stream` | 1000 | 1000 | — |
+| `azure/read_event_hub` | 1000 | 1000 | `max_batch_size` 300 (renamed from old `batch_size`) |
+| `http/receiver` | 1000 | 1000 | — |
+
+## Tests
+
+**New base-class tests** (`core/src/datayoga_core/tests/test_producer_batching.py`):
+
+A `FakeProducer` whose `produce_chunks` yields scripted chunks. Cases:
+
+- One 5000-record chunk + `batch_size=1000` → five batches of 1000.
+- Three chunks of [200, 300, 400] + `batch_size=1000` → one batch of 900 on EOS (no empty trailing).
+- 1500 records + `batch_size=1000` → batches of [1000, 500].
+- `flush_ms=100` with a producer that sleeps 200ms between chunks → partial batches flush on inactivity.
+- `flush_ms=None` holds records indefinitely (asserted with a timeout that the next batch doesn't arrive early).
+- Empty chunk yields are ignored (no empty batches emitted).
+- Pump-task cleanup: cancelling the consumer cancels the pump cleanly (no warnings, no leaks).
+
+**Per-producer tests:**
+
+- `std/read`, `files/read_csv` — existing tests adapted; assert batch counts/sizes match `batch_size`.
+- `relational/read` — assert it yields batches (not single rows); assert `fetch_size` controls driver calls independently of `batch_size`.
+- `parquet/read` — multi-row-group file; batches honor `batch_size` regardless of row-group boundaries.
+- `redis/read_stream` — assert `xreadgroup` called with `count=batch_size`. The `redis_to_relational` integration test (mentioned in #377) provides the end-to-end signal; it depends on the batch-fallback in `relational/write` shipped in commit `7e5b6f7`, which is already in place.
+- `azure/read_event_hub` — assert validation rejects legacy `batch_size: 500` with no `max_batch_size`; assert `max_batch_size: 500, batch_size: 100` results in SDK callbacks of 500 and downstream batches of 100.
+- `http/receiver` — send N records via webhook; assert they land in batches of `batch_size`, or partial batches after `flush_ms`.
+
+## Documentation
+
+- Update `docs/reference/blocks/*_read.md` for each affected producer (`batch_size`, `flush_ms`, `fetch_size`, `max_batch_size` where applicable).
+- Add a section in `docs/processing-strategies.md` explaining the producer batching model: chunked subclass output, base-class re-chunking, `flush_ms` for streaming sources.
+- CHANGELOG entry calling out:
+  - New `batch_size`/`flush_ms` on previously non-batching producers.
+  - **Breaking:** `azure/read_event_hub.batch_size` renamed to `max_batch_size`; the name `batch_size` now means pipeline batch size.
+
+## Risks and trade-offs
+
+1. **`Producer` ABC change.** `produce_chunks` is now the abstract method. Any external/downstream custom producer subclassing `Producer` and overriding `produce()` directly will break. Acceptable given datayoga's surface area; called out in CHANGELOG.
+
+2. **Event Hub silent-semantic-change risk.** The breaking rename is intentional. Adding `additionalProperties: false` to the Event Hub schema (which it lacks today) is part of this change so that old `batch_size: 300` configs fail validation loudly, not get silently ignored.
+
+3. **`flush_ms` semantics on Job shutdown.** When the producer is being cancelled (`Job.shutdown` → `Step.stop`), the pump's `try/finally` ensures `EOS` is queued. The `produce()` loop sees `EOS` and flushes the final partial batch. Verified by the `test_producer_batching` shutdown case.
+
+4. **`relational/read` defaults.** `fetch_size` defaults to 10000 to preserve today's DB roundtrip count. `batch_size` defaults to 1000, matching the framework default. Net effect vs. today: downstream batches grow from 1 to 1000 (huge improvement); DB roundtrips unchanged. Users with memory pressure on large rows can set a smaller `fetch_size` explicitly. Documented in the block's reference page.
+
+5. **Re-chunking cost.** Lists are sliced with `buffer[:n]` / `buffer[n:]` — O(batch_size) per batch. Negligible relative to per-record block work; no benchmark required.
+
+## Out of scope
+
+- Changing the `Result`/payload internal field representation (issue #245).
+- Adding new connector blocks (Snowflake #392, Kafka, S3 #351, RabbitMQ #265, Kinesis #264).
+- Pulling Prometheus out of core (#336).
+- Backpressure / queue sizing changes to the `Step` pipeline.

From 0696f9c8e014086c0b1d22f999b0bbe0047efe05 Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 16:28:33 +0300
Subject: [PATCH 09/38] Add implementation plan for producer batching
 unification (#400)

Task-by-task TDD plan covering: schema fragment loader, Producer base
class, and per-producer migrations (std/read, files/read_csv,
parquet/read, relational/read, redis/read_stream, http/receiver,
azure/read_event_hub), plus autogen + docs.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 ...026-05-28-producer-batching-unification.md | 2203 +++++++++++++++++
 1 file changed, 2203 insertions(+)
 create mode 100644 docs/superpowers/plans/2026-05-28-producer-batching-unification.md

diff --git a/docs/superpowers/plans/2026-05-28-producer-batching-unification.md b/docs/superpowers/plans/2026-05-28-producer-batching-unification.md
new file mode 100644
index 00000000..a53f0e0f
--- /dev/null
+++ b/docs/superpowers/plans/2026-05-28-producer-batching-unification.md
@@ -0,0 +1,2203 @@
+# Producer Batching Unification Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Move batching out of individual producer blocks into the `Producer` base class so every read block has consistent `batch_size` behavior, and three buggy producers stop yielding single records.
+
+**Architecture:** The `Producer` base class gets a new abstract-by-convention hook `produce_chunks()` that yields lists of any size. Its `produce()` method becomes a re-chunker that emits exact `batch_size` batches, with an optional `flush_ms` timeout-flush for streaming sources. Schema fragments (`batchable.schema.json`, `streamable.schema.json`) provide the shared `batch_size`/`flush_ms` definitions, resolved at load time via a `$inherit` convention. Each of the 7 producer blocks migrates to override `produce_chunks` instead of `produce`.
+
+**Tech Stack:** Python 3.7+, asyncio, jsonschema, pytest (asyncio mode), SQLAlchemy, redis-py, aiohttp, azure-eventhub.
+
+**Spec:** `docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md`
+**Issue:** #400
+
+---
+
+## File Structure
+
+**Created:**
+- `core/src/datayoga_core/resources/schemas/batchable.schema.json` — fragment exposing `batch_size`
+- `core/src/datayoga_core/resources/schemas/streamable.schema.json` — fragment exposing `flush_ms` (combined with batchable)
+- `core/src/datayoga_core/schema_utils.py` — `$inherit` resolver used by Block + Job
+- `core/src/datayoga_core/tests/__init__.py` — empty, makes the tests package importable
+- `core/src/datayoga_core/tests/test_schema_inherit.py` — tests for the `$inherit` resolver
+- `core/src/datayoga_core/tests/test_producer_batching.py` — base-class batching tests
+- `core/src/datayoga_core/blocks/parquet/read/tests/__init__.py` (if package missing)
+- `core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py`
+- `core/src/datayoga_core/blocks/files/read_csv/tests/__init__.py`
+- `core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py`
+- `core/src/datayoga_core/blocks/redis/read_stream/tests/__init__.py`
+- `core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py`
+- `core/src/datayoga_core/blocks/http/receiver/tests/__init__.py`
+- `core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py`
+- `core/src/datayoga_core/blocks/azure/read_event_hub/tests/__init__.py`
+- `core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py`
+- `core/src/datayoga_core/blocks/relational/read/tests/__init__.py`
+- `core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py`
+
+**Modified:**
+- `core/src/datayoga_core/producer.py` — adds `produce_chunks` and a default `produce()` that re-chunks
+- `core/src/datayoga_core/block.py` — `get_json_schema()` runs through `$inherit` resolver
+- `core/src/datayoga_core/job.py` — `get_json_schema()` loop runs each loaded schema through the resolver
+- `core/src/datayoga_core/blocks/std/read/block.py` — replace `process_batch` with `produce_chunks`
+- `core/src/datayoga_core/blocks/std/read/block.schema.json` — use `$inherit: ["batchable"]`
+- `core/src/datayoga_core/blocks/files/read_csv/block.py` — `produce_chunks` (drop `islice` loop in `produce`)
+- `core/src/datayoga_core/blocks/files/read_csv/block.schema.json` — drop inline `batch_size`, add `$inherit`
+- `core/src/datayoga_core/blocks/parquet/read/block.py` — `produce_chunks` per row group
+- `core/src/datayoga_core/blocks/parquet/read/block.schema.json` — add `$inherit`
+- `core/src/datayoga_core/blocks/relational/read/block.py` — `produce_chunks` with `fetch_size`
+- `core/src/datayoga_core/blocks/relational/read/block.schema.json` — add `$inherit` + `fetch_size` property
+- `core/src/datayoga_core/blocks/redis/read_stream/block.py` — `produce_chunks` with `count=batch_size`
+- `core/src/datayoga_core/blocks/redis/read_stream/block.schema.json` — `$inherit: ["streamable"]`
+- `core/src/datayoga_core/blocks/http/receiver/block.py` — `produce_chunks` drains queue
+- `core/src/datayoga_core/blocks/http/receiver/block.schema.json` — `$inherit: ["streamable"]`
+- `core/src/datayoga_core/blocks/azure/read_event_hub/block.py` — `produce_chunks`, rename `batch_size` → `max_batch_size`
+- `core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json` — rename property, add `additionalProperties: false`, `$inherit: ["streamable"]`
+- `schemas/job.schema.json` — regenerated at the end
+- `docs/reference/blocks/*.md` — regenerated at the end
+- `docs/processing-strategies.md` — new section on producer batching
+
+---
+
+## Task 1: Schema fragment loader
+
+Adds the `$inherit` convention and the two shared fragments. After this task, schemas referencing `batchable` / `streamable` get the fragments' properties merged in at load time.
+
+**Files:**
+- Create: `core/src/datayoga_core/resources/schemas/batchable.schema.json`
+- Create: `core/src/datayoga_core/resources/schemas/streamable.schema.json`
+- Create: `core/src/datayoga_core/schema_utils.py`
+- Create: `core/src/datayoga_core/tests/__init__.py`
+- Create: `core/src/datayoga_core/tests/test_schema_inherit.py`
+- Modify: `core/src/datayoga_core/block.py` (lines 44–59)
+- Modify: `core/src/datayoga_core/job.py` (lines 223–244)
+
+- [ ] **Step 1.1: Create the `batchable` fragment**
+
+Create `core/src/datayoga_core/resources/schemas/batchable.schema.json`:
+
+```json
+{
+  "title": "batchable",
+  "description": "Producer batching mixin: declares batch_size for producers that yield records in batches.",
+  "type": "object",
+  "properties": {
+    "batch_size": {
+      "type": "integer",
+      "minimum": 1,
+      "description": "Maximum number of records yielded per downstream batch.",
+      "default": 1000
+    }
+  }
+}
+```
+
+- [ ] **Step 1.2: Create the `streamable` fragment**
+
+Create `core/src/datayoga_core/resources/schemas/streamable.schema.json`:
+
+```json
+{
+  "title": "streamable",
+  "description": "Streaming producer mixin: declares batch_size and flush_ms for producers reading from continuous sources.",
+  "type": "object",
+  "properties": {
+    "batch_size": {
+      "type": "integer",
+      "minimum": 1,
+      "description": "Maximum number of records yielded per downstream batch.",
+      "default": 1000
+    },
+    "flush_ms": {
+      "type": ["integer", "null"],
+      "minimum": 1,
+      "description": "If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.",
+      "default": 1000
+    }
+  }
+}
+```
+
+- [ ] **Step 1.3: Create empty tests package**
+
+If `core/src/datayoga_core/tests/__init__.py` does not exist, create it as an empty file. (Several test modules in this plan live in `core/src/datayoga_core/tests/`; the directory must be importable.)
+
+```bash
+test -f core/src/datayoga_core/tests/__init__.py || touch core/src/datayoga_core/tests/__init__.py
+```
+
+- [ ] **Step 1.4: Write the failing test for `$inherit` resolution**
+
+Create `core/src/datayoga_core/tests/test_schema_inherit.py`:
+
+```python
+import json
+from pathlib import Path
+
+import pytest
+
+from datayoga_core.schema_utils import resolve_inherits
+
+
+SCHEMAS_DIR = (
+    Path(__file__).resolve().parent.parent / "resources" / "schemas"
+)
+
+
+def test_inherit_merges_fragment_properties():
+    schema = {
+        "title": "demo",
+        "type": "object",
+        "$inherit": ["batchable"],
+        "properties": {"foo": {"type": "string"}},
+        "additionalProperties": False,
+    }
+    resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
+    assert "$inherit" not in resolved
+    assert "batch_size" in resolved["properties"]
+    assert resolved["properties"]["batch_size"]["default"] == 1000
+    assert resolved["properties"]["foo"] == {"type": "string"}
+    assert resolved["additionalProperties"] is False
+
+
+def test_inherit_local_property_wins_over_fragment():
+    schema = {
+        "type": "object",
+        "$inherit": ["batchable"],
+        "properties": {
+            "batch_size": {"type": "integer", "minimum": 1, "default": 50}
+        },
+    }
+    resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
+    assert resolved["properties"]["batch_size"]["default"] == 50
+
+
+def test_inherit_streamable_brings_both_props():
+    schema = {"type": "object", "$inherit": ["streamable"], "properties": {}}
+    resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
+    assert "batch_size" in resolved["properties"]
+    assert "flush_ms" in resolved["properties"]
+
+
+def test_schema_without_inherit_unchanged():
+    schema = {
+        "type": "object",
+        "properties": {"foo": {"type": "string"}},
+        "additionalProperties": False,
+    }
+    resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
+    assert resolved == schema
+
+
+def test_unknown_fragment_raises():
+    schema = {"type": "object", "$inherit": ["nope"], "properties": {}}
+    with pytest.raises(FileNotFoundError):
+        resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
+```
+
+- [ ] **Step 1.5: Run test to verify it fails**
+
+Run:
+```bash
+cd core && python -m pytest src/datayoga_core/tests/test_schema_inherit.py -v
+```
+
+Expected: FAIL with `ModuleNotFoundError: No module named 'datayoga_core.schema_utils'`.
+
+- [ ] **Step 1.6: Implement the resolver**
+
+Create `core/src/datayoga_core/schema_utils.py`:
+
+```python
+"""Schema composition helpers.
+
+Producers and other blocks can declare `"$inherit": ["batchable"]` at the
+top of their block.schema.json to pull in shared property definitions from
+the fragments in resources/schemas/. `resolve_inherits` merges the
+fragments' `properties` into the local schema (local properties win), then
+removes the `$inherit` key. Schemas without `$inherit` are returned as-is.
+"""
+from __future__ import annotations
+
+import copy
+from os import path
+from typing import Any, Dict, List
+
+from datayoga_core import utils
+
+
+def resolve_inherits(schema: Dict[str, Any], schemas_dir: str = None) -> Dict[str, Any]:
+    """Merge any fragments listed in $inherit into the schema's properties.
+
+    Args:
+        schema: The schema to resolve. Mutated in place and also returned.
+        schemas_dir: Directory containing the fragment files. Defaults to
+            the bundled/non-bundled resources/schemas directory.
+
+    Returns:
+        The mutated schema with $inherit removed and fragment properties merged.
+    """
+    inherits: List[str] = schema.get("$inherit") or []
+    if not inherits:
+        return schema
+
+    if schemas_dir is None:
+        schemas_dir = utils.get_resource_path("schemas")
+
+    merged_properties: Dict[str, Any] = {}
+    for fragment_name in inherits:
+        fragment_path = path.join(schemas_dir, f"{fragment_name}.schema.json")
+        if not path.isfile(fragment_path):
+            raise FileNotFoundError(
+                f"Schema fragment '{fragment_name}' not found at {fragment_path}"
+            )
+        fragment = utils.read_json(fragment_path)
+        merged_properties.update(copy.deepcopy(fragment.get("properties", {})))
+
+    # Local properties take precedence over inherited ones.
+    local_properties = schema.get("properties", {})
+    merged_properties.update(local_properties)
+
+    schema["properties"] = merged_properties
+    schema.pop("$inherit", None)
+    return schema
+```
+
+- [ ] **Step 1.7: Run test to verify it passes**
+
+Run:
+```bash
+cd core && python -m pytest src/datayoga_core/tests/test_schema_inherit.py -v
+```
+
+Expected: 5 passed.
+
+- [ ] **Step 1.8: Wire resolver into `Block.get_json_schema`**
+
+Modify `core/src/datayoga_core/block.py`. After loading the schema (currently `return utils.read_json(json_schema_file)` on line 59), pass it through the resolver.
+
+Replace lines 44–59 with:
+
+```python
+    def get_json_schema(self) -> Dict[str, Any]:
+        """Returns the JSON Schema for this block.
+
+        Returns:
+            Dict[str, Any]: JSON Schema.
+        """
+        json_schema_file = path.join(
+            utils.get_bundled_dir(),
+            os.path.relpath(
+                os.path.dirname(sys.modules[self.__module__].__file__),
+                start=os.path.dirname(__file__)),
+            "block.schema.json") if utils.is_bundled() else path.join(
+            os.path.dirname(os.path.realpath(sys.modules[self.__module__].__file__)),
+            "block.schema.json")
+        logger.debug(f"loading schema from {json_schema_file}")
+        from datayoga_core.schema_utils import resolve_inherits
+        return resolve_inherits(utils.read_json(json_schema_file))
+```
+
+Note: the `from datayoga_core.schema_utils import resolve_inherits` line is inside the function to avoid a circular import (schema_utils imports from utils, utils imports from block).
+
+- [ ] **Step 1.9: Wire resolver into `Job.get_json_schema`**
+
+Modify `core/src/datayoga_core/job.py`. Inside the `for block_type, schema_path in block_info:` loop (around line 240–243), apply the resolver to each loaded schema.
+
+Find this block:
+```python
+        for block_type, schema_path in block_info:
+            block_types.append(block_type)
+            # load schema file
+            schema = utils.read_json(f"{schema_path}")
+            # append to the array of allOf for the full schema
+```
+
+Replace with:
+```python
+        from datayoga_core.schema_utils import resolve_inherits
+        for block_type, schema_path in block_info:
+            block_types.append(block_type)
+            # load schema file
+            schema = resolve_inherits(utils.read_json(f"{schema_path}"))
+            # append to the array of allOf for the full schema
+```
+
+- [ ] **Step 1.10: Verify existing block validation still passes**
+
+Run the full core test suite to make sure nothing regressed (no producer is using `$inherit` yet, so behavior should be unchanged):
+
+```bash
+cd core && python -m pytest src/datayoga_core/ -x -q
+```
+
+Expected: all existing tests pass; the 5 new `test_schema_inherit.py` tests also pass.
+
+- [ ] **Step 1.11: Commit**
+
+```bash
+git add core/src/datayoga_core/resources/schemas/batchable.schema.json \
+        core/src/datayoga_core/resources/schemas/streamable.schema.json \
+        core/src/datayoga_core/schema_utils.py \
+        core/src/datayoga_core/tests/__init__.py \
+        core/src/datayoga_core/tests/test_schema_inherit.py \
+        core/src/datayoga_core/block.py \
+        core/src/datayoga_core/job.py
+git commit -m "Add \$inherit schema fragment resolver (#400)"
+```
+
+---
+
+## Task 2: Producer base class with batching
+
+Add `produce_chunks()` and a default `produce()` that re-chunks. Existing subclasses override `produce()` directly and are unaffected until migrated in later tasks.
+
+**Files:**
+- Create: `core/src/datayoga_core/tests/test_producer_batching.py`
+- Modify: `core/src/datayoga_core/producer.py`
+
+- [ ] **Step 2.1: Write the failing tests**
+
+Create `core/src/datayoga_core/tests/test_producer_batching.py`:
+
+```python
+import asyncio
+from typing import AsyncGenerator, List, Optional
+
+import pytest
+
+from datayoga_core.context import Context
+from datayoga_core.producer import Message, Producer
+
+
+def _msg(i: int) -> dict:
+    return {Producer.MSG_ID_FIELD: str(i), "v": i}
+
+
+class FakeProducer(Producer):
+    """Producer driven by a scripted list of chunks plus optional sleeps."""
+
+    def __init__(self, properties=None, *, chunks=None, sleep_before=None):
+        # schema for a FakeProducer; declare batch_size/flush_ms so validation passes
+        self._test_schema = {
+            "type": "object",
+            "properties": {
+                "batch_size": {"type": "integer", "minimum": 1},
+                "flush_ms": {"type": ["integer", "null"], "minimum": 1},
+            },
+        }
+        self._chunks = chunks or []
+        self._sleep_before = sleep_before or []
+        super().__init__(properties or {})
+
+    def get_json_schema(self):
+        return self._test_schema
+
+    def init(self, context: Optional[Context] = None):
+        pass
+
+    async def produce_chunks(self) -> AsyncGenerator[List[Message], None]:
+        for i, chunk in enumerate(self._chunks):
+            if i < len(self._sleep_before) and self._sleep_before[i]:
+                await asyncio.sleep(self._sleep_before[i])
+            yield chunk
+
+
+async def _drain(producer: Producer):
+    out = []
+    async for batch in producer.produce():
+        out.append(batch)
+    return out
+
+
+@pytest.mark.asyncio
+async def test_rechunks_one_large_chunk():
+    chunks = [[_msg(i) for i in range(5000)]]
+    p = FakeProducer({"batch_size": 1000}, chunks=chunks)
+    batches = await _drain(p)
+    assert [len(b) for b in batches] == [1000, 1000, 1000, 1000, 1000]
+
+
+@pytest.mark.asyncio
+async def test_accumulates_small_chunks_and_flushes_on_eos():
+    chunks = [[_msg(i) for i in range(200)],
+              [_msg(i) for i in range(200, 500)],
+              [_msg(i) for i in range(500, 900)]]
+    p = FakeProducer({"batch_size": 1000}, chunks=chunks)
+    batches = await _drain(p)
+    assert [len(b) for b in batches] == [900]
+
+
+@pytest.mark.asyncio
+async def test_partial_final_batch_on_eos():
+    chunks = [[_msg(i) for i in range(1500)]]
+    p = FakeProducer({"batch_size": 1000}, chunks=chunks)
+    batches = await _drain(p)
+    assert [len(b) for b in batches] == [1000, 500]
+
+
+@pytest.mark.asyncio
+async def test_empty_chunks_are_ignored():
+    chunks = [[], [_msg(1), _msg(2)], [], [_msg(3)]]
+    p = FakeProducer({"batch_size": 10}, chunks=chunks)
+    batches = await _drain(p)
+    assert [len(b) for b in batches] == [3]
+
+
+@pytest.mark.asyncio
+async def test_flush_ms_emits_partial_on_inactivity():
+    # one chunk of 2 records, then a 300ms wait before EOS; flush_ms=100 should
+    # flush the partial batch of 2 well before EOS.
+    chunks = [[_msg(1), _msg(2)], [_msg(3)]]
+    sleeps = [0, 0.3]
+    p = FakeProducer({"batch_size": 100, "flush_ms": 100},
+                     chunks=chunks, sleep_before=sleeps)
+
+    received = []
+    started = asyncio.get_event_loop().time()
+    timings = []
+    async for batch in p.produce():
+        timings.append(asyncio.get_event_loop().time() - started)
+        received.append(batch)
+
+    assert [len(b) for b in received] == [2, 1]
+    # first flush happens because of inactivity (~100ms), not waiting for chunk 2
+    assert timings[0] < 0.25, f"expected first flush before 250ms, got {timings[0]}"
+
+
+@pytest.mark.asyncio
+async def test_no_flush_ms_holds_records_until_eos():
+    chunks = [[_msg(1)], [_msg(2)]]
+    sleeps = [0, 0.1]
+    p = FakeProducer({"batch_size": 100}, chunks=chunks, sleep_before=sleeps)
+    batches = await _drain(p)
+    assert [len(b) for b in batches] == [2]  # combined on EOS, never flushed mid-stream
+
+
+@pytest.mark.asyncio
+async def test_consumer_cancellation_cleans_up_pump():
+    chunks = [[_msg(i)] for i in range(1000)]
+    p = FakeProducer({"batch_size": 10, "flush_ms": 50}, chunks=chunks,
+                     sleep_before=[0.05] * 1000)
+
+    gen = p.produce()
+    first = await gen.__anext__()
+    assert len(first) >= 1
+    await gen.aclose()
+    # If pump task wasn't cleaned up we'd see a "Task was destroyed but it is
+    # pending!" warning here. Sleep briefly so the loop has a chance to surface it.
+    await asyncio.sleep(0.1)
+```
+
+- [ ] **Step 2.2: Run tests to verify they fail**
+
+Run:
+```bash
+cd core && python -m pytest src/datayoga_core/tests/test_producer_batching.py -v
+```
+
+Expected: All 7 tests FAIL with `TypeError: Can't instantiate abstract class FakeProducer with abstract methods produce` (because `produce` is currently abstract and `FakeProducer` doesn't override it; it overrides `produce_chunks` which doesn't exist yet).
+
+- [ ] **Step 2.3: Implement the new `Producer` base class**
+
+Replace the contents of `core/src/datayoga_core/producer.py` with:
+
+```python
+import asyncio
+import logging
+from contextlib import suppress
+from typing import Any, AsyncGenerator, Dict, List
+
+from .block import Block
+
+logger = logging.getLogger("dy")
+
+
+class Message:
+    def __init__(self, msg_id: str, value: Dict[str, Any]):
+        self.msg_id = msg_id
+        self.value = value
+
+
+class Producer(Block):
+    """Base class for producer (read) blocks.
+
+    Subclasses override `produce_chunks()` to yield chunks of any size from
+    the source. The default `produce()` re-chunks them to exactly `batch_size`
+    records per batch (smaller on flush_ms timeout or end-of-stream).
+
+    Legacy subclasses may still override `produce()` directly. They bypass
+    the base-class batching and `produce_chunks` is not called.
+    """
+
+    DEFAULT_BATCH_SIZE = 1000
+    DEFAULT_FLUSH_MS = None  # streaming subclasses override to enable timeout flush
+
+    async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
+        """Yield natural-size chunks from the source.
+
+        Subclasses should override this method. The base-class `produce()`
+        will re-chunk the output to exact `batch_size` slices.
+        """
+        raise NotImplementedError(
+            f"{type(self).__name__} must override produce_chunks() or produce()"
+        )
+        # Make this an async generator for type-checking purposes.
+        yield  # pragma: no cover
+
+    async def produce(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
+        """Re-chunks `produce_chunks()` output to exact batch_size batches.
+
+        Reads `batch_size` and `flush_ms` from properties lazily so subclasses
+        don't need to remember to call `super().init()`.
+        """
+        batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE))
+        flush_ms = self.properties.get("flush_ms", self.DEFAULT_FLUSH_MS)
+        timeout = (flush_ms / 1000) if flush_ms else None
+
+        queue: asyncio.Queue = asyncio.Queue()
+        EOS = object()
+
+        async def pump():
+            try:
+                async for chunk in self.produce_chunks():
+                    if chunk:
+                        await queue.put(chunk)
+            except asyncio.CancelledError:
+                raise
+            except Exception as exc:
+                logger.exception("produce_chunks raised; ending stream: %s", exc)
+            finally:
+                await queue.put(EOS)
+
+        pump_task = asyncio.create_task(pump())
+        buffer: List[Dict[str, Any]] = []
+        try:
+            while True:
+                try:
+                    item = await asyncio.wait_for(queue.get(), timeout=timeout)
+                except asyncio.TimeoutError:
+                    if buffer:
+                        yield buffer
+                        buffer = []
+                    continue
+
+                if item is EOS:
+                    if buffer:
+                        yield buffer
+                    return
+
+                buffer.extend(item)
+                while len(buffer) >= batch_size:
+                    yield buffer[:batch_size]
+                    buffer = buffer[batch_size:]
+        finally:
+            pump_task.cancel()
+            with suppress(asyncio.CancelledError, Exception):
+                await pump_task
+
+    def ack(self, msg_ids: List[str]):
+        """Sends acknowledge for the message IDs of records that have been processed."""
+        pass
+```
+
+Key differences from the current file:
+- `produce()` is no longer `@abstractmethod` — it has a default implementation.
+- `produce_chunks()` is the new override hook (not formally `@abstractmethod` so legacy subclasses still validate).
+- `Message` class unchanged.
+
+- [ ] **Step 2.4: Run tests to verify they pass**
+
+Run:
+```bash
+cd core && python -m pytest src/datayoga_core/tests/test_producer_batching.py -v
+```
+
+Expected: 7 passed.
+
+- [ ] **Step 2.5: Run the full core test suite to confirm no regressions**
+
+Existing producers all still override `produce()`, so their behavior is unchanged.
+
+```bash
+cd core && python -m pytest src/datayoga_core/ -x -q
+```
+
+Expected: all tests pass (including the new `test_producer_batching` and `test_schema_inherit`).
+
+- [ ] **Step 2.6: Commit**
+
+```bash
+git add core/src/datayoga_core/producer.py \
+        core/src/datayoga_core/tests/test_producer_batching.py
+git commit -m "Producer base class re-chunks via produce_chunks (#400)"
+```
+
+---
+
+## Task 3: Migrate `std/read`
+
+`std/read` already has `batch_size` and a custom `process_batch` accumulator. Replace it with a `produce_chunks` that yields one chunk; the base class re-chunks.
+
+**Files:**
+- Modify: `core/src/datayoga_core/blocks/std/read/block.py`
+- Modify: `core/src/datayoga_core/blocks/std/read/block.schema.json`
+
+- [ ] **Step 3.1: Write the failing test**
+
+There is no existing `tests/` directory under `std/read`. The std/read producer is exercised indirectly by integration tests, but we add a unit test for batching here.
+
+Create `core/src/datayoga_core/blocks/std/read/tests/__init__.py` (empty file) and `core/src/datayoga_core/blocks/std/read/tests/test_std_read.py`:
+
+```python
+import asyncio
+from unittest.mock import patch
+
+import orjson
+import pytest
+
+from datayoga_core.blocks.std.read.block import Block
+
+
+async def _drain(producer):
+    out = []
+    async for batch in producer.produce():
+        out.append(batch)
+    return out
+
+
+@pytest.mark.asyncio
+async def test_std_read_batches_to_batch_size():
+    payload = [{"i": i} for i in range(2500)]
+    fake_stdin = [orjson.dumps(payload).decode()]
+
+    block = Block({"batch_size": 1000})
+    block.init()
+
+    with patch("datayoga_core.blocks.std.read.block.select.select",
+               return_value=([object()], [], [])), \
+         patch("datayoga_core.blocks.std.read.block.sys.stdin", fake_stdin):
+        batches = await _drain(block)
+
+    assert [len(b) for b in batches] == [1000, 1000, 500]
+    # records carry their MSG_ID_FIELD and original payload values
+    flat = [r for b in batches for r in b]
+    assert flat[0]["i"] == 0
+    assert all(Block.MSG_ID_FIELD in r for r in flat)
+```
+
+- [ ] **Step 3.2: Run test to verify it fails**
+
+Run:
+```bash
+cd core && python -m pytest src/datayoga_core/blocks/std/read/tests/test_std_read.py -v
+```
+
+Expected: FAIL — the current implementation yields batches of `batch_size`, but its `process_batch` helper won't be exercised through the new `produce()` machinery because it overrides `produce()` directly. The test may also fail because the current produce() doesn't see the `batch_size_in_std_read_block` branch's batch logic interact cleanly with the test mocks. (The point of this step is to drive the migration; the failure shape is secondary.)
+
+- [ ] **Step 3.3: Migrate `std/read` to `produce_chunks`**
+
+Replace the contents of `core/src/datayoga_core/blocks/std/read/block.py` with:
+
+```python
+import logging
+import select
+import sys
+import uuid
+from typing import Any, AsyncGenerator, Dict, List, Optional
+
+import orjson
+from datayoga_core.context import Context
+from datayoga_core.producer import Producer as DyProducer
+
+logger = logging.getLogger("dy")
+
+
+class Block(DyProducer):
+    def init(self, context: Optional[Context] = None):
+        logger.debug(f"Initializing {self.get_block_name()}")
+
+    async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
+        if select.select([sys.stdin], [], [], 0.0)[0]:
+            all_records: List[Dict[str, Any]] = []
+            for line in sys.stdin:
+                all_records.extend(self.get_records(line))
+        else:
+            print("Enter data to process:")
+            all_records = self.get_records(input())
+
+        if all_records:
+            yield [self.get_message(record) for record in all_records]
+
+    @staticmethod
+    def get_records(data: str) -> List[Dict[str, Any]]:
+        records = orjson.loads(data)
+        if isinstance(records, dict):
+            records = [records]
+        return records
+
+    def get_message(self, record: Dict[str, Any]) -> Dict[str, Any]:
+        return {self.MSG_ID_FIELD: str(uuid.uuid4()), **record}
+```
+
+The `process_batch`, `batch_size` init read, and `produce` override are all gone. The base class handles batching.
+
+- [ ] **Step 3.4: Update the schema to use the fragment**
+
+Replace the contents of `core/src/datayoga_core/blocks/std/read/block.schema.json` with:
+
+```json
+{
+  "title": "std.read",
+  "description": "Read from the standard input",
+  "type": "object",
+  "$inherit": ["batchable"],
+  "properties": {},
+  "additionalProperties": false
+}
+```
+
+The `batch_size` declaration now comes from the fragment.
+
+- [ ] **Step 3.5: Run test to verify it passes**
+
+```bash
+cd core && python -m pytest src/datayoga_core/blocks/std/read/tests/test_std_read.py -v
+```
+
+Expected: PASS.
+
+- [ ] **Step 3.6: Run the full core suite**
+
+```bash
+cd core && python -m pytest src/datayoga_core/ -x -q
+```
+
+Expected: all tests pass.
+
+- [ ] **Step 3.7: Commit**
+
+```bash
+git add core/src/datayoga_core/blocks/std/read/block.py \
+        core/src/datayoga_core/blocks/std/read/block.schema.json \
+        core/src/datayoga_core/blocks/std/read/tests/__init__.py \
+        core/src/datayoga_core/blocks/std/read/tests/test_std_read.py
+git commit -m "Migrate std/read to produce_chunks (#400, #296)"
+```
+
+---
+
+## Task 4: Migrate `files/read_csv`
+
+Replace the `produce()` override and `islice` loop with a `produce_chunks` that yields one chunk per `batch_size` rows. The base class re-chunks to the configured `batch_size`.
+
+**Files:**
+- Modify: `core/src/datayoga_core/blocks/files/read_csv/block.py`
+- Modify: `core/src/datayoga_core/blocks/files/read_csv/block.schema.json`
+
+- [ ] **Step 4.1: Write the failing test**
+
+Create `core/src/datayoga_core/blocks/files/read_csv/tests/__init__.py` (empty) and `core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py`:
+
+```python
+from pathlib import Path
+
+import pytest
+
+from datayoga_core.blocks.files.read_csv.block import Block
+
+
+async def _drain(producer):
+    out = []
+    async for batch in producer.produce():
+        out.append(batch)
+    return out
+
+
+@pytest.fixture
+def csv_path(tmp_path) -> Path:
+    p = tmp_path / "data.csv"
+    rows = ["fname,lname"] + [f"first{i},last{i}" for i in range(2500)]
+    p.write_text("\n".join(rows) + "\n", encoding="utf-8")
+    return p
+
+
+@pytest.mark.asyncio
+async def test_csv_batches_to_batch_size(csv_path):
+    block = Block({"file": str(csv_path), "batch_size": 1000, "skip": 1})
+    block.init()
+    batches = await _drain(block)
+    assert [len(b) for b in batches] == [1000, 1000, 500]
+    # message ids are populated
+    assert all(Block.MSG_ID_FIELD in r for b in batches for r in b)
+    # first row content
+    assert batches[0][0]["fname"] == "first0"
+
+
+@pytest.mark.asyncio
+async def test_csv_default_batch_size(csv_path):
+    block = Block({"file": str(csv_path), "skip": 1})
+    block.init()
+    batches = await _drain(block)
+    # default batch_size is 1000
+    assert [len(b) for b in batches] == [1000, 1000, 500]
+```
+
+- [ ] **Step 4.2: Run test to verify it fails**
+
+```bash
+cd core && python -m pytest src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py -v
+```
+
+Expected: FAIL — current `produce()` works but the tests may pass coincidentally because `files/read_csv` already batches. That's fine; the test exists to *protect* the contract. Proceed to the migration anyway and confirm the test still passes afterward.
+
+- [ ] **Step 4.3: Migrate `files/read_csv` to `produce_chunks`**
+
+Replace the contents of `core/src/datayoga_core/blocks/files/read_csv/block.py` with:
+
+```python
+import logging
+import os
+from abc import ABCMeta
+from contextlib import suppress
+from csv import DictReader
+from itertools import count, islice
+from typing import Any, AsyncGenerator, Dict, List, Optional
+
+from datayoga_core.context import Context
+from datayoga_core.producer import Producer as DyProducer
+
+logger = logging.getLogger("dy")
+
+
+class Block(DyProducer, metaclass=ABCMeta):
+
+    def init(self, context: Optional[Context] = None):
+        logger.debug(f"Initializing {self.get_block_name()}")
+        csv_file = self.properties["file"]
+        if os.path.isabs(csv_file) or context is None:
+            self.file = csv_file
+        else:
+            self.file = os.path.join(context.properties.get("data_path"), csv_file)
+        logger.debug(f"file: {self.file}")
+        self.encoding = self.properties.get("encoding", "utf-8")
+        self.fields = self.properties.get("fields")
+        self.skip = self.properties.get("skip", 0)
+        self.delimiter = self.properties.get("delimiter", ",")
+        self.quotechar = self.properties.get("quotechar", "\"")
+
+    async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
+        logger.debug("Reading CSV")
+        batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE))
+
+        with open(self.file, "r", encoding=self.encoding) as read_obj:
+            reader = DictReader(read_obj, fieldnames=self.fields,
+                                delimiter=self.delimiter, quotechar=self.quotechar)
+            for _ in range(self.skip):
+                with suppress(StopIteration):
+                    next(reader)
+            counter = iter(count())
+            while True:
+                chunk = [
+                    {self.MSG_ID_FIELD: f"{next(counter)}", **record}
+                    for record in islice(reader, batch_size)
+                ]
+                if not chunk:
+                    return
+                yield chunk
+```
+
+The init no longer reads `self.batch_size` (read lazily in `produce_chunks`).
+
+- [ ] **Step 4.4: Update the schema**
+
+Replace `core/src/datayoga_core/blocks/files/read_csv/block.schema.json` with:
+
+```json
+{
+  "title": "files.read_csv",
+  "description": "Read data from CSV",
+  "type": "object",
+  "$inherit": ["batchable"],
+  "properties": {
+    "file": {
+      "description": "Filename. Can contain a regexp or glob expression",
+      "type": "string"
+    },
+    "encoding": {
+      "description": "Encoding to use for reading the file",
+      "type": "string",
+      "default": "utf-8"
+    },
+    "fields": {
+      "type": "array",
+      "title": "List of columns to use",
+      "description": "List of columns to use for extract",
+      "default": null,
+      "examples": [["fname", "lname"]],
+      "minLength": 1,
+      "additionalItems": true,
+      "items": {
+        "type": "string",
+        "description": "field name",
+        "examples": ["fname"]
+      }
+    },
+    "skip": {
+      "description": "Number of lines to skip",
+      "type": "number",
+      "minimum": 0,
+      "default": 0
+    },
+    "delimiter": {
+      "description": "Delimiter to use for splitting the csv records",
+      "type": "string",
+      "minLength": 1,
+      "maxLength": 1,
+      "default": ","
+    },
+    "quotechar": {
+      "description": "A one-character string used to quote fields containing special characters, such as the delimiter or quotechar, or which contain new-line characters. It defaults to '",
+      "type": "string",
+      "minLength": 1,
+      "maxLength": 1,
+      "default": "\""
+    }
+  },
+  "additionalProperties": false,
+  "required": ["file"],
+  "examples": [
+    {
+      "file": "archive.csv",
+      "delimiter": ";"
+    }
+  ]
+}
+```
+
+The `batch_size` inline property is removed; it comes from the `batchable` fragment.
+
+- [ ] **Step 4.5: Run test to verify it passes**
+
+```bash
+cd core && python -m pytest src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py -v
+```
+
+Expected: 2 passed.
+
+- [ ] **Step 4.6: Run the full core suite**
+
+```bash
+cd core && python -m pytest src/datayoga_core/ -x -q
+```
+
+Expected: all tests pass.
+
+- [ ] **Step 4.7: Commit**
+
+```bash
+git add core/src/datayoga_core/blocks/files/read_csv/block.py \
+        core/src/datayoga_core/blocks/files/read_csv/block.schema.json \
+        core/src/datayoga_core/blocks/files/read_csv/tests/__init__.py \
+        core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py
+git commit -m "Migrate files/read_csv to produce_chunks (#400)"
+```
+
+---
+
+## Task 5: Migrate `parquet/read` (fixes one-by-one bug)
+
+Today `parquet/read` iterates each row of each row group and yields a single-record list per iteration. Migrate it to yield each row group as a single chunk; the base class re-chunks to `batch_size`.
+
+**Files:**
+- Modify: `core/src/datayoga_core/blocks/parquet/read/block.py`
+- Modify: `core/src/datayoga_core/blocks/parquet/read/block.schema.json`
+
+- [ ] **Step 5.1: Write the failing test**
+
+Create `core/src/datayoga_core/blocks/parquet/read/tests/__init__.py` (empty) and `core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py`:
+
+```python
+from pathlib import Path
+
+import pandas as pd
+import pytest
+
+from datayoga_core.blocks.parquet.read.block import Block
+
+
+async def _drain(producer):
+    out = []
+    async for batch in producer.produce():
+        out.append(batch)
+    return out
+
+
+@pytest.fixture
+def parquet_path(tmp_path) -> Path:
+    p = tmp_path / "data.parquet"
+    df = pd.DataFrame({"i": list(range(2500))})
+    # row_group_offsets=1000 creates 3 row groups (1000, 1000, 500)
+    from fastparquet import write as fp_write
+    fp_write(str(p), df, row_group_offsets=1000)
+    return p
+
+
+@pytest.mark.asyncio
+async def test_parquet_batches_to_batch_size(parquet_path):
+    block = Block({"file": str(parquet_path), "batch_size": 1000})
+    block.init()
+    batches = await _drain(block)
+    assert [len(b) for b in batches] == [1000, 1000, 500]
+    flat = [r for b in batches for r in b]
+    assert flat[0]["i"] == 0
+    assert all(Block.MSG_ID_FIELD in r for r in flat)
+
+
+@pytest.mark.asyncio
+async def test_parquet_rechunks_across_row_groups(parquet_path):
+    # row groups are [1000, 1000, 500]; batch_size=750 should give batches of
+    # [750, 750, 750, 250] regardless of row group boundaries.
+    block = Block({"file": str(parquet_path), "batch_size": 750})
+    block.init()
+    batches = await _drain(block)
+    assert [len(b) for b in batches] == [750, 750, 750, 250]
+```
+
+- [ ] **Step 5.2: Run test to verify it fails**
+
+```bash
+cd core && python -m pytest src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py -v
+```
+
+Expected: FAIL — current implementation yields batches of size 1, so the assertions fail.
+
+- [ ] **Step 5.3: Migrate `parquet/read`**
+
+Replace the contents of `core/src/datayoga_core/blocks/parquet/read/block.py` with:
+
+```python
+import logging
+import os
+from abc import ABCMeta
+from itertools import count
+from typing import Any, AsyncGenerator, Dict, List, Optional
+
+from datayoga_core.context import Context
+from datayoga_core.producer import Producer as DyProducer
+from fastparquet import ParquetFile
+
+logger = logging.getLogger("dy")
+
+
+class Block(DyProducer, metaclass=ABCMeta):
+
+    def init(self, context: Optional[Context] = None):
+        logger.debug(f"Initializing {self.get_block_name()}")
+        parquet_file = self.properties["file"]
+        if os.path.isabs(parquet_file) or context is None:
+            self.file = parquet_file
+        else:
+            self.file = os.path.join(context.properties.get("data_path"), parquet_file)
+        logger.debug(f"file: {self.file}")
+
+    async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
+        logger.debug("Reading parquet")
+        pf = ParquetFile(self.file)
+        counter = iter(count())
+        for df in pf.iter_row_groups():
+            yield [
+                {self.MSG_ID_FIELD: str(next(counter)), **row.to_dict()}
+                for _, row in df.iterrows()
+            ]
+```
+
+- [ ] **Step 5.4: Update the schema**
+
+Replace `core/src/datayoga_core/blocks/parquet/read/block.schema.json` with:
+
+```json
+{
+  "title": "parquet.read",
+  "description": "Read data from parquet",
+  "type": "object",
+  "$inherit": ["batchable"],
+  "properties": {
+    "file": {
+      "description": "Filename. Can contain a regexp or glob expression",
+      "type": "string"
+    }
+  },
+  "additionalProperties": false,
+  "required": ["file"],
+  "examples": [
+    {
+      "file": "data.parquet"
+    }
+  ]
+}
+```
+
+- [ ] **Step 5.5: Run test to verify it passes**
+
+```bash
+cd core && python -m pytest src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py -v
+```
+
+Expected: 2 passed.
+
+- [ ] **Step 5.6: Run the full core suite**
+
+```bash
+cd core && python -m pytest src/datayoga_core/ -x -q
+```
+
+Expected: all tests pass.
+
+- [ ] **Step 5.7: Commit**
+
+```bash
+git add core/src/datayoga_core/blocks/parquet/read/block.py \
+        core/src/datayoga_core/blocks/parquet/read/block.schema.json \
+        core/src/datayoga_core/blocks/parquet/read/tests/__init__.py \
+        core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py
+git commit -m "Migrate parquet/read to produce_chunks, fix one-by-one yield (#400, #293)"
+```
+
+---
+
+## Task 6: Migrate `relational/read` (fix bug + add `fetch_size`)
+
+Today `relational/read` does `fetchmany(10000)` then yields one row at a time. Migrate to `produce_chunks` that yields each `fetchmany` result. Add an optional `fetch_size` property; default to 10000 to preserve today's DB round-trip count.
+
+**Files:**
+- Modify: `core/src/datayoga_core/blocks/relational/read/block.py`
+- Modify: `core/src/datayoga_core/blocks/relational/read/block.schema.json`
+
+- [ ] **Step 6.1: Write the failing test**
+
+Create `core/src/datayoga_core/blocks/relational/read/tests/__init__.py` (empty) and `core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py`:
+
+```python
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from datayoga_core.blocks.relational.read.block import Block
+
+
+async def _drain(producer):
+    out = []
+    async for batch in producer.produce():
+        out.append(batch)
+    return out
+
+
+def _fake_result(rows):
+    """Build a fake SQLAlchemy result that returns rows in fetchmany chunks."""
+    state = {"i": 0}
+
+    def fetchmany(n):
+        i = state["i"]
+        chunk = rows[i:i + n]
+        state["i"] += len(chunk)
+        return chunk
+
+    res = MagicMock()
+    res.fetchmany.side_effect = fetchmany
+    res.execution_options.return_value = res
+    return res
+
+
+class _Row:
+    def __init__(self, d):
+        self._d = d
+
+    def _asdict(self):
+        return self._d
+
+
+@pytest.mark.asyncio
+async def test_relational_read_yields_batches_not_rows():
+    rows = [_Row({"i": i}) for i in range(2500)]
+    fake_result = _fake_result(rows)
+
+    block = Block.__new__(Block)
+    block.properties = {"batch_size": 1000}
+    block.connection = MagicMock()
+    block.tbl = MagicMock()
+    block.tbl.select.return_value = "SELECT *"
+    block.connection.execution_options.return_value.execute.return_value = fake_result
+
+    batches = await _drain(block)
+    assert [len(b) for b in batches] == [1000, 1000, 500]
+
+
+@pytest.mark.asyncio
+async def test_relational_read_fetch_size_independent_of_batch_size():
+    rows = [_Row({"i": i}) for i in range(5000)]
+    fake_result = _fake_result(rows)
+
+    block = Block.__new__(Block)
+    block.properties = {"batch_size": 1000, "fetch_size": 2500}
+    block.connection = MagicMock()
+    block.tbl = MagicMock()
+    block.tbl.select.return_value = "SELECT *"
+    block.connection.execution_options.return_value.execute.return_value = fake_result
+
+    batches = await _drain(block)
+    # Downstream batches are still batch_size=1000
+    assert [len(b) for b in batches] == [1000, 1000, 1000, 1000, 1000]
+    # Driver fetched in fetch_size=2500 chunks: 2500 + 2500 + 0 = 3 calls
+    fetch_sizes = [c.args[0] for c in fake_result.fetchmany.call_args_list]
+    assert fetch_sizes[0] == 2500
+    assert fetch_sizes[1] == 2500
+
+
+@pytest.mark.asyncio
+async def test_relational_read_default_fetch_size_is_10000():
+    rows = [_Row({"i": i}) for i in range(500)]
+    fake_result = _fake_result(rows)
+
+    block = Block.__new__(Block)
+    block.properties = {}
+    block.connection = MagicMock()
+    block.tbl = MagicMock()
+    block.tbl.select.return_value = "SELECT *"
+    block.connection.execution_options.return_value.execute.return_value = fake_result
+
+    await _drain(block)
+    fetch_sizes = [c.args[0] for c in fake_result.fetchmany.call_args_list]
+    assert fetch_sizes[0] == 10000
+```
+
+- [ ] **Step 6.2: Run test to verify it fails**
+
+```bash
+cd core && python -m pytest src/datayoga_core/blocks/relational/read/tests/test_relational_read.py -v
+```
+
+Expected: FAIL — the current `produce()` yields one row at a time, so `[len(b) for b in batches]` is `[1] * 2500`.
+
+- [ ] **Step 6.3: Migrate `relational/read`**
+
+Replace the contents of `core/src/datayoga_core/blocks/relational/read/block.py` with:
+
+```python
+import logging
+from typing import Any, AsyncGenerator, Dict, List, Optional
+
+import sqlalchemy as sa
+from datayoga_core import utils
+from datayoga_core.blocks.relational import utils as relational_utils
+from datayoga_core.context import Context
+from datayoga_core.producer import Producer as DyProducer
+
+logger = logging.getLogger("dy")
+
+
+class Block(DyProducer):
+    DEFAULT_FETCH_SIZE = 10000
+
+    def init(self, context: Optional[Context] = None):
+        self.engine, self.db_type = relational_utils.get_engine(
+            self.properties["connection"],
+            context,
+            autocommit=False,
+        )
+
+        self.schema = self.properties.get("schema")
+        self.table = self.properties.get("table")
+        self.opcode_field = self.properties.get("opcode_field")
+        self.load_strategy = self.properties.get("load_strategy")
+        self.keys = self.properties.get("keys")
+        self.mapping = self.properties.get("mapping")
+
+        self.tbl = sa.Table(self.table, sa.MetaData(schema=self.schema), autoload_with=self.engine)
+
+        logger.debug(f"Connecting to {self.db_type}")
+        self.connection = self.engine.connect()
+
+    async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
+        fetch_size = int(self.properties.get("fetch_size", self.DEFAULT_FETCH_SIZE))
+        result = self.connection.execution_options(stream_results=True).execute(self.tbl.select())
+        while True:
+            rows = result.fetchmany(fetch_size)
+            if not rows:
+                return
+            yield [utils.add_uid(dict(row._asdict())) for row in rows]
+
+    def stop(self):
+        self.connection.close()
+        self.engine.dispose()
+```
+
+- [ ] **Step 6.4: Update the schema**
+
+Replace `core/src/datayoga_core/blocks/relational/read/block.schema.json` with:
+
+```json
+{
+  "title": "relational.read",
+  "description": "Read a table from an SQL-compatible data store",
+  "type": "object",
+  "$inherit": ["batchable"],
+  "additionalProperties": false,
+  "examples": [
+    {
+      "id": "read_snowflake",
+      "type": "relational.read",
+      "properties": {
+        "connection": "eu_datalake",
+        "table": "employees",
+        "schema": "dbo"
+      }
+    }
+  ],
+  "properties": {
+    "connection": {
+      "type": "string",
+      "title": "The connection to use for loading",
+      "description": "Logical connection name as defined in the connections.dy.yaml",
+      "examples": ["europe_db", "target", "eu_dwh"]
+    },
+    "schema": {
+      "type": "string",
+      "title": "The table schema of the table",
+      "description": "If left blank, the default schema of this connection will be used as defined in the connections.dy.yaml",
+      "examples": ["dbo"]
+    },
+    "table": {
+      "type": "string",
+      "title": "The table name",
+      "description": "Table name",
+      "examples": ["employees"]
+    },
+    "columns": {
+      "type": "array",
+      "title": "Optional subset of columns to load",
+      "items": {
+        "type": ["string", "object"],
+        "title": "name of column"
+      },
+      "examples": [["fname", { "lname": "last_name" }]]
+    },
+    "fetch_size": {
+      "type": "integer",
+      "minimum": 1,
+      "description": "Driver-level rows fetched per round-trip. Defaults to 10000.",
+      "default": 10000
+    }
+  },
+  "required": ["connection", "table"]
+}
+```
+
+- [ ] **Step 6.5: Run test to verify it passes**
+
+```bash
+cd core && python -m pytest src/datayoga_core/blocks/relational/read/tests/test_relational_read.py -v
+```
+
+Expected: 3 passed.
+
+- [ ] **Step 6.6: Run the full core suite**
+
+```bash
+cd core && python -m pytest src/datayoga_core/ -x -q
+```
+
+Expected: all tests pass.
+
+- [ ] **Step 6.7: Commit**
+
+```bash
+git add core/src/datayoga_core/blocks/relational/read/block.py \
+        core/src/datayoga_core/blocks/relational/read/block.schema.json \
+        core/src/datayoga_core/blocks/relational/read/tests/__init__.py \
+        core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py
+git commit -m "Migrate relational/read to produce_chunks, add fetch_size (#400, #295)"
+```
+
+---
+
+## Task 7: Migrate `http/receiver` (fix one-by-one)
+
+The receiver currently yields one record per HTTP request. Migrate to drain the queue per chunk; `flush_ms` ensures partial batches flush during low-traffic periods.
+
+**Files:**
+- Modify: `core/src/datayoga_core/blocks/http/receiver/block.py`
+- Modify: `core/src/datayoga_core/blocks/http/receiver/block.schema.json`
+
+- [ ] **Step 7.1: Write the failing test**
+
+Create `core/src/datayoga_core/blocks/http/receiver/tests/__init__.py` (empty) and `core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py`:
+
+```python
+import asyncio
+
+import aiohttp
+import pytest
+
+from datayoga_core.blocks.http.receiver.block import Block
+
+
+def _free_port():
+    import socket
+    with socket.socket() as s:
+        s.bind(("127.0.0.1", 0))
+        return s.getsockname()[1]
+
+
+@pytest.mark.asyncio
+async def test_http_receiver_batches_incoming_requests():
+    port = _free_port()
+    block = Block({"host": "127.0.0.1", "port": port,
+                   "batch_size": 50, "flush_ms": 200})
+    block.init()
+
+    received = []
+
+    async def consumer():
+        async for batch in block.produce():
+            received.append(batch)
+            if sum(len(b) for b in received) >= 60:
+                return
+
+    consumer_task = asyncio.create_task(consumer())
+    await asyncio.sleep(0.2)  # let server start
+
+    async with aiohttp.ClientSession() as session:
+        for i in range(60):
+            async with session.post(f"http://127.0.0.1:{port}", json={"i": i}) as r:
+                assert r.status == 200
+
+    await asyncio.wait_for(consumer_task, timeout=5)
+
+    flat = [r for b in received for r in b]
+    assert len(flat) == 60
+    # Most records arrive in a full batch_size=50 batch; the rest arrive as a
+    # partial batch flushed by flush_ms.
+    assert any(len(b) == 50 for b in received)
+    assert all(Block.MSG_ID_FIELD in r for r in flat)
+```
+
+- [ ] **Step 7.2: Run test to verify it fails**
+
+```bash
+cd core && python -m pytest src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py -v
+```
+
+Expected: FAIL — current implementation yields one record per batch; `assert any(len(b) == 50 ...)` is false.
+
+- [ ] **Step 7.3: Migrate `http/receiver`**
+
+Replace the contents of `core/src/datayoga_core/blocks/http/receiver/block.py` with:
+
+```python
+import logging
+from abc import ABCMeta
+from asyncio import Queue
+from contextlib import suppress
+from itertools import count
+from typing import Any, AsyncGenerator, Dict, List, Optional
+
+import orjson
+from aiohttp.web import (BaseRequest, HTTPInternalServerError, HTTPOk,
+                         Response, Server, ServerRunner, TCPSite)
+from datayoga_core.context import Context
+from datayoga_core.producer import Producer as DyProducer
+
+logger = logging.getLogger("dy")
+
+
+class Block(DyProducer, metaclass=ABCMeta):
+    port: int
+    host: str
+    DEFAULT_FLUSH_MS = 1000
+
+    def init(self, context: Optional[Context] = None):
+        logger.debug(f"Initializing {self.get_block_name()}")
+        self.port = int(self.properties.get("port", 8080))
+        self.host = self.properties.get("host", "0.0.0.0")
+
+    async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
+        queue: Queue = Queue(maxsize=1000)
+
+        async def handler(request: BaseRequest) -> Response:
+            try:
+                queue.put_nowait(orjson.loads(await request.read()))
+                return HTTPOk()
+            except Exception:
+                logger.exception("Got exception while parsing request:")
+                return HTTPInternalServerError()
+
+        runner = ServerRunner(Server(handler))
+        await runner.setup()
+        srv = TCPSite(runner, self.host, self.port)
+        await srv.start()
+        logger.info(f"Listening on {self.host}:{self.port}...")
+
+        try:
+            counter = iter(count())
+            while True:
+                first = await queue.get()
+                chunk = [{self.MSG_ID_FIELD: f"{next(counter)}", **first}]
+                while not queue.empty():
+                    record = queue.get_nowait()
+                    chunk.append({self.MSG_ID_FIELD: f"{next(counter)}", **record})
+                yield chunk
+        finally:
+            with suppress(Exception):
+                await srv.stop()
+```
+
+- [ ] **Step 7.4: Update the schema**
+
+Replace `core/src/datayoga_core/blocks/http/receiver/block.schema.json` with:
+
+```json
+{
+  "title": "http.receiver",
+  "description": "Receives HTTP requests and process the data.",
+  "type": "object",
+  "$inherit": ["streamable"],
+  "properties": {
+    "host": {
+      "description": "Host to listen",
+      "type": "string",
+      "default": "0.0.0.0"
+    },
+    "port": {
+      "description": "Port to listen",
+      "type": "integer",
+      "default": 8080
+    }
+  },
+  "additionalProperties": false,
+  "examples": [
+    {
+      "host": "localhost",
+      "port": 8080
+    }
+  ]
+}
+```
+
+- [ ] **Step 7.5: Run test to verify it passes**
+
+```bash
+cd core && python -m pytest src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py -v
+```
+
+Expected: 1 passed.
+
+- [ ] **Step 7.6: Run the full core suite**
+
+```bash
+cd core && python -m pytest src/datayoga_core/ -x -q
+```
+
+Expected: all tests pass.
+
+- [ ] **Step 7.7: Commit**
+
+```bash
+git add core/src/datayoga_core/blocks/http/receiver/block.py \
+        core/src/datayoga_core/blocks/http/receiver/block.schema.json \
+        core/src/datayoga_core/blocks/http/receiver/tests/__init__.py \
+        core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py
+git commit -m "Migrate http/receiver to produce_chunks (#400)"
+```
+
+---
+
+## Task 8: Migrate `redis/read_stream` (closes #377)
+
+The redis stream producer yields one record at a time today. Migrate so it requests `count=batch_size` from `xreadgroup` and yields each response as a chunk; `flush_ms` flushes partial batches during low-volume periods.
+
+**Files:**
+- Modify: `core/src/datayoga_core/blocks/redis/read_stream/block.py`
+- Modify: `core/src/datayoga_core/blocks/redis/read_stream/block.schema.json`
+
+- [ ] **Step 8.1: Write the failing test**
+
+Create `core/src/datayoga_core/blocks/redis/read_stream/tests/__init__.py` (empty) and `core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py`:
+
+```python
+from unittest.mock import MagicMock
+
+import pytest
+
+from datayoga_core.blocks.redis.read_stream.block import Block
+
+
+def _mk_block(properties, redis_client):
+    block = Block.__new__(Block)
+    block.properties = properties
+    block.redis_client = redis_client
+    block.stream = "mystream"
+    block.snapshot = properties.get("_snapshot", True)
+    block.consumer_group = "g"
+    block.requesting_consumer = "c"
+    return block
+
+
+@pytest.mark.asyncio
+async def test_redis_uses_count_equal_to_batch_size():
+    redis = MagicMock()
+    # First call returns pending messages, second call returns "no new", which
+    # ends snapshot mode.
+    payload_a = (b"1-0", {b"data": b'{"i": 1}'})
+    payload_b = (b"2-0", {b"data": b'{"i": 2}'})
+    redis.xreadgroup.side_effect = [
+        [(b"mystream", [payload_a, payload_b])],  # pending
+        [(b"mystream", [])],                        # nothing new -> exit
+    ]
+
+    block = _mk_block({"batch_size": 250, "_snapshot": True}, redis)
+    batches = []
+    async for b in block.produce():
+        batches.append(b)
+
+    assert all(c.kwargs.get("count") == 250 or (len(c.args) >= 4 and c.args[3] == 250)
+               for c in redis.xreadgroup.call_args_list), \
+        "xreadgroup should be called with count=batch_size"
+
+
+@pytest.mark.asyncio
+async def test_redis_yields_records_as_a_batch_not_one_by_one():
+    redis = MagicMock()
+    pages = [(f"{i}-0".encode(), {b"data": f'{{"i": {i}}}'.encode()}) for i in range(5)]
+    redis.xreadgroup.side_effect = [
+        [(b"mystream", pages)],
+        [(b"mystream", [])],
+    ]
+
+    block = _mk_block({"batch_size": 100, "_snapshot": True}, redis)
+    batches = []
+    async for b in block.produce():
+        batches.append(b)
+
+    # 5 records arrive as one chunk; base class re-emits as one batch of 5.
+    assert [len(b) for b in batches] == [5]
+    assert batches[0][0]["i"] == 0
+```
+
+- [ ] **Step 8.2: Run test to verify it fails**
+
+```bash
+cd core && python -m pytest src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py -v
+```
+
+Expected: FAIL — current `xreadgroup` call passes `count=None`, and the producer yields one record at a time.
+
+- [ ] **Step 8.3: Migrate `redis/read_stream`**
+
+Replace the contents of `core/src/datayoga_core/blocks/redis/read_stream/block.py` with:
+
+```python
+import logging
+from typing import Any, AsyncGenerator, Dict, List, Optional
+
+import datayoga_core.blocks.redis.utils as redis_utils
+import orjson
+from datayoga_core.connection import Connection
+from datayoga_core.context import Context
+from datayoga_core.producer import Producer as DyProducer
+
+logger = logging.getLogger("dy")
+
+
+class Block(DyProducer):
+    DEFAULT_FLUSH_MS = 1000
+
+    def init(self, context: Optional[Context] = None):
+        logger.debug(f"Initializing {self.get_block_name()}")
+        connection_details = Connection.get_connection_details(self.properties["connection"], context)
+        self.redis_client = redis_utils.get_client(connection_details)
+        self.stream = self.properties["stream_name"]
+        self.snapshot = self.properties.get("snapshot", False)
+        self.consumer_group = f'datayoga_job_{context.properties.get("job_name", "") if context else ""}'
+        self.requesting_consumer = "dy_consumer_a"
+        stream_groups = self.redis_client.xinfo_groups(self.stream)
+        if next(filter(lambda x: x["name"] == self.consumer_group, stream_groups), None) is None:
+            logger.info(f"Creating a new {self.consumer_group} consumer group associated with the {self.stream}")
+            self.redis_client.xgroup_create(self.stream, self.consumer_group, 0)
+
+    async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
+        logger.debug(f"Running {self.get_block_name()}")
+        batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE))
+        read_pending = True
+
+        while True:
+            streams = self.redis_client.xreadgroup(
+                self.consumer_group, self.requesting_consumer,
+                {self.stream: "0" if read_pending else ">"},
+                count=batch_size,
+                block=100 if self.snapshot else 0,
+            )
+
+            yielded_any = False
+            for stream in streams:
+                logger.debug(f"Messages in {self.stream} stream (pending: {read_pending}):\n\t{stream}")
+                chunk: List[Dict[str, Any]] = []
+                for key, value in stream[1]:
+                    payload = orjson.loads(value[next(iter(value))])
+                    payload[self.MSG_ID_FIELD] = key
+                    chunk.append(payload)
+                if chunk:
+                    yielded_any = True
+                    yield chunk
+
+            # Snapshot ends after a pending-read followed by a "no new" read.
+            if self.snapshot and not read_pending and not yielded_any:
+                return
+
+            read_pending = False
+
+    def ack(self, msg_ids: List[str]):
+        for msg_id in msg_ids:
+            logger.info(f"Acking {msg_id} message in {self.stream} stream of {self.consumer_group} consumer group")
+            self.redis_client.xack(self.stream, self.consumer_group, msg_id)
+```
+
+Note: snapshot termination is slightly tightened: the loop exits when a non-pending read returns no messages, matching the spec's intent. This is more robust than the original `if self.snapshot and not read_pending: break`.
+
+- [ ] **Step 8.4: Update the schema**
+
+Replace `core/src/datayoga_core/blocks/redis/read_stream/block.schema.json` with:
+
+```json
+{
+  "title": "redis.read_stream",
+  "description": "Read from Redis stream",
+  "type": "object",
+  "$inherit": ["streamable"],
+  "properties": {
+    "connection": { "description": "Connection name", "type": "string" },
+    "stream_name": {
+      "type": "string",
+      "title": "Source stream name",
+      "description": "Source stream name"
+    },
+    "snapshot": {
+      "type": "boolean",
+      "title": "Snapshot current entries and quit",
+      "description": "Snapshot current entries and quit",
+      "default": false
+    }
+  },
+  "additionalProperties": false,
+  "required": ["connection", "stream_name"]
+}
+```
+
+- [ ] **Step 8.5: Run test to verify it passes**
+
+```bash
+cd core && python -m pytest src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py -v
+```
+
+Expected: 2 passed.
+
+- [ ] **Step 8.6: Run the full core suite**
+
+```bash
+cd core && python -m pytest src/datayoga_core/ -x -q
+```
+
+Expected: all tests pass.
+
+- [ ] **Step 8.7: Commit**
+
+```bash
+git add core/src/datayoga_core/blocks/redis/read_stream/block.py \
+        core/src/datayoga_core/blocks/redis/read_stream/block.schema.json \
+        core/src/datayoga_core/blocks/redis/read_stream/tests/__init__.py \
+        core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py
+git commit -m "Migrate redis/read_stream to batched xreadgroup (#400, #377)"
+```
+
+---
+
+## Task 9: Migrate `azure/read_event_hub` (rename `batch_size` → `max_batch_size`)
+
+Today `batch_size` controls the SDK callback size, not the pipeline batch size. Rename to `max_batch_size`, add `additionalProperties: false`, and use the streamable fragment so the *new* `batch_size` means pipeline batch size.
+
+**Files:**
+- Modify: `core/src/datayoga_core/blocks/azure/read_event_hub/block.py`
+- Modify: `core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json`
+
+- [ ] **Step 9.1: Write the failing test**
+
+Create `core/src/datayoga_core/blocks/azure/read_event_hub/tests/__init__.py` (empty) and `core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py`:
+
+```python
+import pytest
+from jsonschema import ValidationError
+
+from datayoga_core.blocks.azure.read_event_hub.block import Block
+
+
+def _minimal_props(extra=None):
+    base = {
+        "event_hub_connection_string": "Endpoint=sb://x/;SharedAccessKeyName=k;SharedAccessKey=v;EntityPath=eh",
+        "event_hub_consumer_group_name": "$Default",
+        "event_hub_name": "eh",
+        "checkpoint_store_connection_string": "DefaultEndpointsProtocol=https;AccountName=a;AccountKey=k==",
+        "checkpoint_store_container_name": "chk",
+    }
+    if extra:
+        base.update(extra)
+    return base
+
+
+def test_unknown_property_rejected_by_validation():
+    """additionalProperties: false catches typos like the legacy 'batch_sz'."""
+    with pytest.raises(ValidationError):
+        Block(_minimal_props({"batch_sz": 300}))
+
+
+def test_max_batch_size_accepted():
+    """The renamed SDK-level property is now max_batch_size."""
+    block = Block(_minimal_props({"max_batch_size": 500, "batch_size": 100}))
+    assert block.properties["max_batch_size"] == 500
+    assert block.properties["batch_size"] == 100
+
+
+def test_max_batch_size_defaults_to_300_when_omitted():
+    """init() reads max_batch_size with a default of 300 if not present."""
+    # We can't safely call init() in unit tests (it instantiates the Azure
+    # SDK client); read the property via the same path init() does.
+    block = Block(_minimal_props())
+    assert int(block.properties.get("max_batch_size", 300)) == 300
+
+
+def test_renamed_schema_has_additional_properties_false():
+    """Schema after rename: max_batch_size + streamable's batch_size/flush_ms,
+    no unknown properties allowed."""
+    block = Block(_minimal_props())
+    schema = block.get_json_schema()
+    assert schema.get("additionalProperties") is False
+    assert "max_batch_size" in schema["properties"]
+    assert "batch_size" in schema["properties"]  # from streamable fragment
+    assert "flush_ms" in schema["properties"]    # from streamable fragment
+
+
+def test_batch_size_300_is_silently_repurposed():
+    """A user upgrading from a pre-rename version with batch_size: 300 (which
+    used to mean SDK callback size) will see their YAML still validate, but
+    batch_size now means pipeline batch size. This is documented in the PR
+    description and processing-strategies.md as a breaking change."""
+    block = Block(_minimal_props({"batch_size": 300}))
+    # Schema validation passes — batch_size is a known property (now pipeline-meaning).
+    # The user must rename to max_batch_size: 300 to preserve old behavior.
+    assert block.properties["batch_size"] == 300
+    assert "max_batch_size" not in block.properties
+```
+
+- [ ] **Step 9.2: Run test to verify it fails**
+
+```bash
+cd core && python -m pytest src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py -v
+```
+
+Expected: most of the 5 tests FAIL — current schema has no `additionalProperties: false`, no `max_batch_size`, no `$inherit`.
+
+- [ ] **Step 9.3: Update the schema**
+
+Replace `core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json` with:
+
+```json
+{
+  "title": "azure.read_event_hub",
+  "description": "Read from Azure Event Hub",
+  "type": "object",
+  "$inherit": ["streamable"],
+  "properties": {
+    "event_hub_connection_string": {
+      "type": "string",
+      "description": "The connection string for the Azure Event Hub namespace."
+    },
+    "event_hub_consumer_group_name": {
+      "type": "string",
+      "description": "The name of the consumer group to read events from."
+    },
+    "event_hub_name": {
+      "type": "string",
+      "description": "The name of the Azure Event Hub."
+    },
+    "checkpoint_store_connection_string": {
+      "type": "string",
+      "description": "The connection string for the Azure Storage account used as the checkpoint store."
+    },
+    "checkpoint_store_container_name": {
+      "type": "string",
+      "description": "The name of the container within the checkpoint store to store the checkpoints."
+    },
+    "max_batch_size": {
+      "type": "integer",
+      "minimum": 1,
+      "description": "Maximum number of events to receive in each SDK callback. Renamed from the previous batch_size which used to mean this. Defaults to 300.",
+      "default": 300
+    }
+  },
+  "additionalProperties": false,
+  "required": [
+    "event_hub_connection_string",
+    "event_hub_consumer_group_name",
+    "event_hub_name",
+    "checkpoint_store_connection_string",
+    "checkpoint_store_container_name"
+  ]
+}
+```
+
+- [ ] **Step 9.4: Migrate the producer**
+
+Replace the contents of `core/src/datayoga_core/blocks/azure/read_event_hub/block.py` with:
+
+```python
+import asyncio
+import logging
+from typing import Any, AsyncGenerator, Dict, List, Optional
+
+import orjson
+from azure.eventhub import EventData, PartitionContext
+from azure.eventhub.aio import EventHubConsumerClient
+from azure.eventhub.extensions.checkpointstoreblobaio import \
+    BlobCheckpointStore
+from datayoga_core.context import Context
+from datayoga_core.producer import Producer as DyProducer
+
+logger = logging.getLogger("dy")
+
+
+class Block(DyProducer):
+    """Azure Event Hub block for reading events."""
+
+    DEFAULT_FLUSH_MS = 1000
+
+    def init(self, context: Optional[Context] = None):
+        logger.debug(f"Initializing {self.get_block_name()}")
+        self.max_batch_size = int(self.properties.get("max_batch_size", 300))
+        self.consumer_client = EventHubConsumerClient.from_connection_string(
+            conn_str=self.properties["event_hub_connection_string"],
+            consumer_group=self.properties["event_hub_consumer_group_name"],
+            eventhub_name=self.properties["event_hub_name"],
+            checkpoint_store=BlobCheckpointStore.from_connection_string(
+                self.properties["checkpoint_store_connection_string"],
+                self.properties["checkpoint_store_container_name"]),
+        )
+        self.events: Dict[Any, Any] = {}
+        self.messages: asyncio.Queue = asyncio.Queue()
+
+    async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
+        logger.debug(f"Running {self.get_block_name()}")
+        logger.debug("Starting event receiving process")
+        asyncio.create_task(self.receive_batch())
+
+        while True:
+            first = await self.messages.get()
+            chunk = [first]
+            while not self.messages.empty():
+                chunk.append(self.messages.get_nowait())
+            yield chunk
+
+    async def receive_batch(self):
+        await self.consumer_client.receive_batch(
+            on_event_batch=self.on_event_batch,
+            max_batch_size=self.max_batch_size,
+            starting_position="-1",
+        )
+
+    async def on_event_batch(self, partition_context: PartitionContext, events: List[EventData]):
+        logger.debug(f"Received batch of events from partition: {partition_context.partition_id}")
+        for event in events:
+            try:
+                payload = orjson.loads(event.body_as_str(encoding="UTF-8"))
+                msg_id = event.system_properties[b"x-opt-sequence-number"]
+                self.events[msg_id] = (event, partition_context)
+                payload[self.MSG_ID_FIELD] = msg_id
+                await self.messages.put(payload)
+            except Exception as e:
+                logger.error(e)
+
+    async def complete_events(self, msg_ids: List[str]):
+        for msg_id in msg_ids:
+            logger.debug(f"Acking {msg_id} event")
+            event, partition_context = self.events.pop(msg_id, (None, None))
+            if event is not None:
+                await partition_context.update_checkpoint(event)
+            else:
+                logger.warning(f"Couldn't find event {msg_id} for acknowledging")
+
+    def ack(self, msg_ids: List[str]):
+        asyncio.create_task(self.complete_events(msg_ids))
+```
+
+- [ ] **Step 9.5: Run test to verify it passes**
+
+```bash
+cd core && python -m pytest src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py -v
+```
+
+Expected: 5 passed.
+
+- [ ] **Step 9.6: Run the full core suite**
+
+```bash
+cd core && python -m pytest src/datayoga_core/ -x -q
+```
+
+Expected: all tests pass.
+
+- [ ] **Step 9.7: Commit**
+
+```bash
+git add core/src/datayoga_core/blocks/azure/read_event_hub/block.py \
+        core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json \
+        core/src/datayoga_core/blocks/azure/read_event_hub/tests/__init__.py \
+        core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py
+git commit -m "Migrate azure/read_event_hub; rename batch_size -> max_batch_size (#400, BREAKING)"
+```
+
+---
+
+## Task 10: Regenerate autogenerated schemas and docs
+
+The aggregated `schemas/job.schema.json` and the per-block markdown in `docs/reference/blocks/` are generated by scripts. After the per-block schema changes, regenerate them.
+
+**Files:**
+- Modify: `schemas/job.schema.json`
+- Modify: `docs/reference/blocks/std_read.md`, `files_read_csv.md`, `parquet_read.md`, `relational_read.md`, `redis_read_stream.md`, `http_receiver.md`, `azure_read_event_hub.md` (autogenerated)
+
+- [ ] **Step 10.1: Regenerate the JSON schemas**
+
+```bash
+bash scripts/generate-jsonschemas.sh
+```
+
+Expected output: `JSON schemas generated successfully`.
+
+- [ ] **Step 10.2: Regenerate the reference docs**
+
+```bash
+bash scripts/generate-docs.sh
+```
+
+Expected: completes without error.
+
+- [ ] **Step 10.3: Inspect the diff**
+
+```bash
+git diff schemas/ docs/reference/blocks/ | head -200
+```
+
+Expected: `batch_size` (and `flush_ms` for streaming producers, `fetch_size` for relational/read, `max_batch_size` for event_hub) appear in the appropriate schema entries and docs.
+
+- [ ] **Step 10.4: Commit**
+
+```bash
+git add schemas/job.schema.json docs/reference/blocks/
+git commit -m "Regenerate JSON schemas and reference docs after producer batching (#400)"
+```
+
+---
+
+## Task 11: Document the producer batching model in processing-strategies
+
+**Files:**
+- Modify: `docs/processing-strategies.md`
+
+- [ ] **Step 11.1: Add a section on producer batching**
+
+Append the following section to `docs/processing-strategies.md` (or replace an existing section if one already covers it):
+
+````markdown
+## Producer Batching
+
+Every producer block (any block that reads from a source — `std/read`, `files/read_csv`, `parquet/read`, `relational/read`, `redis/read_stream`, `azure/read_event_hub`, `http/receiver`) accepts a `batch_size` property. The producer base class re-chunks the source's output into batches of exactly `batch_size` records, regardless of how the source delivers them (per row, per row group, per `fetchmany`, per network message).
+
+```yaml
+input:
+  uses: files.read_csv
+  with:
+    file: people.csv
+    batch_size: 500   # downstream steps process 500 records per call
+```
+
+Default: `1000`.
+
+### Streaming producers and `flush_ms`
+
+Streaming producers (`redis/read_stream`, `azure/read_event_hub`, `http/receiver`) also accept `flush_ms`. If no new records arrive within that many milliseconds, any partial batch is flushed downstream instead of being held until `batch_size` is reached.
+
+```yaml
+input:
+  uses: redis.read_stream
+  with:
+    connection: my_redis
+    stream_name: events
+    batch_size: 1000
+    flush_ms: 500   # emit a partial batch after 500ms of inactivity
+```
+
+Default: `1000` ms. Set to `null` to disable time-based flushing (records are held until `batch_size` or end-of-stream).
+
+### `relational/read` and `fetch_size`
+
+`relational/read` exposes an extra `fetch_size` property that controls how many rows are pulled from the database driver per round-trip, independent of the pipeline `batch_size`. Default: `10000`. Tune lower for memory pressure with wide rows; tune higher if you want fewer DB round-trips and downstream processing is the bottleneck.
+
+### `azure/read_event_hub` migration note
+
+In earlier versions, `batch_size` on `azure/read_event_hub` controlled the SDK callback batch size, not the pipeline batch size. As of #400 it has been renamed to `max_batch_size` to match the SDK semantic, and `batch_size` now consistently means pipeline batch size as it does for every other producer.
+````
+
+- [ ] **Step 11.2: Commit**
+
+```bash
+git add docs/processing-strategies.md
+git commit -m "Document producer batching model in processing-strategies (#400)"
+```
+
+---
+
+## Task 12: Full verification and push branch
+
+- [ ] **Step 12.1: Run full core test suite**
+
+```bash
+cd core && python -m pytest src/datayoga_core/ -v
+```
+
+Expected: all tests pass. Notably:
+- `test_producer_batching.py` (7 tests)
+- `test_schema_inherit.py` (5 tests)
+- `test_std_read.py`, `test_read_csv.py`, `test_parquet_read.py`, `test_relational_read.py`, `test_http_receiver.py`, `test_redis_read_stream.py`, `test_event_hub.py` (12 tests total)
+- All pre-existing tests still pass.
+
+- [ ] **Step 12.2: Inspect the branch's commit history**
+
+```bash
+git log --oneline 400-producer-batching-unification ^main
+```
+
+Expected: a clean sequence of commits — one per task — each referencing #400.
+
+- [ ] **Step 12.3: Push the branch**
+
+```bash
+git push -u origin 400-producer-batching-unification
+```
+
+Expected: branch pushed to remote.
+
+- [ ] **Step 12.4: Open a draft PR (deferred — confirm with user first)**
+
+Before opening the PR, ask the user whether to open it as draft or ready-for-review, and confirm the body content. Do not run `gh pr create` autonomously.
+
+The PR description should call out the breaking change explicitly (no CHANGELOG file exists in this repo, so the PR description is the canonical place):
+
+> **Breaking change:** `azure/read_event_hub.batch_size` has been renamed to `max_batch_size`. The name `batch_size` now means pipeline batch size on this block, consistent with every other producer. Users with `batch_size: <N>` in their YAML for `azure/read_event_hub` must rename it to `max_batch_size: <N>` to preserve the previous SDK callback size semantic; the literal `batch_size: <N>` will validate but with the new pipeline-level meaning.

From 5c178b6074c00db8d3bc2cbe1e82d9b7e0d857c8 Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 16:35:23 +0300
Subject: [PATCH 10/38] Add $inherit schema fragment resolver (#400)

---
 core/src/datayoga_core/block.py               |  3 +-
 core/src/datayoga_core/job.py                 |  3 +-
 .../resources/schemas/batchable.schema.json   | 13 ++++
 .../resources/schemas/streamable.schema.json  | 19 ++++++
 core/src/datayoga_core/schema_utils.py        | 52 ++++++++++++++++
 core/src/datayoga_core/tests/__init__.py      |  0
 .../tests/test_schema_inherit.py              | 62 +++++++++++++++++++
 7 files changed, 150 insertions(+), 2 deletions(-)
 create mode 100644 core/src/datayoga_core/resources/schemas/batchable.schema.json
 create mode 100644 core/src/datayoga_core/resources/schemas/streamable.schema.json
 create mode 100644 core/src/datayoga_core/schema_utils.py
 create mode 100644 core/src/datayoga_core/tests/__init__.py
 create mode 100644 core/src/datayoga_core/tests/test_schema_inherit.py

diff --git a/core/src/datayoga_core/block.py b/core/src/datayoga_core/block.py
index 29363953..2a83036d 100644
--- a/core/src/datayoga_core/block.py
+++ b/core/src/datayoga_core/block.py
@@ -56,7 +56,8 @@ def get_json_schema(self) -> Dict[str, Any]:
             os.path.dirname(os.path.realpath(sys.modules[self.__module__].__file__)),
             "block.schema.json")
         logger.debug(f"loading schema from {json_schema_file}")
-        return utils.read_json(json_schema_file)
+        from datayoga_core.schema_utils import resolve_inherits
+        return resolve_inherits(utils.read_json(json_schema_file))
 
     @abstractmethod
     def init(self, context: Optional[Context] = None):
diff --git a/core/src/datayoga_core/job.py b/core/src/datayoga_core/job.py
index 082dde7c..6fac1132 100644
--- a/core/src/datayoga_core/job.py
+++ b/core/src/datayoga_core/job.py
@@ -237,10 +237,11 @@ def get_json_schema(whitelisted_blocks: Optional[List[str]] = None) -> Dict[str,
         # Now build the sorted lists
         block_types = []
         block_schemas = []
+        from datayoga_core.schema_utils import resolve_inherits
         for block_type, schema_path in block_info:
             block_types.append(block_type)
             # load schema file
-            schema = utils.read_json(f"{schema_path}")
+            schema = resolve_inherits(utils.read_json(f"{schema_path}"))
             # append to the array of allOf for the full schema
             # we use allOf for better error reporting
             block_schemas.append({
diff --git a/core/src/datayoga_core/resources/schemas/batchable.schema.json b/core/src/datayoga_core/resources/schemas/batchable.schema.json
new file mode 100644
index 00000000..f158d4fb
--- /dev/null
+++ b/core/src/datayoga_core/resources/schemas/batchable.schema.json
@@ -0,0 +1,13 @@
+{
+  "title": "batchable",
+  "description": "Producer batching mixin: declares batch_size for producers that yield records in batches.",
+  "type": "object",
+  "properties": {
+    "batch_size": {
+      "type": "integer",
+      "minimum": 1,
+      "description": "Maximum number of records yielded per downstream batch.",
+      "default": 1000
+    }
+  }
+}
diff --git a/core/src/datayoga_core/resources/schemas/streamable.schema.json b/core/src/datayoga_core/resources/schemas/streamable.schema.json
new file mode 100644
index 00000000..761c6d65
--- /dev/null
+++ b/core/src/datayoga_core/resources/schemas/streamable.schema.json
@@ -0,0 +1,19 @@
+{
+  "title": "streamable",
+  "description": "Streaming producer mixin: declares batch_size and flush_ms for producers reading from continuous sources.",
+  "type": "object",
+  "properties": {
+    "batch_size": {
+      "type": "integer",
+      "minimum": 1,
+      "description": "Maximum number of records yielded per downstream batch.",
+      "default": 1000
+    },
+    "flush_ms": {
+      "type": ["integer", "null"],
+      "minimum": 1,
+      "description": "If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.",
+      "default": 1000
+    }
+  }
+}
diff --git a/core/src/datayoga_core/schema_utils.py b/core/src/datayoga_core/schema_utils.py
new file mode 100644
index 00000000..77bdee45
--- /dev/null
+++ b/core/src/datayoga_core/schema_utils.py
@@ -0,0 +1,52 @@
+"""Schema composition helpers.
+
+Producers and other blocks can declare `"$inherit": ["batchable"]` at the
+top of their block.schema.json to pull in shared property definitions from
+the fragments in resources/schemas/. `resolve_inherits` merges the
+fragments' `properties` into the local schema (local properties win), then
+removes the `$inherit` key. Schemas without `$inherit` are returned as-is.
+"""
+from __future__ import annotations
+
+import copy
+from os import path
+from typing import Any, Dict, List
+
+from datayoga_core import utils
+
+
+def resolve_inherits(schema: Dict[str, Any], schemas_dir: str = None) -> Dict[str, Any]:
+    """Merge any fragments listed in $inherit into the schema's properties.
+
+    Args:
+        schema: The schema to resolve. Mutated in place and also returned.
+        schemas_dir: Directory containing the fragment files. Defaults to
+            the bundled/non-bundled resources/schemas directory.
+
+    Returns:
+        The mutated schema with $inherit removed and fragment properties merged.
+    """
+    inherits: List[str] = schema.get("$inherit") or []
+    if not inherits:
+        return schema
+
+    if schemas_dir is None:
+        schemas_dir = utils.get_resource_path("schemas")
+
+    merged_properties: Dict[str, Any] = {}
+    for fragment_name in inherits:
+        fragment_path = path.join(schemas_dir, f"{fragment_name}.schema.json")
+        if not path.isfile(fragment_path):
+            raise FileNotFoundError(
+                f"Schema fragment '{fragment_name}' not found at {fragment_path}"
+            )
+        fragment = utils.read_json(fragment_path)
+        merged_properties.update(copy.deepcopy(fragment.get("properties", {})))
+
+    # Local properties take precedence over inherited ones.
+    local_properties = schema.get("properties", {})
+    merged_properties.update(local_properties)
+
+    schema["properties"] = merged_properties
+    schema.pop("$inherit", None)
+    return schema
diff --git a/core/src/datayoga_core/tests/__init__.py b/core/src/datayoga_core/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/core/src/datayoga_core/tests/test_schema_inherit.py b/core/src/datayoga_core/tests/test_schema_inherit.py
new file mode 100644
index 00000000..77178c37
--- /dev/null
+++ b/core/src/datayoga_core/tests/test_schema_inherit.py
@@ -0,0 +1,62 @@
+import json
+from pathlib import Path
+
+import pytest
+
+from datayoga_core.schema_utils import resolve_inherits
+
+
+SCHEMAS_DIR = (
+    Path(__file__).resolve().parent.parent / "resources" / "schemas"
+)
+
+
+def test_inherit_merges_fragment_properties():
+    schema = {
+        "title": "demo",
+        "type": "object",
+        "$inherit": ["batchable"],
+        "properties": {"foo": {"type": "string"}},
+        "additionalProperties": False,
+    }
+    resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
+    assert "$inherit" not in resolved
+    assert "batch_size" in resolved["properties"]
+    assert resolved["properties"]["batch_size"]["default"] == 1000
+    assert resolved["properties"]["foo"] == {"type": "string"}
+    assert resolved["additionalProperties"] is False
+
+
+def test_inherit_local_property_wins_over_fragment():
+    schema = {
+        "type": "object",
+        "$inherit": ["batchable"],
+        "properties": {
+            "batch_size": {"type": "integer", "minimum": 1, "default": 50}
+        },
+    }
+    resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
+    assert resolved["properties"]["batch_size"]["default"] == 50
+
+
+def test_inherit_streamable_brings_both_props():
+    schema = {"type": "object", "$inherit": ["streamable"], "properties": {}}
+    resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
+    assert "batch_size" in resolved["properties"]
+    assert "flush_ms" in resolved["properties"]
+
+
+def test_schema_without_inherit_unchanged():
+    schema = {
+        "type": "object",
+        "properties": {"foo": {"type": "string"}},
+        "additionalProperties": False,
+    }
+    resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
+    assert resolved == schema
+
+
+def test_unknown_fragment_raises():
+    schema = {"type": "object", "$inherit": ["nope"], "properties": {}}
+    with pytest.raises(FileNotFoundError):
+        resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))

From cb126c665fbdbf4ba4f52f6236dc3280825cd634 Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 16:45:16 +0300
Subject: [PATCH 11/38] Tighten $inherit resolver: reject non-list, guard
 against nested (#400)

---
 core/src/datayoga_core/block.py               |  1 +
 core/src/datayoga_core/job.py                 |  1 +
 core/src/datayoga_core/schema_utils.py        | 18 +++++++---
 .../tests/test_schema_inherit.py              | 33 ++++++++++++++++++-
 4 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/core/src/datayoga_core/block.py b/core/src/datayoga_core/block.py
index 2a83036d..a0b65e06 100644
--- a/core/src/datayoga_core/block.py
+++ b/core/src/datayoga_core/block.py
@@ -56,6 +56,7 @@ def get_json_schema(self) -> Dict[str, Any]:
             os.path.dirname(os.path.realpath(sys.modules[self.__module__].__file__)),
             "block.schema.json")
         logger.debug(f"loading schema from {json_schema_file}")
+        # Lazy import: schema_utils -> utils -> block creates a circular import at module load.
         from datayoga_core.schema_utils import resolve_inherits
         return resolve_inherits(utils.read_json(json_schema_file))
 
diff --git a/core/src/datayoga_core/job.py b/core/src/datayoga_core/job.py
index 6fac1132..9df8c267 100644
--- a/core/src/datayoga_core/job.py
+++ b/core/src/datayoga_core/job.py
@@ -237,6 +237,7 @@ def get_json_schema(whitelisted_blocks: Optional[List[str]] = None) -> Dict[str,
         # Now build the sorted lists
         block_types = []
         block_schemas = []
+        # Lazy import: schema_utils -> utils -> block creates a circular import at module load.
         from datayoga_core.schema_utils import resolve_inherits
         for block_type, schema_path in block_info:
             block_types.append(block_type)
diff --git a/core/src/datayoga_core/schema_utils.py b/core/src/datayoga_core/schema_utils.py
index 77bdee45..8f6657f7 100644
--- a/core/src/datayoga_core/schema_utils.py
+++ b/core/src/datayoga_core/schema_utils.py
@@ -10,12 +10,12 @@
 
 import copy
 from os import path
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 from datayoga_core import utils
 
 
-def resolve_inherits(schema: Dict[str, Any], schemas_dir: str = None) -> Dict[str, Any]:
+def resolve_inherits(schema: Dict[str, Any], schemas_dir: Optional[str] = None) -> Dict[str, Any]:
     """Merge any fragments listed in $inherit into the schema's properties.
 
     Args:
@@ -26,9 +26,13 @@ def resolve_inherits(schema: Dict[str, Any], schemas_dir: str = None) -> Dict[st
     Returns:
         The mutated schema with $inherit removed and fragment properties merged.
     """
-    inherits: List[str] = schema.get("$inherit") or []
-    if not inherits:
+    inherits = schema.get("$inherit")
+    if inherits is None or inherits == []:
         return schema
+    if not isinstance(inherits, list) or not all(isinstance(name, str) for name in inherits):
+        raise TypeError(
+            f"$inherit must be a list of fragment names (strings), got {inherits!r}"
+        )
 
     if schemas_dir is None:
         schemas_dir = utils.get_resource_path("schemas")
@@ -41,6 +45,12 @@ def resolve_inherits(schema: Dict[str, Any], schemas_dir: str = None) -> Dict[st
                 f"Schema fragment '{fragment_name}' not found at {fragment_path}"
             )
         fragment = utils.read_json(fragment_path)
+        if fragment.get("$inherit"):
+            raise ValueError(
+                f"Schema fragment '{fragment_name}' itself contains $inherit; "
+                "nested inheritance is not supported. Inline the parent fragment's "
+                "properties or restructure the hierarchy."
+            )
         merged_properties.update(copy.deepcopy(fragment.get("properties", {})))
 
     # Local properties take precedence over inherited ones.
diff --git a/core/src/datayoga_core/tests/test_schema_inherit.py b/core/src/datayoga_core/tests/test_schema_inherit.py
index 77178c37..c22ea2c8 100644
--- a/core/src/datayoga_core/tests/test_schema_inherit.py
+++ b/core/src/datayoga_core/tests/test_schema_inherit.py
@@ -1,4 +1,3 @@
-import json
 from pathlib import Path
 
 import pytest
@@ -60,3 +59,35 @@ def test_unknown_fragment_raises():
     schema = {"type": "object", "$inherit": ["nope"], "properties": {}}
     with pytest.raises(FileNotFoundError):
         resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
+
+
+def test_inherit_string_value_raises_type_error():
+    schema = {"type": "object", "$inherit": "batchable", "properties": {}}
+    with pytest.raises(TypeError):
+        resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
+
+
+def test_inherit_non_string_items_raises_type_error():
+    schema = {"type": "object", "$inherit": ["batchable", 123], "properties": {}}
+    with pytest.raises(TypeError):
+        resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
+
+
+def test_inherit_empty_list_returns_unchanged():
+    schema = {"type": "object", "$inherit": [], "properties": {"foo": {}}}
+    resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
+    # Early-return path: schema is returned as-is (no mutation, no key removal).
+    assert resolved is schema
+
+
+def test_nested_inherit_raises_value_error(tmp_path):
+    # Build a fragment dir with a fragment that has its own $inherit.
+    (tmp_path / "parent.schema.json").write_text(
+        '{"properties": {"x": {"type": "string"}}}'
+    )
+    (tmp_path / "child.schema.json").write_text(
+        '{"$inherit": ["parent"], "properties": {"y": {"type": "string"}}}'
+    )
+    schema = {"$inherit": ["child"], "type": "object", "properties": {}}
+    with pytest.raises(ValueError, match="nested inheritance is not supported"):
+        resolve_inherits(schema, schemas_dir=str(tmp_path))

From 09319184076f6683018ee26fa10be9099722c1f8 Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 16:45:59 +0300
Subject: [PATCH 12/38] Remove unused List import in schema_utils (#400)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 core/src/datayoga_core/schema_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/datayoga_core/schema_utils.py b/core/src/datayoga_core/schema_utils.py
index 8f6657f7..e009a984 100644
--- a/core/src/datayoga_core/schema_utils.py
+++ b/core/src/datayoga_core/schema_utils.py
@@ -10,7 +10,7 @@
 
 import copy
 from os import path
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, Optional
 
 from datayoga_core import utils
 

From c9dbe921bf51738030f6fbef4c35dbb631c215f5 Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 16:49:03 +0300
Subject: [PATCH 13/38] Producer base class re-chunks via produce_chunks (#400)

---
 core/src/datayoga_core/producer.py            |  88 ++++++++++--
 .../tests/test_producer_batching.py           | 126 ++++++++++++++++++
 2 files changed, 203 insertions(+), 11 deletions(-)
 create mode 100644 core/src/datayoga_core/tests/test_producer_batching.py

diff --git a/core/src/datayoga_core/producer.py b/core/src/datayoga_core/producer.py
index e32b2e01..2b61390d 100644
--- a/core/src/datayoga_core/producer.py
+++ b/core/src/datayoga_core/producer.py
@@ -1,8 +1,12 @@
-from abc import abstractmethod
+import asyncio
+import logging
+from contextlib import suppress
 from typing import Any, AsyncGenerator, Dict, List
 
 from .block import Block
 
+logger = logging.getLogger("dy")
+
 
 class Message:
     def __init__(self, msg_id: str, value: Dict[str, Any]):
@@ -11,20 +15,82 @@ def __init__(self, msg_id: str, value: Dict[str, Any]):
 
 
 class Producer(Block):
+    """Base class for producer (read) blocks.
+
+    Subclasses override `produce_chunks()` to yield chunks of any size from
+    the source. The default `produce()` re-chunks them to exactly `batch_size`
+    records per batch (smaller on flush_ms timeout or end-of-stream).
+
+    Legacy subclasses may still override `produce()` directly. They bypass
+    the base-class batching and `produce_chunks` is not called.
+    """
 
-    @abstractmethod
-    async def produce(self) -> AsyncGenerator[List[Message], None]:
-        """Produces data
+    DEFAULT_BATCH_SIZE = 1000
+    DEFAULT_FLUSH_MS = None  # streaming subclasses override to enable timeout flush
 
-        Returns:
-            AsyncGenerator[List[Message], None]: A generator of message batches.
+    async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
+        """Yield natural-size chunks from the source.
+
+        Subclasses should override this method. The base-class `produce()`
+        will re-chunk the output to exact `batch_size` slices.
         """
-        raise NotImplementedError
+        raise NotImplementedError(
+            f"{type(self).__name__} must override produce_chunks() or produce()"
+        )
+        # Make this an async generator for type-checking purposes.
+        yield  # pragma: no cover
 
-    def ack(self, msg_ids: List[str]):
-        """Sends acknowledge for the message IDs of the records that have been processed
+    async def produce(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
+        """Re-chunks `produce_chunks()` output to exact batch_size batches.
 
-        Args:
-            msg_ids (List[str]): Message IDs
+        Reads `batch_size` and `flush_ms` from properties lazily so subclasses
+        don't need to remember to call `super().init()`.
         """
+        batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE))
+        flush_ms = self.properties.get("flush_ms", self.DEFAULT_FLUSH_MS)
+        timeout = (flush_ms / 1000) if flush_ms else None
+
+        queue: asyncio.Queue = asyncio.Queue()
+        EOS = object()
+
+        async def pump():
+            try:
+                async for chunk in self.produce_chunks():
+                    if chunk:
+                        await queue.put(chunk)
+            except asyncio.CancelledError:
+                raise
+            except Exception as exc:
+                logger.exception("produce_chunks raised; ending stream: %s", exc)
+            finally:
+                await queue.put(EOS)
+
+        pump_task = asyncio.create_task(pump())
+        buffer: List[Dict[str, Any]] = []
+        try:
+            while True:
+                try:
+                    item = await asyncio.wait_for(queue.get(), timeout=timeout)
+                except asyncio.TimeoutError:
+                    if buffer:
+                        yield buffer
+                        buffer = []
+                    continue
+
+                if item is EOS:
+                    if buffer:
+                        yield buffer
+                    return
+
+                buffer.extend(item)
+                while len(buffer) >= batch_size:
+                    yield buffer[:batch_size]
+                    buffer = buffer[batch_size:]
+        finally:
+            pump_task.cancel()
+            with suppress(asyncio.CancelledError, Exception):
+                await pump_task
+
+    def ack(self, msg_ids: List[str]):
+        """Sends acknowledge for the message IDs of records that have been processed."""
         pass
diff --git a/core/src/datayoga_core/tests/test_producer_batching.py b/core/src/datayoga_core/tests/test_producer_batching.py
new file mode 100644
index 00000000..59601786
--- /dev/null
+++ b/core/src/datayoga_core/tests/test_producer_batching.py
@@ -0,0 +1,126 @@
+import asyncio
+from typing import AsyncGenerator, List, Optional
+
+import pytest
+
+from datayoga_core.context import Context
+from datayoga_core.producer import Message, Producer
+
+
+def _msg(i: int) -> dict:
+    return {Producer.MSG_ID_FIELD: str(i), "v": i}
+
+
+class FakeProducer(Producer):
+    """Producer driven by a scripted list of chunks plus optional sleeps."""
+
+    def __init__(self, properties=None, *, chunks=None, sleep_before=None):
+        # schema for a FakeProducer; declare batch_size/flush_ms so validation passes
+        self._test_schema = {
+            "type": "object",
+            "properties": {
+                "batch_size": {"type": "integer", "minimum": 1},
+                "flush_ms": {"type": ["integer", "null"], "minimum": 1},
+            },
+        }
+        self._chunks = chunks or []
+        self._sleep_before = sleep_before or []
+        super().__init__(properties or {})
+
+    def get_json_schema(self):
+        return self._test_schema
+
+    def init(self, context: Optional[Context] = None):
+        pass
+
+    async def produce_chunks(self) -> AsyncGenerator[List[Message], None]:
+        for i, chunk in enumerate(self._chunks):
+            if i < len(self._sleep_before) and self._sleep_before[i]:
+                await asyncio.sleep(self._sleep_before[i])
+            yield chunk
+
+
+async def _drain(producer: Producer):
+    out = []
+    async for batch in producer.produce():
+        out.append(batch)
+    return out
+
+
+@pytest.mark.asyncio
+async def test_rechunks_one_large_chunk():
+    chunks = [[_msg(i) for i in range(5000)]]
+    p = FakeProducer({"batch_size": 1000}, chunks=chunks)
+    batches = await _drain(p)
+    assert [len(b) for b in batches] == [1000, 1000, 1000, 1000, 1000]
+
+
+@pytest.mark.asyncio
+async def test_accumulates_small_chunks_and_flushes_on_eos():
+    chunks = [[_msg(i) for i in range(200)],
+              [_msg(i) for i in range(200, 500)],
+              [_msg(i) for i in range(500, 900)]]
+    p = FakeProducer({"batch_size": 1000}, chunks=chunks)
+    batches = await _drain(p)
+    assert [len(b) for b in batches] == [900]
+
+
+@pytest.mark.asyncio
+async def test_partial_final_batch_on_eos():
+    chunks = [[_msg(i) for i in range(1500)]]
+    p = FakeProducer({"batch_size": 1000}, chunks=chunks)
+    batches = await _drain(p)
+    assert [len(b) for b in batches] == [1000, 500]
+
+
+@pytest.mark.asyncio
+async def test_empty_chunks_are_ignored():
+    chunks = [[], [_msg(1), _msg(2)], [], [_msg(3)]]
+    p = FakeProducer({"batch_size": 10}, chunks=chunks)
+    batches = await _drain(p)
+    assert [len(b) for b in batches] == [3]
+
+
+@pytest.mark.asyncio
+async def test_flush_ms_emits_partial_on_inactivity():
+    # one chunk of 2 records, then a 300ms wait before EOS; flush_ms=100 should
+    # flush the partial batch of 2 well before EOS.
+    chunks = [[_msg(1), _msg(2)], [_msg(3)]]
+    sleeps = [0, 0.3]
+    p = FakeProducer({"batch_size": 100, "flush_ms": 100},
+                     chunks=chunks, sleep_before=sleeps)
+
+    received = []
+    started = asyncio.get_event_loop().time()
+    timings = []
+    async for batch in p.produce():
+        timings.append(asyncio.get_event_loop().time() - started)
+        received.append(batch)
+
+    assert [len(b) for b in received] == [2, 1]
+    # first flush happens because of inactivity (~100ms), not waiting for chunk 2
+    assert timings[0] < 0.25, f"expected first flush before 250ms, got {timings[0]}"
+
+
+@pytest.mark.asyncio
+async def test_no_flush_ms_holds_records_until_eos():
+    chunks = [[_msg(1)], [_msg(2)]]
+    sleeps = [0, 0.1]
+    p = FakeProducer({"batch_size": 100}, chunks=chunks, sleep_before=sleeps)
+    batches = await _drain(p)
+    assert [len(b) for b in batches] == [2]  # combined on EOS, never flushed mid-stream
+
+
+@pytest.mark.asyncio
+async def test_consumer_cancellation_cleans_up_pump():
+    chunks = [[_msg(i)] for i in range(1000)]
+    p = FakeProducer({"batch_size": 10, "flush_ms": 50}, chunks=chunks,
+                     sleep_before=[0.05] * 1000)
+
+    gen = p.produce()
+    first = await gen.__anext__()
+    assert len(first) >= 1
+    await gen.aclose()
+    # If pump task wasn't cleaned up we'd see a "Task was destroyed but it is
+    # pending!" warning here. Sleep briefly so the loop has a chance to surface it.
+    await asyncio.sleep(0.1)

From f1311d88309e08a2165784795a36a1c065b4a10b Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 16:52:22 +0300
Subject: [PATCH 14/38] Migrate std/read to produce_chunks (#400, #296)

---
 .../datayoga_core/blocks/std/read/block.py    | 44 ++++---------------
 .../blocks/std/read/block.schema.json         | 10 ++---
 .../blocks/std/read/tests/__init__.py         |  0
 .../blocks/std/read/tests/test_std_read.py    | 32 ++++++++++++++
 4 files changed, 44 insertions(+), 42 deletions(-)
 create mode 100644 core/src/datayoga_core/blocks/std/read/tests/__init__.py
 create mode 100644 core/src/datayoga_core/blocks/std/read/tests/test_std_read.py

diff --git a/core/src/datayoga_core/blocks/std/read/block.py b/core/src/datayoga_core/blocks/std/read/block.py
index e0b60b13..1c51839d 100644
--- a/core/src/datayoga_core/blocks/std/read/block.py
+++ b/core/src/datayoga_core/blocks/std/read/block.py
@@ -6,59 +6,33 @@
 
 import orjson
 from datayoga_core.context import Context
-from datayoga_core.producer import Message
 from datayoga_core.producer import Producer as DyProducer
 
 logger = logging.getLogger("dy")
 
 
 class Block(DyProducer):
-
     def init(self, context: Optional[Context] = None):
         logger.debug(f"Initializing {self.get_block_name()}")
-        self.batch_size = int(self.properties.get("batch_size", 1000))
-        logger.info(f"Using batch size: {self.batch_size}")
-
-    async def process_batch(self, records: List[Dict[str, Any]]) -> AsyncGenerator[List[Message], None]:
-        """Process records and yield batches according to batch_size"""
-        batch = []
-        for record in records:
-            batch.append(self.get_message(record))
-
-            # When batch is full, yield it
-            if len(batch) >= self.batch_size:
-                logger.info(f"Yielding batch of {len(batch)} records")
-                yield batch
-                batch = []
 
-        # Yield any remaining records
-        if batch:
-            logger.info(f"Yielding final batch of {len(batch)} records")
-            yield batch
-
-    async def produce(self) -> AsyncGenerator[List[Message], None]:
-        if select.select([sys.stdin, ], [], [], 0.0)[0]:
-            # piped data exists
-            all_records = []
-            for data in sys.stdin:
-                all_records.extend(self.get_records(data))
+    async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
+        if select.select([sys.stdin], [], [], 0.0)[0]:
+            all_records: List[Dict[str, Any]] = []
+            for line in sys.stdin:
+                all_records.extend(self.get_records(line))
         else:
-            # interactive mode
             print("Enter data to process:")
-            data = input()
-            all_records = self.get_records(data)
+            all_records = self.get_records(input())
 
-        async for batch in self.process_batch(all_records):
-            yield batch
+        if all_records:
+            yield [self.get_message(record) for record in all_records]
 
     @staticmethod
     def get_records(data: str) -> List[Dict[str, Any]]:
         records = orjson.loads(data)
-
         if isinstance(records, dict):
             records = [records]
-
         return records
 
-    def get_message(self, record: Dict[str, Any]) -> Message:
+    def get_message(self, record: Dict[str, Any]) -> Dict[str, Any]:
         return {self.MSG_ID_FIELD: str(uuid.uuid4()), **record}
diff --git a/core/src/datayoga_core/blocks/std/read/block.schema.json b/core/src/datayoga_core/blocks/std/read/block.schema.json
index 38ad05af..2214ac05 100644
--- a/core/src/datayoga_core/blocks/std/read/block.schema.json
+++ b/core/src/datayoga_core/blocks/std/read/block.schema.json
@@ -2,11 +2,7 @@
   "title": "std.read",
   "description": "Read from the standard input",
   "type": "object",
-  "properties": {
-    "batch_size": {
-      "type": "integer",
-      "description": "Number of records to process in a single batch",
-      "default": 1000
-    }
-  }
+  "$inherit": ["batchable"],
+  "properties": {},
+  "additionalProperties": false
 }
diff --git a/core/src/datayoga_core/blocks/std/read/tests/__init__.py b/core/src/datayoga_core/blocks/std/read/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py b/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py
new file mode 100644
index 00000000..609f0915
--- /dev/null
+++ b/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py
@@ -0,0 +1,32 @@
+from unittest.mock import patch
+
+import orjson
+import pytest
+
+from datayoga_core.blocks.std.read.block import Block
+
+
+async def _drain(producer):
+    out = []
+    async for batch in producer.produce():
+        out.append(batch)
+    return out
+
+
+@pytest.mark.asyncio
+async def test_std_read_batches_to_batch_size():
+    payload = [{"i": i} for i in range(2500)]
+    fake_stdin = [orjson.dumps(payload).decode()]
+
+    block = Block({"batch_size": 1000})
+    block.init()
+
+    with patch("datayoga_core.blocks.std.read.block.select.select",
+               return_value=([object()], [], [])), \
+         patch("datayoga_core.blocks.std.read.block.sys.stdin", fake_stdin):
+        batches = await _drain(block)
+
+    assert [len(b) for b in batches] == [1000, 1000, 500]
+    flat = [r for b in batches for r in b]
+    assert flat[0]["i"] == 0
+    assert all(Block.MSG_ID_FIELD in r for r in flat)

From 12c13fb14e4097d9b1187660117ac675e6de5076 Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 16:56:25 +0300
Subject: [PATCH 15/38] Migrate files/read_csv to produce_chunks (#400)

---
 .../blocks/files/read_csv/block.py            | 32 ++++++----------
 .../blocks/files/read_csv/block.schema.json   |  7 +---
 .../blocks/files/read_csv/tests/__init__.py   |  0
 .../files/read_csv/tests/test_read_csv.py     | 38 +++++++++++++++++++
 4 files changed, 51 insertions(+), 26 deletions(-)
 create mode 100644 core/src/datayoga_core/blocks/files/read_csv/tests/__init__.py
 create mode 100644 core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py

diff --git a/core/src/datayoga_core/blocks/files/read_csv/block.py b/core/src/datayoga_core/blocks/files/read_csv/block.py
index c4bca6f6..336450dc 100644
--- a/core/src/datayoga_core/blocks/files/read_csv/block.py
+++ b/core/src/datayoga_core/blocks/files/read_csv/block.py
@@ -4,10 +4,9 @@
 from contextlib import suppress
 from csv import DictReader
 from itertools import count, islice
-from typing import AsyncGenerator, List, Optional
+from typing import Any, AsyncGenerator, Dict, List, Optional
 
 from datayoga_core.context import Context
-from datayoga_core.producer import Message
 from datayoga_core.producer import Producer as DyProducer
 
 logger = logging.getLogger("dy")
@@ -18,40 +17,33 @@ class Block(DyProducer, metaclass=ABCMeta):
     def init(self, context: Optional[Context] = None):
         logger.debug(f"Initializing {self.get_block_name()}")
         csv_file = self.properties["file"]
-
         if os.path.isabs(csv_file) or context is None:
             self.file = csv_file
         else:
             self.file = os.path.join(context.properties.get("data_path"), csv_file)
-
         logger.debug(f"file: {self.file}")
-
         self.encoding = self.properties.get("encoding", "utf-8")
-        self.batch_size = self.properties.get("batch_size", 1000)
         self.fields = self.properties.get("fields")
         self.skip = self.properties.get("skip", 0)
         self.delimiter = self.properties.get("delimiter", ",")
         self.quotechar = self.properties.get("quotechar", "\"")
 
-    async def produce(self) -> AsyncGenerator[List[Message], None]:
+    async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
         logger.debug("Reading CSV")
+        batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE))
 
         with open(self.file, "r", encoding=self.encoding) as read_obj:
-            reader = DictReader(read_obj, fieldnames=self.fields, delimiter=self.delimiter, quotechar=self.quotechar)
-            counter = iter(count())
-
+            reader = DictReader(read_obj, fieldnames=self.fields,
+                                delimiter=self.delimiter, quotechar=self.quotechar)
             for _ in range(self.skip):
                 with suppress(StopIteration):
                     next(reader)
-
+            counter = iter(count())
             while True:
-                sliced = islice(reader, self.batch_size)
-                records = [{self.MSG_ID_FIELD: f"{next(counter)}", **record} for record in sliced]
-
-                if not records:
-                    logger.debug(f"Done reading {self.file}")
+                chunk = [
+                    {self.MSG_ID_FIELD: f"{next(counter)}", **record}
+                    for record in islice(reader, batch_size)
+                ]
+                if not chunk:
                     return
-
-                logger.debug(f"Producing {len(records)} records")
-
-                yield records
+                yield chunk
diff --git a/core/src/datayoga_core/blocks/files/read_csv/block.schema.json b/core/src/datayoga_core/blocks/files/read_csv/block.schema.json
index 39e7118a..ca7d638b 100644
--- a/core/src/datayoga_core/blocks/files/read_csv/block.schema.json
+++ b/core/src/datayoga_core/blocks/files/read_csv/block.schema.json
@@ -2,6 +2,7 @@
   "title": "files.read_csv",
   "description": "Read data from CSV",
   "type": "object",
+  "$inherit": ["batchable"],
   "properties": {
     "file": {
       "description": "Filename. Can contain a regexp or glob expression",
@@ -39,12 +40,6 @@
       "maxLength": 1,
       "default": ","
     },
-    "batch_size": {
-      "description": "Number of records to read per batch",
-      "type": "number",
-      "minimum": 1,
-      "default": 1000
-    },
     "quotechar": {
       "description": "A one-character string used to quote fields containing special characters, such as the delimiter or quotechar, or which contain new-line characters. It defaults to '",
       "type": "string",
diff --git a/core/src/datayoga_core/blocks/files/read_csv/tests/__init__.py b/core/src/datayoga_core/blocks/files/read_csv/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py b/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py
new file mode 100644
index 00000000..16cb9b17
--- /dev/null
+++ b/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py
@@ -0,0 +1,38 @@
+from pathlib import Path
+
+import pytest
+
+from datayoga_core.blocks.files.read_csv.block import Block
+
+
+async def _drain(producer):
+    out = []
+    async for batch in producer.produce():
+        out.append(batch)
+    return out
+
+
+@pytest.fixture
+def csv_path(tmp_path) -> Path:
+    p = tmp_path / "data.csv"
+    rows = ["fname,lname"] + [f"first{i},last{i}" for i in range(2500)]
+    p.write_text("\n".join(rows) + "\n", encoding="utf-8")
+    return p
+
+
+@pytest.mark.asyncio
+async def test_csv_batches_to_batch_size(csv_path):
+    block = Block({"file": str(csv_path), "batch_size": 1000})
+    block.init()
+    batches = await _drain(block)
+    assert [len(b) for b in batches] == [1000, 1000, 500]
+    assert all(Block.MSG_ID_FIELD in r for b in batches for r in b)
+    assert batches[0][0]["fname"] == "first0"
+
+
+@pytest.mark.asyncio
+async def test_csv_default_batch_size(csv_path):
+    block = Block({"file": str(csv_path)})
+    block.init()
+    batches = await _drain(block)
+    assert [len(b) for b in batches] == [1000, 1000, 500]

From 1af0c66a21f4c48eaab751a6c1b857fc861a1608 Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 16:59:34 +0300
Subject: [PATCH 16/38] Migrate parquet/read to produce_chunks, fix one-by-one
 yield (#400, #293)

---
 .../blocks/parquet/read/block.py              | 19 ++++----
 .../blocks/parquet/read/block.schema.json     |  1 +
 .../blocks/parquet/read/tests/__init__.py     |  0
 .../parquet/read/tests/test_parquet_read.py   | 43 +++++++++++++++++++
 4 files changed, 52 insertions(+), 11 deletions(-)
 create mode 100644 core/src/datayoga_core/blocks/parquet/read/tests/__init__.py
 create mode 100644 core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py

diff --git a/core/src/datayoga_core/blocks/parquet/read/block.py b/core/src/datayoga_core/blocks/parquet/read/block.py
index f72e6490..1c7128c6 100644
--- a/core/src/datayoga_core/blocks/parquet/read/block.py
+++ b/core/src/datayoga_core/blocks/parquet/read/block.py
@@ -1,10 +1,10 @@
 import logging
 import os
 from abc import ABCMeta
-from typing import AsyncGenerator, List, Optional
+from itertools import count
+from typing import Any, AsyncGenerator, Dict, List, Optional
 
 from datayoga_core.context import Context
-from datayoga_core.producer import Message
 from datayoga_core.producer import Producer as DyProducer
 from fastparquet import ParquetFile
 
@@ -16,21 +16,18 @@ class Block(DyProducer, metaclass=ABCMeta):
     def init(self, context: Optional[Context] = None):
         logger.debug(f"Initializing {self.get_block_name()}")
         parquet_file = self.properties["file"]
-
         if os.path.isabs(parquet_file) or context is None:
             self.file = parquet_file
         else:
             self.file = os.path.join(context.properties.get("data_path"), parquet_file)
-
         logger.debug(f"file: {self.file}")
 
-    async def produce(self) -> AsyncGenerator[List[Message], None]:
+    async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
         logger.debug("Reading parquet")
-
         pf = ParquetFile(self.file)
-
-        count = 0
+        counter = iter(count())
         for df in pf.iter_row_groups():
-            for _, data in df.iterrows():
-                yield [{self.MSG_ID_FIELD: str(count), **data.to_dict()}]
-                count += 1
+            yield [
+                {self.MSG_ID_FIELD: str(next(counter)), **row.to_dict()}
+                for _, row in df.iterrows()
+            ]
diff --git a/core/src/datayoga_core/blocks/parquet/read/block.schema.json b/core/src/datayoga_core/blocks/parquet/read/block.schema.json
index 13bcec76..395b3edd 100644
--- a/core/src/datayoga_core/blocks/parquet/read/block.schema.json
+++ b/core/src/datayoga_core/blocks/parquet/read/block.schema.json
@@ -2,6 +2,7 @@
   "title": "parquet.read",
   "description": "Read data from parquet",
   "type": "object",
+  "$inherit": ["batchable"],
   "properties": {
     "file": {
       "description": "Filename. Can contain a regexp or glob expression",
diff --git a/core/src/datayoga_core/blocks/parquet/read/tests/__init__.py b/core/src/datayoga_core/blocks/parquet/read/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py b/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py
new file mode 100644
index 00000000..ab6d8517
--- /dev/null
+++ b/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py
@@ -0,0 +1,43 @@
+from pathlib import Path
+
+import pandas as pd
+import pytest
+
+from datayoga_core.blocks.parquet.read.block import Block
+
+
+async def _drain(producer):
+    out = []
+    async for batch in producer.produce():
+        out.append(batch)
+    return out
+
+
+@pytest.fixture
+def parquet_path(tmp_path) -> Path:
+    p = tmp_path / "data.parquet"
+    df = pd.DataFrame({"i": list(range(2500))})
+    from fastparquet import write as fp_write
+    fp_write(str(p), df, row_group_offsets=1000)
+    return p
+
+
+@pytest.mark.asyncio
+async def test_parquet_batches_to_batch_size(parquet_path):
+    block = Block({"file": str(parquet_path), "batch_size": 1000})
+    block.init()
+    batches = await _drain(block)
+    assert [len(b) for b in batches] == [1000, 1000, 500]
+    flat = [r for b in batches for r in b]
+    assert flat[0]["i"] == 0
+    assert all(Block.MSG_ID_FIELD in r for r in flat)
+
+
+@pytest.mark.asyncio
+async def test_parquet_rechunks_across_row_groups(parquet_path):
+    # row groups are [1000, 1000, 500]; batch_size=750 should give batches of
+    # [750, 750, 750, 250] regardless of row group boundaries.
+    block = Block({"file": str(parquet_path), "batch_size": 750})
+    block.init()
+    batches = await _drain(block)
+    assert [len(b) for b in batches] == [750, 750, 750, 250]

From 85ac26a9642c895b34a9d9d8107203c81554670f Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 17:01:52 +0300
Subject: [PATCH 17/38] Migrate relational/read to produce_chunks, add
 fetch_size (#400, #295)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../blocks/relational/read/block.py           | 19 +++--
 .../blocks/relational/read/block.schema.json  |  7 ++
 .../blocks/relational/read/tests/__init__.py  |  0
 .../read/tests/test_relational_read.py        | 79 +++++++++++++++++++
 4 files changed, 95 insertions(+), 10 deletions(-)
 create mode 100644 core/src/datayoga_core/blocks/relational/read/tests/__init__.py
 create mode 100644 core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py

diff --git a/core/src/datayoga_core/blocks/relational/read/block.py b/core/src/datayoga_core/blocks/relational/read/block.py
index 97d8dcdd..2b04f3c3 100644
--- a/core/src/datayoga_core/blocks/relational/read/block.py
+++ b/core/src/datayoga_core/blocks/relational/read/block.py
@@ -1,23 +1,23 @@
 import logging
-from typing import AsyncGenerator, List, Optional
+from typing import Any, AsyncGenerator, Dict, List, Optional
 
 import sqlalchemy as sa
 from datayoga_core import utils
 from datayoga_core.blocks.relational import utils as relational_utils
 from datayoga_core.context import Context
-from datayoga_core.producer import Message
 from datayoga_core.producer import Producer as DyProducer
 
 logger = logging.getLogger("dy")
 
 
 class Block(DyProducer):
+    DEFAULT_FETCH_SIZE = 10000
 
     def init(self, context: Optional[Context] = None):
         self.engine, self.db_type = relational_utils.get_engine(
             self.properties["connection"],
             context,
-            autocommit=False
+            autocommit=False,
         )
 
         self.schema = self.properties.get("schema")
@@ -32,15 +32,14 @@ def init(self, context: Optional[Context] = None):
         logger.debug(f"Connecting to {self.db_type}")
         self.connection = self.engine.connect()
 
-    async def produce(self) -> AsyncGenerator[List[Message], None]:
+    async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
+        fetch_size = int(self.properties.get("fetch_size", self.DEFAULT_FETCH_SIZE))
         result = self.connection.execution_options(stream_results=True).execute(self.tbl.select())
-
         while True:
-            chunk = result.fetchmany(10000)
-            if not chunk:
-                break
-            for row in chunk:
-                yield [utils.add_uid(dict(row._asdict()))]
+            rows = result.fetchmany(fetch_size)
+            if not rows:
+                return
+            yield [utils.add_uid(dict(row._asdict())) for row in rows]
 
     def stop(self):
         self.connection.close()
diff --git a/core/src/datayoga_core/blocks/relational/read/block.schema.json b/core/src/datayoga_core/blocks/relational/read/block.schema.json
index 4a65a8fc..df5bc8b2 100644
--- a/core/src/datayoga_core/blocks/relational/read/block.schema.json
+++ b/core/src/datayoga_core/blocks/relational/read/block.schema.json
@@ -2,6 +2,7 @@
   "title": "relational.read",
   "description": "Read a table from an SQL-compatible data store",
   "type": "object",
+  "$inherit": ["batchable"],
   "additionalProperties": false,
   "examples": [
     {
@@ -41,6 +42,12 @@
         "title": "name of column"
       },
       "examples": [["fname", { "lname": "last_name" }]]
+    },
+    "fetch_size": {
+      "type": "integer",
+      "minimum": 1,
+      "description": "Driver-level rows fetched per round-trip. Defaults to 10000.",
+      "default": 10000
     }
   },
   "required": ["connection", "table"]
diff --git a/core/src/datayoga_core/blocks/relational/read/tests/__init__.py b/core/src/datayoga_core/blocks/relational/read/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py b/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py
new file mode 100644
index 00000000..0fba4629
--- /dev/null
+++ b/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py
@@ -0,0 +1,79 @@
+from unittest.mock import MagicMock
+
+import pytest
+
+from datayoga_core.blocks.relational.read.block import Block
+
+
+async def _drain(producer):
+    out = []
+    async for batch in producer.produce():
+        out.append(batch)
+    return out
+
+
+def _fake_result(rows):
+    """Build a fake SQLAlchemy result that returns rows in fetchmany chunks."""
+    state = {"i": 0}
+
+    def fetchmany(n):
+        i = state["i"]
+        chunk = rows[i:i + n]
+        state["i"] += len(chunk)
+        return chunk
+
+    res = MagicMock()
+    res.fetchmany.side_effect = fetchmany
+    res.execution_options.return_value = res
+    return res
+
+
+class _Row:
+    def __init__(self, d):
+        self._d = d
+
+    def _asdict(self):
+        return self._d
+
+
+def _mk_block(properties, fake_result):
+    block = Block.__new__(Block)
+    block.properties = properties
+    block.connection = MagicMock()
+    block.tbl = MagicMock()
+    block.tbl.select.return_value = "SELECT *"
+    block.connection.execution_options.return_value.execute.return_value = fake_result
+    return block
+
+
+@pytest.mark.asyncio
+async def test_relational_read_yields_batches_not_rows():
+    rows = [_Row({"i": i}) for i in range(2500)]
+    fake_result = _fake_result(rows)
+    block = _mk_block({"batch_size": 1000}, fake_result)
+    batches = await _drain(block)
+    assert [len(b) for b in batches] == [1000, 1000, 500]
+
+
+@pytest.mark.asyncio
+async def test_relational_read_fetch_size_independent_of_batch_size():
+    rows = [_Row({"i": i}) for i in range(5000)]
+    fake_result = _fake_result(rows)
+    block = _mk_block({"batch_size": 1000, "fetch_size": 2500}, fake_result)
+    batches = await _drain(block)
+    # Downstream batches are still batch_size=1000
+    assert [len(b) for b in batches] == [1000, 1000, 1000, 1000, 1000]
+    # Driver fetched in fetch_size=2500 chunks (2500 + 2500 + 0)
+    fetch_sizes = [c.args[0] for c in fake_result.fetchmany.call_args_list]
+    assert fetch_sizes[0] == 2500
+    assert fetch_sizes[1] == 2500
+
+
+@pytest.mark.asyncio
+async def test_relational_read_default_fetch_size_is_10000():
+    rows = [_Row({"i": i}) for i in range(500)]
+    fake_result = _fake_result(rows)
+    block = _mk_block({}, fake_result)
+    await _drain(block)
+    fetch_sizes = [c.args[0] for c in fake_result.fetchmany.call_args_list]
+    assert fetch_sizes[0] == 10000

From 3b72998380518ac85fe333d7ac03531dd75957e4 Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 17:04:16 +0300
Subject: [PATCH 18/38] Migrate http/receiver to produce_chunks (#400)

---
 .../blocks/http/receiver/block.py             | 20 +++++----
 .../blocks/http/receiver/block.schema.json    |  1 +
 .../blocks/http/receiver/tests/__init__.py    |  0
 .../http/receiver/tests/test_http_receiver.py | 44 +++++++++++++++++++
 4 files changed, 56 insertions(+), 9 deletions(-)
 create mode 100644 core/src/datayoga_core/blocks/http/receiver/tests/__init__.py
 create mode 100644 core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py

diff --git a/core/src/datayoga_core/blocks/http/receiver/block.py b/core/src/datayoga_core/blocks/http/receiver/block.py
index f325e56b..3f5b1833 100644
--- a/core/src/datayoga_core/blocks/http/receiver/block.py
+++ b/core/src/datayoga_core/blocks/http/receiver/block.py
@@ -3,13 +3,12 @@
 from asyncio import Queue
 from contextlib import suppress
 from itertools import count
-from typing import AsyncGenerator, List, Optional
+from typing import Any, AsyncGenerator, Dict, List, Optional
 
 import orjson
 from aiohttp.web import (BaseRequest, HTTPInternalServerError, HTTPOk,
                          Response, Server, ServerRunner, TCPSite)
 from datayoga_core.context import Context
-from datayoga_core.producer import Message
 from datayoga_core.producer import Producer as DyProducer
 
 logger = logging.getLogger("dy")
@@ -18,20 +17,21 @@
 class Block(DyProducer, metaclass=ABCMeta):
     port: int
     host: str
+    DEFAULT_FLUSH_MS = 1000
 
     def init(self, context: Optional[Context] = None):
         logger.debug(f"Initializing {self.get_block_name()}")
         self.port = int(self.properties.get("port", 8080))
         self.host = self.properties.get("host", "0.0.0.0")
 
-    async def produce(self) -> AsyncGenerator[List[Message], None]:
-        queue = Queue(maxsize=1000)
+    async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
+        queue: Queue = Queue(maxsize=1000)
 
         async def handler(request: BaseRequest) -> Response:
             try:
                 queue.put_nowait(orjson.loads(await request.read()))
                 return HTTPOk()
-            except Exception:  # noqa
+            except Exception:
                 logger.exception("Got exception while parsing request:")
                 return HTTPInternalServerError()
 
@@ -43,11 +43,13 @@ async def handler(request: BaseRequest) -> Response:
 
         try:
             counter = iter(count())
-
             while True:
-                data = await queue.get()
-                yield [{self.MSG_ID_FIELD: f"{next(counter)}", **data}]
-
+                first = await queue.get()
+                chunk = [{self.MSG_ID_FIELD: f"{next(counter)}", **first}]
+                while not queue.empty():
+                    record = queue.get_nowait()
+                    chunk.append({self.MSG_ID_FIELD: f"{next(counter)}", **record})
+                yield chunk
         finally:
             with suppress(Exception):
                 await srv.stop()
diff --git a/core/src/datayoga_core/blocks/http/receiver/block.schema.json b/core/src/datayoga_core/blocks/http/receiver/block.schema.json
index c5189b5f..a52edcc5 100644
--- a/core/src/datayoga_core/blocks/http/receiver/block.schema.json
+++ b/core/src/datayoga_core/blocks/http/receiver/block.schema.json
@@ -2,6 +2,7 @@
   "title": "http.receiver",
   "description": "Receives HTTP requests and process the data.",
   "type": "object",
+  "$inherit": ["streamable"],
   "properties": {
     "host": {
       "description": "Host to listen",
diff --git a/core/src/datayoga_core/blocks/http/receiver/tests/__init__.py b/core/src/datayoga_core/blocks/http/receiver/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py b/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py
new file mode 100644
index 00000000..613d91d7
--- /dev/null
+++ b/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py
@@ -0,0 +1,44 @@
+import asyncio
+
+import aiohttp
+import pytest
+
+from datayoga_core.blocks.http.receiver.block import Block
+
+
+def _free_port():
+    import socket
+    with socket.socket() as s:
+        s.bind(("127.0.0.1", 0))
+        return s.getsockname()[1]
+
+
+@pytest.mark.asyncio
+async def test_http_receiver_batches_incoming_requests():
+    port = _free_port()
+    block = Block({"host": "127.0.0.1", "port": port,
+                   "batch_size": 50, "flush_ms": 200})
+    block.init()
+
+    received = []
+
+    async def consumer():
+        async for batch in block.produce():
+            received.append(batch)
+            if sum(len(b) for b in received) >= 60:
+                return
+
+    consumer_task = asyncio.create_task(consumer())
+    await asyncio.sleep(0.2)  # let server start
+
+    async with aiohttp.ClientSession() as session:
+        for i in range(60):
+            async with session.post(f"http://127.0.0.1:{port}", json={"i": i}) as r:
+                assert r.status == 200
+
+    await asyncio.wait_for(consumer_task, timeout=5)
+
+    flat = [r for b in received for r in b]
+    assert len(flat) == 60
+    assert any(len(b) == 50 for b in received)
+    assert all(Block.MSG_ID_FIELD in r for r in flat)

From 0b774ac3ce082327c6a36b35e83509162984048a Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 17:07:04 +0300
Subject: [PATCH 19/38] Migrate redis/read_stream to batched xreadgroup (#400,
 #377)

---
 .../blocks/redis/read_stream/block.py         | 35 +++++++-----
 .../redis/read_stream/block.schema.json       |  1 +
 .../redis/read_stream/tests/__init__.py       |  0
 .../tests/test_redis_read_stream.py           | 54 +++++++++++++++++++
 4 files changed, 76 insertions(+), 14 deletions(-)
 create mode 100644 core/src/datayoga_core/blocks/redis/read_stream/tests/__init__.py
 create mode 100644 core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py

diff --git a/core/src/datayoga_core/blocks/redis/read_stream/block.py b/core/src/datayoga_core/blocks/redis/read_stream/block.py
index 667ed02d..136d0963 100644
--- a/core/src/datayoga_core/blocks/redis/read_stream/block.py
+++ b/core/src/datayoga_core/blocks/redis/read_stream/block.py
@@ -1,23 +1,22 @@
 import logging
-from typing import AsyncGenerator, List, Optional
+from typing import Any, AsyncGenerator, Dict, List, Optional
 
 import datayoga_core.blocks.redis.utils as redis_utils
 import orjson
 from datayoga_core.connection import Connection
 from datayoga_core.context import Context
-from datayoga_core.producer import Message
 from datayoga_core.producer import Producer as DyProducer
 
 logger = logging.getLogger("dy")
 
 
 class Block(DyProducer):
+    DEFAULT_FLUSH_MS = 1000
+
     def init(self, context: Optional[Context] = None):
         logger.debug(f"Initializing {self.get_block_name()}")
-
         connection_details = Connection.get_connection_details(self.properties["connection"], context)
         self.redis_client = redis_utils.get_client(connection_details)
-
         self.stream = self.properties["stream_name"]
         self.snapshot = self.properties.get("snapshot", False)
         self.consumer_group = f'datayoga_job_{context.properties.get("job_name", "") if context else ""}'
@@ -27,25 +26,33 @@ def init(self, context: Optional[Context] = None):
             logger.info(f"Creating a new {self.consumer_group} consumer group associated with the {self.stream}")
             self.redis_client.xgroup_create(self.stream, self.consumer_group, 0)
 
-    async def produce(self) -> AsyncGenerator[List[Message], None]:
+    async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
         logger.debug(f"Running {self.get_block_name()}")
-
+        batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE))
         read_pending = True
-        while True:
-            # Read pending messages (fetched by us before but not acknowledged) in the first time, then consume new messages
-            streams = self.redis_client.xreadgroup(self.consumer_group, self.requesting_consumer, {
-                self.stream: "0" if read_pending else ">"}, None, 100 if self.snapshot else 0)
 
+        while True:
+            streams = self.redis_client.xreadgroup(
+                self.consumer_group, self.requesting_consumer,
+                {self.stream: "0" if read_pending else ">"},
+                count=batch_size,
+                block=100 if self.snapshot else 0,
+            )
+
+            yielded_any = False
             for stream in streams:
                 logger.debug(f"Messages in {self.stream} stream (pending: {read_pending}):\n\t{stream}")
+                chunk: List[Dict[str, Any]] = []
                 for key, value in stream[1]:
                     payload = orjson.loads(value[next(iter(value))])
                     payload[self.MSG_ID_FIELD] = key
-                    yield [payload]
+                    chunk.append(payload)
+                if chunk:
+                    yielded_any = True
+                    yield chunk
 
-            # Quit after consuming pending current messages in case of snapshot
-            if self.snapshot and not read_pending:
-                break
+            if self.snapshot and not read_pending and not yielded_any:
+                return
 
             read_pending = False
 
diff --git a/core/src/datayoga_core/blocks/redis/read_stream/block.schema.json b/core/src/datayoga_core/blocks/redis/read_stream/block.schema.json
index bc2d148c..f7e0a948 100644
--- a/core/src/datayoga_core/blocks/redis/read_stream/block.schema.json
+++ b/core/src/datayoga_core/blocks/redis/read_stream/block.schema.json
@@ -2,6 +2,7 @@
   "title": "redis.read_stream",
   "description": "Read from Redis stream",
   "type": "object",
+  "$inherit": ["streamable"],
   "properties": {
     "connection": { "description": "Connection name", "type": "string" },
     "stream_name": {
diff --git a/core/src/datayoga_core/blocks/redis/read_stream/tests/__init__.py b/core/src/datayoga_core/blocks/redis/read_stream/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py b/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py
new file mode 100644
index 00000000..f45b8d67
--- /dev/null
+++ b/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py
@@ -0,0 +1,54 @@
+from unittest.mock import MagicMock
+
+import pytest
+
+from datayoga_core.blocks.redis.read_stream.block import Block
+
+
+def _mk_block(properties, redis_client):
+    block = Block.__new__(Block)
+    block.properties = properties
+    block.redis_client = redis_client
+    block.stream = "mystream"
+    block.snapshot = properties.get("_snapshot", True)
+    block.consumer_group = "g"
+    block.requesting_consumer = "c"
+    return block
+
+
+@pytest.mark.asyncio
+async def test_redis_uses_count_equal_to_batch_size():
+    redis = MagicMock()
+    payload_a = (b"1-0", {b"data": b'{"i": 1}'})
+    payload_b = (b"2-0", {b"data": b'{"i": 2}'})
+    redis.xreadgroup.side_effect = [
+        [(b"mystream", [payload_a, payload_b])],  # pending
+        [(b"mystream", [])],                       # nothing new -> exit
+    ]
+
+    block = _mk_block({"batch_size": 250, "_snapshot": True}, redis)
+    batches = []
+    async for b in block.produce():
+        batches.append(b)
+
+    assert all(c.kwargs.get("count") == 250 or (len(c.args) >= 4 and c.args[3] == 250)
+               for c in redis.xreadgroup.call_args_list), \
+        "xreadgroup should be called with count=batch_size"
+
+
+@pytest.mark.asyncio
+async def test_redis_yields_records_as_a_batch_not_one_by_one():
+    redis = MagicMock()
+    pages = [(f"{i}-0".encode(), {b"data": f'{{"i": {i}}}'.encode()}) for i in range(5)]
+    redis.xreadgroup.side_effect = [
+        [(b"mystream", pages)],
+        [(b"mystream", [])],
+    ]
+
+    block = _mk_block({"batch_size": 100, "_snapshot": True}, redis)
+    batches = []
+    async for b in block.produce():
+        batches.append(b)
+
+    assert [len(b) for b in batches] == [5]
+    assert batches[0][0]["i"] == 0

From 38cf4ec7edf55bfdc741179e40239ae215e6d3cc Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 17:10:25 +0300
Subject: [PATCH 20/38] Migrate azure/read_event_hub; rename batch_size ->
 max_batch_size (#400, BREAKING)

---
 .../blocks/azure/read_event_hub/block.py      | 66 +++++--------------
 .../azure/read_event_hub/block.schema.json    |  7 +-
 .../azure/read_event_hub/tests/__init__.py    |  0
 .../read_event_hub/tests/test_event_hub.py    | 56 ++++++++++++++++
 4 files changed, 76 insertions(+), 53 deletions(-)
 create mode 100644 core/src/datayoga_core/blocks/azure/read_event_hub/tests/__init__.py
 create mode 100644 core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py

diff --git a/core/src/datayoga_core/blocks/azure/read_event_hub/block.py b/core/src/datayoga_core/blocks/azure/read_event_hub/block.py
index d91497ed..77f76d7d 100644
--- a/core/src/datayoga_core/blocks/azure/read_event_hub/block.py
+++ b/core/src/datayoga_core/blocks/azure/read_event_hub/block.py
@@ -1,6 +1,6 @@
 import asyncio
 import logging
-from typing import AsyncGenerator, List, Optional
+from typing import Any, AsyncGenerator, Dict, List, Optional
 
 import orjson
 from azure.eventhub import EventData, PartitionContext
@@ -8,7 +8,6 @@
 from azure.eventhub.extensions.checkpointstoreblobaio import \
     BlobCheckpointStore
 from datayoga_core.context import Context
-from datayoga_core.producer import Message
 from datayoga_core.producer import Producer as DyProducer
 
 logger = logging.getLogger("dy")
@@ -17,67 +16,43 @@
 class Block(DyProducer):
     """Azure Event Hub block for reading events."""
 
-    def init(self, context: Optional[Context] = None):
-        """Initializes the block.
+    DEFAULT_FLUSH_MS = 1000
 
-        Args:
-            context (Context, optional): The block context. Defaults to None.
-        """
+    def init(self, context: Optional[Context] = None):
         logger.debug(f"Initializing {self.get_block_name()}")
-
-        self.batch_size = self.properties.get("batch_size", 300)
-
+        self.max_batch_size = int(self.properties.get("max_batch_size", 300))
         self.consumer_client = EventHubConsumerClient.from_connection_string(
             conn_str=self.properties["event_hub_connection_string"],
             consumer_group=self.properties["event_hub_consumer_group_name"],
             eventhub_name=self.properties["event_hub_name"],
             checkpoint_store=BlobCheckpointStore.from_connection_string(
                 self.properties["checkpoint_store_connection_string"],
-                self.properties["checkpoint_store_container_name"])
+                self.properties["checkpoint_store_container_name"]),
         )
+        self.events: Dict[Any, Any] = {}
+        self.messages: asyncio.Queue = asyncio.Queue()
 
-        self.events = {}  # Retrieved events by sequence number, used for acknowledging them once processed
-        self.messages = asyncio.Queue()
-
-    async def produce(self) -> AsyncGenerator[List[Message], None]:
-        """Starts the event receiving process and yield batches of messages.
-
-        Yields:
-            AsyncGenerator[List[Message], None]: A generator of message batches.
-        """
+    async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
         logger.debug(f"Running {self.get_block_name()}")
-
         logger.debug("Starting event receiving process")
         asyncio.create_task(self.receive_batch())
 
         while True:
-            if not self.messages.empty():
-                batch = []
-                while not self.messages.empty():
-                    message = await self.messages.get()
-                    batch.append(message)
-
-                yield batch
-
-            await asyncio.sleep(0.1)
+            first = await self.messages.get()
+            chunk = [first]
+            while not self.messages.empty():
+                chunk.append(self.messages.get_nowait())
+            yield chunk
 
     async def receive_batch(self):
-        """Receives events in batches from the Event Hub."""
         await self.consumer_client.receive_batch(
             on_event_batch=self.on_event_batch,
-            max_batch_size=self.batch_size,
-            starting_position="-1",  # read from the beginning of the partition.
+            max_batch_size=self.max_batch_size,
+            starting_position="-1",
         )
 
     async def on_event_batch(self, partition_context: PartitionContext, events: List[EventData]):
-        """Processes each batch of events received from the Event Hub.
-
-        Args:
-            partition_context (PartitionContext): The partition context.
-            events (List[EventData]): The list of events in the batch.
-        """
         logger.debug(f"Received batch of events from partition: {partition_context.partition_id}")
-
         for event in events:
             try:
                 payload = orjson.loads(event.body_as_str(encoding="UTF-8"))
@@ -89,24 +64,13 @@ async def on_event_batch(self, partition_context: PartitionContext, events: List
                 logger.error(e)
 
     async def complete_events(self, msg_ids: List[str]):
-        """Completes the events and update the checkpoint.
-
-        Args:
-            msg_ids (List[str]): The list of message IDs to complete.
-        """
         for msg_id in msg_ids:
             logger.debug(f"Acking {msg_id} event")
             event, partition_context = self.events.pop(msg_id, (None, None))
-
             if event is not None:
                 await partition_context.update_checkpoint(event)
             else:
                 logger.warning(f"Couldn't find event {msg_id} for acknowledging")
 
     def ack(self, msg_ids: List[str]):
-        """Acknowledges the completion of events.
-
-        Args:
-            msg_ids (List[str]): The list of message IDs to acknowledge.
-        """
         asyncio.create_task(self.complete_events(msg_ids))
diff --git a/core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json b/core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json
index 908c211c..f663d383 100644
--- a/core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json
+++ b/core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json
@@ -2,6 +2,7 @@
   "title": "azure.read_event_hub",
   "description": "Read from Azure Event Hub",
   "type": "object",
+  "$inherit": ["streamable"],
   "properties": {
     "event_hub_connection_string": {
       "type": "string",
@@ -23,12 +24,14 @@
       "type": "string",
       "description": "The name of the container within the checkpoint store to store the checkpoints."
     },
-    "batch_size": {
+    "max_batch_size": {
       "type": "integer",
-      "description": "The maximum number of events to receive in each batch.",
+      "minimum": 1,
+      "description": "Maximum number of events to receive in each SDK callback. Renamed from the previous batch_size which used to mean this. Defaults to 300.",
       "default": 300
     }
   },
+  "additionalProperties": false,
   "required": [
     "event_hub_connection_string",
     "event_hub_consumer_group_name",
diff --git a/core/src/datayoga_core/blocks/azure/read_event_hub/tests/__init__.py b/core/src/datayoga_core/blocks/azure/read_event_hub/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py b/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py
new file mode 100644
index 00000000..074b7c36
--- /dev/null
+++ b/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py
@@ -0,0 +1,56 @@
+import pytest
+from jsonschema import ValidationError
+
+from datayoga_core.blocks.azure.read_event_hub.block import Block
+
+
+def _minimal_props(extra=None):
+    base = {
+        "event_hub_connection_string": "Endpoint=sb://x/;SharedAccessKeyName=k;SharedAccessKey=v;EntityPath=eh",
+        "event_hub_consumer_group_name": "$Default",
+        "event_hub_name": "eh",
+        "checkpoint_store_connection_string": "DefaultEndpointsProtocol=https;AccountName=a;AccountKey=k==",
+        "checkpoint_store_container_name": "chk",
+    }
+    if extra:
+        base.update(extra)
+    return base
+
+
+def test_unknown_property_rejected_by_validation():
+    """additionalProperties: false catches typos like 'batch_sz'."""
+    with pytest.raises(ValidationError):
+        Block(_minimal_props({"batch_sz": 300}))
+
+
+def test_max_batch_size_accepted():
+    """The renamed SDK-level property is now max_batch_size."""
+    block = Block(_minimal_props({"max_batch_size": 500, "batch_size": 100}))
+    assert block.properties["max_batch_size"] == 500
+    assert block.properties["batch_size"] == 100
+
+
+def test_max_batch_size_defaults_to_300_when_omitted():
+    """The block's init() reads max_batch_size with a default of 300."""
+    block = Block(_minimal_props())
+    assert int(block.properties.get("max_batch_size", 300)) == 300
+
+
+def test_renamed_schema_has_additional_properties_false():
+    """Schema after rename: max_batch_size + streamable's batch_size/flush_ms,
+    no unknown properties allowed."""
+    block = Block(_minimal_props())
+    schema = block.get_json_schema()
+    assert schema.get("additionalProperties") is False
+    assert "max_batch_size" in schema["properties"]
+    assert "batch_size" in schema["properties"]
+    assert "flush_ms" in schema["properties"]
+
+
+def test_batch_size_300_is_silently_repurposed():
+    """A user upgrading from a pre-rename version with batch_size: 300 (which
+    used to mean SDK callback size) will see their YAML still validate, but
+    batch_size now means pipeline batch size. Documented as breaking change."""
+    block = Block(_minimal_props({"batch_size": 300}))
+    assert block.properties["batch_size"] == 300
+    assert "max_batch_size" not in block.properties

From b67bc4a8e345e202610f27af0271d41e841af686 Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 17:15:46 +0300
Subject: [PATCH 21/38] Regenerate JSON schemas and reference docs after
 producer batching (#400)

---
 docs/reference/batchable.md                   | 24 +++++++
 docs/reference/blocks/azure_read_event_hub.md |  5 +-
 docs/reference/blocks/files_read_csv.md       |  1 -
 docs/reference/blocks/relational_read.md      |  1 +
 docs/reference/blocks/std_read.md             | 14 +---
 docs/reference/connections.md                 |  2 +-
 docs/reference/job.md                         |  2 +-
 docs/reference/streamable.md                  | 26 +++++++
 schemas/job.schema.json                       | 68 +++++++++++++++++--
 9 files changed, 121 insertions(+), 22 deletions(-)
 create mode 100644 docs/reference/batchable.md
 create mode 100644 docs/reference/streamable.md

diff --git a/docs/reference/batchable.md b/docs/reference/batchable.md
new file mode 100644
index 00000000..4c344fa8
--- /dev/null
+++ b/docs/reference/batchable.md
@@ -0,0 +1,24 @@
+---
+parent: Reference
+nav_order: 1
+---
+
+# batchable
+
+Producer batching mixin: declares batch_size for producers that yield records in batches.
+
+
+**Properties**
+
+|Name|Type|Description|Required|
+|----|----|-----------|--------|
+|**batch\_size**|`integer`|Maximum number of records yielded per downstream batch.<br/>Default: `1000`<br/>Minimum: `1`<br/>||
+
+**Example**
+
+```yaml
+batch_size: 1000
+
+```
+
+
diff --git a/docs/reference/blocks/azure_read_event_hub.md b/docs/reference/blocks/azure_read_event_hub.md
index b247fb30..578b968c 100644
--- a/docs/reference/blocks/azure_read_event_hub.md
+++ b/docs/reference/blocks/azure_read_event_hub.md
@@ -17,12 +17,13 @@ Read from Azure Event Hub
 |**event\_hub\_name**|`string`|The name of the Azure Event Hub.<br/>|yes|
 |**checkpoint\_store\_connection\_string**|`string`|The connection string for the Azure Storage account used as the checkpoint store.<br/>|yes|
 |**checkpoint\_store\_container\_name**|`string`|The name of the container within the checkpoint store to store the checkpoints.<br/>|yes|
-|**batch\_size**|`integer`|The maximum number of events to receive in each batch.<br/>Default: `300`<br/>|no|
+|**max\_batch\_size**|`integer`|Maximum number of events to receive in each SDK callback. Renamed from the previous batch_size which used to mean this. Defaults to 300.<br/>Default: `300`<br/>Minimum: `1`<br/>|no|
 
+**Additional Properties:** not allowed  
 **Example**
 
 ```yaml
-batch_size: 300
+max_batch_size: 300
 
 ```
 
diff --git a/docs/reference/blocks/files_read_csv.md b/docs/reference/blocks/files_read_csv.md
index 3f47237f..8948865a 100644
--- a/docs/reference/blocks/files_read_csv.md
+++ b/docs/reference/blocks/files_read_csv.md
@@ -17,7 +17,6 @@ Read data from CSV
 |[**fields**](#fields)<br/>(List of columns to use)|`string[]`|List of columns to use for extract<br/>Minimal Length: `1`<br/>|no|
 |**skip**|`number`|Number of lines to skip<br/>Default: `0`<br/>Minimum: `0`<br/>|no|
 |**delimiter**|`string`|Delimiter to use for splitting the csv records<br/>Default: `","`<br/>Minimal Length: `1`<br/>Maximal Length: `1`<br/>|no|
-|**batch\_size**|`number`|Number of records to read per batch<br/>Default: `1000`<br/>Minimum: `1`<br/>|no|
 |**quotechar**|`string`|A one-character string used to quote fields containing special characters, such as the delimiter or quotechar, or which contain new-line characters. It defaults to '<br/>Default: `"\""`<br/>Minimal Length: `1`<br/>Maximal Length: `1`<br/>|no|
 
 **Additional Properties:** not allowed  
diff --git a/docs/reference/blocks/relational_read.md b/docs/reference/blocks/relational_read.md
index 1b11df44..4bb5248c 100644
--- a/docs/reference/blocks/relational_read.md
+++ b/docs/reference/blocks/relational_read.md
@@ -16,6 +16,7 @@ Read a table from an SQL-compatible data store
 |**schema**<br/>(The table schema of the table)|`string`|If left blank, the default schema of this connection will be used as defined in the connections.dy.yaml<br/>|no|
 |**table**<br/>(The table name)|`string`|Table name<br/>|yes|
 |[**columns**](#columns)<br/>(Optional subset of columns to load)|`array`||no|
+|**fetch\_size**|`integer`|Driver-level rows fetched per round-trip. Defaults to 10000.<br/>Default: `10000`<br/>Minimum: `1`<br/>|no|
 
 **Additional Properties:** not allowed  
 **Example**
diff --git a/docs/reference/blocks/std_read.md b/docs/reference/blocks/std_read.md
index aca1c24a..bee360f2 100644
--- a/docs/reference/blocks/std_read.md
+++ b/docs/reference/blocks/std_read.md
@@ -8,17 +8,7 @@ grand_parent: Reference
 Read from the standard input
 
 
-**Properties**
-
-|Name|Type|Description|Required|
-|----|----|-----------|--------|
-|**batch\_size**|`integer`|Number of records to process in a single batch<br/>Default: `1000`<br/>||
-
-**Example**
-
-```yaml
-batch_size: 1000
-
-```
+**No properties.**
 
+**Additional Properties:** not allowed  
 
diff --git a/docs/reference/connections.md b/docs/reference/connections.md
index bfc2b8d0..580fbb39 100644
--- a/docs/reference/connections.md
+++ b/docs/reference/connections.md
@@ -1,6 +1,6 @@
 ---
 parent: Reference
-nav_order: 1
+nav_order: 2
 ---
 
 # Connections
diff --git a/docs/reference/job.md b/docs/reference/job.md
index ed88211d..615a6da8 100644
--- a/docs/reference/job.md
+++ b/docs/reference/job.md
@@ -1,6 +1,6 @@
 ---
 parent: Reference
-nav_order: 2
+nav_order: 3
 ---
 
 # Job
diff --git a/docs/reference/streamable.md b/docs/reference/streamable.md
new file mode 100644
index 00000000..49f499cd
--- /dev/null
+++ b/docs/reference/streamable.md
@@ -0,0 +1,26 @@
+---
+parent: Reference
+nav_order: 4
+---
+
+# streamable
+
+Streaming producer mixin: declares batch_size and flush_ms for producers reading from continuous sources.
+
+
+**Properties**
+
+|Name|Type|Description|Required|
+|----|----|-----------|--------|
+|**batch\_size**|`integer`|Maximum number of records yielded per downstream batch.<br/>Default: `1000`<br/>Minimum: `1`<br/>||
+|**flush\_ms**|`integer`, `null`|If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.<br/>Default: `1000`<br/>Minimum: `1`<br/>||
+
+**Example**
+
+```yaml
+batch_size: 1000
+flush_ms: 1000
+
+```
+
+
diff --git a/schemas/job.schema.json b/schemas/job.schema.json
index 1b2a2533..ad0f20b9 100644
--- a/schemas/job.schema.json
+++ b/schemas/job.schema.json
@@ -111,11 +111,13 @@
           "then": {
             "properties": {
               "with": {
+                "additionalProperties": false,
                 "description": "Read from Azure Event Hub",
                 "properties": {
                   "batch_size": {
-                    "default": 300,
-                    "description": "The maximum number of events to receive in each batch.",
+                    "default": 1000,
+                    "description": "Maximum number of records yielded per downstream batch.",
+                    "minimum": 1,
                     "type": "integer"
                   },
                   "checkpoint_store_connection_string": {
@@ -137,6 +139,18 @@
                   "event_hub_name": {
                     "description": "The name of the Azure Event Hub.",
                     "type": "string"
+                  },
+                  "flush_ms": {
+                    "default": 1000,
+                    "description": "If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.",
+                    "minimum": 1,
+                    "type": ["integer", "null"]
+                  },
+                  "max_batch_size": {
+                    "default": 300,
+                    "description": "Maximum number of events to receive in each SDK callback. Renamed from the previous batch_size which used to mean this. Defaults to 300.",
+                    "minimum": 1,
+                    "type": "integer"
                   }
                 },
                 "required": [
@@ -258,9 +272,9 @@
                 "properties": {
                   "batch_size": {
                     "default": 1000,
-                    "description": "Number of records to read per batch",
+                    "description": "Maximum number of records yielded per downstream batch.",
                     "minimum": 1,
-                    "type": "number"
+                    "type": "integer"
                   },
                   "delimiter": {
                     "default": ",",
@@ -366,6 +380,18 @@
                 "description": "Receives HTTP requests and process the data.",
                 "examples": [{ "host": "localhost", "port": 8080 }],
                 "properties": {
+                  "batch_size": {
+                    "default": 1000,
+                    "description": "Maximum number of records yielded per downstream batch.",
+                    "minimum": 1,
+                    "type": "integer"
+                  },
+                  "flush_ms": {
+                    "default": 1000,
+                    "description": "If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.",
+                    "minimum": 1,
+                    "type": ["integer", "null"]
+                  },
                   "host": {
                     "default": "0.0.0.0",
                     "description": "Host to listen",
@@ -696,6 +722,12 @@
                 "description": "Read data from parquet",
                 "examples": [{ "file": "data.parquet" }],
                 "properties": {
+                  "batch_size": {
+                    "default": 1000,
+                    "description": "Maximum number of records yielded per downstream batch.",
+                    "minimum": 1,
+                    "type": "integer"
+                  },
                   "file": {
                     "description": "Filename. Can contain a regexp or glob expression",
                     "type": "string"
@@ -825,10 +857,22 @@
                 "additionalProperties": false,
                 "description": "Read from Redis stream",
                 "properties": {
+                  "batch_size": {
+                    "default": 1000,
+                    "description": "Maximum number of records yielded per downstream batch.",
+                    "minimum": 1,
+                    "type": "integer"
+                  },
                   "connection": {
                     "description": "Connection name",
                     "type": "string"
                   },
+                  "flush_ms": {
+                    "default": 1000,
+                    "description": "If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.",
+                    "minimum": 1,
+                    "type": ["integer", "null"]
+                  },
                   "snapshot": {
                     "default": false,
                     "description": "Snapshot current entries and quit",
@@ -1022,6 +1066,12 @@
                   }
                 ],
                 "properties": {
+                  "batch_size": {
+                    "default": 1000,
+                    "description": "Maximum number of records yielded per downstream batch.",
+                    "minimum": 1,
+                    "type": "integer"
+                  },
                   "columns": {
                     "examples": [["fname", { "lname": "last_name" }]],
                     "items": {
@@ -1037,6 +1087,12 @@
                     "title": "The connection to use for loading",
                     "type": "string"
                   },
+                  "fetch_size": {
+                    "default": 10000,
+                    "description": "Driver-level rows fetched per round-trip. Defaults to 10000.",
+                    "minimum": 1,
+                    "type": "integer"
+                  },
                   "schema": {
                     "description": "If left blank, the default schema of this connection will be used as defined in the connections.dy.yaml",
                     "examples": ["dbo"],
@@ -1370,11 +1426,13 @@
           "then": {
             "properties": {
               "with": {
+                "additionalProperties": false,
                 "description": "Read from the standard input",
                 "properties": {
                   "batch_size": {
                     "default": 1000,
-                    "description": "Number of records to process in a single batch",
+                    "description": "Maximum number of records yielded per downstream batch.",
+                    "minimum": 1,
                     "type": "integer"
                   }
                 },

From 606946568d2375076bd4253b7bf291fd0bfd7385 Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 17:19:12 +0300
Subject: [PATCH 22/38] Resolve $inherit before jsonschema2mk so block docs
 include batch_size (#400)

---
 docs/reference/blocks/azure_read_event_hub.md |  4 +++
 docs/reference/blocks/files_read_csv.md       |  1 +
 docs/reference/blocks/http_receiver.md        |  2 ++
 docs/reference/blocks/parquet_read.md         |  1 +
 docs/reference/blocks/redis_read_stream.md    |  4 +++
 docs/reference/blocks/relational_read.md      |  1 +
 docs/reference/blocks/std_read.md             | 13 ++++++-
 scripts/generate-docs.sh                      | 34 ++++++++++++++++++-
 8 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/docs/reference/blocks/azure_read_event_hub.md b/docs/reference/blocks/azure_read_event_hub.md
index 578b968c..fc3f8e5b 100644
--- a/docs/reference/blocks/azure_read_event_hub.md
+++ b/docs/reference/blocks/azure_read_event_hub.md
@@ -12,6 +12,8 @@ Read from Azure Event Hub
 
 |Name|Type|Description|Required|
 |----|----|-----------|--------|
+|**batch\_size**|`integer`|Maximum number of records yielded per downstream batch.<br/>Default: `1000`<br/>Minimum: `1`<br/>|no|
+|**flush\_ms**|`integer`, `null`|If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.<br/>Default: `1000`<br/>Minimum: `1`<br/>|no|
 |**event\_hub\_connection\_string**|`string`|The connection string for the Azure Event Hub namespace.<br/>|yes|
 |**event\_hub\_consumer\_group\_name**|`string`|The name of the consumer group to read events from.<br/>|yes|
 |**event\_hub\_name**|`string`|The name of the Azure Event Hub.<br/>|yes|
@@ -23,6 +25,8 @@ Read from Azure Event Hub
 **Example**
 
 ```yaml
+batch_size: 1000
+flush_ms: 1000
 max_batch_size: 300
 
 ```
diff --git a/docs/reference/blocks/files_read_csv.md b/docs/reference/blocks/files_read_csv.md
index 8948865a..44833e34 100644
--- a/docs/reference/blocks/files_read_csv.md
+++ b/docs/reference/blocks/files_read_csv.md
@@ -12,6 +12,7 @@ Read data from CSV
 
 |Name|Type|Description|Required|
 |----|----|-----------|--------|
+|**batch\_size**|`integer`|Maximum number of records yielded per downstream batch.<br/>Default: `1000`<br/>Minimum: `1`<br/>|no|
 |**file**|`string`|Filename. Can contain a regexp or glob expression<br/>|yes|
 |**encoding**|`string`|Encoding to use for reading the file<br/>Default: `"utf-8"`<br/>|no|
 |[**fields**](#fields)<br/>(List of columns to use)|`string[]`|List of columns to use for extract<br/>Minimal Length: `1`<br/>|no|
diff --git a/docs/reference/blocks/http_receiver.md b/docs/reference/blocks/http_receiver.md
index 749cadb4..fa2c4cf2 100644
--- a/docs/reference/blocks/http_receiver.md
+++ b/docs/reference/blocks/http_receiver.md
@@ -12,6 +12,8 @@ Receives HTTP requests and process the data.
 
 |Name|Type|Description|Required|
 |----|----|-----------|--------|
+|**batch\_size**|`integer`|Maximum number of records yielded per downstream batch.<br/>Default: `1000`<br/>Minimum: `1`<br/>||
+|**flush\_ms**|`integer`, `null`|If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.<br/>Default: `1000`<br/>Minimum: `1`<br/>||
 |**host**|`string`|Host to listen<br/>Default: `"0.0.0.0"`<br/>||
 |**port**|`integer`|Port to listen<br/>Default: `8080`<br/>||
 
diff --git a/docs/reference/blocks/parquet_read.md b/docs/reference/blocks/parquet_read.md
index ba08da29..10f9f2b6 100644
--- a/docs/reference/blocks/parquet_read.md
+++ b/docs/reference/blocks/parquet_read.md
@@ -12,6 +12,7 @@ Read data from parquet
 
 |Name|Type|Description|Required|
 |----|----|-----------|--------|
+|**batch\_size**|`integer`|Maximum number of records yielded per downstream batch.<br/>Default: `1000`<br/>Minimum: `1`<br/>|no|
 |**file**|`string`|Filename. Can contain a regexp or glob expression<br/>|yes|
 
 **Additional Properties:** not allowed  
diff --git a/docs/reference/blocks/redis_read_stream.md b/docs/reference/blocks/redis_read_stream.md
index 3c3b6043..31c0b265 100644
--- a/docs/reference/blocks/redis_read_stream.md
+++ b/docs/reference/blocks/redis_read_stream.md
@@ -12,6 +12,8 @@ Read from Redis stream
 
 |Name|Type|Description|Required|
 |----|----|-----------|--------|
+|**batch\_size**|`integer`|Maximum number of records yielded per downstream batch.<br/>Default: `1000`<br/>Minimum: `1`<br/>|no|
+|**flush\_ms**|`integer`, `null`|If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.<br/>Default: `1000`<br/>Minimum: `1`<br/>|no|
 |**connection**|`string`|Connection name<br/>|yes|
 |**stream\_name**<br/>(Source stream name)|`string`|Source stream name<br/>|yes|
 |**snapshot**<br/>(Snapshot current entries and quit)|`boolean`|Snapshot current entries and quit<br/>Default: `false`<br/>|no|
@@ -20,6 +22,8 @@ Read from Redis stream
 **Example**
 
 ```yaml
+batch_size: 1000
+flush_ms: 1000
 snapshot: false
 
 ```
diff --git a/docs/reference/blocks/relational_read.md b/docs/reference/blocks/relational_read.md
index 4bb5248c..b439eb1b 100644
--- a/docs/reference/blocks/relational_read.md
+++ b/docs/reference/blocks/relational_read.md
@@ -12,6 +12,7 @@ Read a table from an SQL-compatible data store
 
 |Name|Type|Description|Required|
 |----|----|-----------|--------|
+|**batch\_size**|`integer`|Maximum number of records yielded per downstream batch.<br/>Default: `1000`<br/>Minimum: `1`<br/>|no|
 |**connection**<br/>(The connection to use for loading)|`string`|Logical connection name as defined in the connections.dy.yaml<br/>|yes|
 |**schema**<br/>(The table schema of the table)|`string`|If left blank, the default schema of this connection will be used as defined in the connections.dy.yaml<br/>|no|
 |**table**<br/>(The table name)|`string`|Table name<br/>|yes|
diff --git a/docs/reference/blocks/std_read.md b/docs/reference/blocks/std_read.md
index bee360f2..e2d9481c 100644
--- a/docs/reference/blocks/std_read.md
+++ b/docs/reference/blocks/std_read.md
@@ -8,7 +8,18 @@ grand_parent: Reference
 Read from the standard input
 
 
-**No properties.**
+**Properties**
+
+|Name|Type|Description|Required|
+|----|----|-----------|--------|
+|**batch\_size**|`integer`|Maximum number of records yielded per downstream batch.<br/>Default: `1000`<br/>Minimum: `1`<br/>||
 
 **Additional Properties:** not allowed  
+**Example**
+
+```yaml
+batch_size: 1000
+
+```
+
 
diff --git a/scripts/generate-docs.sh b/scripts/generate-docs.sh
index 631bd1ed..7fabd705 100755
--- a/scripts/generate-docs.sh
+++ b/scripts/generate-docs.sh
@@ -37,6 +37,24 @@ done
 rm -rf ./docs/reference/blocks
 mkdir ./docs/reference/blocks
 
+# Pick a Python that can import datayoga_core via PYTHONPATH=core/src.
+if [ -x "./core/.venv/bin/python" ]; then
+  DOC_PYTHON="./core/.venv/bin/python"
+elif [ -x "./venv/bin/python" ]; then
+  DOC_PYTHON="./venv/bin/python"
+else
+  DOC_PYTHON="python3"
+fi
+
+# Track temp files so we can clean them up on exit.
+RESOLVED_TMP_FILES=()
+cleanup_resolved_tmps() {
+  for tmp in "${RESOLVED_TMP_FILES[@]}"; do
+    [ -f "${tmp}" ] && rm -f "${tmp}"
+  done
+}
+trap cleanup_resolved_tmps EXIT
+
 blocks_dir="./core/src/datayoga_core/blocks"
 for schema in $(find ${blocks_dir} -name '*.schema.json' | sort)
 do
@@ -46,7 +64,21 @@ do
   block_package="$(echo ${block_package} | cut -c2- | sed 's/\//_/g')"
   [ ! -z "${block_package}" ] && block_package="${block_package}_"
 
-  npx jsonschema2mk --schema ${schema} --extension yaml-examples \
+  # Resolve $inherit fragments so jsonschema2mk sees the inherited properties
+  # (batch_size, flush_ms, etc.). jsonschema2mk does not understand our custom
+  # $inherit extension, so we materialize a resolved copy first.
+  resolved_tmp="$(mktemp --suffix=.schema.json)"
+  RESOLVED_TMP_FILES+=("${resolved_tmp}")
+  PYTHONPATH=core/src "${DOC_PYTHON}" -c "
+import json, sys
+from datayoga_core.schema_utils import resolve_inherits
+from datayoga_core import utils
+schema = utils.read_json('${schema}')
+resolved = resolve_inherits(schema)
+sys.stdout.write(json.dumps(resolved))
+" > "${resolved_tmp}"
+
+  npx jsonschema2mk --schema "${resolved_tmp}" --extension yaml-examples \
     --extension front-matter --fm.parent "Blocks" --fm.grand_parent "Reference" > \
     "./docs/reference/blocks/${block_package}${doc_name}"
 done

From 9dbc5d37b9d9fddfc51feb129a8d914fdd2ec4fc Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 17:19:58 +0300
Subject: [PATCH 23/38] Document producer batching model in
 processing-strategies (#400)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/processing-strategies.md | 38 +++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/docs/processing-strategies.md b/docs/processing-strategies.md
index 8e9e83be..692d82c1 100644
--- a/docs/processing-strategies.md
+++ b/docs/processing-strategies.md
@@ -64,6 +64,44 @@ Rate limit allows to set guards for the frequency of processing in a given time
 
 The Rate limit strategy defines the number of requests per given time interval. For example, 5 requests a minute. When the limit is reached, processing for this Step will pause until the time period elapses to allow additional calls.
 
+## Producer Batching
+
+Every producer block (any block that reads from a source — `std/read`, `files/read_csv`, `parquet/read`, `relational/read`, `redis/read_stream`, `azure/read_event_hub`, `http/receiver`) accepts a `batch_size` property. The producer base class re-chunks the source's output into batches of exactly `batch_size` records, regardless of how the source delivers them (per row, per row group, per `fetchmany`, per network message).
+
+```yaml
+input:
+  uses: files.read_csv
+  with:
+    file: people.csv
+    batch_size: 500 # downstream steps process 500 records per call
+```
+
+Default: `1000`.
+
+### Streaming producers and `flush_ms`
+
+Streaming producers (`redis/read_stream`, `azure/read_event_hub`, `http/receiver`) also accept `flush_ms`. If no new records arrive within that many milliseconds, any partial batch is flushed downstream instead of being held until `batch_size` is reached.
+
+```yaml
+input:
+  uses: redis.read_stream
+  with:
+    connection: my_redis
+    stream_name: events
+    batch_size: 1000
+    flush_ms: 500 # emit a partial batch after 500ms of inactivity
+```
+
+Default: `1000` ms. Set to `null` to disable time-based flushing (records are held until `batch_size` or end-of-stream).
+
+### `relational/read` and `fetch_size`
+
+`relational/read` exposes an extra `fetch_size` property that controls how many rows are pulled from the database driver per round-trip, independent of the pipeline `batch_size`. Default: `10000`. Tune lower for memory pressure with wide rows; tune higher if you want fewer DB round-trips and downstream processing is the bottleneck.
+
+### `azure/read_event_hub` migration note
+
+In earlier versions, `batch_size` on `azure/read_event_hub` controlled the SDK callback batch size, not the pipeline batch size. As of #400 it has been renamed to `max_batch_size` to match the SDK semantic, and `batch_size` now consistently means pipeline batch size as it does for every other producer.
+
 ## Mix and Match
 
 The processing strategies can be mixed to fit the specific use case. For example, reading records from a Stream one by one, pushing into a parallel processor to perform a transformation, batched and fanned out to multiple processes to load into a relational database in bulk

From 5c2eab4d8a13fb1bff20e426739617198437c6da Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 17:25:55 +0300
Subject: [PATCH 24/38] Clean up http/receiver test teardown (#400)

Explicitly aclose() the producer async generator on consumer exit so the
underlying pump task and aiohttp server are torn down cleanly. Removes a
"Task was destroyed but it is pending!" warning at test teardown.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../blocks/http/receiver/tests/test_http_receiver.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py b/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py
index 613d91d7..85a40435 100644
--- a/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py
+++ b/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py
@@ -21,12 +21,16 @@ async def test_http_receiver_batches_incoming_requests():
     block.init()
 
     received = []
+    gen = block.produce()
 
     async def consumer():
-        async for batch in block.produce():
-            received.append(batch)
-            if sum(len(b) for b in received) >= 60:
-                return
+        try:
+            async for batch in gen:
+                received.append(batch)
+                if sum(len(b) for b in received) >= 60:
+                    return
+        finally:
+            await gen.aclose()
 
     consumer_task = asyncio.create_task(consumer())
     await asyncio.sleep(0.2)  # let server start

From 05f4b015e73bf359f34b83f00df7ee3ab48ce043 Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 17:39:38 +0300
Subject: [PATCH 25/38] Add docstrings to all methods touched in this PR (#400)

One-line docstrings on every method, helper, and inner function added or
modified by this PR: Producer base class, all 7 migrated producers, and
the per-block test helpers.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../datayoga_core/blocks/azure/read_event_hub/block.py   | 6 ++++++
 .../blocks/azure/read_event_hub/tests/test_event_hub.py  | 1 +
 core/src/datayoga_core/blocks/files/read_csv/block.py    | 3 +++
 .../blocks/files/read_csv/tests/test_read_csv.py         | 2 ++
 core/src/datayoga_core/blocks/http/receiver/block.py     | 5 +++++
 .../blocks/http/receiver/tests/test_http_receiver.py     | 2 ++
 core/src/datayoga_core/blocks/parquet/read/block.py      | 3 +++
 .../blocks/parquet/read/tests/test_parquet_read.py       | 2 ++
 core/src/datayoga_core/blocks/redis/read_stream/block.py | 5 +++++
 .../redis/read_stream/tests/test_redis_read_stream.py    | 1 +
 core/src/datayoga_core/blocks/relational/read/block.py   | 5 +++++
 .../blocks/relational/read/tests/test_relational_read.py | 8 +++++++-
 core/src/datayoga_core/blocks/std/read/block.py          | 9 +++++++++
 .../datayoga_core/blocks/std/read/tests/test_std_read.py | 1 +
 core/src/datayoga_core/producer.py                       | 4 ++++
 core/src/datayoga_core/tests/test_producer_batching.py   | 6 ++++++
 16 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/core/src/datayoga_core/blocks/azure/read_event_hub/block.py b/core/src/datayoga_core/blocks/azure/read_event_hub/block.py
index 77f76d7d..ba4173ba 100644
--- a/core/src/datayoga_core/blocks/azure/read_event_hub/block.py
+++ b/core/src/datayoga_core/blocks/azure/read_event_hub/block.py
@@ -19,6 +19,7 @@ class Block(DyProducer):
     DEFAULT_FLUSH_MS = 1000
 
     def init(self, context: Optional[Context] = None):
+        """Constructs the Event Hub consumer client and the internal message queue."""
         logger.debug(f"Initializing {self.get_block_name()}")
         self.max_batch_size = int(self.properties.get("max_batch_size", 300))
         self.consumer_client = EventHubConsumerClient.from_connection_string(
@@ -33,6 +34,7 @@ def init(self, context: Optional[Context] = None):
         self.messages: asyncio.Queue = asyncio.Queue()
 
     async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
+        """Starts the receive loop and yields one chunk per drained-queue snapshot."""
         logger.debug(f"Running {self.get_block_name()}")
         logger.debug("Starting event receiving process")
         asyncio.create_task(self.receive_batch())
@@ -45,6 +47,7 @@ async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
             yield chunk
 
     async def receive_batch(self):
+        """Runs the Azure SDK receive loop, dispatching each batch to `on_event_batch`."""
         await self.consumer_client.receive_batch(
             on_event_batch=self.on_event_batch,
             max_batch_size=self.max_batch_size,
@@ -52,6 +55,7 @@ async def receive_batch(self):
         )
 
     async def on_event_batch(self, partition_context: PartitionContext, events: List[EventData]):
+        """SDK callback: parses each event body as JSON and enqueues it for delivery."""
         logger.debug(f"Received batch of events from partition: {partition_context.partition_id}")
         for event in events:
             try:
@@ -64,6 +68,7 @@ async def on_event_batch(self, partition_context: PartitionContext, events: List
                 logger.error(e)
 
     async def complete_events(self, msg_ids: List[str]):
+        """Updates the partition checkpoint for each previously-delivered message id."""
         for msg_id in msg_ids:
             logger.debug(f"Acking {msg_id} event")
             event, partition_context = self.events.pop(msg_id, (None, None))
@@ -73,4 +78,5 @@ async def complete_events(self, msg_ids: List[str]):
                 logger.warning(f"Couldn't find event {msg_id} for acknowledging")
 
     def ack(self, msg_ids: List[str]):
+        """Schedules checkpoint updates for the given message ids."""
         asyncio.create_task(self.complete_events(msg_ids))
diff --git a/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py b/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py
index 074b7c36..f0f06cd0 100644
--- a/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py
+++ b/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py
@@ -5,6 +5,7 @@
 
 
 def _minimal_props(extra=None):
+    """Returns a minimal set of properties accepted by the Event Hub block schema."""
     base = {
         "event_hub_connection_string": "Endpoint=sb://x/;SharedAccessKeyName=k;SharedAccessKey=v;EntityPath=eh",
         "event_hub_consumer_group_name": "$Default",
diff --git a/core/src/datayoga_core/blocks/files/read_csv/block.py b/core/src/datayoga_core/blocks/files/read_csv/block.py
index 336450dc..8e94f1f7 100644
--- a/core/src/datayoga_core/blocks/files/read_csv/block.py
+++ b/core/src/datayoga_core/blocks/files/read_csv/block.py
@@ -13,8 +13,10 @@
 
 
 class Block(DyProducer, metaclass=ABCMeta):
+    """Producer block that reads records from a CSV file."""
 
     def init(self, context: Optional[Context] = None):
+        """Initializes the block: resolves the CSV file path and reader options."""
         logger.debug(f"Initializing {self.get_block_name()}")
         csv_file = self.properties["file"]
         if os.path.isabs(csv_file) or context is None:
@@ -29,6 +31,7 @@ def init(self, context: Optional[Context] = None):
         self.quotechar = self.properties.get("quotechar", "\"")
 
     async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
+        """Yields successive `batch_size`-sized chunks of CSV rows."""
         logger.debug("Reading CSV")
         batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE))
 
diff --git a/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py b/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py
index 16cb9b17..22651bb1 100644
--- a/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py
+++ b/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py
@@ -6,6 +6,7 @@
 
 
 async def _drain(producer):
+    """Collects all batches emitted by a producer until end-of-stream."""
     out = []
     async for batch in producer.produce():
         out.append(batch)
@@ -14,6 +15,7 @@ async def _drain(producer):
 
 @pytest.fixture
 def csv_path(tmp_path) -> Path:
+    """Writes a 2500-row CSV with a single header row to a temp path."""
     p = tmp_path / "data.csv"
     rows = ["fname,lname"] + [f"first{i},last{i}" for i in range(2500)]
     p.write_text("\n".join(rows) + "\n", encoding="utf-8")
diff --git a/core/src/datayoga_core/blocks/http/receiver/block.py b/core/src/datayoga_core/blocks/http/receiver/block.py
index 3f5b1833..ab0fa60a 100644
--- a/core/src/datayoga_core/blocks/http/receiver/block.py
+++ b/core/src/datayoga_core/blocks/http/receiver/block.py
@@ -15,19 +15,24 @@
 
 
 class Block(DyProducer, metaclass=ABCMeta):
+    """Producer block that exposes an HTTP endpoint and emits POSTed JSON bodies."""
+
     port: int
     host: str
     DEFAULT_FLUSH_MS = 1000
 
     def init(self, context: Optional[Context] = None):
+        """Reads host/port from properties; the HTTP server is started in produce_chunks."""
         logger.debug(f"Initializing {self.get_block_name()}")
         self.port = int(self.properties.get("port", 8080))
         self.host = self.properties.get("host", "0.0.0.0")
 
     async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
+        """Starts the HTTP server, then yields one chunk per drained queue snapshot."""
         queue: Queue = Queue(maxsize=1000)
 
         async def handler(request: BaseRequest) -> Response:
+            """Parses the incoming HTTP body as JSON and enqueues it for delivery."""
             try:
                 queue.put_nowait(orjson.loads(await request.read()))
                 return HTTPOk()
diff --git a/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py b/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py
index 85a40435..9f93360e 100644
--- a/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py
+++ b/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py
@@ -7,6 +7,7 @@
 
 
 def _free_port():
+    """Returns an unused TCP port on localhost."""
     import socket
     with socket.socket() as s:
         s.bind(("127.0.0.1", 0))
@@ -24,6 +25,7 @@ async def test_http_receiver_batches_incoming_requests():
     gen = block.produce()
 
     async def consumer():
+        """Drains the producer until 60 records have arrived, then closes the generator."""
         try:
             async for batch in gen:
                 received.append(batch)
diff --git a/core/src/datayoga_core/blocks/parquet/read/block.py b/core/src/datayoga_core/blocks/parquet/read/block.py
index 1c7128c6..f82604ee 100644
--- a/core/src/datayoga_core/blocks/parquet/read/block.py
+++ b/core/src/datayoga_core/blocks/parquet/read/block.py
@@ -12,8 +12,10 @@
 
 
 class Block(DyProducer, metaclass=ABCMeta):
+    """Producer block that reads records from a Parquet file."""
 
     def init(self, context: Optional[Context] = None):
+        """Initializes the block: resolves the Parquet file path."""
         logger.debug(f"Initializing {self.get_block_name()}")
         parquet_file = self.properties["file"]
         if os.path.isabs(parquet_file) or context is None:
@@ -23,6 +25,7 @@ def init(self, context: Optional[Context] = None):
         logger.debug(f"file: {self.file}")
 
     async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
+        """Yields one chunk per Parquet row group; the base class re-chunks to `batch_size`."""
         logger.debug("Reading parquet")
         pf = ParquetFile(self.file)
         counter = iter(count())
diff --git a/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py b/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py
index ab6d8517..a04bc3fe 100644
--- a/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py
+++ b/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py
@@ -7,6 +7,7 @@
 
 
 async def _drain(producer):
+    """Collects all batches emitted by a producer until end-of-stream."""
     out = []
     async for batch in producer.produce():
         out.append(batch)
@@ -15,6 +16,7 @@ async def _drain(producer):
 
 @pytest.fixture
 def parquet_path(tmp_path) -> Path:
+    """Writes a 2500-row Parquet file with three row groups (1000, 1000, 500)."""
     p = tmp_path / "data.parquet"
     df = pd.DataFrame({"i": list(range(2500))})
     from fastparquet import write as fp_write
diff --git a/core/src/datayoga_core/blocks/redis/read_stream/block.py b/core/src/datayoga_core/blocks/redis/read_stream/block.py
index 136d0963..aa464743 100644
--- a/core/src/datayoga_core/blocks/redis/read_stream/block.py
+++ b/core/src/datayoga_core/blocks/redis/read_stream/block.py
@@ -11,9 +11,12 @@
 
 
 class Block(DyProducer):
+    """Producer block that reads messages from a Redis stream consumer group."""
+
     DEFAULT_FLUSH_MS = 1000
 
     def init(self, context: Optional[Context] = None):
+        """Connects to Redis and ensures the consumer group exists on the target stream."""
         logger.debug(f"Initializing {self.get_block_name()}")
         connection_details = Connection.get_connection_details(self.properties["connection"], context)
         self.redis_client = redis_utils.get_client(connection_details)
@@ -27,6 +30,7 @@ def init(self, context: Optional[Context] = None):
             self.redis_client.xgroup_create(self.stream, self.consumer_group, 0)
 
     async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
+        """Reads pending then new stream messages via XREADGROUP, yielding each response as a chunk."""
         logger.debug(f"Running {self.get_block_name()}")
         batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE))
         read_pending = True
@@ -57,6 +61,7 @@ async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
             read_pending = False
 
     def ack(self, msg_ids: List[str]):
+        """Acknowledges the given message ids with XACK on the stream consumer group."""
         for msg_id in msg_ids:
             logger.info(f"Acking {msg_id} message in {self.stream} stream of {self.consumer_group} consumer group")
             self.redis_client.xack(self.stream, self.consumer_group, msg_id)
diff --git a/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py b/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py
index f45b8d67..f06936d4 100644
--- a/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py
+++ b/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py
@@ -6,6 +6,7 @@
 
 
 def _mk_block(properties, redis_client):
+    """Builds a redis/read_stream Block bypassing its real init() (mocks the Redis client)."""
     block = Block.__new__(Block)
     block.properties = properties
     block.redis_client = redis_client
diff --git a/core/src/datayoga_core/blocks/relational/read/block.py b/core/src/datayoga_core/blocks/relational/read/block.py
index 2b04f3c3..4dd8f026 100644
--- a/core/src/datayoga_core/blocks/relational/read/block.py
+++ b/core/src/datayoga_core/blocks/relational/read/block.py
@@ -11,9 +11,12 @@
 
 
 class Block(DyProducer):
+    """Producer block that reads rows from a SQL-compatible relational database."""
+
     DEFAULT_FETCH_SIZE = 10000
 
     def init(self, context: Optional[Context] = None):
+        """Initializes the engine, autoloads the target table, and opens a connection."""
         self.engine, self.db_type = relational_utils.get_engine(
             self.properties["connection"],
             context,
@@ -33,6 +36,7 @@ def init(self, context: Optional[Context] = None):
         self.connection = self.engine.connect()
 
     async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
+        """Yields each `fetchmany(fetch_size)` result as a chunk; the base class re-chunks to `batch_size`."""
         fetch_size = int(self.properties.get("fetch_size", self.DEFAULT_FETCH_SIZE))
         result = self.connection.execution_options(stream_results=True).execute(self.tbl.select())
         while True:
@@ -42,5 +46,6 @@ async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
             yield [utils.add_uid(dict(row._asdict())) for row in rows]
 
     def stop(self):
+        """Closes the database connection and disposes of the engine."""
         self.connection.close()
         self.engine.dispose()
diff --git a/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py b/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py
index 0fba4629..3e59315b 100644
--- a/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py
+++ b/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py
@@ -6,6 +6,7 @@
 
 
 async def _drain(producer):
+    """Collects all batches emitted by a producer until end-of-stream."""
     out = []
     async for batch in producer.produce():
         out.append(batch)
@@ -13,7 +14,7 @@ async def _drain(producer):
 
 
 def _fake_result(rows):
-    """Build a fake SQLAlchemy result that returns rows in fetchmany chunks."""
+    """Builds a fake SQLAlchemy result that returns rows in fetchmany chunks."""
     state = {"i": 0}
 
     def fetchmany(n):
@@ -29,14 +30,19 @@ def fetchmany(n):
 
 
 class _Row:
+    """Stand-in for a SQLAlchemy Row exposing only `_asdict()`."""
+
     def __init__(self, d):
+        """Stores the underlying dict that `_asdict()` will return."""
         self._d = d
 
     def _asdict(self):
+        """Returns the stored dict, matching SQLAlchemy Row's API."""
         return self._d
 
 
 def _mk_block(properties, fake_result):
+    """Builds a relational/read Block without running its real init() (mocks engine/connection)."""
     block = Block.__new__(Block)
     block.properties = properties
     block.connection = MagicMock()
diff --git a/core/src/datayoga_core/blocks/std/read/block.py b/core/src/datayoga_core/blocks/std/read/block.py
index 1c51839d..8ff15811 100644
--- a/core/src/datayoga_core/blocks/std/read/block.py
+++ b/core/src/datayoga_core/blocks/std/read/block.py
@@ -12,10 +12,17 @@
 
 
 class Block(DyProducer):
+    """Producer block that reads JSON records from standard input."""
+
     def init(self, context: Optional[Context] = None):
+        """Initializes the block."""
         logger.debug(f"Initializing {self.get_block_name()}")
 
     async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
+        """Reads all stdin records and yields them as a single chunk.
+
+        The base class re-chunks the output to `batch_size` records per batch.
+        """
         if select.select([sys.stdin], [], [], 0.0)[0]:
             all_records: List[Dict[str, Any]] = []
             for line in sys.stdin:
@@ -29,10 +36,12 @@ async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
 
     @staticmethod
     def get_records(data: str) -> List[Dict[str, Any]]:
+        """Parses a JSON string into a list of records (wraps single objects in a list)."""
         records = orjson.loads(data)
         if isinstance(records, dict):
             records = [records]
         return records
 
     def get_message(self, record: Dict[str, Any]) -> Dict[str, Any]:
+        """Returns the record with a generated message id field added."""
         return {self.MSG_ID_FIELD: str(uuid.uuid4()), **record}
diff --git a/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py b/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py
index 609f0915..b588af69 100644
--- a/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py
+++ b/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py
@@ -7,6 +7,7 @@
 
 
 async def _drain(producer):
+    """Collects all batches emitted by a producer until end-of-stream."""
     out = []
     async for batch in producer.produce():
         out.append(batch)
diff --git a/core/src/datayoga_core/producer.py b/core/src/datayoga_core/producer.py
index 2b61390d..8b672433 100644
--- a/core/src/datayoga_core/producer.py
+++ b/core/src/datayoga_core/producer.py
@@ -9,7 +9,10 @@
 
 
 class Message:
+    """A message produced by a producer block."""
+
     def __init__(self, msg_id: str, value: Dict[str, Any]):
+        """Initializes a message with an id and a payload value."""
         self.msg_id = msg_id
         self.value = value
 
@@ -54,6 +57,7 @@ async def produce(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
         EOS = object()
 
         async def pump():
+            """Drains produce_chunks() into the queue; signals EOS on exit."""
             try:
                 async for chunk in self.produce_chunks():
                     if chunk:
diff --git a/core/src/datayoga_core/tests/test_producer_batching.py b/core/src/datayoga_core/tests/test_producer_batching.py
index 59601786..695d7e1c 100644
--- a/core/src/datayoga_core/tests/test_producer_batching.py
+++ b/core/src/datayoga_core/tests/test_producer_batching.py
@@ -8,6 +8,7 @@
 
 
 def _msg(i: int) -> dict:
+    """Builds a record carrying the producer MSG_ID_FIELD and a numeric value."""
     return {Producer.MSG_ID_FIELD: str(i), "v": i}
 
 
@@ -15,6 +16,7 @@ class FakeProducer(Producer):
     """Producer driven by a scripted list of chunks plus optional sleeps."""
 
     def __init__(self, properties=None, *, chunks=None, sleep_before=None):
+        """Configures the scripted chunks and optional per-chunk sleep delays."""
         # schema for a FakeProducer; declare batch_size/flush_ms so validation passes
         self._test_schema = {
             "type": "object",
@@ -28,12 +30,15 @@ def __init__(self, properties=None, *, chunks=None, sleep_before=None):
         super().__init__(properties or {})
 
     def get_json_schema(self):
+        """Returns the in-memory test schema (avoids reading from disk)."""
         return self._test_schema
 
     def init(self, context: Optional[Context] = None):
+        """No-op init; FakeProducer doesn't need any setup."""
         pass
 
     async def produce_chunks(self) -> AsyncGenerator[List[Message], None]:
+        """Yields the scripted chunks, optionally sleeping before each one."""
         for i, chunk in enumerate(self._chunks):
             if i < len(self._sleep_before) and self._sleep_before[i]:
                 await asyncio.sleep(self._sleep_before[i])
@@ -41,6 +46,7 @@ async def produce_chunks(self) -> AsyncGenerator[List[Message], None]:
 
 
 async def _drain(producer: Producer):
+    """Collects all batches emitted by a producer until end-of-stream."""
     out = []
     async for batch in producer.produce():
         out.append(batch)

From 0b8d8f77638d17813b11ea60e8841e4f542060b9 Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 17:43:10 +0300
Subject: [PATCH 26/38] Fix CI: isort formatting + stdlib-only $inherit
 resolver in docs script (#400)

- Run isort/autopep8 on test files; collapse the blank line between third-party
  imports (pytest, etc.) and datayoga_core imports that isort flagged.
- Rewrite the $inherit resolution in scripts/generate-docs.sh to use only the
  Python standard library, so the docs CI job (which installs only node) no
  longer hits ModuleNotFoundError on prometheus_client when importing
  datayoga_core.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../read_event_hub/tests/test_event_hub.py    |  3 +-
 .../files/read_csv/tests/test_read_csv.py     |  1 -
 .../http/receiver/tests/test_http_receiver.py |  1 -
 .../parquet/read/tests/test_parquet_read.py   |  1 -
 .../tests/test_redis_read_stream.py           |  1 -
 .../read/tests/test_relational_read.py        |  1 -
 .../blocks/std/read/tests/test_std_read.py    |  3 +-
 .../tests/test_producer_batching.py           |  1 -
 .../tests/test_schema_inherit.py              |  2 -
 scripts/generate-docs.sh                      | 45 ++++++++++++-------
 10 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py b/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py
index f0f06cd0..17cff570 100644
--- a/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py
+++ b/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py
@@ -1,7 +1,6 @@
 import pytest
-from jsonschema import ValidationError
-
 from datayoga_core.blocks.azure.read_event_hub.block import Block
+from jsonschema import ValidationError
 
 
 def _minimal_props(extra=None):
diff --git a/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py b/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py
index 22651bb1..55fd548e 100644
--- a/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py
+++ b/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py
@@ -1,7 +1,6 @@
 from pathlib import Path
 
 import pytest
-
 from datayoga_core.blocks.files.read_csv.block import Block
 
 
diff --git a/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py b/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py
index 9f93360e..4673801d 100644
--- a/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py
+++ b/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py
@@ -2,7 +2,6 @@
 
 import aiohttp
 import pytest
-
 from datayoga_core.blocks.http.receiver.block import Block
 
 
diff --git a/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py b/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py
index a04bc3fe..b33a3d03 100644
--- a/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py
+++ b/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py
@@ -2,7 +2,6 @@
 
 import pandas as pd
 import pytest
-
 from datayoga_core.blocks.parquet.read.block import Block
 
 
diff --git a/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py b/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py
index f06936d4..5d46e99e 100644
--- a/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py
+++ b/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py
@@ -1,7 +1,6 @@
 from unittest.mock import MagicMock
 
 import pytest
-
 from datayoga_core.blocks.redis.read_stream.block import Block
 
 
diff --git a/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py b/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py
index 3e59315b..6dafd72e 100644
--- a/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py
+++ b/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py
@@ -1,7 +1,6 @@
 from unittest.mock import MagicMock
 
 import pytest
-
 from datayoga_core.blocks.relational.read.block import Block
 
 
diff --git a/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py b/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py
index b588af69..d9698d16 100644
--- a/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py
+++ b/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py
@@ -2,7 +2,6 @@
 
 import orjson
 import pytest
-
 from datayoga_core.blocks.std.read.block import Block
 
 
@@ -24,7 +23,7 @@ async def test_std_read_batches_to_batch_size():
 
     with patch("datayoga_core.blocks.std.read.block.select.select",
                return_value=([object()], [], [])), \
-         patch("datayoga_core.blocks.std.read.block.sys.stdin", fake_stdin):
+            patch("datayoga_core.blocks.std.read.block.sys.stdin", fake_stdin):
         batches = await _drain(block)
 
     assert [len(b) for b in batches] == [1000, 1000, 500]
diff --git a/core/src/datayoga_core/tests/test_producer_batching.py b/core/src/datayoga_core/tests/test_producer_batching.py
index 695d7e1c..34413037 100644
--- a/core/src/datayoga_core/tests/test_producer_batching.py
+++ b/core/src/datayoga_core/tests/test_producer_batching.py
@@ -2,7 +2,6 @@
 from typing import AsyncGenerator, List, Optional
 
 import pytest
-
 from datayoga_core.context import Context
 from datayoga_core.producer import Message, Producer
 
diff --git a/core/src/datayoga_core/tests/test_schema_inherit.py b/core/src/datayoga_core/tests/test_schema_inherit.py
index c22ea2c8..46f448cd 100644
--- a/core/src/datayoga_core/tests/test_schema_inherit.py
+++ b/core/src/datayoga_core/tests/test_schema_inherit.py
@@ -1,10 +1,8 @@
 from pathlib import Path
 
 import pytest
-
 from datayoga_core.schema_utils import resolve_inherits
 
-
 SCHEMAS_DIR = (
     Path(__file__).resolve().parent.parent / "resources" / "schemas"
 )
diff --git a/scripts/generate-docs.sh b/scripts/generate-docs.sh
index 7fabd705..03aa51ae 100755
--- a/scripts/generate-docs.sh
+++ b/scripts/generate-docs.sh
@@ -37,15 +37,6 @@ done
 rm -rf ./docs/reference/blocks
 mkdir ./docs/reference/blocks
 
-# Pick a Python that can import datayoga_core via PYTHONPATH=core/src.
-if [ -x "./core/.venv/bin/python" ]; then
-  DOC_PYTHON="./core/.venv/bin/python"
-elif [ -x "./venv/bin/python" ]; then
-  DOC_PYTHON="./venv/bin/python"
-else
-  DOC_PYTHON="python3"
-fi
-
 # Track temp files so we can clean them up on exit.
 RESOLVED_TMP_FILES=()
 cleanup_resolved_tmps() {
@@ -56,6 +47,7 @@ cleanup_resolved_tmps() {
 trap cleanup_resolved_tmps EXIT
 
 blocks_dir="./core/src/datayoga_core/blocks"
+schemas_dir="./core/src/datayoga_core/resources/schemas"
 for schema in $(find ${blocks_dir} -name '*.schema.json' | sort)
 do
   doc_name="$(awk -F/ '{ print $(NF-1) }' <<<${schema}).md"
@@ -67,16 +59,35 @@ do
   # Resolve $inherit fragments so jsonschema2mk sees the inherited properties
   # (batch_size, flush_ms, etc.). jsonschema2mk does not understand our custom
   # $inherit extension, so we materialize a resolved copy first.
+  # Self-contained Python (stdlib only) so this works in CI without installing
+  # datayoga_core's runtime dependencies.
   resolved_tmp="$(mktemp --suffix=.schema.json)"
   RESOLVED_TMP_FILES+=("${resolved_tmp}")
-  PYTHONPATH=core/src "${DOC_PYTHON}" -c "
-import json, sys
-from datayoga_core.schema_utils import resolve_inherits
-from datayoga_core import utils
-schema = utils.read_json('${schema}')
-resolved = resolve_inherits(schema)
-sys.stdout.write(json.dumps(resolved))
-" > "${resolved_tmp}"
+  python3 - "${schema}" "${schemas_dir}" > "${resolved_tmp}" <<'PYEOF'
+import json
+import os
+import sys
+
+schema_path, schemas_dir = sys.argv[1], sys.argv[2]
+with open(schema_path) as f:
+    schema = json.load(f)
+inherits = schema.get("$inherit") or []
+if inherits:
+    if not isinstance(inherits, list) or not all(isinstance(n, str) for n in inherits):
+        raise SystemExit(f"$inherit must be a list of strings, got {inherits!r}")
+    merged = {}
+    for name in inherits:
+        fragment_path = os.path.join(schemas_dir, f"{name}.schema.json")
+        with open(fragment_path) as f:
+            fragment = json.load(f)
+        if fragment.get("$inherit"):
+            raise SystemExit(f"Nested $inherit in fragment '{name}' is not supported")
+        merged.update(fragment.get("properties", {}))
+    merged.update(schema.get("properties", {}))
+    schema["properties"] = merged
+    schema.pop("$inherit", None)
+json.dump(schema, sys.stdout)
+PYEOF
 
   npx jsonschema2mk --schema "${resolved_tmp}" --extension yaml-examples \
     --extension front-matter --fm.parent "Blocks" --fm.grand_parent "Reference" > \

From 4e3b3fc974441348da2b0c00314b897725aed8c9 Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 17:51:25 +0300
Subject: [PATCH 27/38] Add azure-eventhub deps to test extras for CI (#400)

unit-tests CI does `pip install .[test]`. The azure/read_event_hub test
module imports the block, which imports azure.eventhub at module load.
Without azure-eventhub in the test extras, pytest's collection fails on
ModuleNotFoundError. Other producer test modules (parquet, redis, http,
relational) already work because their backing deps are in [test].

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 core/pyproject.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/core/pyproject.toml b/core/pyproject.toml
index 9939fd53..2a55ee25 100644
--- a/core/pyproject.toml
+++ b/core/pyproject.toml
@@ -66,6 +66,8 @@ sqlserver = ["pymssql", "SQLAlchemy"]
 
 test = [
         "aiohttp",
+        "azure-eventhub",
+        "azure-eventhub-checkpointstoreblob-aio",
         "cassandra-driver",
         "fastparquet",
         "ibm_db_sa",

From a3d927595d981e1484e396633122ae9aae0d368d Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 18:38:50 +0300
Subject: [PATCH 28/38] Format superpowers spec and plan with prettier (#400)

formatting-check runs prettier --check on all .md including these.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 ...026-05-28-producer-batching-unification.md | 30 ++++++++++++--
 ...28-producer-batching-unification-design.md | 40 ++++++++++---------
 2 files changed, 47 insertions(+), 23 deletions(-)

diff --git a/docs/superpowers/plans/2026-05-28-producer-batching-unification.md b/docs/superpowers/plans/2026-05-28-producer-batching-unification.md
index a53f0e0f..d4fa4415 100644
--- a/docs/superpowers/plans/2026-05-28-producer-batching-unification.md
+++ b/docs/superpowers/plans/2026-05-28-producer-batching-unification.md
@@ -16,6 +16,7 @@
 ## File Structure
 
 **Created:**
+
 - `core/src/datayoga_core/resources/schemas/batchable.schema.json` — fragment exposing `batch_size`
 - `core/src/datayoga_core/resources/schemas/streamable.schema.json` — fragment exposing `flush_ms` (combined with batchable)
 - `core/src/datayoga_core/schema_utils.py` — `$inherit` resolver used by Block + Job
@@ -36,6 +37,7 @@
 - `core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py`
 
 **Modified:**
+
 - `core/src/datayoga_core/producer.py` — adds `produce_chunks` and a default `produce()` that re-chunks
 - `core/src/datayoga_core/block.py` — `get_json_schema()` runs through `$inherit` resolver
 - `core/src/datayoga_core/job.py` — `get_json_schema()` loop runs each loaded schema through the resolver
@@ -64,6 +66,7 @@
 Adds the `$inherit` convention and the two shared fragments. After this task, schemas referencing `batchable` / `streamable` get the fragments' properties merged in at load time.
 
 **Files:**
+
 - Create: `core/src/datayoga_core/resources/schemas/batchable.schema.json`
 - Create: `core/src/datayoga_core/resources/schemas/streamable.schema.json`
 - Create: `core/src/datayoga_core/schema_utils.py`
@@ -198,6 +201,7 @@ def test_unknown_fragment_raises():
 - [ ] **Step 1.5: Run test to verify it fails**
 
 Run:
+
 ```bash
 cd core && python -m pytest src/datayoga_core/tests/test_schema_inherit.py -v
 ```
@@ -266,6 +270,7 @@ def resolve_inherits(schema: Dict[str, Any], schemas_dir: str = None) -> Dict[st
 - [ ] **Step 1.7: Run test to verify it passes**
 
 Run:
+
 ```bash
 cd core && python -m pytest src/datayoga_core/tests/test_schema_inherit.py -v
 ```
@@ -305,6 +310,7 @@ Note: the `from datayoga_core.schema_utils import resolve_inherits` line is insi
 Modify `core/src/datayoga_core/job.py`. Inside the `for block_type, schema_path in block_info:` loop (around line 240–243), apply the resolver to each loaded schema.
 
 Find this block:
+
 ```python
         for block_type, schema_path in block_info:
             block_types.append(block_type)
@@ -314,6 +320,7 @@ Find this block:
 ```
 
 Replace with:
+
 ```python
         from datayoga_core.schema_utils import resolve_inherits
         for block_type, schema_path in block_info:
@@ -353,6 +360,7 @@ git commit -m "Add \$inherit schema fragment resolver (#400)"
 Add `produce_chunks()` and a default `produce()` that re-chunks. Existing subclasses override `produce()` directly and are unaffected until migrated in later tasks.
 
 **Files:**
+
 - Create: `core/src/datayoga_core/tests/test_producer_batching.py`
 - Modify: `core/src/datayoga_core/producer.py`
 
@@ -492,6 +500,7 @@ async def test_consumer_cancellation_cleans_up_pump():
 - [ ] **Step 2.2: Run tests to verify they fail**
 
 Run:
+
 ```bash
 cd core && python -m pytest src/datayoga_core/tests/test_producer_batching.py -v
 ```
@@ -602,6 +611,7 @@ class Producer(Block):
 ```
 
 Key differences from the current file:
+
 - `produce()` is no longer `@abstractmethod` — it has a default implementation.
 - `produce_chunks()` is the new override hook (not formally `@abstractmethod` so legacy subclasses still validate).
 - `Message` class unchanged.
@@ -609,6 +619,7 @@ Key differences from the current file:
 - [ ] **Step 2.4: Run tests to verify they pass**
 
 Run:
+
 ```bash
 cd core && python -m pytest src/datayoga_core/tests/test_producer_batching.py -v
 ```
@@ -640,6 +651,7 @@ git commit -m "Producer base class re-chunks via produce_chunks (#400)"
 `std/read` already has `batch_size` and a custom `process_batch` accumulator. Replace it with a `produce_chunks` that yields one chunk; the base class re-chunks.
 
 **Files:**
+
 - Modify: `core/src/datayoga_core/blocks/std/read/block.py`
 - Modify: `core/src/datayoga_core/blocks/std/read/block.schema.json`
 
@@ -689,6 +701,7 @@ async def test_std_read_batches_to_batch_size():
 - [ ] **Step 3.2: Run test to verify it fails**
 
 Run:
+
 ```bash
 cd core && python -m pytest src/datayoga_core/blocks/std/read/tests/test_std_read.py -v
 ```
@@ -792,6 +805,7 @@ git commit -m "Migrate std/read to produce_chunks (#400, #296)"
 Replace the `produce()` override and `islice` loop with a `produce_chunks` that yields one chunk per `batch_size` rows. The base class re-chunks to the configured `batch_size`.
 
 **Files:**
+
 - Modify: `core/src/datayoga_core/blocks/files/read_csv/block.py`
 - Modify: `core/src/datayoga_core/blocks/files/read_csv/block.schema.json`
 
@@ -849,7 +863,7 @@ async def test_csv_default_batch_size(csv_path):
 cd core && python -m pytest src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py -v
 ```
 
-Expected: FAIL — current `produce()` works but the tests may pass coincidentally because `files/read_csv` already batches. That's fine; the test exists to *protect* the contract. Proceed to the migration anyway and confirm the test still passes afterward.
+Expected: FAIL — current `produce()` works but the tests may pass coincidentally because `files/read_csv` already batches. That's fine; the test exists to _protect_ the contract. Proceed to the migration anyway and confirm the test still passes afterward.
 
 - [ ] **Step 4.3: Migrate `files/read_csv` to `produce_chunks`**
 
@@ -1010,6 +1024,7 @@ git commit -m "Migrate files/read_csv to produce_chunks (#400)"
 Today `parquet/read` iterates each row of each row group and yields a single-record list per iteration. Migrate it to yield each row group as a single chunk; the base class re-chunks to `batch_size`.
 
 **Files:**
+
 - Modify: `core/src/datayoga_core/blocks/parquet/read/block.py`
 - Modify: `core/src/datayoga_core/blocks/parquet/read/block.schema.json`
 
@@ -1171,6 +1186,7 @@ git commit -m "Migrate parquet/read to produce_chunks, fix one-by-one yield (#40
 Today `relational/read` does `fetchmany(10000)` then yields one row at a time. Migrate to `produce_chunks` that yields each `fetchmany` result. Add an optional `fetch_size` property; default to 10000 to preserve today's DB round-trip count.
 
 **Files:**
+
 - Modify: `core/src/datayoga_core/blocks/relational/read/block.py`
 - Modify: `core/src/datayoga_core/blocks/relational/read/block.schema.json`
 
@@ -1426,6 +1442,7 @@ git commit -m "Migrate relational/read to produce_chunks, add fetch_size (#400,
 The receiver currently yields one record per HTTP request. Migrate to drain the queue per chunk; `flush_ms` ensures partial batches flush during low-traffic periods.
 
 **Files:**
+
 - Modify: `core/src/datayoga_core/blocks/http/receiver/block.py`
 - Modify: `core/src/datayoga_core/blocks/http/receiver/block.schema.json`
 
@@ -1617,6 +1634,7 @@ git commit -m "Migrate http/receiver to produce_chunks (#400)"
 The redis stream producer yields one record at a time today. Migrate so it requests `count=batch_size` from `xreadgroup` and yields each response as a chunk; `flush_ms` flushes partial batches during low-volume periods.
 
 **Files:**
+
 - Modify: `core/src/datayoga_core/blocks/redis/read_stream/block.py`
 - Modify: `core/src/datayoga_core/blocks/redis/read_stream/block.schema.json`
 
@@ -1823,9 +1841,10 @@ git commit -m "Migrate redis/read_stream to batched xreadgroup (#400, #377)"
 
 ## Task 9: Migrate `azure/read_event_hub` (rename `batch_size` → `max_batch_size`)
 
-Today `batch_size` controls the SDK callback size, not the pipeline batch size. Rename to `max_batch_size`, add `additionalProperties: false`, and use the streamable fragment so the *new* `batch_size` means pipeline batch size.
+Today `batch_size` controls the SDK callback size, not the pipeline batch size. Rename to `max_batch_size`, add `additionalProperties: false`, and use the streamable fragment so the _new_ `batch_size` means pipeline batch size.
 
 **Files:**
+
 - Modify: `core/src/datayoga_core/blocks/azure/read_event_hub/block.py`
 - Modify: `core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json`
 
@@ -2070,6 +2089,7 @@ git commit -m "Migrate azure/read_event_hub; rename batch_size -> max_batch_size
 The aggregated `schemas/job.schema.json` and the per-block markdown in `docs/reference/blocks/` are generated by scripts. After the per-block schema changes, regenerate them.
 
 **Files:**
+
 - Modify: `schemas/job.schema.json`
 - Modify: `docs/reference/blocks/std_read.md`, `files_read_csv.md`, `parquet_read.md`, `relational_read.md`, `redis_read_stream.md`, `http_receiver.md`, `azure_read_event_hub.md` (autogenerated)
 
@@ -2109,6 +2129,7 @@ git commit -m "Regenerate JSON schemas and reference docs after producer batchin
 ## Task 11: Document the producer batching model in processing-strategies
 
 **Files:**
+
 - Modify: `docs/processing-strategies.md`
 
 - [ ] **Step 11.1: Add a section on producer batching**
@@ -2125,7 +2146,7 @@ input:
   uses: files.read_csv
   with:
     file: people.csv
-    batch_size: 500   # downstream steps process 500 records per call
+    batch_size: 500 # downstream steps process 500 records per call
 ```
 
 Default: `1000`.
@@ -2141,7 +2162,7 @@ input:
     connection: my_redis
     stream_name: events
     batch_size: 1000
-    flush_ms: 500   # emit a partial batch after 500ms of inactivity
+    flush_ms: 500 # emit a partial batch after 500ms of inactivity
 ```
 
 Default: `1000` ms. Set to `null` to disable time-based flushing (records are held until `batch_size` or end-of-stream).
@@ -2173,6 +2194,7 @@ cd core && python -m pytest src/datayoga_core/ -v
 ```
 
 Expected: all tests pass. Notably:
+
 - `test_producer_batching.py` (7 tests)
 - `test_schema_inherit.py` (5 tests)
 - `test_std_read.py`, `test_read_csv.py`, `test_parquet_read.py`, `test_relational_read.py`, `test_http_receiver.py`, `test_redis_read_stream.py`, `test_event_hub.py` (12 tests total)
diff --git a/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md b/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md
index 81692cdc..7ef27825 100644
--- a/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md
+++ b/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md
@@ -9,15 +9,15 @@
 
 Seven producer blocks each handle (or fail to handle) batching differently:
 
-| Producer | Bounded/Streaming | `batch_size` today | Behavior |
-|---|---|---|---|
-| `std/read` | bounded | yes, default 1000 *(on `batch_size_in_std_read_block` branch)* | custom `process_batch` accumulator |
-| `files/read_csv` | bounded | yes, default 1000 | own `islice(reader, batch_size)` loop |
-| `relational/read` | bounded | **no** — hardcoded `fetchmany(10000)` | yields one row at a time downstream (bug) |
-| `parquet/read` | bounded | **no** | yields one row at a time (bug) |
-| `redis/read_stream` | streaming | **no** | yields one record at a time (bug #377) |
-| `azure/read_event_hub` | streaming | yes, default 300, **but** controls *SDK callback batch size*, not pipeline batch size | drains internal queue in unbounded batches |
-| `http/receiver` | streaming | **no** | yields one record per HTTP request (bug) |
+| Producer               | Bounded/Streaming | `batch_size` today                                                                    | Behavior                                   |
+| ---------------------- | ----------------- | ------------------------------------------------------------------------------------- | ------------------------------------------ |
+| `std/read`             | bounded           | yes, default 1000 _(on `batch_size_in_std_read_block` branch)_                        | custom `process_batch` accumulator         |
+| `files/read_csv`       | bounded           | yes, default 1000                                                                     | own `islice(reader, batch_size)` loop      |
+| `relational/read`      | bounded           | **no** — hardcoded `fetchmany(10000)`                                                 | yields one row at a time downstream (bug)  |
+| `parquet/read`         | bounded           | **no**                                                                                | yields one row at a time (bug)             |
+| `redis/read_stream`    | streaming         | **no**                                                                                | yields one record at a time (bug #377)     |
+| `azure/read_event_hub` | streaming         | yes, default 300, **but** controls _SDK callback batch size_, not pipeline batch size | drains internal queue in unbounded batches |
+| `http/receiver`        | streaming         | **no**                                                                                | yields one record per HTTP request (bug)   |
 
 Four are actively buggy (yielding single records into the pipeline when batches are intended). One uses `batch_size` with a different semantic. Each producer that has implemented batching has done it differently.
 
@@ -115,7 +115,7 @@ async def produce(self) -> AsyncGenerator[List[Message], None]:
             await pump_task
 ```
 
-Why a queue and not `asyncio.wait_for(anext(gen), timeout)`: cancelling `__anext__` on an async generator with side effects (open connections, partial reads) can leave it in a broken state. Cancelling the *pump task* boundary is safe; the generator finishes its current chunk before the pump's `try/finally` runs.
+Why a queue and not `asyncio.wait_for(anext(gen), timeout)`: cancelling `__anext__` on an async generator with side effects (open connections, partial reads) can leave it in a broken state. Cancelling the _pump task_ boundary is safe; the generator finishes its current chunk before the pump's `try/finally` runs.
 
 `flush_ms = None` ⇒ `timeout = None` ⇒ `queue.get()` waits forever ⇒ no time-based flush. Bounded sources don't set `flush_ms` and aren't affected.
 
@@ -124,6 +124,7 @@ Why a queue and not `asyncio.wait_for(anext(gen), timeout)`: cancelling `__anext
 Two shared fragments in `core/src/datayoga_core/resources/schemas/`:
 
 `batchable.schema.json`:
+
 ```json
 {
   "type": "object",
@@ -139,6 +140,7 @@ Two shared fragments in `core/src/datayoga_core/resources/schemas/`:
 ```
 
 `streamable.schema.json`:
+
 ```json
 {
   "type": "object",
@@ -324,15 +326,15 @@ class Block(DyProducer):
 
 ### Defaults summary
 
-| Producer | `batch_size` | `flush_ms` | Other |
-|---|---|---|---|
-| `std/read` | 1000 | — | — |
-| `files/read_csv` | 1000 | — | — |
-| `relational/read` | 1000 | — | optional `fetch_size`, defaults to 10000 |
-| `parquet/read` | 1000 | — | — |
-| `redis/read_stream` | 1000 | 1000 | — |
-| `azure/read_event_hub` | 1000 | 1000 | `max_batch_size` 300 (renamed from old `batch_size`) |
-| `http/receiver` | 1000 | 1000 | — |
+| Producer               | `batch_size` | `flush_ms` | Other                                                |
+| ---------------------- | ------------ | ---------- | ---------------------------------------------------- |
+| `std/read`             | 1000         | —          | —                                                    |
+| `files/read_csv`       | 1000         | —          | —                                                    |
+| `relational/read`      | 1000         | —          | optional `fetch_size`, defaults to 10000             |
+| `parquet/read`         | 1000         | —          | —                                                    |
+| `redis/read_stream`    | 1000         | 1000       | —                                                    |
+| `azure/read_event_hub` | 1000         | 1000       | `max_batch_size` 300 (renamed from old `batch_size`) |
+| `http/receiver`        | 1000         | 1000       | —                                                    |
 
 ## Tests
 

From 5734e033637fca4461527f0538fc1c798a92e10b Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 19:04:39 +0300
Subject: [PATCH 29/38] Address Copilot review: propagate source errors +
 bounded backpressure (#400)

Two correctness fixes flagged by the Copilot PR reviewer:

1. Source errors no longer become silent EOS. The pump now captures
   non-cancellation exceptions and re-raises them on the consumer side
   after flushing the partial buffer, so a Redis disconnect, broken CSV,
   or DB read error fails the job loudly instead of being treated as
   end-of-stream against truncated input.

2. The internal queue is now bounded (maxsize=1), restoring the
   backpressure the old yield-driven model had. Without this, large
   bounded sources (parquet, relational, csv) could pre-load the entire
   table/file into memory while downstream was processing batch 1. The
   pump's `finally: put(EOS)` is skipped on cancellation to avoid
   deadlocking against a full queue.

Also: corrected processing-strategies docs to say "up to batch_size"
instead of "exactly batch_size", since partial batches fire on EOS and
flush_ms timeout.

Three new tests:
- test_source_errors_propagate_instead_of_silent_eos
- test_source_error_flushes_buffer_before_raising
- test_pump_does_not_outrun_consumer_unboundedly

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 core/src/datayoga_core/producer.py            |  33 ++++-
 .../tests/test_producer_batching.py           | 122 ++++++++++++++++++
 docs/processing-strategies.md                 |   2 +-
 ...28-producer-batching-unification-design.md |   5 +-
 4 files changed, 153 insertions(+), 9 deletions(-)

diff --git a/core/src/datayoga_core/producer.py b/core/src/datayoga_core/producer.py
index 8b672433..dc5b05d5 100644
--- a/core/src/datayoga_core/producer.py
+++ b/core/src/datayoga_core/producer.py
@@ -44,30 +44,47 @@ async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
         yield  # pragma: no cover
 
     async def produce(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
-        """Re-chunks `produce_chunks()` output to exact batch_size batches.
+        """Re-chunks `produce_chunks()` output into batches of up to `batch_size`.
+
+        Each batch is exactly `batch_size` except for the last batch on
+        end-of-stream and any partial batch flushed by `flush_ms` inactivity.
 
         Reads `batch_size` and `flush_ms` from properties lazily so subclasses
         don't need to remember to call `super().init()`.
+
+        Source errors raised by `produce_chunks()` propagate to the caller (the
+        job aborts) rather than being treated as a silent end-of-stream. The
+        background pump uses a bounded queue so source reads cannot outpace
+        downstream consumption — the existing backpressure is preserved.
         """
         batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE))
         flush_ms = self.properties.get("flush_ms", self.DEFAULT_FLUSH_MS)
         timeout = (flush_ms / 1000) if flush_ms else None
 
-        queue: asyncio.Queue = asyncio.Queue()
+        # maxsize=1 keeps the pump exactly one chunk ahead of the consumer,
+        # which restores the natural backpressure the old yield-driven model had.
+        queue: asyncio.Queue = asyncio.Queue(maxsize=1)
         EOS = object()
+        pump_error: List[BaseException] = []  # length 0 or 1
 
         async def pump():
-            """Drains produce_chunks() into the queue; signals EOS on exit."""
+            """Drains produce_chunks() into the queue; signals EOS on exit and captures errors."""
+            cancelled = False
             try:
                 async for chunk in self.produce_chunks():
                     if chunk:
                         await queue.put(chunk)
             except asyncio.CancelledError:
+                cancelled = True
                 raise
-            except Exception as exc:
-                logger.exception("produce_chunks raised; ending stream: %s", exc)
+            except BaseException as exc:
+                pump_error.append(exc)
             finally:
-                await queue.put(EOS)
+                # Skip the EOS put when cancelled — the consumer's finally is
+                # awaiting us, the queue may be full (maxsize=1), and putting
+                # would deadlock. The consumer won't read EOS anyway.
+                if not cancelled:
+                    await queue.put(EOS)
 
         pump_task = asyncio.create_task(pump())
         buffer: List[Dict[str, Any]] = []
@@ -84,6 +101,10 @@ async def pump():
                 if item is EOS:
                     if buffer:
                         yield buffer
+                    if pump_error:
+                        # Re-raise the source error so the job fails loudly
+                        # instead of treating a truncated read as success.
+                        raise pump_error[0]
                     return
 
                 buffer.extend(item)
diff --git a/core/src/datayoga_core/tests/test_producer_batching.py b/core/src/datayoga_core/tests/test_producer_batching.py
index 34413037..337cc3c6 100644
--- a/core/src/datayoga_core/tests/test_producer_batching.py
+++ b/core/src/datayoga_core/tests/test_producer_batching.py
@@ -129,3 +129,125 @@ async def test_consumer_cancellation_cleans_up_pump():
     # If pump task wasn't cleaned up we'd see a "Task was destroyed but it is
     # pending!" warning here. Sleep briefly so the loop has a chance to surface it.
     await asyncio.sleep(0.1)
+
+
+class _BoomProducer(Producer):
+    """Producer whose produce_chunks() raises after emitting some chunks."""
+
+    def __init__(self, properties, *, before_error, error):
+        """Configures how many chunks to emit before raising."""
+        self._test_schema = {
+            "type": "object",
+            "properties": {"batch_size": {"type": "integer", "minimum": 1}},
+        }
+        self._before_error = before_error
+        self._error = error
+        super().__init__(properties)
+
+    def get_json_schema(self):
+        """Returns the in-memory test schema (avoids reading from disk)."""
+        return self._test_schema
+
+    def init(self, context: Optional[Context] = None):
+        """No-op init; _BoomProducer doesn't need any setup."""
+        pass
+
+    async def produce_chunks(self) -> AsyncGenerator[List[Message], None]:
+        """Emits the scripted lead-in chunks, then raises the configured exception."""
+        for chunk in self._before_error:
+            yield chunk
+        raise self._error
+
+
+@pytest.mark.asyncio
+async def test_source_errors_propagate_instead_of_silent_eos():
+    """A failing source must abort the consumer, not look like clean EOS."""
+    p = _BoomProducer(
+        {"batch_size": 100},
+        before_error=[[_msg(1), _msg(2)]],
+        error=RuntimeError("source connection lost"),
+    )
+    with pytest.raises(RuntimeError, match="source connection lost"):
+        async for _ in p.produce():
+            pass
+
+
+@pytest.mark.asyncio
+async def test_source_error_flushes_buffer_before_raising():
+    """Partial buffer is yielded before the error propagates, so already-read
+    records aren't dropped on top of the error."""
+    p = _BoomProducer(
+        {"batch_size": 1000},
+        before_error=[[_msg(1), _msg(2), _msg(3)]],
+        error=RuntimeError("disk read failed"),
+    )
+    received = []
+    with pytest.raises(RuntimeError, match="disk read failed"):
+        async for batch in p.produce():
+            received.append(batch)
+    assert [len(b) for b in received] == [3]
+
+
+class _CountingProducer(Producer):
+    """Producer that records how many chunks it has been allowed to emit.
+
+    Used to prove the base class applies backpressure (the pump stays no more
+    than one chunk ahead of the consumer when maxsize=1).
+    """
+
+    def __init__(self, properties, *, num_chunks, chunk_size, on_emit):
+        """Configures how many fixed-size chunks to emit and a per-emit hook."""
+        self._test_schema = {
+            "type": "object",
+            "properties": {"batch_size": {"type": "integer", "minimum": 1}},
+        }
+        self._num_chunks = num_chunks
+        self._chunk_size = chunk_size
+        self._on_emit = on_emit
+        super().__init__(properties)
+
+    def get_json_schema(self):
+        """Returns the in-memory test schema (avoids reading from disk)."""
+        return self._test_schema
+
+    def init(self, context: Optional[Context] = None):
+        """No-op init; _CountingProducer doesn't need any setup."""
+        pass
+
+    async def produce_chunks(self) -> AsyncGenerator[List[Message], None]:
+        """Yields num_chunks fixed-size chunks, calling on_emit after each yield."""
+        for i in range(self._num_chunks):
+            yield [_msg(i * self._chunk_size + j) for j in range(self._chunk_size)]
+            self._on_emit(i + 1)
+
+
+@pytest.mark.asyncio
+async def test_pump_does_not_outrun_consumer_unboundedly():
+    """With the default bounded queue, the pump stays close to the consumer.
+
+    Without backpressure, the pump would emit all 1000 chunks before the
+    consumer reads any. With maxsize=1 the pump can be at most ~2 chunks
+    ahead at any moment (one being put, one queued).
+    """
+    emitted_count = [0]
+
+    def record_emit(n):
+        emitted_count[0] = n
+
+    p = _CountingProducer(
+        {"batch_size": 100},
+        num_chunks=1000,
+        chunk_size=100,
+        on_emit=record_emit,
+    )
+
+    gen = p.produce()
+    # Pull one batch and observe how far ahead the pump got.
+    await gen.__anext__()
+    # Yield once so the pump gets a chance to advance after the consumer
+    # took one chunk off the queue.
+    await asyncio.sleep(0)
+    ahead = emitted_count[0]
+    await gen.aclose()
+    # Pump should be at most a handful of chunks ahead, not all 1000.
+    assert ahead <= 5, f"pump emitted {ahead} chunks while consumer pulled 1"
diff --git a/docs/processing-strategies.md b/docs/processing-strategies.md
index 692d82c1..2cf186b3 100644
--- a/docs/processing-strategies.md
+++ b/docs/processing-strategies.md
@@ -66,7 +66,7 @@ The Rate limit strategy defines the number of requests per given time interval.
 
 ## Producer Batching
 
-Every producer block (any block that reads from a source — `std/read`, `files/read_csv`, `parquet/read`, `relational/read`, `redis/read_stream`, `azure/read_event_hub`, `http/receiver`) accepts a `batch_size` property. The producer base class re-chunks the source's output into batches of exactly `batch_size` records, regardless of how the source delivers them (per row, per row group, per `fetchmany`, per network message).
+Every producer block (any block that reads from a source — `std/read`, `files/read_csv`, `parquet/read`, `relational/read`, `redis/read_stream`, `azure/read_event_hub`, `http/receiver`) accepts a `batch_size` property. The producer base class re-chunks the source's output into batches of up to `batch_size` records, regardless of how the source delivers them (per row, per row group, per `fetchmany`, per network message). The last batch on end-of-stream and any partial batch flushed by `flush_ms` may be smaller.
 
 ```yaml
 input:
diff --git a/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md b/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md
index 7ef27825..2b96ce05 100644
--- a/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md
+++ b/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md
@@ -53,13 +53,14 @@ class Producer(Block):
 
     async def produce(self) -> AsyncGenerator[List[Message], None]:
         """Public entry point. Reads chunks from produce_chunks() and re-emits
-        in exact batch_size slices, with optional time-based flush."""
+        in batches of up to batch_size (smaller on EOS or flush_ms), with
+        bounded backpressure and source-error propagation."""
         ...
 ```
 
 Subclasses override `produce_chunks` instead of `produce`. They emit chunks of any size — whatever's natural to the source (a Parquet row group, a `fetchmany` result, an `xreadgroup` response, an Event Hub callback batch, a single record).
 
-The base class accumulates chunks and re-emits them in exact `batch_size` slices, flushing whatever's left on end-of-stream.
+The base class accumulates chunks and re-emits them in batches of up to `batch_size`, flushing whatever's left on end-of-stream and (for streaming sources) on `flush_ms` inactivity.
 
 ### `batch_size` and `flush_ms` are read lazily
 

From 0dda422520b89c8d8ad017a9f77b8616aab6c988 Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 19:10:18 +0300
Subject: [PATCH 30/38] Add docstrings to all test functions added in this PR
 (#400)

One-liner docstrings on every test_* function: producer batching, schema
inherit, and per-block read tests. Matches the docstring coverage applied
to production code earlier in this PR.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../blocks/files/read_csv/tests/test_read_csv.py         | 2 ++
 .../blocks/http/receiver/tests/test_http_receiver.py     | 1 +
 .../blocks/parquet/read/tests/test_parquet_read.py       | 2 ++
 .../redis/read_stream/tests/test_redis_read_stream.py    | 2 ++
 .../blocks/relational/read/tests/test_relational_read.py | 3 +++
 .../datayoga_core/blocks/std/read/tests/test_std_read.py | 1 +
 core/src/datayoga_core/tests/test_producer_batching.py   | 7 +++++++
 core/src/datayoga_core/tests/test_schema_inherit.py      | 9 +++++++++
 8 files changed, 27 insertions(+)

diff --git a/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py b/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py
index 55fd548e..a479910a 100644
--- a/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py
+++ b/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py
@@ -23,6 +23,7 @@ def csv_path(tmp_path) -> Path:
 
 @pytest.mark.asyncio
 async def test_csv_batches_to_batch_size(csv_path):
+    """2500 CSV rows with batch_size=1000 yields batches of [1000, 1000, 500]."""
     block = Block({"file": str(csv_path), "batch_size": 1000})
     block.init()
     batches = await _drain(block)
@@ -33,6 +34,7 @@ async def test_csv_batches_to_batch_size(csv_path):
 
 @pytest.mark.asyncio
 async def test_csv_default_batch_size(csv_path):
+    """Without batch_size in properties, the default 1000 is applied."""
     block = Block({"file": str(csv_path)})
     block.init()
     batches = await _drain(block)
diff --git a/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py b/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py
index 4673801d..ee187f71 100644
--- a/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py
+++ b/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py
@@ -15,6 +15,7 @@ def _free_port():
 
 @pytest.mark.asyncio
 async def test_http_receiver_batches_incoming_requests():
+    """60 POSTs with batch_size=50 + flush_ms=200 yield at least one full batch of 50."""
     port = _free_port()
     block = Block({"host": "127.0.0.1", "port": port,
                    "batch_size": 50, "flush_ms": 200})
diff --git a/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py b/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py
index b33a3d03..546f77d9 100644
--- a/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py
+++ b/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py
@@ -25,6 +25,7 @@ def parquet_path(tmp_path) -> Path:
 
 @pytest.mark.asyncio
 async def test_parquet_batches_to_batch_size(parquet_path):
+    """2500 rows across three row groups, batch_size=1000 -> [1000, 1000, 500]."""
     block = Block({"file": str(parquet_path), "batch_size": 1000})
     block.init()
     batches = await _drain(block)
@@ -36,6 +37,7 @@ async def test_parquet_batches_to_batch_size(parquet_path):
 
 @pytest.mark.asyncio
 async def test_parquet_rechunks_across_row_groups(parquet_path):
+    """Batches honor batch_size regardless of underlying row-group boundaries."""
     # row groups are [1000, 1000, 500]; batch_size=750 should give batches of
     # [750, 750, 750, 250] regardless of row group boundaries.
     block = Block({"file": str(parquet_path), "batch_size": 750})
diff --git a/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py b/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py
index 5d46e99e..5c4a43f7 100644
--- a/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py
+++ b/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py
@@ -18,6 +18,7 @@ def _mk_block(properties, redis_client):
 
 @pytest.mark.asyncio
 async def test_redis_uses_count_equal_to_batch_size():
+    """xreadgroup is called with count=batch_size (closes #377)."""
     redis = MagicMock()
     payload_a = (b"1-0", {b"data": b'{"i": 1}'})
     payload_b = (b"2-0", {b"data": b'{"i": 2}'})
@@ -38,6 +39,7 @@ async def test_redis_uses_count_equal_to_batch_size():
 
 @pytest.mark.asyncio
 async def test_redis_yields_records_as_a_batch_not_one_by_one():
+    """A 5-record xreadgroup response yields one batch of 5, not five batches of 1."""
     redis = MagicMock()
     pages = [(f"{i}-0".encode(), {b"data": f'{{"i": {i}}}'.encode()}) for i in range(5)]
     redis.xreadgroup.side_effect = [
diff --git a/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py b/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py
index 6dafd72e..47528712 100644
--- a/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py
+++ b/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py
@@ -53,6 +53,7 @@ def _mk_block(properties, fake_result):
 
 @pytest.mark.asyncio
 async def test_relational_read_yields_batches_not_rows():
+    """2500 rows with batch_size=1000 yield [1000, 1000, 500], not 2500 single-row batches."""
     rows = [_Row({"i": i}) for i in range(2500)]
     fake_result = _fake_result(rows)
     block = _mk_block({"batch_size": 1000}, fake_result)
@@ -62,6 +63,7 @@ async def test_relational_read_yields_batches_not_rows():
 
 @pytest.mark.asyncio
 async def test_relational_read_fetch_size_independent_of_batch_size():
+    """fetch_size controls driver round-trips; batch_size controls downstream batches; both are decoupled."""
     rows = [_Row({"i": i}) for i in range(5000)]
     fake_result = _fake_result(rows)
     block = _mk_block({"batch_size": 1000, "fetch_size": 2500}, fake_result)
@@ -76,6 +78,7 @@ async def test_relational_read_fetch_size_independent_of_batch_size():
 
 @pytest.mark.asyncio
 async def test_relational_read_default_fetch_size_is_10000():
+    """When fetch_size is omitted, the driver-level fetchmany is called with 10000."""
     rows = [_Row({"i": i}) for i in range(500)]
     fake_result = _fake_result(rows)
     block = _mk_block({}, fake_result)
diff --git a/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py b/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py
index d9698d16..6ec3d933 100644
--- a/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py
+++ b/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py
@@ -15,6 +15,7 @@ async def _drain(producer):
 
 @pytest.mark.asyncio
 async def test_std_read_batches_to_batch_size():
+    """2500 stdin records with batch_size=1000 yield batches of [1000, 1000, 500]."""
     payload = [{"i": i} for i in range(2500)]
     fake_stdin = [orjson.dumps(payload).decode()]
 
diff --git a/core/src/datayoga_core/tests/test_producer_batching.py b/core/src/datayoga_core/tests/test_producer_batching.py
index 337cc3c6..5ab4f98a 100644
--- a/core/src/datayoga_core/tests/test_producer_batching.py
+++ b/core/src/datayoga_core/tests/test_producer_batching.py
@@ -54,6 +54,7 @@ async def _drain(producer: Producer):
 
 @pytest.mark.asyncio
 async def test_rechunks_one_large_chunk():
+    """One 5000-record chunk + batch_size=1000 yields five batches of 1000."""
     chunks = [[_msg(i) for i in range(5000)]]
     p = FakeProducer({"batch_size": 1000}, chunks=chunks)
     batches = await _drain(p)
@@ -62,6 +63,7 @@ async def test_rechunks_one_large_chunk():
 
 @pytest.mark.asyncio
 async def test_accumulates_small_chunks_and_flushes_on_eos():
+    """Small chunks (200+300+400=900) are accumulated; the partial batch flushes on EOS."""
     chunks = [[_msg(i) for i in range(200)],
               [_msg(i) for i in range(200, 500)],
               [_msg(i) for i in range(500, 900)]]
@@ -72,6 +74,7 @@ async def test_accumulates_small_chunks_and_flushes_on_eos():
 
 @pytest.mark.asyncio
 async def test_partial_final_batch_on_eos():
+    """1500 records + batch_size=1000 yields [1000, 500] — the trailing partial fires on EOS."""
     chunks = [[_msg(i) for i in range(1500)]]
     p = FakeProducer({"batch_size": 1000}, chunks=chunks)
     batches = await _drain(p)
@@ -80,6 +83,7 @@ async def test_partial_final_batch_on_eos():
 
 @pytest.mark.asyncio
 async def test_empty_chunks_are_ignored():
+    """Empty chunks from produce_chunks() don't produce empty batches."""
     chunks = [[], [_msg(1), _msg(2)], [], [_msg(3)]]
     p = FakeProducer({"batch_size": 10}, chunks=chunks)
     batches = await _drain(p)
@@ -88,6 +92,7 @@ async def test_empty_chunks_are_ignored():
 
 @pytest.mark.asyncio
 async def test_flush_ms_emits_partial_on_inactivity():
+    """With flush_ms set, a partial batch is emitted on source inactivity, not held to EOS."""
     # one chunk of 2 records, then a 300ms wait before EOS; flush_ms=100 should
     # flush the partial batch of 2 well before EOS.
     chunks = [[_msg(1), _msg(2)], [_msg(3)]]
@@ -109,6 +114,7 @@ async def test_flush_ms_emits_partial_on_inactivity():
 
 @pytest.mark.asyncio
 async def test_no_flush_ms_holds_records_until_eos():
+    """Without flush_ms, accumulated records stay buffered until batch_size or EOS."""
     chunks = [[_msg(1)], [_msg(2)]]
     sleeps = [0, 0.1]
     p = FakeProducer({"batch_size": 100}, chunks=chunks, sleep_before=sleeps)
@@ -118,6 +124,7 @@ async def test_no_flush_ms_holds_records_until_eos():
 
 @pytest.mark.asyncio
 async def test_consumer_cancellation_cleans_up_pump():
+    """Closing the producer generator cancels the pump cleanly (no orphaned task warnings)."""
     chunks = [[_msg(i)] for i in range(1000)]
     p = FakeProducer({"batch_size": 10, "flush_ms": 50}, chunks=chunks,
                      sleep_before=[0.05] * 1000)
diff --git a/core/src/datayoga_core/tests/test_schema_inherit.py b/core/src/datayoga_core/tests/test_schema_inherit.py
index 46f448cd..f01a1dfe 100644
--- a/core/src/datayoga_core/tests/test_schema_inherit.py
+++ b/core/src/datayoga_core/tests/test_schema_inherit.py
@@ -9,6 +9,7 @@
 
 
 def test_inherit_merges_fragment_properties():
+    """A schema with $inherit:[batchable] picks up batch_size from the fragment."""
     schema = {
         "title": "demo",
         "type": "object",
@@ -25,6 +26,7 @@ def test_inherit_merges_fragment_properties():
 
 
 def test_inherit_local_property_wins_over_fragment():
+    """When local schema redefines an inherited property, the local version takes precedence."""
     schema = {
         "type": "object",
         "$inherit": ["batchable"],
@@ -37,6 +39,7 @@ def test_inherit_local_property_wins_over_fragment():
 
 
 def test_inherit_streamable_brings_both_props():
+    """$inherit:[streamable] exposes both batch_size and flush_ms on the schema."""
     schema = {"type": "object", "$inherit": ["streamable"], "properties": {}}
     resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
     assert "batch_size" in resolved["properties"]
@@ -44,6 +47,7 @@ def test_inherit_streamable_brings_both_props():
 
 
 def test_schema_without_inherit_unchanged():
+    """Schemas without $inherit pass through resolve_inherits unmodified."""
     schema = {
         "type": "object",
         "properties": {"foo": {"type": "string"}},
@@ -54,24 +58,28 @@ def test_schema_without_inherit_unchanged():
 
 
 def test_unknown_fragment_raises():
+    """$inherit referencing a missing fragment file raises FileNotFoundError."""
     schema = {"type": "object", "$inherit": ["nope"], "properties": {}}
     with pytest.raises(FileNotFoundError):
         resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
 
 
 def test_inherit_string_value_raises_type_error():
+    """$inherit must be a list; passing a string raises TypeError loudly."""
     schema = {"type": "object", "$inherit": "batchable", "properties": {}}
     with pytest.raises(TypeError):
         resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
 
 
 def test_inherit_non_string_items_raises_type_error():
+    """Non-string items in the $inherit list raise TypeError."""
     schema = {"type": "object", "$inherit": ["batchable", 123], "properties": {}}
     with pytest.raises(TypeError):
         resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
 
 
 def test_inherit_empty_list_returns_unchanged():
+    """An empty $inherit list is a no-op; the schema is returned as-is."""
     schema = {"type": "object", "$inherit": [], "properties": {"foo": {}}}
     resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
     # Early-return path: schema is returned as-is (no mutation, no key removal).
@@ -79,6 +87,7 @@ def test_inherit_empty_list_returns_unchanged():
 
 
 def test_nested_inherit_raises_value_error(tmp_path):
+    """A fragment that itself contains $inherit raises ValueError (no nested inheritance)."""
     # Build a fragment dir with a fragment that has its own $inherit.
     (tmp_path / "parent.schema.json").write_text(
         '{"properties": {"x": {"type": "string"}}}'

From c1e2e7130fd90f8779ceef5481975c2dca91a845 Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 19:27:21 +0300
Subject: [PATCH 31/38] Drop meaningless
 test_max_batch_size_defaults_to_300_when_omitted (#400)

Audit pass: the test literally asserted
  block.properties.get("max_batch_size", 300) == 300
which is testing the standard library, not Block code. Removed.
Remaining 4 azure tests cover validation, schema shape, accepted properties,
and the documented breaking-change behavior.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../blocks/azure/read_event_hub/tests/test_event_hub.py     | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py b/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py
index 17cff570..0506ee7b 100644
--- a/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py
+++ b/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py
@@ -30,12 +30,6 @@ def test_max_batch_size_accepted():
     assert block.properties["batch_size"] == 100
 
 
-def test_max_batch_size_defaults_to_300_when_omitted():
-    """The block's init() reads max_batch_size with a default of 300."""
-    block = Block(_minimal_props())
-    assert int(block.properties.get("max_batch_size", 300)) == 300
-
-
 def test_renamed_schema_has_additional_properties_false():
     """Schema after rename: max_batch_size + streamable's batch_size/flush_ms,
     no unknown properties allowed."""

From 056f8cf5df32657bb9cf1285de52ed0e4ce75d31 Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 19:31:17 +0300
Subject: [PATCH 32/38] Drop retrospective implementation plan; keep design
 spec (#400)

The 2225-line plan was execution scaffolding for the work that now ships.
The spec remains as the architectural record.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 ...026-05-28-producer-batching-unification.md | 2225 -----------------
 1 file changed, 2225 deletions(-)
 delete mode 100644 docs/superpowers/plans/2026-05-28-producer-batching-unification.md

diff --git a/docs/superpowers/plans/2026-05-28-producer-batching-unification.md b/docs/superpowers/plans/2026-05-28-producer-batching-unification.md
deleted file mode 100644
index d4fa4415..00000000
--- a/docs/superpowers/plans/2026-05-28-producer-batching-unification.md
+++ /dev/null
@@ -1,2225 +0,0 @@
-# Producer Batching Unification Implementation Plan
-
-> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
-
-**Goal:** Move batching out of individual producer blocks into the `Producer` base class so every read block has consistent `batch_size` behavior, and three buggy producers stop yielding single records.
-
-**Architecture:** The `Producer` base class gets a new abstract-by-convention hook `produce_chunks()` that yields lists of any size. Its `produce()` method becomes a re-chunker that emits exact `batch_size` batches, with an optional `flush_ms` timeout-flush for streaming sources. Schema fragments (`batchable.schema.json`, `streamable.schema.json`) provide the shared `batch_size`/`flush_ms` definitions, resolved at load time via a `$inherit` convention. Each of the 7 producer blocks migrates to override `produce_chunks` instead of `produce`.
-
-**Tech Stack:** Python 3.7+, asyncio, jsonschema, pytest (asyncio mode), SQLAlchemy, redis-py, aiohttp, azure-eventhub.
-
-**Spec:** `docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md`
-**Issue:** #400
-
----
-
-## File Structure
-
-**Created:**
-
-- `core/src/datayoga_core/resources/schemas/batchable.schema.json` — fragment exposing `batch_size`
-- `core/src/datayoga_core/resources/schemas/streamable.schema.json` — fragment exposing `flush_ms` (combined with batchable)
-- `core/src/datayoga_core/schema_utils.py` — `$inherit` resolver used by Block + Job
-- `core/src/datayoga_core/tests/__init__.py` — empty, makes the tests package importable
-- `core/src/datayoga_core/tests/test_schema_inherit.py` — tests for the `$inherit` resolver
-- `core/src/datayoga_core/tests/test_producer_batching.py` — base-class batching tests
-- `core/src/datayoga_core/blocks/parquet/read/tests/__init__.py` (if package missing)
-- `core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py`
-- `core/src/datayoga_core/blocks/files/read_csv/tests/__init__.py`
-- `core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py`
-- `core/src/datayoga_core/blocks/redis/read_stream/tests/__init__.py`
-- `core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py`
-- `core/src/datayoga_core/blocks/http/receiver/tests/__init__.py`
-- `core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py`
-- `core/src/datayoga_core/blocks/azure/read_event_hub/tests/__init__.py`
-- `core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py`
-- `core/src/datayoga_core/blocks/relational/read/tests/__init__.py`
-- `core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py`
-
-**Modified:**
-
-- `core/src/datayoga_core/producer.py` — adds `produce_chunks` and a default `produce()` that re-chunks
-- `core/src/datayoga_core/block.py` — `get_json_schema()` runs through `$inherit` resolver
-- `core/src/datayoga_core/job.py` — `get_json_schema()` loop runs each loaded schema through the resolver
-- `core/src/datayoga_core/blocks/std/read/block.py` — replace `process_batch` with `produce_chunks`
-- `core/src/datayoga_core/blocks/std/read/block.schema.json` — use `$inherit: ["batchable"]`
-- `core/src/datayoga_core/blocks/files/read_csv/block.py` — `produce_chunks` (drop `islice` loop in `produce`)
-- `core/src/datayoga_core/blocks/files/read_csv/block.schema.json` — drop inline `batch_size`, add `$inherit`
-- `core/src/datayoga_core/blocks/parquet/read/block.py` — `produce_chunks` per row group
-- `core/src/datayoga_core/blocks/parquet/read/block.schema.json` — add `$inherit`
-- `core/src/datayoga_core/blocks/relational/read/block.py` — `produce_chunks` with `fetch_size`
-- `core/src/datayoga_core/blocks/relational/read/block.schema.json` — add `$inherit` + `fetch_size` property
-- `core/src/datayoga_core/blocks/redis/read_stream/block.py` — `produce_chunks` with `count=batch_size`
-- `core/src/datayoga_core/blocks/redis/read_stream/block.schema.json` — `$inherit: ["streamable"]`
-- `core/src/datayoga_core/blocks/http/receiver/block.py` — `produce_chunks` drains queue
-- `core/src/datayoga_core/blocks/http/receiver/block.schema.json` — `$inherit: ["streamable"]`
-- `core/src/datayoga_core/blocks/azure/read_event_hub/block.py` — `produce_chunks`, rename `batch_size` → `max_batch_size`
-- `core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json` — rename property, add `additionalProperties: false`, `$inherit: ["streamable"]`
-- `schemas/job.schema.json` — regenerated at the end
-- `docs/reference/blocks/*.md` — regenerated at the end
-- `docs/processing-strategies.md` — new section on producer batching
-
----
-
-## Task 1: Schema fragment loader
-
-Adds the `$inherit` convention and the two shared fragments. After this task, schemas referencing `batchable` / `streamable` get the fragments' properties merged in at load time.
-
-**Files:**
-
-- Create: `core/src/datayoga_core/resources/schemas/batchable.schema.json`
-- Create: `core/src/datayoga_core/resources/schemas/streamable.schema.json`
-- Create: `core/src/datayoga_core/schema_utils.py`
-- Create: `core/src/datayoga_core/tests/__init__.py`
-- Create: `core/src/datayoga_core/tests/test_schema_inherit.py`
-- Modify: `core/src/datayoga_core/block.py` (lines 44–59)
-- Modify: `core/src/datayoga_core/job.py` (lines 223–244)
-
-- [ ] **Step 1.1: Create the `batchable` fragment**
-
-Create `core/src/datayoga_core/resources/schemas/batchable.schema.json`:
-
-```json
-{
-  "title": "batchable",
-  "description": "Producer batching mixin: declares batch_size for producers that yield records in batches.",
-  "type": "object",
-  "properties": {
-    "batch_size": {
-      "type": "integer",
-      "minimum": 1,
-      "description": "Maximum number of records yielded per downstream batch.",
-      "default": 1000
-    }
-  }
-}
-```
-
-- [ ] **Step 1.2: Create the `streamable` fragment**
-
-Create `core/src/datayoga_core/resources/schemas/streamable.schema.json`:
-
-```json
-{
-  "title": "streamable",
-  "description": "Streaming producer mixin: declares batch_size and flush_ms for producers reading from continuous sources.",
-  "type": "object",
-  "properties": {
-    "batch_size": {
-      "type": "integer",
-      "minimum": 1,
-      "description": "Maximum number of records yielded per downstream batch.",
-      "default": 1000
-    },
-    "flush_ms": {
-      "type": ["integer", "null"],
-      "minimum": 1,
-      "description": "If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.",
-      "default": 1000
-    }
-  }
-}
-```
-
-- [ ] **Step 1.3: Create empty tests package**
-
-If `core/src/datayoga_core/tests/__init__.py` does not exist, create it as an empty file. (Several test modules in this plan live in `core/src/datayoga_core/tests/`; the directory must be importable.)
-
-```bash
-test -f core/src/datayoga_core/tests/__init__.py || touch core/src/datayoga_core/tests/__init__.py
-```
-
-- [ ] **Step 1.4: Write the failing test for `$inherit` resolution**
-
-Create `core/src/datayoga_core/tests/test_schema_inherit.py`:
-
-```python
-import json
-from pathlib import Path
-
-import pytest
-
-from datayoga_core.schema_utils import resolve_inherits
-
-
-SCHEMAS_DIR = (
-    Path(__file__).resolve().parent.parent / "resources" / "schemas"
-)
-
-
-def test_inherit_merges_fragment_properties():
-    schema = {
-        "title": "demo",
-        "type": "object",
-        "$inherit": ["batchable"],
-        "properties": {"foo": {"type": "string"}},
-        "additionalProperties": False,
-    }
-    resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
-    assert "$inherit" not in resolved
-    assert "batch_size" in resolved["properties"]
-    assert resolved["properties"]["batch_size"]["default"] == 1000
-    assert resolved["properties"]["foo"] == {"type": "string"}
-    assert resolved["additionalProperties"] is False
-
-
-def test_inherit_local_property_wins_over_fragment():
-    schema = {
-        "type": "object",
-        "$inherit": ["batchable"],
-        "properties": {
-            "batch_size": {"type": "integer", "minimum": 1, "default": 50}
-        },
-    }
-    resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
-    assert resolved["properties"]["batch_size"]["default"] == 50
-
-
-def test_inherit_streamable_brings_both_props():
-    schema = {"type": "object", "$inherit": ["streamable"], "properties": {}}
-    resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
-    assert "batch_size" in resolved["properties"]
-    assert "flush_ms" in resolved["properties"]
-
-
-def test_schema_without_inherit_unchanged():
-    schema = {
-        "type": "object",
-        "properties": {"foo": {"type": "string"}},
-        "additionalProperties": False,
-    }
-    resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
-    assert resolved == schema
-
-
-def test_unknown_fragment_raises():
-    schema = {"type": "object", "$inherit": ["nope"], "properties": {}}
-    with pytest.raises(FileNotFoundError):
-        resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
-```
-
-- [ ] **Step 1.5: Run test to verify it fails**
-
-Run:
-
-```bash
-cd core && python -m pytest src/datayoga_core/tests/test_schema_inherit.py -v
-```
-
-Expected: FAIL with `ModuleNotFoundError: No module named 'datayoga_core.schema_utils'`.
-
-- [ ] **Step 1.6: Implement the resolver**
-
-Create `core/src/datayoga_core/schema_utils.py`:
-
-```python
-"""Schema composition helpers.
-
-Producers and other blocks can declare `"$inherit": ["batchable"]` at the
-top of their block.schema.json to pull in shared property definitions from
-the fragments in resources/schemas/. `resolve_inherits` merges the
-fragments' `properties` into the local schema (local properties win), then
-removes the `$inherit` key. Schemas without `$inherit` are returned as-is.
-"""
-from __future__ import annotations
-
-import copy
-from os import path
-from typing import Any, Dict, List
-
-from datayoga_core import utils
-
-
-def resolve_inherits(schema: Dict[str, Any], schemas_dir: str = None) -> Dict[str, Any]:
-    """Merge any fragments listed in $inherit into the schema's properties.
-
-    Args:
-        schema: The schema to resolve. Mutated in place and also returned.
-        schemas_dir: Directory containing the fragment files. Defaults to
-            the bundled/non-bundled resources/schemas directory.
-
-    Returns:
-        The mutated schema with $inherit removed and fragment properties merged.
-    """
-    inherits: List[str] = schema.get("$inherit") or []
-    if not inherits:
-        return schema
-
-    if schemas_dir is None:
-        schemas_dir = utils.get_resource_path("schemas")
-
-    merged_properties: Dict[str, Any] = {}
-    for fragment_name in inherits:
-        fragment_path = path.join(schemas_dir, f"{fragment_name}.schema.json")
-        if not path.isfile(fragment_path):
-            raise FileNotFoundError(
-                f"Schema fragment '{fragment_name}' not found at {fragment_path}"
-            )
-        fragment = utils.read_json(fragment_path)
-        merged_properties.update(copy.deepcopy(fragment.get("properties", {})))
-
-    # Local properties take precedence over inherited ones.
-    local_properties = schema.get("properties", {})
-    merged_properties.update(local_properties)
-
-    schema["properties"] = merged_properties
-    schema.pop("$inherit", None)
-    return schema
-```
-
-- [ ] **Step 1.7: Run test to verify it passes**
-
-Run:
-
-```bash
-cd core && python -m pytest src/datayoga_core/tests/test_schema_inherit.py -v
-```
-
-Expected: 5 passed.
-
-- [ ] **Step 1.8: Wire resolver into `Block.get_json_schema`**
-
-Modify `core/src/datayoga_core/block.py`. After loading the schema (currently `return utils.read_json(json_schema_file)` on line 59), pass it through the resolver.
-
-Replace lines 44–59 with:
-
-```python
-    def get_json_schema(self) -> Dict[str, Any]:
-        """Returns the JSON Schema for this block.
-
-        Returns:
-            Dict[str, Any]: JSON Schema.
-        """
-        json_schema_file = path.join(
-            utils.get_bundled_dir(),
-            os.path.relpath(
-                os.path.dirname(sys.modules[self.__module__].__file__),
-                start=os.path.dirname(__file__)),
-            "block.schema.json") if utils.is_bundled() else path.join(
-            os.path.dirname(os.path.realpath(sys.modules[self.__module__].__file__)),
-            "block.schema.json")
-        logger.debug(f"loading schema from {json_schema_file}")
-        from datayoga_core.schema_utils import resolve_inherits
-        return resolve_inherits(utils.read_json(json_schema_file))
-```
-
-Note: the `from datayoga_core.schema_utils import resolve_inherits` line is inside the function to avoid a circular import (schema_utils imports from utils, utils imports from block).
-
-- [ ] **Step 1.9: Wire resolver into `Job.get_json_schema`**
-
-Modify `core/src/datayoga_core/job.py`. Inside the `for block_type, schema_path in block_info:` loop (around line 240–243), apply the resolver to each loaded schema.
-
-Find this block:
-
-```python
-        for block_type, schema_path in block_info:
-            block_types.append(block_type)
-            # load schema file
-            schema = utils.read_json(f"{schema_path}")
-            # append to the array of allOf for the full schema
-```
-
-Replace with:
-
-```python
-        from datayoga_core.schema_utils import resolve_inherits
-        for block_type, schema_path in block_info:
-            block_types.append(block_type)
-            # load schema file
-            schema = resolve_inherits(utils.read_json(f"{schema_path}"))
-            # append to the array of allOf for the full schema
-```
-
-- [ ] **Step 1.10: Verify existing block validation still passes**
-
-Run the full core test suite to make sure nothing regressed (no producer is using `$inherit` yet, so behavior should be unchanged):
-
-```bash
-cd core && python -m pytest src/datayoga_core/ -x -q
-```
-
-Expected: all existing tests pass; the 5 new `test_schema_inherit.py` tests also pass.
-
-- [ ] **Step 1.11: Commit**
-
-```bash
-git add core/src/datayoga_core/resources/schemas/batchable.schema.json \
-        core/src/datayoga_core/resources/schemas/streamable.schema.json \
-        core/src/datayoga_core/schema_utils.py \
-        core/src/datayoga_core/tests/__init__.py \
-        core/src/datayoga_core/tests/test_schema_inherit.py \
-        core/src/datayoga_core/block.py \
-        core/src/datayoga_core/job.py
-git commit -m "Add \$inherit schema fragment resolver (#400)"
-```
-
----
-
-## Task 2: Producer base class with batching
-
-Add `produce_chunks()` and a default `produce()` that re-chunks. Existing subclasses override `produce()` directly and are unaffected until migrated in later tasks.
-
-**Files:**
-
-- Create: `core/src/datayoga_core/tests/test_producer_batching.py`
-- Modify: `core/src/datayoga_core/producer.py`
-
-- [ ] **Step 2.1: Write the failing tests**
-
-Create `core/src/datayoga_core/tests/test_producer_batching.py`:
-
-```python
-import asyncio
-from typing import AsyncGenerator, List, Optional
-
-import pytest
-
-from datayoga_core.context import Context
-from datayoga_core.producer import Message, Producer
-
-
-def _msg(i: int) -> dict:
-    return {Producer.MSG_ID_FIELD: str(i), "v": i}
-
-
-class FakeProducer(Producer):
-    """Producer driven by a scripted list of chunks plus optional sleeps."""
-
-    def __init__(self, properties=None, *, chunks=None, sleep_before=None):
-        # schema for a FakeProducer; declare batch_size/flush_ms so validation passes
-        self._test_schema = {
-            "type": "object",
-            "properties": {
-                "batch_size": {"type": "integer", "minimum": 1},
-                "flush_ms": {"type": ["integer", "null"], "minimum": 1},
-            },
-        }
-        self._chunks = chunks or []
-        self._sleep_before = sleep_before or []
-        super().__init__(properties or {})
-
-    def get_json_schema(self):
-        return self._test_schema
-
-    def init(self, context: Optional[Context] = None):
-        pass
-
-    async def produce_chunks(self) -> AsyncGenerator[List[Message], None]:
-        for i, chunk in enumerate(self._chunks):
-            if i < len(self._sleep_before) and self._sleep_before[i]:
-                await asyncio.sleep(self._sleep_before[i])
-            yield chunk
-
-
-async def _drain(producer: Producer):
-    out = []
-    async for batch in producer.produce():
-        out.append(batch)
-    return out
-
-
-@pytest.mark.asyncio
-async def test_rechunks_one_large_chunk():
-    chunks = [[_msg(i) for i in range(5000)]]
-    p = FakeProducer({"batch_size": 1000}, chunks=chunks)
-    batches = await _drain(p)
-    assert [len(b) for b in batches] == [1000, 1000, 1000, 1000, 1000]
-
-
-@pytest.mark.asyncio
-async def test_accumulates_small_chunks_and_flushes_on_eos():
-    chunks = [[_msg(i) for i in range(200)],
-              [_msg(i) for i in range(200, 500)],
-              [_msg(i) for i in range(500, 900)]]
-    p = FakeProducer({"batch_size": 1000}, chunks=chunks)
-    batches = await _drain(p)
-    assert [len(b) for b in batches] == [900]
-
-
-@pytest.mark.asyncio
-async def test_partial_final_batch_on_eos():
-    chunks = [[_msg(i) for i in range(1500)]]
-    p = FakeProducer({"batch_size": 1000}, chunks=chunks)
-    batches = await _drain(p)
-    assert [len(b) for b in batches] == [1000, 500]
-
-
-@pytest.mark.asyncio
-async def test_empty_chunks_are_ignored():
-    chunks = [[], [_msg(1), _msg(2)], [], [_msg(3)]]
-    p = FakeProducer({"batch_size": 10}, chunks=chunks)
-    batches = await _drain(p)
-    assert [len(b) for b in batches] == [3]
-
-
-@pytest.mark.asyncio
-async def test_flush_ms_emits_partial_on_inactivity():
-    # one chunk of 2 records, then a 300ms wait before EOS; flush_ms=100 should
-    # flush the partial batch of 2 well before EOS.
-    chunks = [[_msg(1), _msg(2)], [_msg(3)]]
-    sleeps = [0, 0.3]
-    p = FakeProducer({"batch_size": 100, "flush_ms": 100},
-                     chunks=chunks, sleep_before=sleeps)
-
-    received = []
-    started = asyncio.get_event_loop().time()
-    timings = []
-    async for batch in p.produce():
-        timings.append(asyncio.get_event_loop().time() - started)
-        received.append(batch)
-
-    assert [len(b) for b in received] == [2, 1]
-    # first flush happens because of inactivity (~100ms), not waiting for chunk 2
-    assert timings[0] < 0.25, f"expected first flush before 250ms, got {timings[0]}"
-
-
-@pytest.mark.asyncio
-async def test_no_flush_ms_holds_records_until_eos():
-    chunks = [[_msg(1)], [_msg(2)]]
-    sleeps = [0, 0.1]
-    p = FakeProducer({"batch_size": 100}, chunks=chunks, sleep_before=sleeps)
-    batches = await _drain(p)
-    assert [len(b) for b in batches] == [2]  # combined on EOS, never flushed mid-stream
-
-
-@pytest.mark.asyncio
-async def test_consumer_cancellation_cleans_up_pump():
-    chunks = [[_msg(i)] for i in range(1000)]
-    p = FakeProducer({"batch_size": 10, "flush_ms": 50}, chunks=chunks,
-                     sleep_before=[0.05] * 1000)
-
-    gen = p.produce()
-    first = await gen.__anext__()
-    assert len(first) >= 1
-    await gen.aclose()
-    # If pump task wasn't cleaned up we'd see a "Task was destroyed but it is
-    # pending!" warning here. Sleep briefly so the loop has a chance to surface it.
-    await asyncio.sleep(0.1)
-```
-
-- [ ] **Step 2.2: Run tests to verify they fail**
-
-Run:
-
-```bash
-cd core && python -m pytest src/datayoga_core/tests/test_producer_batching.py -v
-```
-
-Expected: All 7 tests FAIL with `TypeError: Can't instantiate abstract class FakeProducer with abstract methods produce` (because `produce` is currently abstract and `FakeProducer` doesn't override it; it overrides `produce_chunks` which doesn't exist yet).
-
-- [ ] **Step 2.3: Implement the new `Producer` base class**
-
-Replace the contents of `core/src/datayoga_core/producer.py` with:
-
-```python
-import asyncio
-import logging
-from contextlib import suppress
-from typing import Any, AsyncGenerator, Dict, List
-
-from .block import Block
-
-logger = logging.getLogger("dy")
-
-
-class Message:
-    def __init__(self, msg_id: str, value: Dict[str, Any]):
-        self.msg_id = msg_id
-        self.value = value
-
-
-class Producer(Block):
-    """Base class for producer (read) blocks.
-
-    Subclasses override `produce_chunks()` to yield chunks of any size from
-    the source. The default `produce()` re-chunks them to exactly `batch_size`
-    records per batch (smaller on flush_ms timeout or end-of-stream).
-
-    Legacy subclasses may still override `produce()` directly. They bypass
-    the base-class batching and `produce_chunks` is not called.
-    """
-
-    DEFAULT_BATCH_SIZE = 1000
-    DEFAULT_FLUSH_MS = None  # streaming subclasses override to enable timeout flush
-
-    async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
-        """Yield natural-size chunks from the source.
-
-        Subclasses should override this method. The base-class `produce()`
-        will re-chunk the output to exact `batch_size` slices.
-        """
-        raise NotImplementedError(
-            f"{type(self).__name__} must override produce_chunks() or produce()"
-        )
-        # Make this an async generator for type-checking purposes.
-        yield  # pragma: no cover
-
-    async def produce(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
-        """Re-chunks `produce_chunks()` output to exact batch_size batches.
-
-        Reads `batch_size` and `flush_ms` from properties lazily so subclasses
-        don't need to remember to call `super().init()`.
-        """
-        batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE))
-        flush_ms = self.properties.get("flush_ms", self.DEFAULT_FLUSH_MS)
-        timeout = (flush_ms / 1000) if flush_ms else None
-
-        queue: asyncio.Queue = asyncio.Queue()
-        EOS = object()
-
-        async def pump():
-            try:
-                async for chunk in self.produce_chunks():
-                    if chunk:
-                        await queue.put(chunk)
-            except asyncio.CancelledError:
-                raise
-            except Exception as exc:
-                logger.exception("produce_chunks raised; ending stream: %s", exc)
-            finally:
-                await queue.put(EOS)
-
-        pump_task = asyncio.create_task(pump())
-        buffer: List[Dict[str, Any]] = []
-        try:
-            while True:
-                try:
-                    item = await asyncio.wait_for(queue.get(), timeout=timeout)
-                except asyncio.TimeoutError:
-                    if buffer:
-                        yield buffer
-                        buffer = []
-                    continue
-
-                if item is EOS:
-                    if buffer:
-                        yield buffer
-                    return
-
-                buffer.extend(item)
-                while len(buffer) >= batch_size:
-                    yield buffer[:batch_size]
-                    buffer = buffer[batch_size:]
-        finally:
-            pump_task.cancel()
-            with suppress(asyncio.CancelledError, Exception):
-                await pump_task
-
-    def ack(self, msg_ids: List[str]):
-        """Sends acknowledge for the message IDs of records that have been processed."""
-        pass
-```
-
-Key differences from the current file:
-
-- `produce()` is no longer `@abstractmethod` — it has a default implementation.
-- `produce_chunks()` is the new override hook (not formally `@abstractmethod` so legacy subclasses still validate).
-- `Message` class unchanged.
-
-- [ ] **Step 2.4: Run tests to verify they pass**
-
-Run:
-
-```bash
-cd core && python -m pytest src/datayoga_core/tests/test_producer_batching.py -v
-```
-
-Expected: 7 passed.
-
-- [ ] **Step 2.5: Run the full core test suite to confirm no regressions**
-
-Existing producers all still override `produce()`, so their behavior is unchanged.
-
-```bash
-cd core && python -m pytest src/datayoga_core/ -x -q
-```
-
-Expected: all tests pass (including the new `test_producer_batching` and `test_schema_inherit`).
-
-- [ ] **Step 2.6: Commit**
-
-```bash
-git add core/src/datayoga_core/producer.py \
-        core/src/datayoga_core/tests/test_producer_batching.py
-git commit -m "Producer base class re-chunks via produce_chunks (#400)"
-```
-
----
-
-## Task 3: Migrate `std/read`
-
-`std/read` already has `batch_size` and a custom `process_batch` accumulator. Replace it with a `produce_chunks` that yields one chunk; the base class re-chunks.
-
-**Files:**
-
-- Modify: `core/src/datayoga_core/blocks/std/read/block.py`
-- Modify: `core/src/datayoga_core/blocks/std/read/block.schema.json`
-
-- [ ] **Step 3.1: Write the failing test**
-
-There is no existing `tests/` directory under `std/read`. The std/read producer is exercised indirectly by integration tests, but we add a unit test for batching here.
-
-Create `core/src/datayoga_core/blocks/std/read/tests/__init__.py` (empty file) and `core/src/datayoga_core/blocks/std/read/tests/test_std_read.py`:
-
-```python
-import asyncio
-from unittest.mock import patch
-
-import orjson
-import pytest
-
-from datayoga_core.blocks.std.read.block import Block
-
-
-async def _drain(producer):
-    out = []
-    async for batch in producer.produce():
-        out.append(batch)
-    return out
-
-
-@pytest.mark.asyncio
-async def test_std_read_batches_to_batch_size():
-    payload = [{"i": i} for i in range(2500)]
-    fake_stdin = [orjson.dumps(payload).decode()]
-
-    block = Block({"batch_size": 1000})
-    block.init()
-
-    with patch("datayoga_core.blocks.std.read.block.select.select",
-               return_value=([object()], [], [])), \
-         patch("datayoga_core.blocks.std.read.block.sys.stdin", fake_stdin):
-        batches = await _drain(block)
-
-    assert [len(b) for b in batches] == [1000, 1000, 500]
-    # records carry their MSG_ID_FIELD and original payload values
-    flat = [r for b in batches for r in b]
-    assert flat[0]["i"] == 0
-    assert all(Block.MSG_ID_FIELD in r for r in flat)
-```
-
-- [ ] **Step 3.2: Run test to verify it fails**
-
-Run:
-
-```bash
-cd core && python -m pytest src/datayoga_core/blocks/std/read/tests/test_std_read.py -v
-```
-
-Expected: FAIL — the current implementation yields batches of `batch_size`, but its `process_batch` helper won't be exercised through the new `produce()` machinery because it overrides `produce()` directly. The test may also fail because the current produce() doesn't see the `batch_size_in_std_read_block` branch's batch logic interact cleanly with the test mocks. (The point of this step is to drive the migration; the failure shape is secondary.)
-
-- [ ] **Step 3.3: Migrate `std/read` to `produce_chunks`**
-
-Replace the contents of `core/src/datayoga_core/blocks/std/read/block.py` with:
-
-```python
-import logging
-import select
-import sys
-import uuid
-from typing import Any, AsyncGenerator, Dict, List, Optional
-
-import orjson
-from datayoga_core.context import Context
-from datayoga_core.producer import Producer as DyProducer
-
-logger = logging.getLogger("dy")
-
-
-class Block(DyProducer):
-    def init(self, context: Optional[Context] = None):
-        logger.debug(f"Initializing {self.get_block_name()}")
-
-    async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
-        if select.select([sys.stdin], [], [], 0.0)[0]:
-            all_records: List[Dict[str, Any]] = []
-            for line in sys.stdin:
-                all_records.extend(self.get_records(line))
-        else:
-            print("Enter data to process:")
-            all_records = self.get_records(input())
-
-        if all_records:
-            yield [self.get_message(record) for record in all_records]
-
-    @staticmethod
-    def get_records(data: str) -> List[Dict[str, Any]]:
-        records = orjson.loads(data)
-        if isinstance(records, dict):
-            records = [records]
-        return records
-
-    def get_message(self, record: Dict[str, Any]) -> Dict[str, Any]:
-        return {self.MSG_ID_FIELD: str(uuid.uuid4()), **record}
-```
-
-The `process_batch`, `batch_size` init read, and `produce` override are all gone. The base class handles batching.
-
-- [ ] **Step 3.4: Update the schema to use the fragment**
-
-Replace the contents of `core/src/datayoga_core/blocks/std/read/block.schema.json` with:
-
-```json
-{
-  "title": "std.read",
-  "description": "Read from the standard input",
-  "type": "object",
-  "$inherit": ["batchable"],
-  "properties": {},
-  "additionalProperties": false
-}
-```
-
-The `batch_size` declaration now comes from the fragment.
-
-- [ ] **Step 3.5: Run test to verify it passes**
-
-```bash
-cd core && python -m pytest src/datayoga_core/blocks/std/read/tests/test_std_read.py -v
-```
-
-Expected: PASS.
-
-- [ ] **Step 3.6: Run the full core suite**
-
-```bash
-cd core && python -m pytest src/datayoga_core/ -x -q
-```
-
-Expected: all tests pass.
-
-- [ ] **Step 3.7: Commit**
-
-```bash
-git add core/src/datayoga_core/blocks/std/read/block.py \
-        core/src/datayoga_core/blocks/std/read/block.schema.json \
-        core/src/datayoga_core/blocks/std/read/tests/__init__.py \
-        core/src/datayoga_core/blocks/std/read/tests/test_std_read.py
-git commit -m "Migrate std/read to produce_chunks (#400, #296)"
-```
-
----
-
-## Task 4: Migrate `files/read_csv`
-
-Replace the `produce()` override and `islice` loop with a `produce_chunks` that yields one chunk per `batch_size` rows. The base class re-chunks to the configured `batch_size`.
-
-**Files:**
-
-- Modify: `core/src/datayoga_core/blocks/files/read_csv/block.py`
-- Modify: `core/src/datayoga_core/blocks/files/read_csv/block.schema.json`
-
-- [ ] **Step 4.1: Write the failing test**
-
-Create `core/src/datayoga_core/blocks/files/read_csv/tests/__init__.py` (empty) and `core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py`:
-
-```python
-from pathlib import Path
-
-import pytest
-
-from datayoga_core.blocks.files.read_csv.block import Block
-
-
-async def _drain(producer):
-    out = []
-    async for batch in producer.produce():
-        out.append(batch)
-    return out
-
-
-@pytest.fixture
-def csv_path(tmp_path) -> Path:
-    p = tmp_path / "data.csv"
-    rows = ["fname,lname"] + [f"first{i},last{i}" for i in range(2500)]
-    p.write_text("\n".join(rows) + "\n", encoding="utf-8")
-    return p
-
-
-@pytest.mark.asyncio
-async def test_csv_batches_to_batch_size(csv_path):
-    block = Block({"file": str(csv_path), "batch_size": 1000, "skip": 1})
-    block.init()
-    batches = await _drain(block)
-    assert [len(b) for b in batches] == [1000, 1000, 500]
-    # message ids are populated
-    assert all(Block.MSG_ID_FIELD in r for b in batches for r in b)
-    # first row content
-    assert batches[0][0]["fname"] == "first0"
-
-
-@pytest.mark.asyncio
-async def test_csv_default_batch_size(csv_path):
-    block = Block({"file": str(csv_path), "skip": 1})
-    block.init()
-    batches = await _drain(block)
-    # default batch_size is 1000
-    assert [len(b) for b in batches] == [1000, 1000, 500]
-```
-
-- [ ] **Step 4.2: Run test to verify it fails**
-
-```bash
-cd core && python -m pytest src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py -v
-```
-
-Expected: FAIL — current `produce()` works but the tests may pass coincidentally because `files/read_csv` already batches. That's fine; the test exists to _protect_ the contract. Proceed to the migration anyway and confirm the test still passes afterward.
-
-- [ ] **Step 4.3: Migrate `files/read_csv` to `produce_chunks`**
-
-Replace the contents of `core/src/datayoga_core/blocks/files/read_csv/block.py` with:
-
-```python
-import logging
-import os
-from abc import ABCMeta
-from contextlib import suppress
-from csv import DictReader
-from itertools import count, islice
-from typing import Any, AsyncGenerator, Dict, List, Optional
-
-from datayoga_core.context import Context
-from datayoga_core.producer import Producer as DyProducer
-
-logger = logging.getLogger("dy")
-
-
-class Block(DyProducer, metaclass=ABCMeta):
-
-    def init(self, context: Optional[Context] = None):
-        logger.debug(f"Initializing {self.get_block_name()}")
-        csv_file = self.properties["file"]
-        if os.path.isabs(csv_file) or context is None:
-            self.file = csv_file
-        else:
-            self.file = os.path.join(context.properties.get("data_path"), csv_file)
-        logger.debug(f"file: {self.file}")
-        self.encoding = self.properties.get("encoding", "utf-8")
-        self.fields = self.properties.get("fields")
-        self.skip = self.properties.get("skip", 0)
-        self.delimiter = self.properties.get("delimiter", ",")
-        self.quotechar = self.properties.get("quotechar", "\"")
-
-    async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
-        logger.debug("Reading CSV")
-        batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE))
-
-        with open(self.file, "r", encoding=self.encoding) as read_obj:
-            reader = DictReader(read_obj, fieldnames=self.fields,
-                                delimiter=self.delimiter, quotechar=self.quotechar)
-            for _ in range(self.skip):
-                with suppress(StopIteration):
-                    next(reader)
-            counter = iter(count())
-            while True:
-                chunk = [
-                    {self.MSG_ID_FIELD: f"{next(counter)}", **record}
-                    for record in islice(reader, batch_size)
-                ]
-                if not chunk:
-                    return
-                yield chunk
-```
-
-The init no longer reads `self.batch_size` (read lazily in `produce_chunks`).
-
-- [ ] **Step 4.4: Update the schema**
-
-Replace `core/src/datayoga_core/blocks/files/read_csv/block.schema.json` with:
-
-```json
-{
-  "title": "files.read_csv",
-  "description": "Read data from CSV",
-  "type": "object",
-  "$inherit": ["batchable"],
-  "properties": {
-    "file": {
-      "description": "Filename. Can contain a regexp or glob expression",
-      "type": "string"
-    },
-    "encoding": {
-      "description": "Encoding to use for reading the file",
-      "type": "string",
-      "default": "utf-8"
-    },
-    "fields": {
-      "type": "array",
-      "title": "List of columns to use",
-      "description": "List of columns to use for extract",
-      "default": null,
-      "examples": [["fname", "lname"]],
-      "minLength": 1,
-      "additionalItems": true,
-      "items": {
-        "type": "string",
-        "description": "field name",
-        "examples": ["fname"]
-      }
-    },
-    "skip": {
-      "description": "Number of lines to skip",
-      "type": "number",
-      "minimum": 0,
-      "default": 0
-    },
-    "delimiter": {
-      "description": "Delimiter to use for splitting the csv records",
-      "type": "string",
-      "minLength": 1,
-      "maxLength": 1,
-      "default": ","
-    },
-    "quotechar": {
-      "description": "A one-character string used to quote fields containing special characters, such as the delimiter or quotechar, or which contain new-line characters. It defaults to '",
-      "type": "string",
-      "minLength": 1,
-      "maxLength": 1,
-      "default": "\""
-    }
-  },
-  "additionalProperties": false,
-  "required": ["file"],
-  "examples": [
-    {
-      "file": "archive.csv",
-      "delimiter": ";"
-    }
-  ]
-}
-```
-
-The `batch_size` inline property is removed; it comes from the `batchable` fragment.
-
-- [ ] **Step 4.5: Run test to verify it passes**
-
-```bash
-cd core && python -m pytest src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py -v
-```
-
-Expected: 2 passed.
-
-- [ ] **Step 4.6: Run the full core suite**
-
-```bash
-cd core && python -m pytest src/datayoga_core/ -x -q
-```
-
-Expected: all tests pass.
-
-- [ ] **Step 4.7: Commit**
-
-```bash
-git add core/src/datayoga_core/blocks/files/read_csv/block.py \
-        core/src/datayoga_core/blocks/files/read_csv/block.schema.json \
-        core/src/datayoga_core/blocks/files/read_csv/tests/__init__.py \
-        core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py
-git commit -m "Migrate files/read_csv to produce_chunks (#400)"
-```
-
----
-
-## Task 5: Migrate `parquet/read` (fixes one-by-one bug)
-
-Today `parquet/read` iterates each row of each row group and yields a single-record list per iteration. Migrate it to yield each row group as a single chunk; the base class re-chunks to `batch_size`.
-
-**Files:**
-
-- Modify: `core/src/datayoga_core/blocks/parquet/read/block.py`
-- Modify: `core/src/datayoga_core/blocks/parquet/read/block.schema.json`
-
-- [ ] **Step 5.1: Write the failing test**
-
-Create `core/src/datayoga_core/blocks/parquet/read/tests/__init__.py` (empty) and `core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py`:
-
-```python
-from pathlib import Path
-
-import pandas as pd
-import pytest
-
-from datayoga_core.blocks.parquet.read.block import Block
-
-
-async def _drain(producer):
-    out = []
-    async for batch in producer.produce():
-        out.append(batch)
-    return out
-
-
-@pytest.fixture
-def parquet_path(tmp_path) -> Path:
-    p = tmp_path / "data.parquet"
-    df = pd.DataFrame({"i": list(range(2500))})
-    # row_group_offsets=1000 creates 3 row groups (1000, 1000, 500)
-    from fastparquet import write as fp_write
-    fp_write(str(p), df, row_group_offsets=1000)
-    return p
-
-
-@pytest.mark.asyncio
-async def test_parquet_batches_to_batch_size(parquet_path):
-    block = Block({"file": str(parquet_path), "batch_size": 1000})
-    block.init()
-    batches = await _drain(block)
-    assert [len(b) for b in batches] == [1000, 1000, 500]
-    flat = [r for b in batches for r in b]
-    assert flat[0]["i"] == 0
-    assert all(Block.MSG_ID_FIELD in r for r in flat)
-
-
-@pytest.mark.asyncio
-async def test_parquet_rechunks_across_row_groups(parquet_path):
-    # row groups are [1000, 1000, 500]; batch_size=750 should give batches of
-    # [750, 750, 750, 250] regardless of row group boundaries.
-    block = Block({"file": str(parquet_path), "batch_size": 750})
-    block.init()
-    batches = await _drain(block)
-    assert [len(b) for b in batches] == [750, 750, 750, 250]
-```
-
-- [ ] **Step 5.2: Run test to verify it fails**
-
-```bash
-cd core && python -m pytest src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py -v
-```
-
-Expected: FAIL — current implementation yields batches of size 1, so the assertions fail.
-
-- [ ] **Step 5.3: Migrate `parquet/read`**
-
-Replace the contents of `core/src/datayoga_core/blocks/parquet/read/block.py` with:
-
-```python
-import logging
-import os
-from abc import ABCMeta
-from itertools import count
-from typing import Any, AsyncGenerator, Dict, List, Optional
-
-from datayoga_core.context import Context
-from datayoga_core.producer import Producer as DyProducer
-from fastparquet import ParquetFile
-
-logger = logging.getLogger("dy")
-
-
-class Block(DyProducer, metaclass=ABCMeta):
-
-    def init(self, context: Optional[Context] = None):
-        logger.debug(f"Initializing {self.get_block_name()}")
-        parquet_file = self.properties["file"]
-        if os.path.isabs(parquet_file) or context is None:
-            self.file = parquet_file
-        else:
-            self.file = os.path.join(context.properties.get("data_path"), parquet_file)
-        logger.debug(f"file: {self.file}")
-
-    async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
-        logger.debug("Reading parquet")
-        pf = ParquetFile(self.file)
-        counter = iter(count())
-        for df in pf.iter_row_groups():
-            yield [
-                {self.MSG_ID_FIELD: str(next(counter)), **row.to_dict()}
-                for _, row in df.iterrows()
-            ]
-```
-
-- [ ] **Step 5.4: Update the schema**
-
-Replace `core/src/datayoga_core/blocks/parquet/read/block.schema.json` with:
-
-```json
-{
-  "title": "parquet.read",
-  "description": "Read data from parquet",
-  "type": "object",
-  "$inherit": ["batchable"],
-  "properties": {
-    "file": {
-      "description": "Filename. Can contain a regexp or glob expression",
-      "type": "string"
-    }
-  },
-  "additionalProperties": false,
-  "required": ["file"],
-  "examples": [
-    {
-      "file": "data.parquet"
-    }
-  ]
-}
-```
-
-- [ ] **Step 5.5: Run test to verify it passes**
-
-```bash
-cd core && python -m pytest src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py -v
-```
-
-Expected: 2 passed.
-
-- [ ] **Step 5.6: Run the full core suite**
-
-```bash
-cd core && python -m pytest src/datayoga_core/ -x -q
-```
-
-Expected: all tests pass.
-
-- [ ] **Step 5.7: Commit**
-
-```bash
-git add core/src/datayoga_core/blocks/parquet/read/block.py \
-        core/src/datayoga_core/blocks/parquet/read/block.schema.json \
-        core/src/datayoga_core/blocks/parquet/read/tests/__init__.py \
-        core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py
-git commit -m "Migrate parquet/read to produce_chunks, fix one-by-one yield (#400, #293)"
-```
-
----
-
-## Task 6: Migrate `relational/read` (fix bug + add `fetch_size`)
-
-Today `relational/read` does `fetchmany(10000)` then yields one row at a time. Migrate to `produce_chunks` that yields each `fetchmany` result. Add an optional `fetch_size` property; default to 10000 to preserve today's DB round-trip count.
-
-**Files:**
-
-- Modify: `core/src/datayoga_core/blocks/relational/read/block.py`
-- Modify: `core/src/datayoga_core/blocks/relational/read/block.schema.json`
-
-- [ ] **Step 6.1: Write the failing test**
-
-Create `core/src/datayoga_core/blocks/relational/read/tests/__init__.py` (empty) and `core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py`:
-
-```python
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-from datayoga_core.blocks.relational.read.block import Block
-
-
-async def _drain(producer):
-    out = []
-    async for batch in producer.produce():
-        out.append(batch)
-    return out
-
-
-def _fake_result(rows):
-    """Build a fake SQLAlchemy result that returns rows in fetchmany chunks."""
-    state = {"i": 0}
-
-    def fetchmany(n):
-        i = state["i"]
-        chunk = rows[i:i + n]
-        state["i"] += len(chunk)
-        return chunk
-
-    res = MagicMock()
-    res.fetchmany.side_effect = fetchmany
-    res.execution_options.return_value = res
-    return res
-
-
-class _Row:
-    def __init__(self, d):
-        self._d = d
-
-    def _asdict(self):
-        return self._d
-
-
-@pytest.mark.asyncio
-async def test_relational_read_yields_batches_not_rows():
-    rows = [_Row({"i": i}) for i in range(2500)]
-    fake_result = _fake_result(rows)
-
-    block = Block.__new__(Block)
-    block.properties = {"batch_size": 1000}
-    block.connection = MagicMock()
-    block.tbl = MagicMock()
-    block.tbl.select.return_value = "SELECT *"
-    block.connection.execution_options.return_value.execute.return_value = fake_result
-
-    batches = await _drain(block)
-    assert [len(b) for b in batches] == [1000, 1000, 500]
-
-
-@pytest.mark.asyncio
-async def test_relational_read_fetch_size_independent_of_batch_size():
-    rows = [_Row({"i": i}) for i in range(5000)]
-    fake_result = _fake_result(rows)
-
-    block = Block.__new__(Block)
-    block.properties = {"batch_size": 1000, "fetch_size": 2500}
-    block.connection = MagicMock()
-    block.tbl = MagicMock()
-    block.tbl.select.return_value = "SELECT *"
-    block.connection.execution_options.return_value.execute.return_value = fake_result
-
-    batches = await _drain(block)
-    # Downstream batches are still batch_size=1000
-    assert [len(b) for b in batches] == [1000, 1000, 1000, 1000, 1000]
-    # Driver fetched in fetch_size=2500 chunks: 2500 + 2500 + 0 = 3 calls
-    fetch_sizes = [c.args[0] for c in fake_result.fetchmany.call_args_list]
-    assert fetch_sizes[0] == 2500
-    assert fetch_sizes[1] == 2500
-
-
-@pytest.mark.asyncio
-async def test_relational_read_default_fetch_size_is_10000():
-    rows = [_Row({"i": i}) for i in range(500)]
-    fake_result = _fake_result(rows)
-
-    block = Block.__new__(Block)
-    block.properties = {}
-    block.connection = MagicMock()
-    block.tbl = MagicMock()
-    block.tbl.select.return_value = "SELECT *"
-    block.connection.execution_options.return_value.execute.return_value = fake_result
-
-    await _drain(block)
-    fetch_sizes = [c.args[0] for c in fake_result.fetchmany.call_args_list]
-    assert fetch_sizes[0] == 10000
-```
-
-- [ ] **Step 6.2: Run test to verify it fails**
-
-```bash
-cd core && python -m pytest src/datayoga_core/blocks/relational/read/tests/test_relational_read.py -v
-```
-
-Expected: FAIL — the current `produce()` yields one row at a time, so `[len(b) for b in batches]` is `[1] * 2500`.
-
-- [ ] **Step 6.3: Migrate `relational/read`**
-
-Replace the contents of `core/src/datayoga_core/blocks/relational/read/block.py` with:
-
-```python
-import logging
-from typing import Any, AsyncGenerator, Dict, List, Optional
-
-import sqlalchemy as sa
-from datayoga_core import utils
-from datayoga_core.blocks.relational import utils as relational_utils
-from datayoga_core.context import Context
-from datayoga_core.producer import Producer as DyProducer
-
-logger = logging.getLogger("dy")
-
-
-class Block(DyProducer):
-    DEFAULT_FETCH_SIZE = 10000
-
-    def init(self, context: Optional[Context] = None):
-        self.engine, self.db_type = relational_utils.get_engine(
-            self.properties["connection"],
-            context,
-            autocommit=False,
-        )
-
-        self.schema = self.properties.get("schema")
-        self.table = self.properties.get("table")
-        self.opcode_field = self.properties.get("opcode_field")
-        self.load_strategy = self.properties.get("load_strategy")
-        self.keys = self.properties.get("keys")
-        self.mapping = self.properties.get("mapping")
-
-        self.tbl = sa.Table(self.table, sa.MetaData(schema=self.schema), autoload_with=self.engine)
-
-        logger.debug(f"Connecting to {self.db_type}")
-        self.connection = self.engine.connect()
-
-    async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
-        fetch_size = int(self.properties.get("fetch_size", self.DEFAULT_FETCH_SIZE))
-        result = self.connection.execution_options(stream_results=True).execute(self.tbl.select())
-        while True:
-            rows = result.fetchmany(fetch_size)
-            if not rows:
-                return
-            yield [utils.add_uid(dict(row._asdict())) for row in rows]
-
-    def stop(self):
-        self.connection.close()
-        self.engine.dispose()
-```
-
-- [ ] **Step 6.4: Update the schema**
-
-Replace `core/src/datayoga_core/blocks/relational/read/block.schema.json` with:
-
-```json
-{
-  "title": "relational.read",
-  "description": "Read a table from an SQL-compatible data store",
-  "type": "object",
-  "$inherit": ["batchable"],
-  "additionalProperties": false,
-  "examples": [
-    {
-      "id": "read_snowflake",
-      "type": "relational.read",
-      "properties": {
-        "connection": "eu_datalake",
-        "table": "employees",
-        "schema": "dbo"
-      }
-    }
-  ],
-  "properties": {
-    "connection": {
-      "type": "string",
-      "title": "The connection to use for loading",
-      "description": "Logical connection name as defined in the connections.dy.yaml",
-      "examples": ["europe_db", "target", "eu_dwh"]
-    },
-    "schema": {
-      "type": "string",
-      "title": "The table schema of the table",
-      "description": "If left blank, the default schema of this connection will be used as defined in the connections.dy.yaml",
-      "examples": ["dbo"]
-    },
-    "table": {
-      "type": "string",
-      "title": "The table name",
-      "description": "Table name",
-      "examples": ["employees"]
-    },
-    "columns": {
-      "type": "array",
-      "title": "Optional subset of columns to load",
-      "items": {
-        "type": ["string", "object"],
-        "title": "name of column"
-      },
-      "examples": [["fname", { "lname": "last_name" }]]
-    },
-    "fetch_size": {
-      "type": "integer",
-      "minimum": 1,
-      "description": "Driver-level rows fetched per round-trip. Defaults to 10000.",
-      "default": 10000
-    }
-  },
-  "required": ["connection", "table"]
-}
-```
-
-- [ ] **Step 6.5: Run test to verify it passes**
-
-```bash
-cd core && python -m pytest src/datayoga_core/blocks/relational/read/tests/test_relational_read.py -v
-```
-
-Expected: 3 passed.
-
-- [ ] **Step 6.6: Run the full core suite**
-
-```bash
-cd core && python -m pytest src/datayoga_core/ -x -q
-```
-
-Expected: all tests pass.
-
-- [ ] **Step 6.7: Commit**
-
-```bash
-git add core/src/datayoga_core/blocks/relational/read/block.py \
-        core/src/datayoga_core/blocks/relational/read/block.schema.json \
-        core/src/datayoga_core/blocks/relational/read/tests/__init__.py \
-        core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py
-git commit -m "Migrate relational/read to produce_chunks, add fetch_size (#400, #295)"
-```
-
----
-
-## Task 7: Migrate `http/receiver` (fix one-by-one)
-
-The receiver currently yields one record per HTTP request. Migrate to drain the queue per chunk; `flush_ms` ensures partial batches flush during low-traffic periods.
-
-**Files:**
-
-- Modify: `core/src/datayoga_core/blocks/http/receiver/block.py`
-- Modify: `core/src/datayoga_core/blocks/http/receiver/block.schema.json`
-
-- [ ] **Step 7.1: Write the failing test**
-
-Create `core/src/datayoga_core/blocks/http/receiver/tests/__init__.py` (empty) and `core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py`:
-
-```python
-import asyncio
-
-import aiohttp
-import pytest
-
-from datayoga_core.blocks.http.receiver.block import Block
-
-
-def _free_port():
-    import socket
-    with socket.socket() as s:
-        s.bind(("127.0.0.1", 0))
-        return s.getsockname()[1]
-
-
-@pytest.mark.asyncio
-async def test_http_receiver_batches_incoming_requests():
-    port = _free_port()
-    block = Block({"host": "127.0.0.1", "port": port,
-                   "batch_size": 50, "flush_ms": 200})
-    block.init()
-
-    received = []
-
-    async def consumer():
-        async for batch in block.produce():
-            received.append(batch)
-            if sum(len(b) for b in received) >= 60:
-                return
-
-    consumer_task = asyncio.create_task(consumer())
-    await asyncio.sleep(0.2)  # let server start
-
-    async with aiohttp.ClientSession() as session:
-        for i in range(60):
-            async with session.post(f"http://127.0.0.1:{port}", json={"i": i}) as r:
-                assert r.status == 200
-
-    await asyncio.wait_for(consumer_task, timeout=5)
-
-    flat = [r for b in received for r in b]
-    assert len(flat) == 60
-    # Most records arrive in a full batch_size=50 batch; the rest arrive as a
-    # partial batch flushed by flush_ms.
-    assert any(len(b) == 50 for b in received)
-    assert all(Block.MSG_ID_FIELD in r for r in flat)
-```
-
-- [ ] **Step 7.2: Run test to verify it fails**
-
-```bash
-cd core && python -m pytest src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py -v
-```
-
-Expected: FAIL — current implementation yields one record per batch; `assert any(len(b) == 50 ...)` is false.
-
-- [ ] **Step 7.3: Migrate `http/receiver`**
-
-Replace the contents of `core/src/datayoga_core/blocks/http/receiver/block.py` with:
-
-```python
-import logging
-from abc import ABCMeta
-from asyncio import Queue
-from contextlib import suppress
-from itertools import count
-from typing import Any, AsyncGenerator, Dict, List, Optional
-
-import orjson
-from aiohttp.web import (BaseRequest, HTTPInternalServerError, HTTPOk,
-                         Response, Server, ServerRunner, TCPSite)
-from datayoga_core.context import Context
-from datayoga_core.producer import Producer as DyProducer
-
-logger = logging.getLogger("dy")
-
-
-class Block(DyProducer, metaclass=ABCMeta):
-    port: int
-    host: str
-    DEFAULT_FLUSH_MS = 1000
-
-    def init(self, context: Optional[Context] = None):
-        logger.debug(f"Initializing {self.get_block_name()}")
-        self.port = int(self.properties.get("port", 8080))
-        self.host = self.properties.get("host", "0.0.0.0")
-
-    async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
-        queue: Queue = Queue(maxsize=1000)
-
-        async def handler(request: BaseRequest) -> Response:
-            try:
-                queue.put_nowait(orjson.loads(await request.read()))
-                return HTTPOk()
-            except Exception:
-                logger.exception("Got exception while parsing request:")
-                return HTTPInternalServerError()
-
-        runner = ServerRunner(Server(handler))
-        await runner.setup()
-        srv = TCPSite(runner, self.host, self.port)
-        await srv.start()
-        logger.info(f"Listening on {self.host}:{self.port}...")
-
-        try:
-            counter = iter(count())
-            while True:
-                first = await queue.get()
-                chunk = [{self.MSG_ID_FIELD: f"{next(counter)}", **first}]
-                while not queue.empty():
-                    record = queue.get_nowait()
-                    chunk.append({self.MSG_ID_FIELD: f"{next(counter)}", **record})
-                yield chunk
-        finally:
-            with suppress(Exception):
-                await srv.stop()
-```
-
-- [ ] **Step 7.4: Update the schema**
-
-Replace `core/src/datayoga_core/blocks/http/receiver/block.schema.json` with:
-
-```json
-{
-  "title": "http.receiver",
-  "description": "Receives HTTP requests and process the data.",
-  "type": "object",
-  "$inherit": ["streamable"],
-  "properties": {
-    "host": {
-      "description": "Host to listen",
-      "type": "string",
-      "default": "0.0.0.0"
-    },
-    "port": {
-      "description": "Port to listen",
-      "type": "integer",
-      "default": 8080
-    }
-  },
-  "additionalProperties": false,
-  "examples": [
-    {
-      "host": "localhost",
-      "port": 8080
-    }
-  ]
-}
-```
-
-- [ ] **Step 7.5: Run test to verify it passes**
-
-```bash
-cd core && python -m pytest src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py -v
-```
-
-Expected: 1 passed.
-
-- [ ] **Step 7.6: Run the full core suite**
-
-```bash
-cd core && python -m pytest src/datayoga_core/ -x -q
-```
-
-Expected: all tests pass.
-
-- [ ] **Step 7.7: Commit**
-
-```bash
-git add core/src/datayoga_core/blocks/http/receiver/block.py \
-        core/src/datayoga_core/blocks/http/receiver/block.schema.json \
-        core/src/datayoga_core/blocks/http/receiver/tests/__init__.py \
-        core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py
-git commit -m "Migrate http/receiver to produce_chunks (#400)"
-```
-
----
-
-## Task 8: Migrate `redis/read_stream` (closes #377)
-
-The redis stream producer yields one record at a time today. Migrate so it requests `count=batch_size` from `xreadgroup` and yields each response as a chunk; `flush_ms` flushes partial batches during low-volume periods.
-
-**Files:**
-
-- Modify: `core/src/datayoga_core/blocks/redis/read_stream/block.py`
-- Modify: `core/src/datayoga_core/blocks/redis/read_stream/block.schema.json`
-
-- [ ] **Step 8.1: Write the failing test**
-
-Create `core/src/datayoga_core/blocks/redis/read_stream/tests/__init__.py` (empty) and `core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py`:
-
-```python
-from unittest.mock import MagicMock
-
-import pytest
-
-from datayoga_core.blocks.redis.read_stream.block import Block
-
-
-def _mk_block(properties, redis_client):
-    block = Block.__new__(Block)
-    block.properties = properties
-    block.redis_client = redis_client
-    block.stream = "mystream"
-    block.snapshot = properties.get("_snapshot", True)
-    block.consumer_group = "g"
-    block.requesting_consumer = "c"
-    return block
-
-
-@pytest.mark.asyncio
-async def test_redis_uses_count_equal_to_batch_size():
-    redis = MagicMock()
-    # First call returns pending messages, second call returns "no new", which
-    # ends snapshot mode.
-    payload_a = (b"1-0", {b"data": b'{"i": 1}'})
-    payload_b = (b"2-0", {b"data": b'{"i": 2}'})
-    redis.xreadgroup.side_effect = [
-        [(b"mystream", [payload_a, payload_b])],  # pending
-        [(b"mystream", [])],                        # nothing new -> exit
-    ]
-
-    block = _mk_block({"batch_size": 250, "_snapshot": True}, redis)
-    batches = []
-    async for b in block.produce():
-        batches.append(b)
-
-    assert all(c.kwargs.get("count") == 250 or (len(c.args) >= 4 and c.args[3] == 250)
-               for c in redis.xreadgroup.call_args_list), \
-        "xreadgroup should be called with count=batch_size"
-
-
-@pytest.mark.asyncio
-async def test_redis_yields_records_as_a_batch_not_one_by_one():
-    redis = MagicMock()
-    pages = [(f"{i}-0".encode(), {b"data": f'{{"i": {i}}}'.encode()}) for i in range(5)]
-    redis.xreadgroup.side_effect = [
-        [(b"mystream", pages)],
-        [(b"mystream", [])],
-    ]
-
-    block = _mk_block({"batch_size": 100, "_snapshot": True}, redis)
-    batches = []
-    async for b in block.produce():
-        batches.append(b)
-
-    # 5 records arrive as one chunk; base class re-emits as one batch of 5.
-    assert [len(b) for b in batches] == [5]
-    assert batches[0][0]["i"] == 0
-```
-
-- [ ] **Step 8.2: Run test to verify it fails**
-
-```bash
-cd core && python -m pytest src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py -v
-```
-
-Expected: FAIL — current `xreadgroup` call passes `count=None`, and the producer yields one record at a time.
-
-- [ ] **Step 8.3: Migrate `redis/read_stream`**
-
-Replace the contents of `core/src/datayoga_core/blocks/redis/read_stream/block.py` with:
-
-```python
-import logging
-from typing import Any, AsyncGenerator, Dict, List, Optional
-
-import datayoga_core.blocks.redis.utils as redis_utils
-import orjson
-from datayoga_core.connection import Connection
-from datayoga_core.context import Context
-from datayoga_core.producer import Producer as DyProducer
-
-logger = logging.getLogger("dy")
-
-
-class Block(DyProducer):
-    DEFAULT_FLUSH_MS = 1000
-
-    def init(self, context: Optional[Context] = None):
-        logger.debug(f"Initializing {self.get_block_name()}")
-        connection_details = Connection.get_connection_details(self.properties["connection"], context)
-        self.redis_client = redis_utils.get_client(connection_details)
-        self.stream = self.properties["stream_name"]
-        self.snapshot = self.properties.get("snapshot", False)
-        self.consumer_group = f'datayoga_job_{context.properties.get("job_name", "") if context else ""}'
-        self.requesting_consumer = "dy_consumer_a"
-        stream_groups = self.redis_client.xinfo_groups(self.stream)
-        if next(filter(lambda x: x["name"] == self.consumer_group, stream_groups), None) is None:
-            logger.info(f"Creating a new {self.consumer_group} consumer group associated with the {self.stream}")
-            self.redis_client.xgroup_create(self.stream, self.consumer_group, 0)
-
-    async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
-        logger.debug(f"Running {self.get_block_name()}")
-        batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE))
-        read_pending = True
-
-        while True:
-            streams = self.redis_client.xreadgroup(
-                self.consumer_group, self.requesting_consumer,
-                {self.stream: "0" if read_pending else ">"},
-                count=batch_size,
-                block=100 if self.snapshot else 0,
-            )
-
-            yielded_any = False
-            for stream in streams:
-                logger.debug(f"Messages in {self.stream} stream (pending: {read_pending}):\n\t{stream}")
-                chunk: List[Dict[str, Any]] = []
-                for key, value in stream[1]:
-                    payload = orjson.loads(value[next(iter(value))])
-                    payload[self.MSG_ID_FIELD] = key
-                    chunk.append(payload)
-                if chunk:
-                    yielded_any = True
-                    yield chunk
-
-            # Snapshot ends after a pending-read followed by a "no new" read.
-            if self.snapshot and not read_pending and not yielded_any:
-                return
-
-            read_pending = False
-
-    def ack(self, msg_ids: List[str]):
-        for msg_id in msg_ids:
-            logger.info(f"Acking {msg_id} message in {self.stream} stream of {self.consumer_group} consumer group")
-            self.redis_client.xack(self.stream, self.consumer_group, msg_id)
-```
-
-Note: snapshot termination is slightly tightened: the loop exits when a non-pending read returns no messages, matching the spec's intent. This is more robust than the original `if self.snapshot and not read_pending: break`.
-
-- [ ] **Step 8.4: Update the schema**
-
-Replace `core/src/datayoga_core/blocks/redis/read_stream/block.schema.json` with:
-
-```json
-{
-  "title": "redis.read_stream",
-  "description": "Read from Redis stream",
-  "type": "object",
-  "$inherit": ["streamable"],
-  "properties": {
-    "connection": { "description": "Connection name", "type": "string" },
-    "stream_name": {
-      "type": "string",
-      "title": "Source stream name",
-      "description": "Source stream name"
-    },
-    "snapshot": {
-      "type": "boolean",
-      "title": "Snapshot current entries and quit",
-      "description": "Snapshot current entries and quit",
-      "default": false
-    }
-  },
-  "additionalProperties": false,
-  "required": ["connection", "stream_name"]
-}
-```
-
-- [ ] **Step 8.5: Run test to verify it passes**
-
-```bash
-cd core && python -m pytest src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py -v
-```
-
-Expected: 2 passed.
-
-- [ ] **Step 8.6: Run the full core suite**
-
-```bash
-cd core && python -m pytest src/datayoga_core/ -x -q
-```
-
-Expected: all tests pass.
-
-- [ ] **Step 8.7: Commit**
-
-```bash
-git add core/src/datayoga_core/blocks/redis/read_stream/block.py \
-        core/src/datayoga_core/blocks/redis/read_stream/block.schema.json \
-        core/src/datayoga_core/blocks/redis/read_stream/tests/__init__.py \
-        core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py
-git commit -m "Migrate redis/read_stream to batched xreadgroup (#400, #377)"
-```
-
----
-
-## Task 9: Migrate `azure/read_event_hub` (rename `batch_size` → `max_batch_size`)
-
-Today `batch_size` controls the SDK callback size, not the pipeline batch size. Rename to `max_batch_size`, add `additionalProperties: false`, and use the streamable fragment so the _new_ `batch_size` means pipeline batch size.
-
-**Files:**
-
-- Modify: `core/src/datayoga_core/blocks/azure/read_event_hub/block.py`
-- Modify: `core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json`
-
-- [ ] **Step 9.1: Write the failing test**
-
-Create `core/src/datayoga_core/blocks/azure/read_event_hub/tests/__init__.py` (empty) and `core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py`:
-
-```python
-import pytest
-from jsonschema import ValidationError
-
-from datayoga_core.blocks.azure.read_event_hub.block import Block
-
-
-def _minimal_props(extra=None):
-    base = {
-        "event_hub_connection_string": "Endpoint=sb://x/;SharedAccessKeyName=k;SharedAccessKey=v;EntityPath=eh",
-        "event_hub_consumer_group_name": "$Default",
-        "event_hub_name": "eh",
-        "checkpoint_store_connection_string": "DefaultEndpointsProtocol=https;AccountName=a;AccountKey=k==",
-        "checkpoint_store_container_name": "chk",
-    }
-    if extra:
-        base.update(extra)
-    return base
-
-
-def test_unknown_property_rejected_by_validation():
-    """additionalProperties: false catches typos like the legacy 'batch_sz'."""
-    with pytest.raises(ValidationError):
-        Block(_minimal_props({"batch_sz": 300}))
-
-
-def test_max_batch_size_accepted():
-    """The renamed SDK-level property is now max_batch_size."""
-    block = Block(_minimal_props({"max_batch_size": 500, "batch_size": 100}))
-    assert block.properties["max_batch_size"] == 500
-    assert block.properties["batch_size"] == 100
-
-
-def test_max_batch_size_defaults_to_300_when_omitted():
-    """init() reads max_batch_size with a default of 300 if not present."""
-    # We can't safely call init() in unit tests (it instantiates the Azure
-    # SDK client); read the property via the same path init() does.
-    block = Block(_minimal_props())
-    assert int(block.properties.get("max_batch_size", 300)) == 300
-
-
-def test_renamed_schema_has_additional_properties_false():
-    """Schema after rename: max_batch_size + streamable's batch_size/flush_ms,
-    no unknown properties allowed."""
-    block = Block(_minimal_props())
-    schema = block.get_json_schema()
-    assert schema.get("additionalProperties") is False
-    assert "max_batch_size" in schema["properties"]
-    assert "batch_size" in schema["properties"]  # from streamable fragment
-    assert "flush_ms" in schema["properties"]    # from streamable fragment
-
-
-def test_batch_size_300_is_silently_repurposed():
-    """A user upgrading from a pre-rename version with batch_size: 300 (which
-    used to mean SDK callback size) will see their YAML still validate, but
-    batch_size now means pipeline batch size. This is documented in the PR
-    description and processing-strategies.md as a breaking change."""
-    block = Block(_minimal_props({"batch_size": 300}))
-    # Schema validation passes — batch_size is a known property (now pipeline-meaning).
-    # The user must rename to max_batch_size: 300 to preserve old behavior.
-    assert block.properties["batch_size"] == 300
-    assert "max_batch_size" not in block.properties
-```
-
-- [ ] **Step 9.2: Run test to verify it fails**
-
-```bash
-cd core && python -m pytest src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py -v
-```
-
-Expected: most of the 5 tests FAIL — current schema has no `additionalProperties: false`, no `max_batch_size`, no `$inherit`.
-
-- [ ] **Step 9.3: Update the schema**
-
-Replace `core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json` with:
-
-```json
-{
-  "title": "azure.read_event_hub",
-  "description": "Read from Azure Event Hub",
-  "type": "object",
-  "$inherit": ["streamable"],
-  "properties": {
-    "event_hub_connection_string": {
-      "type": "string",
-      "description": "The connection string for the Azure Event Hub namespace."
-    },
-    "event_hub_consumer_group_name": {
-      "type": "string",
-      "description": "The name of the consumer group to read events from."
-    },
-    "event_hub_name": {
-      "type": "string",
-      "description": "The name of the Azure Event Hub."
-    },
-    "checkpoint_store_connection_string": {
-      "type": "string",
-      "description": "The connection string for the Azure Storage account used as the checkpoint store."
-    },
-    "checkpoint_store_container_name": {
-      "type": "string",
-      "description": "The name of the container within the checkpoint store to store the checkpoints."
-    },
-    "max_batch_size": {
-      "type": "integer",
-      "minimum": 1,
-      "description": "Maximum number of events to receive in each SDK callback. Renamed from the previous batch_size which used to mean this. Defaults to 300.",
-      "default": 300
-    }
-  },
-  "additionalProperties": false,
-  "required": [
-    "event_hub_connection_string",
-    "event_hub_consumer_group_name",
-    "event_hub_name",
-    "checkpoint_store_connection_string",
-    "checkpoint_store_container_name"
-  ]
-}
-```
-
-- [ ] **Step 9.4: Migrate the producer**
-
-Replace the contents of `core/src/datayoga_core/blocks/azure/read_event_hub/block.py` with:
-
-```python
-import asyncio
-import logging
-from typing import Any, AsyncGenerator, Dict, List, Optional
-
-import orjson
-from azure.eventhub import EventData, PartitionContext
-from azure.eventhub.aio import EventHubConsumerClient
-from azure.eventhub.extensions.checkpointstoreblobaio import \
-    BlobCheckpointStore
-from datayoga_core.context import Context
-from datayoga_core.producer import Producer as DyProducer
-
-logger = logging.getLogger("dy")
-
-
-class Block(DyProducer):
-    """Azure Event Hub block for reading events."""
-
-    DEFAULT_FLUSH_MS = 1000
-
-    def init(self, context: Optional[Context] = None):
-        logger.debug(f"Initializing {self.get_block_name()}")
-        self.max_batch_size = int(self.properties.get("max_batch_size", 300))
-        self.consumer_client = EventHubConsumerClient.from_connection_string(
-            conn_str=self.properties["event_hub_connection_string"],
-            consumer_group=self.properties["event_hub_consumer_group_name"],
-            eventhub_name=self.properties["event_hub_name"],
-            checkpoint_store=BlobCheckpointStore.from_connection_string(
-                self.properties["checkpoint_store_connection_string"],
-                self.properties["checkpoint_store_container_name"]),
-        )
-        self.events: Dict[Any, Any] = {}
-        self.messages: asyncio.Queue = asyncio.Queue()
-
-    async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
-        logger.debug(f"Running {self.get_block_name()}")
-        logger.debug("Starting event receiving process")
-        asyncio.create_task(self.receive_batch())
-
-        while True:
-            first = await self.messages.get()
-            chunk = [first]
-            while not self.messages.empty():
-                chunk.append(self.messages.get_nowait())
-            yield chunk
-
-    async def receive_batch(self):
-        await self.consumer_client.receive_batch(
-            on_event_batch=self.on_event_batch,
-            max_batch_size=self.max_batch_size,
-            starting_position="-1",
-        )
-
-    async def on_event_batch(self, partition_context: PartitionContext, events: List[EventData]):
-        logger.debug(f"Received batch of events from partition: {partition_context.partition_id}")
-        for event in events:
-            try:
-                payload = orjson.loads(event.body_as_str(encoding="UTF-8"))
-                msg_id = event.system_properties[b"x-opt-sequence-number"]
-                self.events[msg_id] = (event, partition_context)
-                payload[self.MSG_ID_FIELD] = msg_id
-                await self.messages.put(payload)
-            except Exception as e:
-                logger.error(e)
-
-    async def complete_events(self, msg_ids: List[str]):
-        for msg_id in msg_ids:
-            logger.debug(f"Acking {msg_id} event")
-            event, partition_context = self.events.pop(msg_id, (None, None))
-            if event is not None:
-                await partition_context.update_checkpoint(event)
-            else:
-                logger.warning(f"Couldn't find event {msg_id} for acknowledging")
-
-    def ack(self, msg_ids: List[str]):
-        asyncio.create_task(self.complete_events(msg_ids))
-```
-
-- [ ] **Step 9.5: Run test to verify it passes**
-
-```bash
-cd core && python -m pytest src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py -v
-```
-
-Expected: 5 passed.
-
-- [ ] **Step 9.6: Run the full core suite**
-
-```bash
-cd core && python -m pytest src/datayoga_core/ -x -q
-```
-
-Expected: all tests pass.
-
-- [ ] **Step 9.7: Commit**
-
-```bash
-git add core/src/datayoga_core/blocks/azure/read_event_hub/block.py \
-        core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json \
-        core/src/datayoga_core/blocks/azure/read_event_hub/tests/__init__.py \
-        core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py
-git commit -m "Migrate azure/read_event_hub; rename batch_size -> max_batch_size (#400, BREAKING)"
-```
-
----
-
-## Task 10: Regenerate autogenerated schemas and docs
-
-The aggregated `schemas/job.schema.json` and the per-block markdown in `docs/reference/blocks/` are generated by scripts. After the per-block schema changes, regenerate them.
-
-**Files:**
-
-- Modify: `schemas/job.schema.json`
-- Modify: `docs/reference/blocks/std_read.md`, `files_read_csv.md`, `parquet_read.md`, `relational_read.md`, `redis_read_stream.md`, `http_receiver.md`, `azure_read_event_hub.md` (autogenerated)
-
-- [ ] **Step 10.1: Regenerate the JSON schemas**
-
-```bash
-bash scripts/generate-jsonschemas.sh
-```
-
-Expected output: `JSON schemas generated successfully`.
-
-- [ ] **Step 10.2: Regenerate the reference docs**
-
-```bash
-bash scripts/generate-docs.sh
-```
-
-Expected: completes without error.
-
-- [ ] **Step 10.3: Inspect the diff**
-
-```bash
-git diff schemas/ docs/reference/blocks/ | head -200
-```
-
-Expected: `batch_size` (and `flush_ms` for streaming producers, `fetch_size` for relational/read, `max_batch_size` for event_hub) appear in the appropriate schema entries and docs.
-
-- [ ] **Step 10.4: Commit**
-
-```bash
-git add schemas/job.schema.json docs/reference/blocks/
-git commit -m "Regenerate JSON schemas and reference docs after producer batching (#400)"
-```
-
----
-
-## Task 11: Document the producer batching model in processing-strategies
-
-**Files:**
-
-- Modify: `docs/processing-strategies.md`
-
-- [ ] **Step 11.1: Add a section on producer batching**
-
-Append the following section to `docs/processing-strategies.md` (or replace an existing section if one already covers it):
-
-````markdown
-## Producer Batching
-
-Every producer block (any block that reads from a source — `std/read`, `files/read_csv`, `parquet/read`, `relational/read`, `redis/read_stream`, `azure/read_event_hub`, `http/receiver`) accepts a `batch_size` property. The producer base class re-chunks the source's output into batches of exactly `batch_size` records, regardless of how the source delivers them (per row, per row group, per `fetchmany`, per network message).
-
-```yaml
-input:
-  uses: files.read_csv
-  with:
-    file: people.csv
-    batch_size: 500 # downstream steps process 500 records per call
-```
-
-Default: `1000`.
-
-### Streaming producers and `flush_ms`
-
-Streaming producers (`redis/read_stream`, `azure/read_event_hub`, `http/receiver`) also accept `flush_ms`. If no new records arrive within that many milliseconds, any partial batch is flushed downstream instead of being held until `batch_size` is reached.
-
-```yaml
-input:
-  uses: redis.read_stream
-  with:
-    connection: my_redis
-    stream_name: events
-    batch_size: 1000
-    flush_ms: 500 # emit a partial batch after 500ms of inactivity
-```
-
-Default: `1000` ms. Set to `null` to disable time-based flushing (records are held until `batch_size` or end-of-stream).
-
-### `relational/read` and `fetch_size`
-
-`relational/read` exposes an extra `fetch_size` property that controls how many rows are pulled from the database driver per round-trip, independent of the pipeline `batch_size`. Default: `10000`. Tune lower for memory pressure with wide rows; tune higher if you want fewer DB round-trips and downstream processing is the bottleneck.
-
-### `azure/read_event_hub` migration note
-
-In earlier versions, `batch_size` on `azure/read_event_hub` controlled the SDK callback batch size, not the pipeline batch size. As of #400 it has been renamed to `max_batch_size` to match the SDK semantic, and `batch_size` now consistently means pipeline batch size as it does for every other producer.
-````
-
-- [ ] **Step 11.2: Commit**
-
-```bash
-git add docs/processing-strategies.md
-git commit -m "Document producer batching model in processing-strategies (#400)"
-```
-
----
-
-## Task 12: Full verification and push branch
-
-- [ ] **Step 12.1: Run full core test suite**
-
-```bash
-cd core && python -m pytest src/datayoga_core/ -v
-```
-
-Expected: all tests pass. Notably:
-
-- `test_producer_batching.py` (7 tests)
-- `test_schema_inherit.py` (5 tests)
-- `test_std_read.py`, `test_read_csv.py`, `test_parquet_read.py`, `test_relational_read.py`, `test_http_receiver.py`, `test_redis_read_stream.py`, `test_event_hub.py` (12 tests total)
-- All pre-existing tests still pass.
-
-- [ ] **Step 12.2: Inspect the branch's commit history**
-
-```bash
-git log --oneline 400-producer-batching-unification ^main
-```
-
-Expected: a clean sequence of commits — one per task — each referencing #400.
-
-- [ ] **Step 12.3: Push the branch**
-
-```bash
-git push -u origin 400-producer-batching-unification
-```
-
-Expected: branch pushed to remote.
-
-- [ ] **Step 12.4: Open a draft PR (deferred — confirm with user first)**
-
-Before opening the PR, ask the user whether to open it as draft or ready-for-review, and confirm the body content. Do not run `gh pr create` autonomously.
-
-The PR description should call out the breaking change explicitly (no CHANGELOG file exists in this repo, so the PR description is the canonical place):
-
-> **Breaking change:** `azure/read_event_hub.batch_size` has been renamed to `max_batch_size`. The name `batch_size` now means pipeline batch size on this block, consistent with every other producer. Users with `batch_size: <N>` in their YAML for `azure/read_event_hub` must rename it to `max_batch_size: <N>` to preserve the previous SDK callback size semantic; the literal `batch_size: <N>` will validate but with the new pipeline-level meaning.

From 78ef675c3858409109be8c28c5fdc47ff8440ddb Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 19:45:50 +0300
Subject: [PATCH 33/38] Switch from custom \$inherit to standard JSON Schema
 composition (#400)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per review discussion: drop the custom \$inherit extension and use
JSON Schema's standard allOf + \$ref composition instead. The on-disk
schemas are now idiomatic JSON Schema, understood by any standards-
compliant tool.

Changes:
- Each producer block schema gains "\$schema": draft/2019-09 and uses
  allOf: [{"\$ref": "../../../resources/schemas/<fragment>.schema.json"}]
  to inherit batch_size (and flush_ms for streaming producers).
- additionalProperties: false -> unevaluatedProperties: false, which is
  composition-aware (the additionalProperties + allOf interaction is a
  known JSON Schema gotcha that rejects allOf-contributed properties).
- schema_utils.resolve_inherits -> resolve_refs: walks the schema,
  inlines local-file \$refs recursively, detects cycles. The validation
  code path (Block.validate, Job.get_json_schema) stays unchanged —
  resolved schemas are flat.
- Tests in test_schema_inherit.py rewritten for the new mechanics:
  inlining, transitive resolution, cycle detection, non-local refs
  passthrough, default base-dir fallback.
- generate-docs.sh: walks standard \$ref instead of \$inherit, and also
  flattens allOf properties for jsonschema2mk's benefit (docs-only).
- Aggregate schemas/job.schema.json regenerated.

External \$ref-aware tools (IDE plugins, OpenAPI exporters) can now
follow the schemas without our custom resolver. jsonschema2mk is the
one tool that doesn't grok \$ref, so the docs generator keeps its
pre-resolution step.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 core/src/datayoga_core/block.py               |   4 +-
 .../azure/read_event_hub/block.schema.json    |   5 +-
 .../read_event_hub/tests/test_event_hub.py    |  17 +-
 .../blocks/files/read_csv/block.schema.json   |   5 +-
 .../blocks/http/receiver/block.schema.json    |   5 +-
 .../blocks/parquet/read/block.schema.json     |   5 +-
 .../redis/read_stream/block.schema.json       |   5 +-
 .../blocks/relational/read/block.schema.json  |   5 +-
 .../blocks/std/read/block.schema.json         |   5 +-
 core/src/datayoga_core/job.py                 |   4 +-
 .../resources/schemas/batchable.schema.json   |   1 +
 .../resources/schemas/streamable.schema.json  |   1 +
 core/src/datayoga_core/schema_utils.py        | 116 +++++----
 .../tests/test_schema_inherit.py              | 166 +++++++------
 docs/reference/blocks/azure_read_event_hub.md |   1 -
 docs/reference/blocks/files_read_csv.md       |   1 -
 docs/reference/blocks/http_receiver.md        |   1 -
 docs/reference/blocks/parquet_read.md         |   1 -
 docs/reference/blocks/redis_read_stream.md    |   1 -
 docs/reference/blocks/relational_read.md      |   1 -
 docs/reference/blocks/relational_write.md     |   9 -
 docs/reference/blocks/std_read.md             |   1 -
 ...28-producer-batching-unification-design.md |  41 ++--
 schemas/job.schema.json                       | 228 ++++++++++++------
 scripts/generate-docs.sh                      |  68 ++++--
 25 files changed, 413 insertions(+), 284 deletions(-)

diff --git a/core/src/datayoga_core/block.py b/core/src/datayoga_core/block.py
index a0b65e06..2dd6300b 100644
--- a/core/src/datayoga_core/block.py
+++ b/core/src/datayoga_core/block.py
@@ -57,8 +57,8 @@ def get_json_schema(self) -> Dict[str, Any]:
             "block.schema.json")
         logger.debug(f"loading schema from {json_schema_file}")
         # Lazy import: schema_utils -> utils -> block creates a circular import at module load.
-        from datayoga_core.schema_utils import resolve_inherits
-        return resolve_inherits(utils.read_json(json_schema_file))
+        from datayoga_core.schema_utils import resolve_refs
+        return resolve_refs(utils.read_json(json_schema_file), schema_path=json_schema_file)
 
     @abstractmethod
     def init(self, context: Optional[Context] = None):
diff --git a/core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json b/core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json
index f663d383..f014b63f 100644
--- a/core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json
+++ b/core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json
@@ -1,8 +1,9 @@
 {
+  "$schema": "https://json-schema.org/draft/2019-09/schema",
   "title": "azure.read_event_hub",
   "description": "Read from Azure Event Hub",
   "type": "object",
-  "$inherit": ["streamable"],
+  "allOf": [{ "$ref": "../../../resources/schemas/streamable.schema.json" }],
   "properties": {
     "event_hub_connection_string": {
       "type": "string",
@@ -31,7 +32,7 @@
       "default": 300
     }
   },
-  "additionalProperties": false,
+  "unevaluatedProperties": false,
   "required": [
     "event_hub_connection_string",
     "event_hub_consumer_group_name",
diff --git a/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py b/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py
index 0506ee7b..b18fca3b 100644
--- a/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py
+++ b/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py
@@ -18,7 +18,7 @@ def _minimal_props(extra=None):
 
 
 def test_unknown_property_rejected_by_validation():
-    """additionalProperties: false catches typos like 'batch_sz'."""
+    """unevaluatedProperties: false catches typos like 'batch_sz'."""
     with pytest.raises(ValidationError):
         Block(_minimal_props({"batch_sz": 300}))
 
@@ -30,15 +30,18 @@ def test_max_batch_size_accepted():
     assert block.properties["batch_size"] == 100
 
 
-def test_renamed_schema_has_additional_properties_false():
-    """Schema after rename: max_batch_size + streamable's batch_size/flush_ms,
-    no unknown properties allowed."""
+def test_renamed_schema_uses_unevaluated_properties_with_streamable():
+    """Schema after rename: max_batch_size locally, streamable contributes
+    batch_size + flush_ms via allOf $ref, and unevaluatedProperties=false
+    rejects anything else."""
     block = Block(_minimal_props())
     schema = block.get_json_schema()
-    assert schema.get("additionalProperties") is False
+    assert schema.get("unevaluatedProperties") is False
     assert "max_batch_size" in schema["properties"]
-    assert "batch_size" in schema["properties"]
-    assert "flush_ms" in schema["properties"]
+    # batch_size and flush_ms come from the inlined streamable fragment via allOf
+    fragment_props = schema["allOf"][0]["properties"]
+    assert "batch_size" in fragment_props
+    assert "flush_ms" in fragment_props
 
 
 def test_batch_size_300_is_silently_repurposed():
diff --git a/core/src/datayoga_core/blocks/files/read_csv/block.schema.json b/core/src/datayoga_core/blocks/files/read_csv/block.schema.json
index ca7d638b..dc837561 100644
--- a/core/src/datayoga_core/blocks/files/read_csv/block.schema.json
+++ b/core/src/datayoga_core/blocks/files/read_csv/block.schema.json
@@ -1,8 +1,9 @@
 {
+  "$schema": "https://json-schema.org/draft/2019-09/schema",
   "title": "files.read_csv",
   "description": "Read data from CSV",
   "type": "object",
-  "$inherit": ["batchable"],
+  "allOf": [{ "$ref": "../../../resources/schemas/batchable.schema.json" }],
   "properties": {
     "file": {
       "description": "Filename. Can contain a regexp or glob expression",
@@ -48,7 +49,7 @@
       "default": "\""
     }
   },
-  "additionalProperties": false,
+  "unevaluatedProperties": false,
   "required": ["file"],
   "examples": [
     {
diff --git a/core/src/datayoga_core/blocks/http/receiver/block.schema.json b/core/src/datayoga_core/blocks/http/receiver/block.schema.json
index a52edcc5..1f93ccd5 100644
--- a/core/src/datayoga_core/blocks/http/receiver/block.schema.json
+++ b/core/src/datayoga_core/blocks/http/receiver/block.schema.json
@@ -1,8 +1,9 @@
 {
+  "$schema": "https://json-schema.org/draft/2019-09/schema",
   "title": "http.receiver",
   "description": "Receives HTTP requests and process the data.",
   "type": "object",
-  "$inherit": ["streamable"],
+  "allOf": [{ "$ref": "../../../resources/schemas/streamable.schema.json" }],
   "properties": {
     "host": {
       "description": "Host to listen",
@@ -15,7 +16,7 @@
       "default": 8080
     }
   },
-  "additionalProperties": false,
+  "unevaluatedProperties": false,
   "examples": [
     {
       "host": "localhost",
diff --git a/core/src/datayoga_core/blocks/parquet/read/block.schema.json b/core/src/datayoga_core/blocks/parquet/read/block.schema.json
index 395b3edd..777c23c4 100644
--- a/core/src/datayoga_core/blocks/parquet/read/block.schema.json
+++ b/core/src/datayoga_core/blocks/parquet/read/block.schema.json
@@ -1,15 +1,16 @@
 {
+  "$schema": "https://json-schema.org/draft/2019-09/schema",
   "title": "parquet.read",
   "description": "Read data from parquet",
   "type": "object",
-  "$inherit": ["batchable"],
+  "allOf": [{ "$ref": "../../../resources/schemas/batchable.schema.json" }],
   "properties": {
     "file": {
       "description": "Filename. Can contain a regexp or glob expression",
       "type": "string"
     }
   },
-  "additionalProperties": false,
+  "unevaluatedProperties": false,
   "required": ["file"],
   "examples": [
     {
diff --git a/core/src/datayoga_core/blocks/redis/read_stream/block.schema.json b/core/src/datayoga_core/blocks/redis/read_stream/block.schema.json
index f7e0a948..4411149f 100644
--- a/core/src/datayoga_core/blocks/redis/read_stream/block.schema.json
+++ b/core/src/datayoga_core/blocks/redis/read_stream/block.schema.json
@@ -1,8 +1,9 @@
 {
+  "$schema": "https://json-schema.org/draft/2019-09/schema",
   "title": "redis.read_stream",
   "description": "Read from Redis stream",
   "type": "object",
-  "$inherit": ["streamable"],
+  "allOf": [{ "$ref": "../../../resources/schemas/streamable.schema.json" }],
   "properties": {
     "connection": { "description": "Connection name", "type": "string" },
     "stream_name": {
@@ -17,6 +18,6 @@
       "default": false
     }
   },
-  "additionalProperties": false,
+  "unevaluatedProperties": false,
   "required": ["connection", "stream_name"]
 }
diff --git a/core/src/datayoga_core/blocks/relational/read/block.schema.json b/core/src/datayoga_core/blocks/relational/read/block.schema.json
index df5bc8b2..29f5715a 100644
--- a/core/src/datayoga_core/blocks/relational/read/block.schema.json
+++ b/core/src/datayoga_core/blocks/relational/read/block.schema.json
@@ -1,9 +1,10 @@
 {
+  "$schema": "https://json-schema.org/draft/2019-09/schema",
   "title": "relational.read",
   "description": "Read a table from an SQL-compatible data store",
   "type": "object",
-  "$inherit": ["batchable"],
-  "additionalProperties": false,
+  "allOf": [{ "$ref": "../../../resources/schemas/batchable.schema.json" }],
+  "unevaluatedProperties": false,
   "examples": [
     {
       "id": "read_snowflake",
diff --git a/core/src/datayoga_core/blocks/std/read/block.schema.json b/core/src/datayoga_core/blocks/std/read/block.schema.json
index 2214ac05..5d825898 100644
--- a/core/src/datayoga_core/blocks/std/read/block.schema.json
+++ b/core/src/datayoga_core/blocks/std/read/block.schema.json
@@ -1,8 +1,9 @@
 {
+  "$schema": "https://json-schema.org/draft/2019-09/schema",
   "title": "std.read",
   "description": "Read from the standard input",
   "type": "object",
-  "$inherit": ["batchable"],
+  "allOf": [{ "$ref": "../../../resources/schemas/batchable.schema.json" }],
   "properties": {},
-  "additionalProperties": false
+  "unevaluatedProperties": false
 }
diff --git a/core/src/datayoga_core/job.py b/core/src/datayoga_core/job.py
index 9df8c267..710d84e6 100644
--- a/core/src/datayoga_core/job.py
+++ b/core/src/datayoga_core/job.py
@@ -238,11 +238,11 @@ def get_json_schema(whitelisted_blocks: Optional[List[str]] = None) -> Dict[str,
         block_types = []
         block_schemas = []
         # Lazy import: schema_utils -> utils -> block creates a circular import at module load.
-        from datayoga_core.schema_utils import resolve_inherits
+        from datayoga_core.schema_utils import resolve_refs
         for block_type, schema_path in block_info:
             block_types.append(block_type)
             # load schema file
-            schema = resolve_inherits(utils.read_json(f"{schema_path}"))
+            schema = resolve_refs(utils.read_json(f"{schema_path}"), schema_path=f"{schema_path}")
             # append to the array of allOf for the full schema
             # we use allOf for better error reporting
             block_schemas.append({
diff --git a/core/src/datayoga_core/resources/schemas/batchable.schema.json b/core/src/datayoga_core/resources/schemas/batchable.schema.json
index f158d4fb..c04fb8fa 100644
--- a/core/src/datayoga_core/resources/schemas/batchable.schema.json
+++ b/core/src/datayoga_core/resources/schemas/batchable.schema.json
@@ -1,4 +1,5 @@
 {
+  "$schema": "https://json-schema.org/draft/2019-09/schema",
   "title": "batchable",
   "description": "Producer batching mixin: declares batch_size for producers that yield records in batches.",
   "type": "object",
diff --git a/core/src/datayoga_core/resources/schemas/streamable.schema.json b/core/src/datayoga_core/resources/schemas/streamable.schema.json
index 761c6d65..0bdba461 100644
--- a/core/src/datayoga_core/resources/schemas/streamable.schema.json
+++ b/core/src/datayoga_core/resources/schemas/streamable.schema.json
@@ -1,4 +1,5 @@
 {
+  "$schema": "https://json-schema.org/draft/2019-09/schema",
   "title": "streamable",
   "description": "Streaming producer mixin: declares batch_size and flush_ms for producers reading from continuous sources.",
   "type": "object",
diff --git a/core/src/datayoga_core/schema_utils.py b/core/src/datayoga_core/schema_utils.py
index e009a984..c170b733 100644
--- a/core/src/datayoga_core/schema_utils.py
+++ b/core/src/datayoga_core/schema_utils.py
@@ -1,62 +1,88 @@
 """Schema composition helpers.
 
-Producers and other blocks can declare `"$inherit": ["batchable"]` at the
-top of their block.schema.json to pull in shared property definitions from
-the fragments in resources/schemas/. `resolve_inherits` merges the
-fragments' `properties` into the local schema (local properties win), then
-removes the `$inherit` key. Schemas without `$inherit` are returned as-is.
+Producer block schemas use standard JSON Schema composition via `$ref` +
+`allOf` (with `unevaluatedProperties: false` to allow inherited properties).
+At validation time we want to keep the simple `jsonschema.validate(instance,
+schema)` code path, so we resolve any local-file `$ref`s into the schema
+ahead of time. The on-disk schemas remain standard JSON Schema; only the
+in-memory form is flattened.
+
+Example: a block schema like
+
+    {"allOf": [{"$ref": "../../../resources/schemas/batchable.schema.json"}],
+     "properties": {...},
+     "unevaluatedProperties": false}
+
+becomes
+
+    {"allOf": [<contents of batchable.schema.json>],
+     "properties": {...},
+     "unevaluatedProperties": false}
+
+after `resolve_refs(schema, schema_path)`.
 """
 from __future__ import annotations
 
 import copy
 from os import path
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Set
 
 from datayoga_core import utils
 
 
-def resolve_inherits(schema: Dict[str, Any], schemas_dir: Optional[str] = None) -> Dict[str, Any]:
-    """Merge any fragments listed in $inherit into the schema's properties.
+def resolve_refs(schema: Dict[str, Any], schema_path: Optional[str] = None) -> Dict[str, Any]:
+    """Return a copy of `schema` with local-file `$ref`s inlined recursively.
 
     Args:
-        schema: The schema to resolve. Mutated in place and also returned.
-        schemas_dir: Directory containing the fragment files. Defaults to
+        schema: The schema to resolve.
+        schema_path: Filesystem path the schema was loaded from. Used to
+            resolve relative `$ref` paths. If None, refs are resolved against
             the bundled/non-bundled resources/schemas directory.
 
     Returns:
-        The mutated schema with $inherit removed and fragment properties merged.
+        A new schema with all local-file $refs replaced by the referenced
+        document's contents. Non-local refs (http://, #fragments) and
+        non-existent files pass through unchanged or raise depending on form.
+
+    Raises:
+        FileNotFoundError: A local-file $ref points at a file that doesn't exist.
+        ValueError: A circular $ref chain is detected.
     """
-    inherits = schema.get("$inherit")
-    if inherits is None or inherits == []:
-        return schema
-    if not isinstance(inherits, list) or not all(isinstance(name, str) for name in inherits):
-        raise TypeError(
-            f"$inherit must be a list of fragment names (strings), got {inherits!r}"
-        )
-
-    if schemas_dir is None:
-        schemas_dir = utils.get_resource_path("schemas")
-
-    merged_properties: Dict[str, Any] = {}
-    for fragment_name in inherits:
-        fragment_path = path.join(schemas_dir, f"{fragment_name}.schema.json")
-        if not path.isfile(fragment_path):
-            raise FileNotFoundError(
-                f"Schema fragment '{fragment_name}' not found at {fragment_path}"
-            )
-        fragment = utils.read_json(fragment_path)
-        if fragment.get("$inherit"):
-            raise ValueError(
-                f"Schema fragment '{fragment_name}' itself contains $inherit; "
-                "nested inheritance is not supported. Inline the parent fragment's "
-                "properties or restructure the hierarchy."
-            )
-        merged_properties.update(copy.deepcopy(fragment.get("properties", {})))
-
-    # Local properties take precedence over inherited ones.
-    local_properties = schema.get("properties", {})
-    merged_properties.update(local_properties)
-
-    schema["properties"] = merged_properties
-    schema.pop("$inherit", None)
-    return schema
+    if schema_path is not None:
+        base_dir = path.dirname(path.abspath(schema_path))
+    else:
+        base_dir = utils.get_resource_path("schemas")
+
+    return _resolve_node(schema, base_dir, visited=set())
+
+
+def _resolve_node(node: Any, base_dir: str, visited: Set[str]) -> Any:
+    if isinstance(node, dict):
+        ref = node.get("$ref")
+        if isinstance(ref, str) and _is_local_file_ref(ref):
+            target = path.normpath(path.join(base_dir, ref))
+            if target in visited:
+                raise ValueError(f"Circular $ref detected resolving '{ref}' at {target}")
+            if not path.isfile(target):
+                raise FileNotFoundError(
+                    f"$ref target not found: '{ref}' resolved to {target}"
+                )
+            fragment = utils.read_json(target)
+            visited.add(target)
+            try:
+                resolved = _resolve_node(fragment, path.dirname(target), visited)
+            finally:
+                visited.discard(target)
+            return resolved
+        return {k: _resolve_node(v, base_dir, visited) for k, v in node.items()}
+    if isinstance(node, list):
+        return [_resolve_node(item, base_dir, visited) for item in node]
+    return copy.copy(node)
+
+
+def _is_local_file_ref(ref: str) -> bool:
+    """A $ref is a local file ref if it looks like a path to a .json/.schema.json
+    file with no URI scheme and no in-document fragment."""
+    if ref.startswith("#") or "://" in ref:
+        return False
+    return ref.endswith(".json")
diff --git a/core/src/datayoga_core/tests/test_schema_inherit.py b/core/src/datayoga_core/tests/test_schema_inherit.py
index f01a1dfe..dce5024d 100644
--- a/core/src/datayoga_core/tests/test_schema_inherit.py
+++ b/core/src/datayoga_core/tests/test_schema_inherit.py
@@ -1,100 +1,118 @@
+"""Tests for the $ref pre-resolver in `schema_utils.resolve_refs`.
+
+Block schemas use standard JSON Schema composition (`allOf` + `$ref` to
+local fragment files). We pre-resolve those refs at load time so the
+in-memory schema is self-contained.
+"""
+import json
 from pathlib import Path
 
 import pytest
-from datayoga_core.schema_utils import resolve_inherits
+from datayoga_core.schema_utils import resolve_refs
+
+SCHEMAS_DIR = Path(__file__).resolve().parent.parent / "resources" / "schemas"
+BATCHABLE = SCHEMAS_DIR / "batchable.schema.json"
 
-SCHEMAS_DIR = (
-    Path(__file__).resolve().parent.parent / "resources" / "schemas"
-)
 
+def test_resolve_refs_inlines_local_ref(tmp_path):
+    """A {'$ref': 'localfile.json'} node is replaced inline with the file's contents."""
+    fragment = {"type": "object", "properties": {"x": {"type": "integer"}}}
+    frag_path = tmp_path / "frag.schema.json"
+    frag_path.write_text(json.dumps(fragment))
 
-def test_inherit_merges_fragment_properties():
-    """A schema with $inherit:[batchable] picks up batch_size from the fragment."""
     schema = {
-        "title": "demo",
         "type": "object",
-        "$inherit": ["batchable"],
-        "properties": {"foo": {"type": "string"}},
-        "additionalProperties": False,
+        "allOf": [{"$ref": "frag.schema.json"}],
+        "properties": {"y": {"type": "string"}},
     }
-    resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
-    assert "$inherit" not in resolved
-    assert "batch_size" in resolved["properties"]
-    assert resolved["properties"]["batch_size"]["default"] == 1000
-    assert resolved["properties"]["foo"] == {"type": "string"}
-    assert resolved["additionalProperties"] is False
+    schema_path = tmp_path / "host.schema.json"
+    resolved = resolve_refs(schema, schema_path=str(schema_path))
 
+    assert resolved["allOf"][0] == fragment
+    assert "$ref" not in json.dumps(resolved)
 
-def test_inherit_local_property_wins_over_fragment():
-    """When local schema redefines an inherited property, the local version takes precedence."""
-    schema = {
-        "type": "object",
-        "$inherit": ["batchable"],
-        "properties": {
-            "batch_size": {"type": "integer", "minimum": 1, "default": 50}
-        },
-    }
-    resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
-    assert resolved["properties"]["batch_size"]["default"] == 50
+
+def test_resolve_refs_no_ref_passthrough(tmp_path):
+    """Schemas with no `$ref` come out structurally equal."""
+    schema = {"type": "object", "properties": {"x": {"type": "string"}}}
+    resolved = resolve_refs(schema, schema_path=str(tmp_path / "host.schema.json"))
+    assert resolved == schema
 
 
-def test_inherit_streamable_brings_both_props():
-    """$inherit:[streamable] exposes both batch_size and flush_ms on the schema."""
-    schema = {"type": "object", "$inherit": ["streamable"], "properties": {}}
-    resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
-    assert "batch_size" in resolved["properties"]
-    assert "flush_ms" in resolved["properties"]
+def test_resolve_refs_resolves_transitively(tmp_path):
+    """A fragment that itself contains `$ref` is resolved all the way."""
+    leaf = {"type": "object", "properties": {"leaf_prop": {"type": "integer"}}}
+    (tmp_path / "leaf.schema.json").write_text(json.dumps(leaf))
 
+    middle = {"allOf": [{"$ref": "leaf.schema.json"}]}
+    (tmp_path / "middle.schema.json").write_text(json.dumps(middle))
 
-def test_schema_without_inherit_unchanged():
-    """Schemas without $inherit pass through resolve_inherits unmodified."""
-    schema = {
-        "type": "object",
-        "properties": {"foo": {"type": "string"}},
-        "additionalProperties": False,
-    }
-    resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
-    assert resolved == schema
+    schema = {"allOf": [{"$ref": "middle.schema.json"}]}
+    resolved = resolve_refs(schema, schema_path=str(tmp_path / "host.schema.json"))
 
+    # middle's $ref to leaf was resolved as part of the resolution of host's $ref to middle
+    assert resolved == {"allOf": [{"allOf": [leaf]}]}
 
-def test_unknown_fragment_raises():
-    """$inherit referencing a missing fragment file raises FileNotFoundError."""
-    schema = {"type": "object", "$inherit": ["nope"], "properties": {}}
-    with pytest.raises(FileNotFoundError):
-        resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
 
+def test_resolve_refs_missing_file_raises(tmp_path):
+    """A `$ref` pointing at a missing local file raises FileNotFoundError."""
+    schema = {"allOf": [{"$ref": "does_not_exist.schema.json"}]}
+    with pytest.raises(FileNotFoundError, match="does_not_exist.schema.json"):
+        resolve_refs(schema, schema_path=str(tmp_path / "host.schema.json"))
 
-def test_inherit_string_value_raises_type_error():
-    """$inherit must be a list; passing a string raises TypeError loudly."""
-    schema = {"type": "object", "$inherit": "batchable", "properties": {}}
-    with pytest.raises(TypeError):
-        resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
 
+def test_resolve_refs_detects_circular(tmp_path):
+    """A → B → A cycle raises ValueError, not infinite recursion."""
+    (tmp_path / "a.schema.json").write_text('{"allOf": [{"$ref": "b.schema.json"}]}')
+    (tmp_path / "b.schema.json").write_text('{"allOf": [{"$ref": "a.schema.json"}]}')
 
-def test_inherit_non_string_items_raises_type_error():
-    """Non-string items in the $inherit list raise TypeError."""
-    schema = {"type": "object", "$inherit": ["batchable", 123], "properties": {}}
-    with pytest.raises(TypeError):
-        resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
+    schema = {"allOf": [{"$ref": "a.schema.json"}]}
+    with pytest.raises(ValueError, match="Circular"):
+        resolve_refs(schema, schema_path=str(tmp_path / "host.schema.json"))
 
 
-def test_inherit_empty_list_returns_unchanged():
-    """An empty $inherit list is a no-op; the schema is returned as-is."""
-    schema = {"type": "object", "$inherit": [], "properties": {"foo": {}}}
-    resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR))
-    # Early-return path: schema is returned as-is (no mutation, no key removal).
-    assert resolved is schema
+def test_resolve_refs_ignores_non_local_refs(tmp_path):
+    """`$ref` values like '#/$defs/x' or 'http://...' are left untouched."""
+    schema = {
+        "allOf": [
+            {"$ref": "#/$defs/internal"},
+            {"$ref": "https://json-schema.org/draft/2019-09/schema"},
+        ],
+        "$defs": {"internal": {"type": "integer"}},
+    }
+    resolved = resolve_refs(schema, schema_path=str(tmp_path / "host.schema.json"))
+    assert resolved == schema
 
 
-def test_nested_inherit_raises_value_error(tmp_path):
-    """A fragment that itself contains $inherit raises ValueError (no nested inheritance)."""
-    # Build a fragment dir with a fragment that has its own $inherit.
-    (tmp_path / "parent.schema.json").write_text(
-        '{"properties": {"x": {"type": "string"}}}'
-    )
-    (tmp_path / "child.schema.json").write_text(
-        '{"$inherit": ["parent"], "properties": {"y": {"type": "string"}}}'
+def test_resolve_refs_against_real_fragment():
+    """resolve_refs against the actual batchable fragment in the repo works."""
+    # Simulate loading a block schema whose path is at depth blocks/X/Y/.
+    schema = {
+        "$schema": "https://json-schema.org/draft/2019-09/schema",
+        "type": "object",
+        "allOf": [{"$ref": "../../../resources/schemas/batchable.schema.json"}],
+        "properties": {"connection": {"type": "string"}},
+        "unevaluatedProperties": False,
+    }
+    # Pick any real block path so the relative $ref resolves.
+    block_path = (
+        Path(__file__).resolve().parent.parent
+        / "blocks" / "std" / "read" / "block.schema.json"
     )
-    schema = {"$inherit": ["child"], "type": "object", "properties": {}}
-    with pytest.raises(ValueError, match="nested inheritance is not supported"):
-        resolve_inherits(schema, schemas_dir=str(tmp_path))
+    resolved = resolve_refs(schema, schema_path=str(block_path))
+    # The batchable fragment is inlined inside allOf
+    assert resolved["allOf"][0]["properties"]["batch_size"]["default"] == 1000
+
+
+def test_resolve_refs_default_base_dir():
+    """When schema_path is None, refs resolve against resources/schemas/."""
+    schema = {"allOf": [{"$ref": "batchable.schema.json"}]}
+    resolved = resolve_refs(schema)
+    assert resolved["allOf"][0]["properties"]["batch_size"]["default"] == 1000
+
+
+def test_resolve_refs_default_base_dir_with_missing_file():
+    """Without schema_path, refs pointing at unknown files in the resources dir raise."""
+    schema = {"allOf": [{"$ref": "nope.schema.json"}]}
+    with pytest.raises(FileNotFoundError):
+        resolve_refs(schema)
diff --git a/docs/reference/blocks/azure_read_event_hub.md b/docs/reference/blocks/azure_read_event_hub.md
index fc3f8e5b..72bb4ef6 100644
--- a/docs/reference/blocks/azure_read_event_hub.md
+++ b/docs/reference/blocks/azure_read_event_hub.md
@@ -21,7 +21,6 @@ Read from Azure Event Hub
 |**checkpoint\_store\_container\_name**|`string`|The name of the container within the checkpoint store to store the checkpoints.<br/>|yes|
 |**max\_batch\_size**|`integer`|Maximum number of events to receive in each SDK callback. Renamed from the previous batch_size which used to mean this. Defaults to 300.<br/>Default: `300`<br/>Minimum: `1`<br/>|no|
 
-**Additional Properties:** not allowed  
 **Example**
 
 ```yaml
diff --git a/docs/reference/blocks/files_read_csv.md b/docs/reference/blocks/files_read_csv.md
index 44833e34..4a03458e 100644
--- a/docs/reference/blocks/files_read_csv.md
+++ b/docs/reference/blocks/files_read_csv.md
@@ -20,7 +20,6 @@ Read data from CSV
 |**delimiter**|`string`|Delimiter to use for splitting the csv records<br/>Default: `","`<br/>Minimal Length: `1`<br/>Maximal Length: `1`<br/>|no|
 |**quotechar**|`string`|A one-character string used to quote fields containing special characters, such as the delimiter or quotechar, or which contain new-line characters. It defaults to '<br/>Default: `"\""`<br/>Minimal Length: `1`<br/>Maximal Length: `1`<br/>|no|
 
-**Additional Properties:** not allowed  
 **Example**
 
 ```yaml
diff --git a/docs/reference/blocks/http_receiver.md b/docs/reference/blocks/http_receiver.md
index fa2c4cf2..1cad6824 100644
--- a/docs/reference/blocks/http_receiver.md
+++ b/docs/reference/blocks/http_receiver.md
@@ -17,7 +17,6 @@ Receives HTTP requests and process the data.
 |**host**|`string`|Host to listen<br/>Default: `"0.0.0.0"`<br/>||
 |**port**|`integer`|Port to listen<br/>Default: `8080`<br/>||
 
-**Additional Properties:** not allowed  
 **Example**
 
 ```yaml
diff --git a/docs/reference/blocks/parquet_read.md b/docs/reference/blocks/parquet_read.md
index 10f9f2b6..19a1c1b3 100644
--- a/docs/reference/blocks/parquet_read.md
+++ b/docs/reference/blocks/parquet_read.md
@@ -15,7 +15,6 @@ Read data from parquet
 |**batch\_size**|`integer`|Maximum number of records yielded per downstream batch.<br/>Default: `1000`<br/>Minimum: `1`<br/>|no|
 |**file**|`string`|Filename. Can contain a regexp or glob expression<br/>|yes|
 
-**Additional Properties:** not allowed  
 **Example**
 
 ```yaml
diff --git a/docs/reference/blocks/redis_read_stream.md b/docs/reference/blocks/redis_read_stream.md
index 31c0b265..317e4497 100644
--- a/docs/reference/blocks/redis_read_stream.md
+++ b/docs/reference/blocks/redis_read_stream.md
@@ -18,7 +18,6 @@ Read from Redis stream
 |**stream\_name**<br/>(Source stream name)|`string`|Source stream name<br/>|yes|
 |**snapshot**<br/>(Snapshot current entries and quit)|`boolean`|Snapshot current entries and quit<br/>Default: `false`<br/>|no|
 
-**Additional Properties:** not allowed  
 **Example**
 
 ```yaml
diff --git a/docs/reference/blocks/relational_read.md b/docs/reference/blocks/relational_read.md
index b439eb1b..409d6adb 100644
--- a/docs/reference/blocks/relational_read.md
+++ b/docs/reference/blocks/relational_read.md
@@ -19,7 +19,6 @@ Read a table from an SQL-compatible data store
 |[**columns**](#columns)<br/>(Optional subset of columns to load)|`array`||no|
 |**fetch\_size**|`integer`|Driver-level rows fetched per round-trip. Defaults to 10000.<br/>Default: `10000`<br/>Minimum: `1`<br/>|no|
 
-**Additional Properties:** not allowed  
 **Example**
 
 ```yaml
diff --git a/docs/reference/blocks/relational_write.md b/docs/reference/blocks/relational_write.md
index 34e54fed..a8ebabfb 100644
--- a/docs/reference/blocks/relational_write.md
+++ b/docs/reference/blocks/relational_write.md
@@ -24,15 +24,6 @@ Write into a SQL-compatible data store
 |[**inactive\_record\_mapping**](#inactive_record_mapping)<br/>(Used for \`TYPE2\` load\_strategy\. The columns mapping to use to close out an active record)|`array`|A list of columns to use. Use any valid SQL expression for the source. If 'target' is omitted, will default to the name of the source column<br/>Default: <br/>|no|
 
 **Additional Properties:** not allowed  
-   
-
-**No properties.**
-
-   
-**Not [required1]:** 
-**No properties.**
-
-
 **Example**
 
 ```yaml
diff --git a/docs/reference/blocks/std_read.md b/docs/reference/blocks/std_read.md
index e2d9481c..9f858f42 100644
--- a/docs/reference/blocks/std_read.md
+++ b/docs/reference/blocks/std_read.md
@@ -14,7 +14,6 @@ Read from the standard input
 |----|----|-----------|--------|
 |**batch\_size**|`integer`|Maximum number of records yielded per downstream batch.<br/>Default: `1000`<br/>Minimum: `1`<br/>||
 
-**Additional Properties:** not allowed  
 **Example**
 
 ```yaml
diff --git a/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md b/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md
index 2b96ce05..f2d20436 100644
--- a/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md
+++ b/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md
@@ -120,44 +120,31 @@ Why a queue and not `asyncio.wait_for(anext(gen), timeout)`: cancelling `__anext
 
 `flush_ms = None` ⇒ `timeout = None` ⇒ `queue.get()` waits forever ⇒ no time-based flush. Bounded sources don't set `flush_ms` and aren't affected.
 
-### Schema fragments
+### Schema composition (standard JSON Schema)
 
-Two shared fragments in `core/src/datayoga_core/resources/schemas/`:
+Two shared fragments in `core/src/datayoga_core/resources/schemas/` declare the common properties:
 
-`batchable.schema.json`:
+- `batchable.schema.json` declares `batch_size`.
+- `streamable.schema.json` declares both `batch_size` and `flush_ms`.
+
+Each block schema uses standard JSON Schema composition: `allOf` + `$ref` to the fragment file, plus `unevaluatedProperties: false` (rather than `additionalProperties: false`) so the fragment-contributed properties are recognized as evaluated. Example:
 
 ```json
 {
+  "$schema": "https://json-schema.org/draft/2019-09/schema",
+  "title": "std.read",
   "type": "object",
-  "properties": {
-    "batch_size": {
-      "type": "integer",
-      "minimum": 1,
-      "description": "Maximum number of records yielded per downstream batch",
-      "default": 1000
-    }
-  }
+  "allOf": [{ "$ref": "../../../resources/schemas/batchable.schema.json" }],
+  "properties": {},
+  "unevaluatedProperties": false
 }
 ```
 
-`streamable.schema.json`:
+At load time, `schema_utils.resolve_refs(schema, schema_path)` walks the schema, finds any local-file `$ref` (relative path, ends in `.json`, no URI scheme, no in-document fragment), and inlines the referenced file's contents in place. The resulting in-memory schema is self-contained — no remaining `$ref`s — so `Block.validate()` keeps using the simple `jsonschema.validate(instance, schema)` code path. The on-disk schemas remain standards-compliant; the resolution is purely a runtime detail to avoid threading a `RefResolver` through every validation site.
 
-```json
-{
-  "type": "object",
-  "allOf": [{ "$ref": "batchable.schema.json" }],
-  "properties": {
-    "flush_ms": {
-      "type": ["integer", "null"],
-      "minimum": 1,
-      "description": "If set, flush a partial batch after this many ms of inactivity. null/omitted = wait until batch_size or end-of-stream.",
-      "default": 1000
-    }
-  }
-}
-```
+`unevaluatedProperties: false` (introduced in draft 2019-09) is what makes composition + strict property validation work: with `additionalProperties: false`, a property contributed by an `allOf` member would be rejected as "additional" at the parent level. `unevaluatedProperties` is composition-aware.
 
-Bounded producer schemas `$ref` `batchable`; streaming producer schemas `$ref` `streamable`. The fragments are the single source of truth for the description, validation, and default.
+External tools that ARE `$ref`-aware (IDE schema validators, OpenAPI exporters) read the on-disk schemas correctly without our resolver. The `jsonschema2mk` docs generator is not `$ref`-aware, so `scripts/generate-docs.sh` pre-resolves `$ref` and flattens `allOf` properties for docs rendering only.
 
 ### Per-producer changes
 
diff --git a/schemas/job.schema.json b/schemas/job.schema.json
index ad0f20b9..d23ccc8e 100644
--- a/schemas/job.schema.json
+++ b/schemas/job.schema.json
@@ -111,15 +111,31 @@
           "then": {
             "properties": {
               "with": {
-                "additionalProperties": false,
+                "$schema": "https://json-schema.org/draft/2019-09/schema",
+                "allOf": [
+                  {
+                    "$schema": "https://json-schema.org/draft/2019-09/schema",
+                    "description": "Streaming producer mixin: declares batch_size and flush_ms for producers reading from continuous sources.",
+                    "properties": {
+                      "batch_size": {
+                        "default": 1000,
+                        "description": "Maximum number of records yielded per downstream batch.",
+                        "minimum": 1,
+                        "type": "integer"
+                      },
+                      "flush_ms": {
+                        "default": 1000,
+                        "description": "If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.",
+                        "minimum": 1,
+                        "type": ["integer", "null"]
+                      }
+                    },
+                    "title": "streamable",
+                    "type": "object"
+                  }
+                ],
                 "description": "Read from Azure Event Hub",
                 "properties": {
-                  "batch_size": {
-                    "default": 1000,
-                    "description": "Maximum number of records yielded per downstream batch.",
-                    "minimum": 1,
-                    "type": "integer"
-                  },
                   "checkpoint_store_connection_string": {
                     "description": "The connection string for the Azure Storage account used as the checkpoint store.",
                     "type": "string"
@@ -140,12 +156,6 @@
                     "description": "The name of the Azure Event Hub.",
                     "type": "string"
                   },
-                  "flush_ms": {
-                    "default": 1000,
-                    "description": "If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.",
-                    "minimum": 1,
-                    "type": ["integer", "null"]
-                  },
                   "max_batch_size": {
                     "default": 300,
                     "description": "Maximum number of events to receive in each SDK callback. Renamed from the previous batch_size which used to mean this. Defaults to 300.",
@@ -161,7 +171,8 @@
                   "checkpoint_store_container_name"
                 ],
                 "title": "azure.read_event_hub",
-                "type": "object"
+                "type": "object",
+                "unevaluatedProperties": false
               }
             }
           }
@@ -266,16 +277,26 @@
           "then": {
             "properties": {
               "with": {
-                "additionalProperties": false,
+                "$schema": "https://json-schema.org/draft/2019-09/schema",
+                "allOf": [
+                  {
+                    "$schema": "https://json-schema.org/draft/2019-09/schema",
+                    "description": "Producer batching mixin: declares batch_size for producers that yield records in batches.",
+                    "properties": {
+                      "batch_size": {
+                        "default": 1000,
+                        "description": "Maximum number of records yielded per downstream batch.",
+                        "minimum": 1,
+                        "type": "integer"
+                      }
+                    },
+                    "title": "batchable",
+                    "type": "object"
+                  }
+                ],
                 "description": "Read data from CSV",
                 "examples": [{ "delimiter": ";", "file": "archive.csv" }],
                 "properties": {
-                  "batch_size": {
-                    "default": 1000,
-                    "description": "Maximum number of records yielded per downstream batch.",
-                    "minimum": 1,
-                    "type": "integer"
-                  },
                   "delimiter": {
                     "default": ",",
                     "description": "Delimiter to use for splitting the csv records",
@@ -322,7 +343,8 @@
                 },
                 "required": ["file"],
                 "title": "files.read_csv",
-                "type": "object"
+                "type": "object",
+                "unevaluatedProperties": false
               }
             }
           }
@@ -376,22 +398,32 @@
           "then": {
             "properties": {
               "with": {
-                "additionalProperties": false,
+                "$schema": "https://json-schema.org/draft/2019-09/schema",
+                "allOf": [
+                  {
+                    "$schema": "https://json-schema.org/draft/2019-09/schema",
+                    "description": "Streaming producer mixin: declares batch_size and flush_ms for producers reading from continuous sources.",
+                    "properties": {
+                      "batch_size": {
+                        "default": 1000,
+                        "description": "Maximum number of records yielded per downstream batch.",
+                        "minimum": 1,
+                        "type": "integer"
+                      },
+                      "flush_ms": {
+                        "default": 1000,
+                        "description": "If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.",
+                        "minimum": 1,
+                        "type": ["integer", "null"]
+                      }
+                    },
+                    "title": "streamable",
+                    "type": "object"
+                  }
+                ],
                 "description": "Receives HTTP requests and process the data.",
                 "examples": [{ "host": "localhost", "port": 8080 }],
                 "properties": {
-                  "batch_size": {
-                    "default": 1000,
-                    "description": "Maximum number of records yielded per downstream batch.",
-                    "minimum": 1,
-                    "type": "integer"
-                  },
-                  "flush_ms": {
-                    "default": 1000,
-                    "description": "If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.",
-                    "minimum": 1,
-                    "type": ["integer", "null"]
-                  },
                   "host": {
                     "default": "0.0.0.0",
                     "description": "Host to listen",
@@ -404,7 +436,8 @@
                   }
                 },
                 "title": "http.receiver",
-                "type": "object"
+                "type": "object",
+                "unevaluatedProperties": false
               }
             }
           }
@@ -718,16 +751,26 @@
           "then": {
             "properties": {
               "with": {
-                "additionalProperties": false,
+                "$schema": "https://json-schema.org/draft/2019-09/schema",
+                "allOf": [
+                  {
+                    "$schema": "https://json-schema.org/draft/2019-09/schema",
+                    "description": "Producer batching mixin: declares batch_size for producers that yield records in batches.",
+                    "properties": {
+                      "batch_size": {
+                        "default": 1000,
+                        "description": "Maximum number of records yielded per downstream batch.",
+                        "minimum": 1,
+                        "type": "integer"
+                      }
+                    },
+                    "title": "batchable",
+                    "type": "object"
+                  }
+                ],
                 "description": "Read data from parquet",
                 "examples": [{ "file": "data.parquet" }],
                 "properties": {
-                  "batch_size": {
-                    "default": 1000,
-                    "description": "Maximum number of records yielded per downstream batch.",
-                    "minimum": 1,
-                    "type": "integer"
-                  },
                   "file": {
                     "description": "Filename. Can contain a regexp or glob expression",
                     "type": "string"
@@ -735,7 +778,8 @@
                 },
                 "required": ["file"],
                 "title": "parquet.read",
-                "type": "object"
+                "type": "object",
+                "unevaluatedProperties": false
               }
             }
           }
@@ -854,25 +898,35 @@
           "then": {
             "properties": {
               "with": {
-                "additionalProperties": false,
+                "$schema": "https://json-schema.org/draft/2019-09/schema",
+                "allOf": [
+                  {
+                    "$schema": "https://json-schema.org/draft/2019-09/schema",
+                    "description": "Streaming producer mixin: declares batch_size and flush_ms for producers reading from continuous sources.",
+                    "properties": {
+                      "batch_size": {
+                        "default": 1000,
+                        "description": "Maximum number of records yielded per downstream batch.",
+                        "minimum": 1,
+                        "type": "integer"
+                      },
+                      "flush_ms": {
+                        "default": 1000,
+                        "description": "If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.",
+                        "minimum": 1,
+                        "type": ["integer", "null"]
+                      }
+                    },
+                    "title": "streamable",
+                    "type": "object"
+                  }
+                ],
                 "description": "Read from Redis stream",
                 "properties": {
-                  "batch_size": {
-                    "default": 1000,
-                    "description": "Maximum number of records yielded per downstream batch.",
-                    "minimum": 1,
-                    "type": "integer"
-                  },
                   "connection": {
                     "description": "Connection name",
                     "type": "string"
                   },
-                  "flush_ms": {
-                    "default": 1000,
-                    "description": "If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.",
-                    "minimum": 1,
-                    "type": ["integer", "null"]
-                  },
                   "snapshot": {
                     "default": false,
                     "description": "Snapshot current entries and quit",
@@ -887,7 +941,8 @@
                 },
                 "required": ["connection", "stream_name"],
                 "title": "redis.read_stream",
-                "type": "object"
+                "type": "object",
+                "unevaluatedProperties": false
               }
             }
           }
@@ -1052,7 +1107,23 @@
           "then": {
             "properties": {
               "with": {
-                "additionalProperties": false,
+                "$schema": "https://json-schema.org/draft/2019-09/schema",
+                "allOf": [
+                  {
+                    "$schema": "https://json-schema.org/draft/2019-09/schema",
+                    "description": "Producer batching mixin: declares batch_size for producers that yield records in batches.",
+                    "properties": {
+                      "batch_size": {
+                        "default": 1000,
+                        "description": "Maximum number of records yielded per downstream batch.",
+                        "minimum": 1,
+                        "type": "integer"
+                      }
+                    },
+                    "title": "batchable",
+                    "type": "object"
+                  }
+                ],
                 "description": "Read a table from an SQL-compatible data store",
                 "examples": [
                   {
@@ -1066,12 +1137,6 @@
                   }
                 ],
                 "properties": {
-                  "batch_size": {
-                    "default": 1000,
-                    "description": "Maximum number of records yielded per downstream batch.",
-                    "minimum": 1,
-                    "type": "integer"
-                  },
                   "columns": {
                     "examples": [["fname", { "lname": "last_name" }]],
                     "items": {
@@ -1108,7 +1173,8 @@
                 },
                 "required": ["connection", "table"],
                 "title": "relational.read",
-                "type": "object"
+                "type": "object",
+                "unevaluatedProperties": false
               }
             }
           }
@@ -1426,18 +1492,28 @@
           "then": {
             "properties": {
               "with": {
-                "additionalProperties": false,
-                "description": "Read from the standard input",
-                "properties": {
-                  "batch_size": {
-                    "default": 1000,
-                    "description": "Maximum number of records yielded per downstream batch.",
-                    "minimum": 1,
-                    "type": "integer"
+                "$schema": "https://json-schema.org/draft/2019-09/schema",
+                "allOf": [
+                  {
+                    "$schema": "https://json-schema.org/draft/2019-09/schema",
+                    "description": "Producer batching mixin: declares batch_size for producers that yield records in batches.",
+                    "properties": {
+                      "batch_size": {
+                        "default": 1000,
+                        "description": "Maximum number of records yielded per downstream batch.",
+                        "minimum": 1,
+                        "type": "integer"
+                      }
+                    },
+                    "title": "batchable",
+                    "type": "object"
                   }
-                },
+                ],
+                "description": "Read from the standard input",
+                "properties": {},
                 "title": "std.read",
-                "type": "object"
+                "type": "object",
+                "unevaluatedProperties": false
               }
             }
           }
diff --git a/scripts/generate-docs.sh b/scripts/generate-docs.sh
index 03aa51ae..4d58f199 100755
--- a/scripts/generate-docs.sh
+++ b/scripts/generate-docs.sh
@@ -47,7 +47,6 @@ cleanup_resolved_tmps() {
 trap cleanup_resolved_tmps EXIT
 
 blocks_dir="./core/src/datayoga_core/blocks"
-schemas_dir="./core/src/datayoga_core/resources/schemas"
 for schema in $(find ${blocks_dir} -name '*.schema.json' | sort)
 do
   doc_name="$(awk -F/ '{ print $(NF-1) }' <<<${schema}).md"
@@ -56,36 +55,63 @@ do
   block_package="$(echo ${block_package} | cut -c2- | sed 's/\//_/g')"
   [ ! -z "${block_package}" ] && block_package="${block_package}_"
 
-  # Resolve $inherit fragments so jsonschema2mk sees the inherited properties
-  # (batch_size, flush_ms, etc.). jsonschema2mk does not understand our custom
-  # $inherit extension, so we materialize a resolved copy first.
+  # Materialize a docs-friendly copy of the schema:
+  #   1. Resolve local-file $ref nodes by inlining the referenced JSON.
+  #   2. Flatten allOf-contributed properties into the top-level `properties`
+  #      so jsonschema2mk renders a single property table per block.
   # Self-contained Python (stdlib only) so this works in CI without installing
-  # datayoga_core's runtime dependencies.
+  # datayoga_core's runtime dependencies. Pre-resolve at doc-gen time only;
+  # the on-disk schemas remain standard JSON Schema.
   resolved_tmp="$(mktemp --suffix=.schema.json)"
   RESOLVED_TMP_FILES+=("${resolved_tmp}")
-  python3 - "${schema}" "${schemas_dir}" > "${resolved_tmp}" <<'PYEOF'
+  python3 - "${schema}" > "${resolved_tmp}" <<'PYEOF'
 import json
 import os
 import sys
 
-schema_path, schemas_dir = sys.argv[1], sys.argv[2]
-with open(schema_path) as f:
-    schema = json.load(f)
-inherits = schema.get("$inherit") or []
-if inherits:
-    if not isinstance(inherits, list) or not all(isinstance(n, str) for n in inherits):
-        raise SystemExit(f"$inherit must be a list of strings, got {inherits!r}")
+
+def resolve_node(node, base_dir, visited):
+    if isinstance(node, dict):
+        ref = node.get("$ref")
+        if isinstance(ref, str) and not ref.startswith("#") and "://" not in ref and ref.endswith(".json"):
+            target = os.path.normpath(os.path.join(base_dir, ref))
+            if target in visited:
+                raise SystemExit(f"Circular $ref at {target}")
+            if not os.path.isfile(target):
+                raise SystemExit(f"$ref target not found: {ref} -> {target}")
+            with open(target) as f:
+                fragment = json.load(f)
+            visited.add(target)
+            try:
+                return resolve_node(fragment, os.path.dirname(target), visited)
+            finally:
+                visited.discard(target)
+        return {k: resolve_node(v, base_dir, visited) for k, v in node.items()}
+    if isinstance(node, list):
+        return [resolve_node(item, base_dir, visited) for item in node]
+    return node
+
+
+def flatten_allof_properties(schema):
+    """Inline `allOf[*].properties` into the top-level `properties`, removing
+    the allOf. Docs-only transformation so jsonschema2mk renders one table."""
+    if not isinstance(schema, dict) or "allOf" not in schema:
+        return schema
     merged = {}
-    for name in inherits:
-        fragment_path = os.path.join(schemas_dir, f"{name}.schema.json")
-        with open(fragment_path) as f:
-            fragment = json.load(f)
-        if fragment.get("$inherit"):
-            raise SystemExit(f"Nested $inherit in fragment '{name}' is not supported")
-        merged.update(fragment.get("properties", {}))
+    for member in schema.get("allOf", []):
+        if isinstance(member, dict):
+            merged.update(member.get("properties", {}))
     merged.update(schema.get("properties", {}))
     schema["properties"] = merged
-    schema.pop("$inherit", None)
+    schema.pop("allOf", None)
+    return schema
+
+
+schema_path = sys.argv[1]
+with open(schema_path) as f:
+    schema = json.load(f)
+schema = resolve_node(schema, os.path.dirname(os.path.abspath(schema_path)), set())
+schema = flatten_allof_properties(schema)
 json.dump(schema, sys.stdout)
 PYEOF
 

From eae77e35a0522dbf3768a98174b8974f38ea958f Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 19:57:10 +0300
Subject: [PATCH 34/38] Update spec to reflect what shipped (#400)

Four corrections after PR review and migration to standard JSON Schema:
- Status: Design -> Implemented in PR #401.
- flush_ms code snippet now shows the bounded queue (maxsize=1) and
  source-error propagation that Copilot review surfaced, plus the
  cancelled-flag dance that the bounded queue requires.
- Producer ABC risk note: produce_chunks is the new override hook but
  not formally @abstractmethod, so legacy produce() overrides keep
  working (correcting an earlier overstatement).
- Event Hub schema risk note: we use unevaluatedProperties: false,
  not additionalProperties: false (composition-aware).
- Drop CHANGELOG mention (no CHANGELOG in this repo; PR description
  carries the breaking change note).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 ...28-producer-batching-unification-design.md | 42 +++++++++++++------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md b/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md
index f2d20436..f5411371 100644
--- a/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md
+++ b/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md
@@ -1,9 +1,9 @@
 # Producer batching unification
 
-**Status:** Design — pending implementation
+**Status:** Implemented in PR #401
 **Date:** 2026-05-28
 **Issue:** #400
-**Closes:** #294, #295, #296, #377 (as a side effect of the refactor)
+**Closes:** #293, #294, #295, #296, #377 (as a side effect of the refactor)
 
 ## Problem
 
@@ -70,27 +70,39 @@ The base class accumulates chunks and re-emits them in batches of up to `batch_s
 
 For streaming sources, partial batches must flush on inactivity, otherwise a low-traffic stream could hold records indefinitely.
 
-Implementation uses an internal queue + background pump task, mirroring the pattern already in `azure/read_event_hub`:
+Implementation uses an internal **bounded** queue + background pump task. The pump captures source errors and re-raises on the consumer side, so failures aren't silently treated as EOS:
 
 ```python
-async def produce(self) -> AsyncGenerator[List[Message], None]:
+async def produce(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
     batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE))
     flush_ms = self.properties.get("flush_ms", self.DEFAULT_FLUSH_MS)
-    timeout = (flush_ms / 1000) if flush_ms is not None else None
+    timeout = (flush_ms / 1000) if flush_ms else None
 
-    queue: asyncio.Queue[Optional[List[Message]]] = asyncio.Queue()
+    # maxsize=1 preserves the natural backpressure the old yield-driven model
+    # had: the pump can be at most one chunk ahead of the consumer.
+    queue: asyncio.Queue = asyncio.Queue(maxsize=1)
     EOS = object()
+    pump_error: List[BaseException] = []  # captured non-cancellation errors
 
     async def pump():
+        cancelled = False
         try:
             async for chunk in self.produce_chunks():
                 if chunk:
                     await queue.put(chunk)
+        except asyncio.CancelledError:
+            cancelled = True
+            raise
+        except BaseException as exc:
+            pump_error.append(exc)
         finally:
-            await queue.put(EOS)
+            # Skip the EOS put on cancellation — the consumer's finally is
+            # awaiting us and the queue may be full; putting would deadlock.
+            if not cancelled:
+                await queue.put(EOS)
 
     pump_task = asyncio.create_task(pump())
-    buffer: List[Message] = []
+    buffer: List[Dict[str, Any]] = []
     try:
         while True:
             try:
@@ -104,6 +116,8 @@ async def produce(self) -> AsyncGenerator[List[Message], None]:
             if item is EOS:
                 if buffer:
                     yield buffer
+                if pump_error:
+                    raise pump_error[0]  # propagate source error to caller
                 return
 
             buffer.extend(item)
@@ -112,12 +126,16 @@ async def produce(self) -> AsyncGenerator[List[Message], None]:
                 buffer = buffer[batch_size:]
     finally:
         pump_task.cancel()
-        with suppress(asyncio.CancelledError):
+        with suppress(asyncio.CancelledError, Exception):
             await pump_task
 ```
 
 Why a queue and not `asyncio.wait_for(anext(gen), timeout)`: cancelling `__anext__` on an async generator with side effects (open connections, partial reads) can leave it in a broken state. Cancelling the _pump task_ boundary is safe; the generator finishes its current chunk before the pump's `try/finally` runs.
 
+Why `maxsize=1` and the `cancelled` flag: an unbounded queue removes backpressure — the pump could pre-load an entire parquet or relational table into memory while the consumer is processing batch 1 (flagged by Copilot review). Bounding at 1 keeps memory flat at the cost of a deadlock when the consumer is cancelled mid-flow (the pump's `finally: put(EOS)` blocks against a full queue). The `cancelled` flag skips the EOS put on cancellation, since the consumer is gone and EOS doesn't need to be delivered.
+
+Why `pump_error`: catching all exceptions in the pump and letting it terminate via EOS would silently truncate input on a source failure (Redis disconnect, broken CSV, DB error) — the consumer would see clean end-of-stream against partial data. Capturing the exception and re-raising on the consumer side makes the job fail loudly instead (also flagged by Copilot review).
+
 `flush_ms = None` ⇒ `timeout = None` ⇒ `queue.get()` waits forever ⇒ no time-based flush. Bounded sources don't set `flush_ms` and aren't affected.
 
 ### Schema composition (standard JSON Schema)
@@ -351,15 +369,15 @@ A `FakeProducer` whose `produce_chunks` yields scripted chunks. Cases:
 
 - Update `docs/reference/blocks/*_read.md` for each affected producer (`batch_size`, `flush_ms`, `fetch_size`, `max_batch_size` where applicable).
 - Add a section in `docs/processing-strategies.md` explaining the producer batching model: chunked subclass output, base-class re-chunking, `flush_ms` for streaming sources.
-- CHANGELOG entry calling out:
+- PR description carries the breaking-change note (no CHANGELOG file in this repo):
   - New `batch_size`/`flush_ms` on previously non-batching producers.
   - **Breaking:** `azure/read_event_hub.batch_size` renamed to `max_batch_size`; the name `batch_size` now means pipeline batch size.
 
 ## Risks and trade-offs
 
-1. **`Producer` ABC change.** `produce_chunks` is now the abstract method. Any external/downstream custom producer subclassing `Producer` and overriding `produce()` directly will break. Acceptable given datayoga's surface area; called out in CHANGELOG.
+1. **`Producer` ABC change.** `produce_chunks` is the new override hook (raises NotImplementedError by default; not formally `@abstractmethod` so legacy subclasses that still override `produce()` directly continue to validate). All 7 in-tree producers were migrated to override `produce_chunks`; external/downstream subclassers that override `produce()` directly continue to work but bypass the base-class batching. Called out in the PR description.
 
-2. **Event Hub silent-semantic-change risk.** The breaking rename is intentional. Adding `additionalProperties: false` to the Event Hub schema (which it lacks today) is part of this change so that old `batch_size: 300` configs fail validation loudly, not get silently ignored.
+2. **Event Hub silent-semantic-change risk.** The breaking rename is intentional. Adding `unevaluatedProperties: false` to the Event Hub schema (which lacked any `additionalProperties` declaration before) catches typos loudly. The literal `batch_size: 300` still validates after the rename but now means pipeline batch size, not SDK callback size — that semantic shift is documented in the PR description and the processing-strategies docs.
 
 3. **`flush_ms` semantics on Job shutdown.** When the producer is being cancelled (`Job.shutdown` → `Step.stop`), the pump's `try/finally` ensures `EOS` is queued. The `produce()` loop sees `EOS` and flushes the final partial batch. Verified by the `test_producer_batching` shutdown case.
 

From 2d2bcc4f29bc4a38caac3cb2255ce6b0694ef719 Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Thu, 28 May 2026 19:57:44 +0300
Subject: [PATCH 35/38] Spec: correct Event Hub migration note (#400)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Section said schema gains additionalProperties: false; actually it gains
unevaluatedProperties: false (composition-aware). Also reframed the
"rejects batch_size: 300 loudly" claim, which was always wrong — that
literal property name still validates after the rename, just with new
semantics. Typos are what additionalProperties/unevaluatedProperties
catches.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../specs/2026-05-28-producer-batching-unification-design.md  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md b/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md
index f5411371..ffc1c7c9 100644
--- a/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md
+++ b/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md
@@ -292,9 +292,9 @@ class Block(DyProducer):
             yield chunk
 ```
 
-**Migration:** Users with `batch_size: 300` in YAML thinking it controls SDK callbacks must rename to `max_batch_size: 300`. No backward-compat shim. Called out in CHANGELOG.
+**Migration:** Users with `batch_size: 300` in YAML thinking it controls SDK callbacks must rename to `max_batch_size: 300`. No backward-compat shim. The literal `batch_size: 300` still validates after the rename but now means pipeline batch size, not SDK callback size — that semantic shift is documented in the PR description.
 
-The schema for `azure/read_event_hub` also gains `additionalProperties: false` (it doesn't have it today). Without this, an old `batch_size: 300` in YAML would silently be ignored as an unknown property after the rename. With it, validation fails loudly with a clear error.
+The schema for `azure/read_event_hub` also gains `unevaluatedProperties: false` (it had no `additionalProperties` declaration before). Typos like `batch_sz: 300` now fail validation loudly with a clear error.
 
 **`http/receiver`** (streaming)
 

From 26a271b3fd4ae64efd4cfede68f92631e561dbce Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Sun, 31 May 2026 13:29:20 +0300
Subject: [PATCH 36/38] Rename test_schema_inherit.py -> test_schema_refs.py
 (#400)

The file now tests schema_utils.resolve_refs (standard JSON Schema \$ref
resolution), not the old custom \$inherit extension. Filename was stale.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../tests/{test_schema_inherit.py => test_schema_refs.py}         | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename core/src/datayoga_core/tests/{test_schema_inherit.py => test_schema_refs.py} (100%)

diff --git a/core/src/datayoga_core/tests/test_schema_inherit.py b/core/src/datayoga_core/tests/test_schema_refs.py
similarity index 100%
rename from core/src/datayoga_core/tests/test_schema_inherit.py
rename to core/src/datayoga_core/tests/test_schema_refs.py

From a58336229871691a8abfe14232d085655c435261 Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Sun, 31 May 2026 14:05:00 +0300
Subject: [PATCH 37/38] Fix redis/read_stream PEL pagination regression (#400)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Copilot re-review caught a real regression: my PR set count=batch_size on
all xreadgroup calls, including the pending-entry-list reads (id="0").
Combined with the unconditional flip to read_pending=False after the first
call, this meant: only the first batch_size pending entries got processed
per job session; anything beyond was stranded until restart.

The "stay in pending mode until empty" attempt I tried first didn't work:
XREADGROUP id="0" always returns from the start of the PEL (since the
producer doesn't ack inside produce_chunks), so a smaller count just makes
us re-read the same first page forever.

Fix: revert to the pre-PR semantic for the pending read — count=None drains
the entire PEL in one call. Keep count=batch_size for the new-message read
(id=">") so the #377 batching contract still applies to live streams. The
producer's yield-as-a-chunk behavior (the actual fix for #377) is unchanged.

Updated tests:
- test_redis_new_message_read_uses_count_equal_to_batch_size: pending
  call uses count=None, new-message call uses count=batch_size
- test_redis_drains_full_pel_in_one_call_even_when_larger_than_batch_size:
  20 pending entries drain in a single call; base class re-chunks to
  batch_size=5 -> four batches of 5

Real-Redis smoke against Redis 7 with PEL=25, batch_size=5: produces
batches=[5,5,5,5,5], all PEL delivered, no re-reads.

The other Copilot comment (sync redis-py with block=0 freezes the asyncio
event loop) is a real architectural concern but pre-existing — same
behavior in pre-PR code. Deferred to a follow-up issue if needed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../blocks/redis/read_stream/block.py         | 15 +++-
 .../tests/test_redis_read_stream.py           | 71 ++++++++++++++++---
 2 files changed, 75 insertions(+), 11 deletions(-)

diff --git a/core/src/datayoga_core/blocks/redis/read_stream/block.py b/core/src/datayoga_core/blocks/redis/read_stream/block.py
index aa464743..ad166a8c 100644
--- a/core/src/datayoga_core/blocks/redis/read_stream/block.py
+++ b/core/src/datayoga_core/blocks/redis/read_stream/block.py
@@ -30,7 +30,15 @@ def init(self, context: Optional[Context] = None):
             self.redis_client.xgroup_create(self.stream, self.consumer_group, 0)
 
     async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
-        """Reads pending then new stream messages via XREADGROUP, yielding each response as a chunk."""
+        """Reads pending then new stream messages via XREADGROUP, yielding each response as a chunk.
+
+        Pending entries (id="0") are drained in a single unbounded XREADGROUP
+        call (count=None) — this matches pre-PR behavior. Paginating PEL via
+        count is not safe with a non-acking producer because XREADGROUP id="0"
+        always returns from the start of PEL, so a smaller count would just
+        re-read the same first page forever. New-message reads (id=">") use
+        count=batch_size to bound the Redis network response size.
+        """
         logger.debug(f"Running {self.get_block_name()}")
         batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE))
         read_pending = True
@@ -39,7 +47,7 @@ async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
             streams = self.redis_client.xreadgroup(
                 self.consumer_group, self.requesting_consumer,
                 {self.stream: "0" if read_pending else ">"},
-                count=batch_size,
+                count=None if read_pending else batch_size,
                 block=100 if self.snapshot else 0,
             )
 
@@ -58,6 +66,9 @@ async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
             if self.snapshot and not read_pending and not yielded_any:
                 return
 
+            # Flip unconditionally after the first pending-read call: count=None
+            # drained the entire PEL in that single call, so there's no more
+            # pending work to do this session.
             read_pending = False
 
     def ack(self, msg_ids: List[str]):
diff --git a/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py b/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py
index 5c4a43f7..de003da8 100644
--- a/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py
+++ b/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py
@@ -16,15 +16,30 @@ def _mk_block(properties, redis_client):
     return block
 
 
+def _xreadgroup_count(call):
+    """Extract the count arg from an xreadgroup call regardless of kw/positional."""
+    if "count" in call.kwargs:
+        return call.kwargs["count"]
+    if len(call.args) >= 4:
+        return call.args[3]
+    return None
+
+
+def _xreadgroup_id(call):
+    """Extract the stream-id dict value from an xreadgroup call."""
+    streams = call.kwargs.get("streams") or (call.args[2] if len(call.args) >= 3 else {})
+    return next(iter(streams.values())) if streams else None
+
+
 @pytest.mark.asyncio
-async def test_redis_uses_count_equal_to_batch_size():
-    """xreadgroup is called with count=batch_size (closes #377)."""
+async def test_redis_new_message_read_uses_count_equal_to_batch_size():
+    """xreadgroup for new messages (id='>') uses count=batch_size (closes #377)."""
     redis = MagicMock()
     payload_a = (b"1-0", {b"data": b'{"i": 1}'})
     payload_b = (b"2-0", {b"data": b'{"i": 2}'})
     redis.xreadgroup.side_effect = [
-        [(b"mystream", [payload_a, payload_b])],  # pending
-        [(b"mystream", [])],                       # nothing new -> exit
+        [(b"mystream", [payload_a, payload_b])],  # pending (drained in one call, count=None)
+        [(b"mystream", [])],                       # new-read empty -> exit
     ]
 
     block = _mk_block({"batch_size": 250, "_snapshot": True}, redis)
@@ -32,9 +47,18 @@ async def test_redis_uses_count_equal_to_batch_size():
     async for b in block.produce():
         batches.append(b)
 
-    assert all(c.kwargs.get("count") == 250 or (len(c.args) >= 4 and c.args[3] == 250)
-               for c in redis.xreadgroup.call_args_list), \
-        "xreadgroup should be called with count=batch_size"
+    # First call is pending (id="0"); it uses count=None (drain).
+    pending_call = redis.xreadgroup.call_args_list[0]
+    assert _xreadgroup_id(pending_call) == "0"
+    assert _xreadgroup_count(pending_call) is None, \
+        "pending read should use count=None to drain PEL in one call"
+
+    # Subsequent new-message calls (id=">") use count=batch_size.
+    new_calls = [c for c in redis.xreadgroup.call_args_list if _xreadgroup_id(c) == ">"]
+    assert new_calls, "expected at least one new-message read"
+    for c in new_calls:
+        assert _xreadgroup_count(c) == 250, \
+            f"new-message read should use count=batch_size, got count={_xreadgroup_count(c)}"
 
 
 @pytest.mark.asyncio
@@ -43,8 +67,8 @@ async def test_redis_yields_records_as_a_batch_not_one_by_one():
     redis = MagicMock()
     pages = [(f"{i}-0".encode(), {b"data": f'{{"i": {i}}}'.encode()}) for i in range(5)]
     redis.xreadgroup.side_effect = [
-        [(b"mystream", pages)],
-        [(b"mystream", [])],
+        [(b"mystream", pages)],     # pending drained in one call
+        [(b"mystream", [])],         # new-read empty -> exit
     ]
 
     block = _mk_block({"batch_size": 100, "_snapshot": True}, redis)
@@ -54,3 +78,32 @@ async def test_redis_yields_records_as_a_batch_not_one_by_one():
 
     assert [len(b) for b in batches] == [5]
     assert batches[0][0]["i"] == 0
+
+
+@pytest.mark.asyncio
+async def test_redis_drains_full_pel_in_one_call_even_when_larger_than_batch_size():
+    """Pending reads use count=None so the entire PEL drains in a single call.
+    The base class re-chunks the result to batch_size. This avoids the
+    Copilot-flagged pagination bug where count=batch_size + XREADGROUP id='0'
+    would re-read the same first page forever (since the producer doesn't ack
+    inside produce_chunks)."""
+    redis = MagicMock()
+    # Simulate a PEL of 20 entries returned in one xreadgroup call.
+    pel = [(f"{i}-0".encode(), {b"data": f'{{"i": {i}}}'.encode()}) for i in range(20)]
+    redis.xreadgroup.side_effect = [
+        [(b"mystream", pel)],         # entire PEL in one call (count=None)
+        [(b"mystream", [])],          # new-read empty -> exit
+    ]
+
+    block = _mk_block({"batch_size": 5, "_snapshot": True}, redis)
+    batches = []
+    async for b in block.produce():
+        batches.append(b)
+
+    # All 20 pending entries are delivered; the base class re-chunks them
+    # to batch_size=5 → four batches of 5.
+    assert [len(b) for b in batches] == [5, 5, 5, 5]
+    # Only ONE pending read was made (PEL drained in one shot).
+    pending_calls = [c for c in redis.xreadgroup.call_args_list if _xreadgroup_id(c) == "0"]
+    assert len(pending_calls) == 1, \
+        f"expected exactly 1 pending read (count=None drains all), got {len(pending_calls)}"

From 9c2c59b18525364d67854f6f6eeb8b0556afc155 Mon Sep 17 00:00:00 2001
From: spicy-sauce <yshirizli@gmail.com>
Date: Sun, 31 May 2026 14:40:47 +0300
Subject: [PATCH 38/38] Add property-based tests, external-cancel test, mypy
 fix (#400)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three final verification additions:

1. Property-based rechunker tests (hypothesis): probe ~1000 generated
   chunk-size/batch-size combinations to verify the rechunker's
   invariants — record conservation, order preservation, all batches
   well-formed, no empty emissions, partial-only-at-end. Adds hypothesis
   to test extras. Catches the class of bug Copilot flagged where my
   existing tests only covered specific inputs, not the contract.

2. test_external_task_cancellation_cleans_up_pump: simulates the
   Job.shutdown / Job.run cancellation path (cancelling the outer task
   that iterates produce()) and verifies no producer pump task is
   orphaned afterward. The spec claims this works; now there's a test.

3. mypy fix: Producer.DEFAULT_FLUSH_MS was inferred as None-only,
   making subclass overrides with int fail strict type-checking. Now
   typed as Optional[int]. mypy clean on all 9 changed source files.

90 tests pass (was 84: +5 property tests, +1 external-cancel test).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 core/pyproject.toml                           |   2 +
 core/src/datayoga_core/producer.py            |   6 +-
 .../tests/test_producer_batching.py           |  36 +++++
 .../test_producer_batching_properties.py      | 141 ++++++++++++++++++
 4 files changed, 182 insertions(+), 3 deletions(-)
 create mode 100644 core/src/datayoga_core/tests/test_producer_batching_properties.py

diff --git a/core/pyproject.toml b/core/pyproject.toml
index 2a55ee25..2aeca188 100644
--- a/core/pyproject.toml
+++ b/core/pyproject.toml
@@ -28,6 +28,7 @@ python = "^3.8"
 PyYAML = "^6.0"
 sqlglot = "^10.4.3"
 
+hypothesis = { version = "^6.0", optional = true }
 mock = { version = "^4.0.3", optional = true }
 pytest = { version = "^7.1.2", optional = true }
 pytest-aioresponses = { version = "^0.2.0", optional = true }
@@ -70,6 +71,7 @@ test = [
         "azure-eventhub-checkpointstoreblob-aio",
         "cassandra-driver",
         "fastparquet",
+        "hypothesis",
         "ibm_db_sa",
         "mock",
         "oracledb",
diff --git a/core/src/datayoga_core/producer.py b/core/src/datayoga_core/producer.py
index dc5b05d5..a199d446 100644
--- a/core/src/datayoga_core/producer.py
+++ b/core/src/datayoga_core/producer.py
@@ -1,7 +1,7 @@
 import asyncio
 import logging
 from contextlib import suppress
-from typing import Any, AsyncGenerator, Dict, List
+from typing import Any, AsyncGenerator, Dict, List, Optional
 
 from .block import Block
 
@@ -28,8 +28,8 @@ class Producer(Block):
     the base-class batching and `produce_chunks` is not called.
     """
 
-    DEFAULT_BATCH_SIZE = 1000
-    DEFAULT_FLUSH_MS = None  # streaming subclasses override to enable timeout flush
+    DEFAULT_BATCH_SIZE: int = 1000
+    DEFAULT_FLUSH_MS: Optional[int] = None  # streaming subclasses override to enable timeout flush
 
     async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]:
         """Yield natural-size chunks from the source.
diff --git a/core/src/datayoga_core/tests/test_producer_batching.py b/core/src/datayoga_core/tests/test_producer_batching.py
index 5ab4f98a..c3fe94b0 100644
--- a/core/src/datayoga_core/tests/test_producer_batching.py
+++ b/core/src/datayoga_core/tests/test_producer_batching.py
@@ -138,6 +138,42 @@ async def test_consumer_cancellation_cleans_up_pump():
     await asyncio.sleep(0.1)
 
 
+@pytest.mark.asyncio
+async def test_external_task_cancellation_cleans_up_pump():
+    """When the task iterating produce() is cancelled (e.g., Job.run is cancelled
+    by the runtime), the producer's pump task must clean up. This is the
+    Job-shutdown scenario: an external cancellation propagates through the
+    async-for loop into the producer generator's finally."""
+    chunks = [[_msg(i)] for i in range(10_000)]
+    p = FakeProducer({"batch_size": 5, "flush_ms": 50}, chunks=chunks,
+                     sleep_before=[0.01] * 10_000)
+
+    async def consume():
+        # Mirrors Job.run's iteration pattern.
+        async for batch in p.produce():
+            pass  # downstream processing would happen here
+
+    consumer_task = asyncio.create_task(consume())
+    await asyncio.sleep(0.05)  # let the producer ramp up — some batches arrive
+    consumer_task.cancel()
+    with pytest.raises(asyncio.CancelledError):
+        await consumer_task
+    # Give the loop a moment to settle any pending finalizers.
+    await asyncio.sleep(0.1)
+
+    # No producer pump task should remain after cancellation. We identify the
+    # pump specifically by Producer.produce.<locals>.pump in its qualname,
+    # since the test's own name happens to contain "pump".
+    remaining = [t for t in asyncio.all_tasks() if not t.done()]
+    pump_tasks = [
+        t for t in remaining
+        if "Producer.produce" in (t.get_coro().__qualname__ or "")
+    ]
+    assert not pump_tasks, \
+        f"orphaned producer pump tasks after cancellation: " \
+        f"{[t.get_coro().__qualname__ for t in pump_tasks]}"
+
+
 class _BoomProducer(Producer):
     """Producer whose produce_chunks() raises after emitting some chunks."""
 
diff --git a/core/src/datayoga_core/tests/test_producer_batching_properties.py b/core/src/datayoga_core/tests/test_producer_batching_properties.py
new file mode 100644
index 00000000..d288c3a1
--- /dev/null
+++ b/core/src/datayoga_core/tests/test_producer_batching_properties.py
@@ -0,0 +1,141 @@
+"""Property-based tests for the Producer base-class rechunker.
+
+Where `test_producer_batching.py` asserts specific outputs for specific inputs,
+this file uses Hypothesis to generate arbitrary chunk-size sequences and probe
+the rechunker's invariants. Catches the class of bug where the code works for
+the inputs you tested but breaks somewhere in the wider input space.
+"""
+import asyncio
+from typing import AsyncGenerator, Dict, List, Optional
+
+import pytest
+from datayoga_core.context import Context
+from datayoga_core.producer import Producer
+from hypothesis import given, settings
+from hypothesis import strategies as st
+
+
+class _ScriptedProducer(Producer):
+    """Producer driven by a scripted list of chunk-sizes; each chunk has
+    sequential integer payloads."""
+
+    def __init__(self, properties, *, chunk_sizes):
+        """Wires the schema and chunk script."""
+        self._test_schema = {
+            "type": "object",
+            "properties": {"batch_size": {"type": "integer", "minimum": 1}},
+        }
+        self._chunk_sizes = chunk_sizes
+        super().__init__(properties)
+
+    def get_json_schema(self):
+        """In-memory schema (no disk read)."""
+        return self._test_schema
+
+    def init(self, context: Optional[Context] = None):
+        """No-op."""
+        pass
+
+    async def produce_chunks(self) -> AsyncGenerator[List[Dict], None]:
+        """Yield chunks of the scripted sizes, with sequential payload ids."""
+        counter = 0
+        for size in self._chunk_sizes:
+            chunk = [{Producer.MSG_ID_FIELD: str(counter + i), "v": counter + i}
+                     for i in range(size)]
+            counter += size
+            yield chunk
+
+
+async def _drain(producer: Producer):
+    out = []
+    async for batch in producer.produce():
+        out.append(batch)
+    return out
+
+
+# Strategies
+chunk_sizes_strategy = st.lists(
+    st.integers(min_value=0, max_value=200),
+    min_size=0,
+    max_size=20,
+)
+batch_size_strategy = st.integers(min_value=1, max_value=300)
+
+
+@given(chunk_sizes=chunk_sizes_strategy, batch_size=batch_size_strategy)
+@settings(max_examples=200, deadline=2000)
+def test_property_record_conservation(chunk_sizes, batch_size):
+    """The total number of records yielded downstream equals the total number
+    yielded by produce_chunks. No records lost; none duplicated."""
+    p = _ScriptedProducer({"batch_size": batch_size}, chunk_sizes=chunk_sizes)
+    batches = asyncio.run(_drain(p))
+    expected_total = sum(chunk_sizes)
+    actual_total = sum(len(b) for b in batches)
+    assert actual_total == expected_total, \
+        f"chunk_sizes={chunk_sizes}, batch_size={batch_size}: " \
+        f"expected {expected_total} records, got {actual_total}"
+
+
+@given(chunk_sizes=chunk_sizes_strategy, batch_size=batch_size_strategy)
+@settings(max_examples=200, deadline=2000)
+def test_property_record_order_preserved(chunk_sizes, batch_size):
+    """Records flow downstream in the same order produce_chunks emits them.
+    Re-chunking doesn't shuffle."""
+    p = _ScriptedProducer({"batch_size": batch_size}, chunk_sizes=chunk_sizes)
+    batches = asyncio.run(_drain(p))
+    flat = [r["v"] for b in batches for r in b]
+    expected = list(range(sum(chunk_sizes)))
+    assert flat == expected, \
+        f"chunk_sizes={chunk_sizes}, batch_size={batch_size}: order mismatch"
+
+
+@given(chunk_sizes=chunk_sizes_strategy, batch_size=batch_size_strategy)
+@settings(max_examples=200, deadline=2000)
+def test_property_batch_sizes_well_formed(chunk_sizes, batch_size):
+    """Every batch is non-empty AND has length ≤ batch_size. All batches except
+    possibly the last have length == batch_size (the last may be partial on EOS)."""
+    p = _ScriptedProducer({"batch_size": batch_size}, chunk_sizes=chunk_sizes)
+    batches = asyncio.run(_drain(p))
+    for i, b in enumerate(batches):
+        assert len(b) > 0, f"batch {i} is empty: {batches}"
+        assert len(b) <= batch_size, f"batch {i} exceeds batch_size: {len(b)} > {batch_size}"
+    # All non-final batches should be exactly batch_size (no time-based flush
+    # here since flush_ms is not set).
+    for i, b in enumerate(batches[:-1]):
+        assert len(b) == batch_size, \
+            f"batch {i} is partial mid-stream: len={len(b)}, batch_size={batch_size}"
+
+
+@given(chunk_sizes=chunk_sizes_strategy, batch_size=batch_size_strategy)
+@settings(max_examples=200, deadline=2000)
+def test_property_no_empty_emissions(chunk_sizes, batch_size):
+    """If produce_chunks emits empty chunks, the base class doesn't propagate
+    them downstream."""
+    # Inject empty chunks throughout the sequence.
+    chunks_with_empties = []
+    for size in chunk_sizes:
+        chunks_with_empties.append(0)  # empty
+        chunks_with_empties.append(size)
+    p = _ScriptedProducer({"batch_size": batch_size}, chunk_sizes=chunks_with_empties)
+    batches = asyncio.run(_drain(p))
+    for i, b in enumerate(batches):
+        assert len(b) > 0, f"empty batch emitted at index {i}"
+
+
+@given(num_records=st.integers(min_value=0, max_value=500),
+       batch_size=st.integers(min_value=1, max_value=100))
+@settings(max_examples=100, deadline=2000)
+def test_property_partial_final_batch_only(num_records, batch_size):
+    """When all records come in one big chunk, the output is N full batches plus
+    optionally one partial batch — never a partial in the middle."""
+    p = _ScriptedProducer({"batch_size": batch_size}, chunk_sizes=[num_records])
+    batches = asyncio.run(_drain(p))
+    if num_records == 0:
+        assert batches == [], "expected no batches for empty source"
+        return
+    expected_full, remainder = divmod(num_records, batch_size)
+    sizes = [len(b) for b in batches]
+    if remainder == 0:
+        assert sizes == [batch_size] * expected_full
+    else:
+        assert sizes == [batch_size] * expected_full + [remainder]