From 124bb931fc4e41c010a59ee717c318b10aaa909e Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Tue, 1 Apr 2025 13:43:22 +0300 Subject: [PATCH 01/38] Use sa.sql.text for SQL statement execution in Block class --- core/src/datayoga_core/blocks/relational/write/block.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/core/src/datayoga_core/blocks/relational/write/block.py b/core/src/datayoga_core/blocks/relational/write/block.py index 1348d420..39c1ac7d 100644 --- a/core/src/datayoga_core/blocks/relational/write/block.py +++ b/core/src/datayoga_core/blocks/relational/write/block.py @@ -10,7 +10,6 @@ from datayoga_core.context import Context from datayoga_core.opcode import OpCode from datayoga_core.result import BlockResult, Result, Status -from sqlalchemy import text from sqlalchemy.exc import OperationalError logger = logging.getLogger("dy") @@ -210,7 +209,7 @@ def process_records( def execute(self, statement: Any, records: List[Dict[str, Any]]): """Executes a SQL statement with given records.""" if isinstance(statement, str): - statement = text(statement) + statement = sa.sql.text(statement) logger.debug(f"Executing {statement} on {records}") connected = False From 4773f9aed61249b297b740fd75b7d756fc26accb Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Tue, 1 Apr 2025 13:43:47 +0300 Subject: [PATCH 02/38] Add batch processing capability to Block class and update schema for batch_size --- .../datayoga_core/blocks/std/read/block.py | 29 ++++++++++++++++--- .../blocks/std/read/block.schema.json | 10 ++++++- 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/core/src/datayoga_core/blocks/std/read/block.py b/core/src/datayoga_core/blocks/std/read/block.py index b649c88b..184423a1 100644 --- a/core/src/datayoga_core/blocks/std/read/block.py +++ b/core/src/datayoga_core/blocks/std/read/block.py @@ -16,19 +16,40 @@ class Block(DyProducer): def init(self, context: Optional[Context] = None): logger.debug(f"Initializing {self.get_block_name()}") + self.batch_size = int(self.properties.get("batch_size", 1000)) + logger.info(f"!Using batch size: {self.batch_size}") + + async def process_batch(self, records: List[Dict[str, Any]]) -> AsyncGenerator[List[Message], None]: + """Process records and yield batches according to batch_size""" + batch = [] + for record in records: + batch.append(self.get_message(record)) + + # When batch is full, yield it + if len(batch) >= self.batch_size: + logger.info(f"Yielding batch of {len(batch)} records") + yield batch + batch = [] + + # Yield any remaining records + if batch: + logger.info(f"Yielding final batch of {len(batch)} records") + yield batch async def produce(self) -> AsyncGenerator[List[Message], None]: if select.select([sys.stdin, ], [], [], 0.0)[0]: # piped data exists + all_records = [] for data in sys.stdin: - for record in self.get_records(data): - yield [self.get_message(record)] + all_records.extend(self.get_records(data)) else: # interactive mode print("Enter data to process:") data = input() - for record in self.get_records(data): - yield [self.get_message(record)] + all_records = self.get_records(data) + + async for batch in self.process_batch(all_records): + yield batch @staticmethod def get_records(data: str) -> List[Dict[str, Any]]: diff --git a/core/src/datayoga_core/blocks/std/read/block.schema.json b/core/src/datayoga_core/blocks/std/read/block.schema.json index 11453dbf..38ad05af 100644 --- a/core/src/datayoga_core/blocks/std/read/block.schema.json +++ b/core/src/datayoga_core/blocks/std/read/block.schema.json @@ -1,4 +1,12 @@ { "title": "std.read", - "description": "Read from the standard input" + "description": "Read from the standard input", + "type": "object", + "properties": { + "batch_size": { + "type": "integer", + "description": "Number of records to process in a single batch", + "default": 1000 + } + } } From c02ab0485bc580febefad6c54716ed218c815399 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 1 Apr 2025 10:45:27 +0000 Subject: [PATCH 03/38] update json schemas --- schemas/job.schema.json | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/schemas/job.schema.json b/schemas/job.schema.json index d9031291..340e0e68 100644 --- a/schemas/job.schema.json +++ b/schemas/job.schema.json @@ -176,7 +176,15 @@ "properties": { "with": { "title": "std.read", - "description": "Read from the standard input" + "description": "Read from the standard input", + "type": "object", + "properties": { + "batch_size": { + "type": "integer", + "description": "Number of records to process in a single batch", + "default": 1000 + } + } } } } From 3a43bc1bb95400a02fa5b66f1c48de1b85354039 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 1 Apr 2025 10:45:48 +0000 Subject: [PATCH 04/38] update autogenerated docs --- docs/reference/blocks/std_read.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/reference/blocks/std_read.md b/docs/reference/blocks/std_read.md index 32b6904b..aca1c24a 100644 --- a/docs/reference/blocks/std_read.md +++ b/docs/reference/blocks/std_read.md @@ -8,4 +8,17 @@ grand_parent: Reference Read from the standard input +**Properties** + +|Name|Type|Description|Required| +|----|----|-----------|--------| +|**batch\_size**|`integer`|Number of records to process in a single batch
Default: `1000`
|| + +**Example** + +```yaml +batch_size: 1000 + +``` + From 0b51d63147ec5b7716d233edd5bebac2638c03b6 Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Tue, 1 Apr 2025 13:52:49 +0300 Subject: [PATCH 05/38] Fix log message to remove unnecessary exclamation mark in batch size initialization --- core/src/datayoga_core/blocks/std/read/block.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/datayoga_core/blocks/std/read/block.py b/core/src/datayoga_core/blocks/std/read/block.py index 184423a1..e0b60b13 100644 --- a/core/src/datayoga_core/blocks/std/read/block.py +++ b/core/src/datayoga_core/blocks/std/read/block.py @@ -17,7 +17,7 @@ class Block(DyProducer): def init(self, context: Optional[Context] = None): logger.debug(f"Initializing {self.get_block_name()}") self.batch_size = int(self.properties.get("batch_size", 1000)) - logger.info(f"!Using batch size: {self.batch_size}") + logger.info(f"Using batch size: {self.batch_size}") async def process_batch(self, records: List[Dict[str, Any]]) -> AsyncGenerator[List[Message], None]: """Process records and yield batches according to batch_size""" From 8f4f992a77106536b42d321b32dfca015e6fb0c1 Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Sun, 28 Dec 2025 14:42:17 +0200 Subject: [PATCH 06/38] update json schemas --- schemas/job.schema.json | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/schemas/job.schema.json b/schemas/job.schema.json index 74b3e45e..1b2a2533 100644 --- a/schemas/job.schema.json +++ b/schemas/job.schema.json @@ -1371,7 +1371,15 @@ "properties": { "with": { "description": "Read from the standard input", - "title": "std.read" + "properties": { + "batch_size": { + "default": 1000, + "description": "Number of records to process in a single batch", + "type": "integer" + } + }, + "title": "std.read", + "type": "object" } } } From 807d61ad8a0f1a50b3fc45572e7d6e12d934a3a8 Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Sun, 28 Dec 2025 15:19:30 +0200 Subject: [PATCH 07/38] Increase timeout for integration tests from 10 to 15 minutes --- .github/workflows/integration-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index 7b4b819c..66b9ee0d 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -8,7 +8,7 @@ concurrency: jobs: integration-tests: runs-on: ubuntu-22.04 - timeout-minutes: 10 + timeout-minutes: 15 steps: - name: Check out repository code From 633d9bfb4420f814ed543dabd416d89f92eb3ddb Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 16:14:10 +0300 Subject: [PATCH 08/38] Add design spec for producer batching unification (#400) Brainstormed design for unifying batch handling across all 7 producer blocks (std/read, files/read_csv, relational/read, parquet/read, redis/read_stream, azure/read_event_hub, http/receiver). Closes the gap behind #294, #295, #296, #377 by making the Producer base class own batching via a new produce_chunks() hook. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...28-producer-batching-unification-design.md | 385 ++++++++++++++++++ 1 file changed, 385 insertions(+) create mode 100644 docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md diff --git a/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md b/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md new file mode 100644 index 00000000..81692cdc --- /dev/null +++ b/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md @@ -0,0 +1,385 @@ +# Producer batching unification + +**Status:** Design — pending implementation +**Date:** 2026-05-28 +**Issue:** #400 +**Closes:** #294, #295, #296, #377 (as a side effect of the refactor) + +## Problem + +Seven producer blocks each handle (or fail to handle) batching differently: + +| Producer | Bounded/Streaming | `batch_size` today | Behavior | +|---|---|---|---| +| `std/read` | bounded | yes, default 1000 *(on `batch_size_in_std_read_block` branch)* | custom `process_batch` accumulator | +| `files/read_csv` | bounded | yes, default 1000 | own `islice(reader, batch_size)` loop | +| `relational/read` | bounded | **no** — hardcoded `fetchmany(10000)` | yields one row at a time downstream (bug) | +| `parquet/read` | bounded | **no** | yields one row at a time (bug) | +| `redis/read_stream` | streaming | **no** | yields one record at a time (bug #377) | +| `azure/read_event_hub` | streaming | yes, default 300, **but** controls *SDK callback batch size*, not pipeline batch size | drains internal queue in unbounded batches | +| `http/receiver` | streaming | **no** | yields one record per HTTP request (bug) | + +Four are actively buggy (yielding single records into the pipeline when batches are intended). One uses `batch_size` with a different semantic. Each producer that has implemented batching has done it differently. + +The duplication is the root cause of issues #294, #295, #296, and #377 — all four are the same gap, in different blocks. + +## Goal + +Make the `Producer` base class own batching. Subclasses describe how to fetch records; the base class controls the size and timing of batches yielded to the pipeline. + +After the change: + +- `batch_size` means the same thing in every producer: the maximum number of records yielded per downstream batch. +- Adding a new producer cannot reintroduce the "yield single records" bug — there's no place for it to happen. +- Streaming producers get an optional `flush_ms` so partial batches flush on inactivity instead of being held indefinitely. + +Non-goals: changing the `Job`/`Step` pipeline, adding new sources, restructuring the Result/payload model (that's #245). + +## Design + +### Base-class contract + +```python +# core/src/datayoga_core/producer.py + +class Producer(Block): + DEFAULT_BATCH_SIZE = 1000 + DEFAULT_FLUSH_MS = None # streaming subclasses override + + @abstractmethod + async def produce_chunks(self) -> AsyncGenerator[List[Message], None]: + """Yield natural chunks of any size. Base class re-chunks to batch_size.""" + raise NotImplementedError + + async def produce(self) -> AsyncGenerator[List[Message], None]: + """Public entry point. Reads chunks from produce_chunks() and re-emits + in exact batch_size slices, with optional time-based flush.""" + ... +``` + +Subclasses override `produce_chunks` instead of `produce`. They emit chunks of any size — whatever's natural to the source (a Parquet row group, a `fetchmany` result, an `xreadgroup` response, an Event Hub callback batch, a single record). + +The base class accumulates chunks and re-emits them in exact `batch_size` slices, flushing whatever's left on end-of-stream. + +### `batch_size` and `flush_ms` are read lazily + +`produce()` reads `self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE)` on first call, not in `init()`. This avoids the "subclass forgot `super().init(context)`" footgun. + +### `flush_ms` implementation + +For streaming sources, partial batches must flush on inactivity, otherwise a low-traffic stream could hold records indefinitely. + +Implementation uses an internal queue + background pump task, mirroring the pattern already in `azure/read_event_hub`: + +```python +async def produce(self) -> AsyncGenerator[List[Message], None]: + batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE)) + flush_ms = self.properties.get("flush_ms", self.DEFAULT_FLUSH_MS) + timeout = (flush_ms / 1000) if flush_ms is not None else None + + queue: asyncio.Queue[Optional[List[Message]]] = asyncio.Queue() + EOS = object() + + async def pump(): + try: + async for chunk in self.produce_chunks(): + if chunk: + await queue.put(chunk) + finally: + await queue.put(EOS) + + pump_task = asyncio.create_task(pump()) + buffer: List[Message] = [] + try: + while True: + try: + item = await asyncio.wait_for(queue.get(), timeout=timeout) + except asyncio.TimeoutError: + if buffer: + yield buffer + buffer = [] + continue + + if item is EOS: + if buffer: + yield buffer + return + + buffer.extend(item) + while len(buffer) >= batch_size: + yield buffer[:batch_size] + buffer = buffer[batch_size:] + finally: + pump_task.cancel() + with suppress(asyncio.CancelledError): + await pump_task +``` + +Why a queue and not `asyncio.wait_for(anext(gen), timeout)`: cancelling `__anext__` on an async generator with side effects (open connections, partial reads) can leave it in a broken state. Cancelling the *pump task* boundary is safe; the generator finishes its current chunk before the pump's `try/finally` runs. + +`flush_ms = None` ⇒ `timeout = None` ⇒ `queue.get()` waits forever ⇒ no time-based flush. Bounded sources don't set `flush_ms` and aren't affected. + +### Schema fragments + +Two shared fragments in `core/src/datayoga_core/resources/schemas/`: + +`batchable.schema.json`: +```json +{ + "type": "object", + "properties": { + "batch_size": { + "type": "integer", + "minimum": 1, + "description": "Maximum number of records yielded per downstream batch", + "default": 1000 + } + } +} +``` + +`streamable.schema.json`: +```json +{ + "type": "object", + "allOf": [{ "$ref": "batchable.schema.json" }], + "properties": { + "flush_ms": { + "type": ["integer", "null"], + "minimum": 1, + "description": "If set, flush a partial batch after this many ms of inactivity. null/omitted = wait until batch_size or end-of-stream.", + "default": 1000 + } + } +} +``` + +Bounded producer schemas `$ref` `batchable`; streaming producer schemas `$ref` `streamable`. The fragments are the single source of truth for the description, validation, and default. + +### Per-producer changes + +**`std/read`** (bounded) + +Replace `process_batch` with a single-chunk yield. Base class slices. + +```python +async def produce_chunks(self): + if select.select([sys.stdin], [], [], 0.0)[0]: + all_records = [r for line in sys.stdin for r in self.get_records(line)] + else: + print("Enter data to process:") + all_records = self.get_records(input()) + if all_records: + yield [self.get_message(r) for r in all_records] +``` + +**`files/read_csv`** (bounded) + +Drops the `islice` loop; yield in `batch_size` chunks. Base class re-emits. + +```python +async def produce_chunks(self): + batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE)) + with open(self.file, "r", encoding=self.encoding) as f: + reader = DictReader(f, fieldnames=self.fields, + delimiter=self.delimiter, quotechar=self.quotechar) + for _ in range(self.skip): + next(reader, None) + counter = iter(count()) + while True: + chunk = [{self.MSG_ID_FIELD: f"{next(counter)}", **r} + for r in islice(reader, batch_size)] + if not chunk: + return + yield chunk +``` + +**`relational/read`** (bounded) + +`batch_size` uses the framework default (1000). `fetch_size` defaults to **10000** to preserve today's driver-roundtrip count as the no-config baseline. Result: strict improvement vs. today (downstream goes from 1-record batches to 1000-record batches; DB roundtrips stay at 10000). + +```python +class Block(DyProducer): + DEFAULT_FETCH_SIZE = 10000 + + async def produce_chunks(self): + fetch_size = int(self.properties.get("fetch_size", self.DEFAULT_FETCH_SIZE)) + result = self.connection.execution_options(stream_results=True).execute(self.tbl.select()) + while True: + rows = result.fetchmany(fetch_size) + if not rows: + return + yield [utils.add_uid(dict(r._asdict())) for r in rows] +``` + +Schema adds optional `fetch_size` with default 10000. + +**`parquet/read`** (bounded) + +Fix one-by-one yield. Each row group becomes one chunk; base class re-emits in `batch_size` slices. + +```python +async def produce_chunks(self): + pf = ParquetFile(self.file) + counter = iter(count()) + for df in pf.iter_row_groups(): + yield [{self.MSG_ID_FIELD: str(next(counter)), **row.to_dict()} + for _, row in df.iterrows()] +``` + +**`redis/read_stream`** (streaming, closes #377) + +Use `count=batch_size` on `xreadgroup`. Yield each batch as a chunk. Class overrides `DEFAULT_FLUSH_MS = 1000`. + +```python +class Block(DyProducer): + DEFAULT_FLUSH_MS = 1000 + +async def produce_chunks(self): + batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE)) + read_pending = True + while True: + streams = self.redis_client.xreadgroup( + self.consumer_group, self.requesting_consumer, + {self.stream: "0" if read_pending else ">"}, + count=batch_size, + block=100 if self.snapshot else 0, # streaming blocks forever; snapshot polls briefly + ) + for stream in streams: + chunk = [] + for key, value in stream[1]: + payload = orjson.loads(value[next(iter(value))]) + payload[self.MSG_ID_FIELD] = key + chunk.append(payload) + if chunk: + yield chunk + if self.snapshot and not read_pending: + return + read_pending = False +``` + +`flush_ms` (default 1000) ensures partial batches flush during low-volume periods. The pump task can sit blocked inside `xreadgroup` indefinitely — that's fine, because the pump and the consumer side of the base-class queue are decoupled. When a single message finally arrives, it lands in the queue immediately and `flush_ms` flushes the partial batch downstream. + +**`azure/read_event_hub`** (streaming, breaking change) + +Existing `batch_size` property → renamed `max_batch_size` (matches SDK semantic, default 300). New `batch_size` (pipeline semantic, default 1000) comes from the streamable fragment. + +```python +class Block(DyProducer): + DEFAULT_FLUSH_MS = 1000 + + def init(self, context=None): + self.max_batch_size = int(self.properties.get("max_batch_size", 300)) + # ... existing client setup ... + self.events = {} + self.messages = asyncio.Queue() + + async def produce_chunks(self): + asyncio.create_task(self.receive_batch()) # uses self.max_batch_size + while True: + msg = await self.messages.get() + chunk = [msg] + while not self.messages.empty(): + chunk.append(self.messages.get_nowait()) + yield chunk +``` + +**Migration:** Users with `batch_size: 300` in YAML thinking it controls SDK callbacks must rename to `max_batch_size: 300`. No backward-compat shim. Called out in CHANGELOG. + +The schema for `azure/read_event_hub` also gains `additionalProperties: false` (it doesn't have it today). Without this, an old `batch_size: 300` in YAML would silently be ignored as an unknown property after the rename. With it, validation fails loudly with a clear error. + +**`http/receiver`** (streaming) + +Drain the queue per chunk; `flush_ms` flushes partial batches when traffic is low. + +```python +class Block(DyProducer): + DEFAULT_FLUSH_MS = 1000 + + async def produce_chunks(self): + queue: Queue = Queue(maxsize=1000) + async def handler(request): + try: + queue.put_nowait(orjson.loads(await request.read())) + return HTTPOk() + except Exception: + logger.exception("Got exception while parsing request:") + return HTTPInternalServerError() + runner = ServerRunner(Server(handler)) + await runner.setup() + srv = TCPSite(runner, self.host, self.port) + await srv.start() + try: + counter = iter(count()) + while True: + msg = await queue.get() + chunk = [{self.MSG_ID_FIELD: f"{next(counter)}", **msg}] + while not queue.empty(): + chunk.append({self.MSG_ID_FIELD: f"{next(counter)}", **queue.get_nowait()}) + yield chunk + finally: + with suppress(Exception): + await srv.stop() +``` + +### Defaults summary + +| Producer | `batch_size` | `flush_ms` | Other | +|---|---|---|---| +| `std/read` | 1000 | — | — | +| `files/read_csv` | 1000 | — | — | +| `relational/read` | 1000 | — | optional `fetch_size`, defaults to 10000 | +| `parquet/read` | 1000 | — | — | +| `redis/read_stream` | 1000 | 1000 | — | +| `azure/read_event_hub` | 1000 | 1000 | `max_batch_size` 300 (renamed from old `batch_size`) | +| `http/receiver` | 1000 | 1000 | — | + +## Tests + +**New base-class tests** (`core/src/datayoga_core/tests/test_producer_batching.py`): + +A `FakeProducer` whose `produce_chunks` yields scripted chunks. Cases: + +- One 5000-record chunk + `batch_size=1000` → five batches of 1000. +- Three chunks of [200, 300, 400] + `batch_size=1000` → one batch of 900 on EOS (no empty trailing). +- 1500 records + `batch_size=1000` → batches of [1000, 500]. +- `flush_ms=100` with a producer that sleeps 200ms between chunks → partial batches flush on inactivity. +- `flush_ms=None` holds records indefinitely (asserted with a timeout that the next batch doesn't arrive early). +- Empty chunk yields are ignored (no empty batches emitted). +- Pump-task cleanup: cancelling the consumer cancels the pump cleanly (no warnings, no leaks). + +**Per-producer tests:** + +- `std/read`, `files/read_csv` — existing tests adapted; assert batch counts/sizes match `batch_size`. +- `relational/read` — assert it yields batches (not single rows); assert `fetch_size` controls driver calls independently of `batch_size`. +- `parquet/read` — multi-row-group file; batches honor `batch_size` regardless of row-group boundaries. +- `redis/read_stream` — assert `xreadgroup` called with `count=batch_size`. The `redis_to_relational` integration test (mentioned in #377) provides the end-to-end signal; it depends on the batch-fallback in `relational/write` shipped in commit `7e5b6f7`, which is already in place. +- `azure/read_event_hub` — assert validation rejects legacy `batch_size: 500` with no `max_batch_size`; assert `max_batch_size: 500, batch_size: 100` results in SDK callbacks of 500 and downstream batches of 100. +- `http/receiver` — send N records via webhook; assert they land in batches of `batch_size`, or partial batches after `flush_ms`. + +## Documentation + +- Update `docs/reference/blocks/*_read.md` for each affected producer (`batch_size`, `flush_ms`, `fetch_size`, `max_batch_size` where applicable). +- Add a section in `docs/processing-strategies.md` explaining the producer batching model: chunked subclass output, base-class re-chunking, `flush_ms` for streaming sources. +- CHANGELOG entry calling out: + - New `batch_size`/`flush_ms` on previously non-batching producers. + - **Breaking:** `azure/read_event_hub.batch_size` renamed to `max_batch_size`; the name `batch_size` now means pipeline batch size. + +## Risks and trade-offs + +1. **`Producer` ABC change.** `produce_chunks` is now the abstract method. Any external/downstream custom producer subclassing `Producer` and overriding `produce()` directly will break. Acceptable given datayoga's surface area; called out in CHANGELOG. + +2. **Event Hub silent-semantic-change risk.** The breaking rename is intentional. Adding `additionalProperties: false` to the Event Hub schema (which it lacks today) is part of this change so that old `batch_size: 300` configs fail validation loudly, not get silently ignored. + +3. **`flush_ms` semantics on Job shutdown.** When the producer is being cancelled (`Job.shutdown` → `Step.stop`), the pump's `try/finally` ensures `EOS` is queued. The `produce()` loop sees `EOS` and flushes the final partial batch. Verified by the `test_producer_batching` shutdown case. + +4. **`relational/read` defaults.** `fetch_size` defaults to 10000 to preserve today's DB roundtrip count. `batch_size` defaults to 1000, matching the framework default. Net effect vs. today: downstream batches grow from 1 to 1000 (huge improvement); DB roundtrips unchanged. Users with memory pressure on large rows can set a smaller `fetch_size` explicitly. Documented in the block's reference page. + +5. **Re-chunking cost.** Lists are sliced with `buffer[:n]` / `buffer[n:]` — O(batch_size) per batch. Negligible relative to per-record block work; no benchmark required. + +## Out of scope + +- Changing the `Result`/payload internal field representation (issue #245). +- Adding new connector blocks (Snowflake #392, Kafka, S3 #351, RabbitMQ #265, Kinesis #264). +- Pulling Prometheus out of core (#336). +- Backpressure / queue sizing changes to the `Step` pipeline. From 0696f9c8e014086c0b1d22f999b0bbe0047efe05 Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 16:28:33 +0300 Subject: [PATCH 09/38] Add implementation plan for producer batching unification (#400) Task-by-task TDD plan covering: schema fragment loader, Producer base class, and per-producer migrations (std/read, files/read_csv, parquet/read, relational/read, redis/read_stream, http/receiver, azure/read_event_hub), plus autogen + docs. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...026-05-28-producer-batching-unification.md | 2203 +++++++++++++++++ 1 file changed, 2203 insertions(+) create mode 100644 docs/superpowers/plans/2026-05-28-producer-batching-unification.md diff --git a/docs/superpowers/plans/2026-05-28-producer-batching-unification.md b/docs/superpowers/plans/2026-05-28-producer-batching-unification.md new file mode 100644 index 00000000..a53f0e0f --- /dev/null +++ b/docs/superpowers/plans/2026-05-28-producer-batching-unification.md @@ -0,0 +1,2203 @@ +# Producer Batching Unification Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Move batching out of individual producer blocks into the `Producer` base class so every read block has consistent `batch_size` behavior, and three buggy producers stop yielding single records. + +**Architecture:** The `Producer` base class gets a new abstract-by-convention hook `produce_chunks()` that yields lists of any size. Its `produce()` method becomes a re-chunker that emits exact `batch_size` batches, with an optional `flush_ms` timeout-flush for streaming sources. Schema fragments (`batchable.schema.json`, `streamable.schema.json`) provide the shared `batch_size`/`flush_ms` definitions, resolved at load time via a `$inherit` convention. Each of the 7 producer blocks migrates to override `produce_chunks` instead of `produce`. + +**Tech Stack:** Python 3.7+, asyncio, jsonschema, pytest (asyncio mode), SQLAlchemy, redis-py, aiohttp, azure-eventhub. + +**Spec:** `docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md` +**Issue:** #400 + +--- + +## File Structure + +**Created:** +- `core/src/datayoga_core/resources/schemas/batchable.schema.json` — fragment exposing `batch_size` +- `core/src/datayoga_core/resources/schemas/streamable.schema.json` — fragment exposing `flush_ms` (combined with batchable) +- `core/src/datayoga_core/schema_utils.py` — `$inherit` resolver used by Block + Job +- `core/src/datayoga_core/tests/__init__.py` — empty, makes the tests package importable +- `core/src/datayoga_core/tests/test_schema_inherit.py` — tests for the `$inherit` resolver +- `core/src/datayoga_core/tests/test_producer_batching.py` — base-class batching tests +- `core/src/datayoga_core/blocks/parquet/read/tests/__init__.py` (if package missing) +- `core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py` +- `core/src/datayoga_core/blocks/files/read_csv/tests/__init__.py` +- `core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py` +- `core/src/datayoga_core/blocks/redis/read_stream/tests/__init__.py` +- `core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py` +- `core/src/datayoga_core/blocks/http/receiver/tests/__init__.py` +- `core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py` +- `core/src/datayoga_core/blocks/azure/read_event_hub/tests/__init__.py` +- `core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py` +- `core/src/datayoga_core/blocks/relational/read/tests/__init__.py` +- `core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py` + +**Modified:** +- `core/src/datayoga_core/producer.py` — adds `produce_chunks` and a default `produce()` that re-chunks +- `core/src/datayoga_core/block.py` — `get_json_schema()` runs through `$inherit` resolver +- `core/src/datayoga_core/job.py` — `get_json_schema()` loop runs each loaded schema through the resolver +- `core/src/datayoga_core/blocks/std/read/block.py` — replace `process_batch` with `produce_chunks` +- `core/src/datayoga_core/blocks/std/read/block.schema.json` — use `$inherit: ["batchable"]` +- `core/src/datayoga_core/blocks/files/read_csv/block.py` — `produce_chunks` (drop `islice` loop in `produce`) +- `core/src/datayoga_core/blocks/files/read_csv/block.schema.json` — drop inline `batch_size`, add `$inherit` +- `core/src/datayoga_core/blocks/parquet/read/block.py` — `produce_chunks` per row group +- `core/src/datayoga_core/blocks/parquet/read/block.schema.json` — add `$inherit` +- `core/src/datayoga_core/blocks/relational/read/block.py` — `produce_chunks` with `fetch_size` +- `core/src/datayoga_core/blocks/relational/read/block.schema.json` — add `$inherit` + `fetch_size` property +- `core/src/datayoga_core/blocks/redis/read_stream/block.py` — `produce_chunks` with `count=batch_size` +- `core/src/datayoga_core/blocks/redis/read_stream/block.schema.json` — `$inherit: ["streamable"]` +- `core/src/datayoga_core/blocks/http/receiver/block.py` — `produce_chunks` drains queue +- `core/src/datayoga_core/blocks/http/receiver/block.schema.json` — `$inherit: ["streamable"]` +- `core/src/datayoga_core/blocks/azure/read_event_hub/block.py` — `produce_chunks`, rename `batch_size` → `max_batch_size` +- `core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json` — rename property, add `additionalProperties: false`, `$inherit: ["streamable"]` +- `schemas/job.schema.json` — regenerated at the end +- `docs/reference/blocks/*.md` — regenerated at the end +- `docs/processing-strategies.md` — new section on producer batching + +--- + +## Task 1: Schema fragment loader + +Adds the `$inherit` convention and the two shared fragments. After this task, schemas referencing `batchable` / `streamable` get the fragments' properties merged in at load time. + +**Files:** +- Create: `core/src/datayoga_core/resources/schemas/batchable.schema.json` +- Create: `core/src/datayoga_core/resources/schemas/streamable.schema.json` +- Create: `core/src/datayoga_core/schema_utils.py` +- Create: `core/src/datayoga_core/tests/__init__.py` +- Create: `core/src/datayoga_core/tests/test_schema_inherit.py` +- Modify: `core/src/datayoga_core/block.py` (lines 44–59) +- Modify: `core/src/datayoga_core/job.py` (lines 223–244) + +- [ ] **Step 1.1: Create the `batchable` fragment** + +Create `core/src/datayoga_core/resources/schemas/batchable.schema.json`: + +```json +{ + "title": "batchable", + "description": "Producer batching mixin: declares batch_size for producers that yield records in batches.", + "type": "object", + "properties": { + "batch_size": { + "type": "integer", + "minimum": 1, + "description": "Maximum number of records yielded per downstream batch.", + "default": 1000 + } + } +} +``` + +- [ ] **Step 1.2: Create the `streamable` fragment** + +Create `core/src/datayoga_core/resources/schemas/streamable.schema.json`: + +```json +{ + "title": "streamable", + "description": "Streaming producer mixin: declares batch_size and flush_ms for producers reading from continuous sources.", + "type": "object", + "properties": { + "batch_size": { + "type": "integer", + "minimum": 1, + "description": "Maximum number of records yielded per downstream batch.", + "default": 1000 + }, + "flush_ms": { + "type": ["integer", "null"], + "minimum": 1, + "description": "If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.", + "default": 1000 + } + } +} +``` + +- [ ] **Step 1.3: Create empty tests package** + +If `core/src/datayoga_core/tests/__init__.py` does not exist, create it as an empty file. (Several test modules in this plan live in `core/src/datayoga_core/tests/`; the directory must be importable.) + +```bash +test -f core/src/datayoga_core/tests/__init__.py || touch core/src/datayoga_core/tests/__init__.py +``` + +- [ ] **Step 1.4: Write the failing test for `$inherit` resolution** + +Create `core/src/datayoga_core/tests/test_schema_inherit.py`: + +```python +import json +from pathlib import Path + +import pytest + +from datayoga_core.schema_utils import resolve_inherits + + +SCHEMAS_DIR = ( + Path(__file__).resolve().parent.parent / "resources" / "schemas" +) + + +def test_inherit_merges_fragment_properties(): + schema = { + "title": "demo", + "type": "object", + "$inherit": ["batchable"], + "properties": {"foo": {"type": "string"}}, + "additionalProperties": False, + } + resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) + assert "$inherit" not in resolved + assert "batch_size" in resolved["properties"] + assert resolved["properties"]["batch_size"]["default"] == 1000 + assert resolved["properties"]["foo"] == {"type": "string"} + assert resolved["additionalProperties"] is False + + +def test_inherit_local_property_wins_over_fragment(): + schema = { + "type": "object", + "$inherit": ["batchable"], + "properties": { + "batch_size": {"type": "integer", "minimum": 1, "default": 50} + }, + } + resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) + assert resolved["properties"]["batch_size"]["default"] == 50 + + +def test_inherit_streamable_brings_both_props(): + schema = {"type": "object", "$inherit": ["streamable"], "properties": {}} + resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) + assert "batch_size" in resolved["properties"] + assert "flush_ms" in resolved["properties"] + + +def test_schema_without_inherit_unchanged(): + schema = { + "type": "object", + "properties": {"foo": {"type": "string"}}, + "additionalProperties": False, + } + resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) + assert resolved == schema + + +def test_unknown_fragment_raises(): + schema = {"type": "object", "$inherit": ["nope"], "properties": {}} + with pytest.raises(FileNotFoundError): + resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) +``` + +- [ ] **Step 1.5: Run test to verify it fails** + +Run: +```bash +cd core && python -m pytest src/datayoga_core/tests/test_schema_inherit.py -v +``` + +Expected: FAIL with `ModuleNotFoundError: No module named 'datayoga_core.schema_utils'`. + +- [ ] **Step 1.6: Implement the resolver** + +Create `core/src/datayoga_core/schema_utils.py`: + +```python +"""Schema composition helpers. + +Producers and other blocks can declare `"$inherit": ["batchable"]` at the +top of their block.schema.json to pull in shared property definitions from +the fragments in resources/schemas/. `resolve_inherits` merges the +fragments' `properties` into the local schema (local properties win), then +removes the `$inherit` key. Schemas without `$inherit` are returned as-is. +""" +from __future__ import annotations + +import copy +from os import path +from typing import Any, Dict, List + +from datayoga_core import utils + + +def resolve_inherits(schema: Dict[str, Any], schemas_dir: str = None) -> Dict[str, Any]: + """Merge any fragments listed in $inherit into the schema's properties. + + Args: + schema: The schema to resolve. Mutated in place and also returned. + schemas_dir: Directory containing the fragment files. Defaults to + the bundled/non-bundled resources/schemas directory. + + Returns: + The mutated schema with $inherit removed and fragment properties merged. + """ + inherits: List[str] = schema.get("$inherit") or [] + if not inherits: + return schema + + if schemas_dir is None: + schemas_dir = utils.get_resource_path("schemas") + + merged_properties: Dict[str, Any] = {} + for fragment_name in inherits: + fragment_path = path.join(schemas_dir, f"{fragment_name}.schema.json") + if not path.isfile(fragment_path): + raise FileNotFoundError( + f"Schema fragment '{fragment_name}' not found at {fragment_path}" + ) + fragment = utils.read_json(fragment_path) + merged_properties.update(copy.deepcopy(fragment.get("properties", {}))) + + # Local properties take precedence over inherited ones. + local_properties = schema.get("properties", {}) + merged_properties.update(local_properties) + + schema["properties"] = merged_properties + schema.pop("$inherit", None) + return schema +``` + +- [ ] **Step 1.7: Run test to verify it passes** + +Run: +```bash +cd core && python -m pytest src/datayoga_core/tests/test_schema_inherit.py -v +``` + +Expected: 5 passed. + +- [ ] **Step 1.8: Wire resolver into `Block.get_json_schema`** + +Modify `core/src/datayoga_core/block.py`. After loading the schema (currently `return utils.read_json(json_schema_file)` on line 59), pass it through the resolver. + +Replace lines 44–59 with: + +```python + def get_json_schema(self) -> Dict[str, Any]: + """Returns the JSON Schema for this block. + + Returns: + Dict[str, Any]: JSON Schema. + """ + json_schema_file = path.join( + utils.get_bundled_dir(), + os.path.relpath( + os.path.dirname(sys.modules[self.__module__].__file__), + start=os.path.dirname(__file__)), + "block.schema.json") if utils.is_bundled() else path.join( + os.path.dirname(os.path.realpath(sys.modules[self.__module__].__file__)), + "block.schema.json") + logger.debug(f"loading schema from {json_schema_file}") + from datayoga_core.schema_utils import resolve_inherits + return resolve_inherits(utils.read_json(json_schema_file)) +``` + +Note: the `from datayoga_core.schema_utils import resolve_inherits` line is inside the function to avoid a circular import (schema_utils imports from utils, utils imports from block). + +- [ ] **Step 1.9: Wire resolver into `Job.get_json_schema`** + +Modify `core/src/datayoga_core/job.py`. Inside the `for block_type, schema_path in block_info:` loop (around line 240–243), apply the resolver to each loaded schema. + +Find this block: +```python + for block_type, schema_path in block_info: + block_types.append(block_type) + # load schema file + schema = utils.read_json(f"{schema_path}") + # append to the array of allOf for the full schema +``` + +Replace with: +```python + from datayoga_core.schema_utils import resolve_inherits + for block_type, schema_path in block_info: + block_types.append(block_type) + # load schema file + schema = resolve_inherits(utils.read_json(f"{schema_path}")) + # append to the array of allOf for the full schema +``` + +- [ ] **Step 1.10: Verify existing block validation still passes** + +Run the full core test suite to make sure nothing regressed (no producer is using `$inherit` yet, so behavior should be unchanged): + +```bash +cd core && python -m pytest src/datayoga_core/ -x -q +``` + +Expected: all existing tests pass; the 5 new `test_schema_inherit.py` tests also pass. + +- [ ] **Step 1.11: Commit** + +```bash +git add core/src/datayoga_core/resources/schemas/batchable.schema.json \ + core/src/datayoga_core/resources/schemas/streamable.schema.json \ + core/src/datayoga_core/schema_utils.py \ + core/src/datayoga_core/tests/__init__.py \ + core/src/datayoga_core/tests/test_schema_inherit.py \ + core/src/datayoga_core/block.py \ + core/src/datayoga_core/job.py +git commit -m "Add \$inherit schema fragment resolver (#400)" +``` + +--- + +## Task 2: Producer base class with batching + +Add `produce_chunks()` and a default `produce()` that re-chunks. Existing subclasses override `produce()` directly and are unaffected until migrated in later tasks. + +**Files:** +- Create: `core/src/datayoga_core/tests/test_producer_batching.py` +- Modify: `core/src/datayoga_core/producer.py` + +- [ ] **Step 2.1: Write the failing tests** + +Create `core/src/datayoga_core/tests/test_producer_batching.py`: + +```python +import asyncio +from typing import AsyncGenerator, List, Optional + +import pytest + +from datayoga_core.context import Context +from datayoga_core.producer import Message, Producer + + +def _msg(i: int) -> dict: + return {Producer.MSG_ID_FIELD: str(i), "v": i} + + +class FakeProducer(Producer): + """Producer driven by a scripted list of chunks plus optional sleeps.""" + + def __init__(self, properties=None, *, chunks=None, sleep_before=None): + # schema for a FakeProducer; declare batch_size/flush_ms so validation passes + self._test_schema = { + "type": "object", + "properties": { + "batch_size": {"type": "integer", "minimum": 1}, + "flush_ms": {"type": ["integer", "null"], "minimum": 1}, + }, + } + self._chunks = chunks or [] + self._sleep_before = sleep_before or [] + super().__init__(properties or {}) + + def get_json_schema(self): + return self._test_schema + + def init(self, context: Optional[Context] = None): + pass + + async def produce_chunks(self) -> AsyncGenerator[List[Message], None]: + for i, chunk in enumerate(self._chunks): + if i < len(self._sleep_before) and self._sleep_before[i]: + await asyncio.sleep(self._sleep_before[i]) + yield chunk + + +async def _drain(producer: Producer): + out = [] + async for batch in producer.produce(): + out.append(batch) + return out + + +@pytest.mark.asyncio +async def test_rechunks_one_large_chunk(): + chunks = [[_msg(i) for i in range(5000)]] + p = FakeProducer({"batch_size": 1000}, chunks=chunks) + batches = await _drain(p) + assert [len(b) for b in batches] == [1000, 1000, 1000, 1000, 1000] + + +@pytest.mark.asyncio +async def test_accumulates_small_chunks_and_flushes_on_eos(): + chunks = [[_msg(i) for i in range(200)], + [_msg(i) for i in range(200, 500)], + [_msg(i) for i in range(500, 900)]] + p = FakeProducer({"batch_size": 1000}, chunks=chunks) + batches = await _drain(p) + assert [len(b) for b in batches] == [900] + + +@pytest.mark.asyncio +async def test_partial_final_batch_on_eos(): + chunks = [[_msg(i) for i in range(1500)]] + p = FakeProducer({"batch_size": 1000}, chunks=chunks) + batches = await _drain(p) + assert [len(b) for b in batches] == [1000, 500] + + +@pytest.mark.asyncio +async def test_empty_chunks_are_ignored(): + chunks = [[], [_msg(1), _msg(2)], [], [_msg(3)]] + p = FakeProducer({"batch_size": 10}, chunks=chunks) + batches = await _drain(p) + assert [len(b) for b in batches] == [3] + + +@pytest.mark.asyncio +async def test_flush_ms_emits_partial_on_inactivity(): + # one chunk of 2 records, then a 300ms wait before EOS; flush_ms=100 should + # flush the partial batch of 2 well before EOS. + chunks = [[_msg(1), _msg(2)], [_msg(3)]] + sleeps = [0, 0.3] + p = FakeProducer({"batch_size": 100, "flush_ms": 100}, + chunks=chunks, sleep_before=sleeps) + + received = [] + started = asyncio.get_event_loop().time() + timings = [] + async for batch in p.produce(): + timings.append(asyncio.get_event_loop().time() - started) + received.append(batch) + + assert [len(b) for b in received] == [2, 1] + # first flush happens because of inactivity (~100ms), not waiting for chunk 2 + assert timings[0] < 0.25, f"expected first flush before 250ms, got {timings[0]}" + + +@pytest.mark.asyncio +async def test_no_flush_ms_holds_records_until_eos(): + chunks = [[_msg(1)], [_msg(2)]] + sleeps = [0, 0.1] + p = FakeProducer({"batch_size": 100}, chunks=chunks, sleep_before=sleeps) + batches = await _drain(p) + assert [len(b) for b in batches] == [2] # combined on EOS, never flushed mid-stream + + +@pytest.mark.asyncio +async def test_consumer_cancellation_cleans_up_pump(): + chunks = [[_msg(i)] for i in range(1000)] + p = FakeProducer({"batch_size": 10, "flush_ms": 50}, chunks=chunks, + sleep_before=[0.05] * 1000) + + gen = p.produce() + first = await gen.__anext__() + assert len(first) >= 1 + await gen.aclose() + # If pump task wasn't cleaned up we'd see a "Task was destroyed but it is + # pending!" warning here. Sleep briefly so the loop has a chance to surface it. + await asyncio.sleep(0.1) +``` + +- [ ] **Step 2.2: Run tests to verify they fail** + +Run: +```bash +cd core && python -m pytest src/datayoga_core/tests/test_producer_batching.py -v +``` + +Expected: All 7 tests FAIL with `TypeError: Can't instantiate abstract class FakeProducer with abstract methods produce` (because `produce` is currently abstract and `FakeProducer` doesn't override it; it overrides `produce_chunks` which doesn't exist yet). + +- [ ] **Step 2.3: Implement the new `Producer` base class** + +Replace the contents of `core/src/datayoga_core/producer.py` with: + +```python +import asyncio +import logging +from contextlib import suppress +from typing import Any, AsyncGenerator, Dict, List + +from .block import Block + +logger = logging.getLogger("dy") + + +class Message: + def __init__(self, msg_id: str, value: Dict[str, Any]): + self.msg_id = msg_id + self.value = value + + +class Producer(Block): + """Base class for producer (read) blocks. + + Subclasses override `produce_chunks()` to yield chunks of any size from + the source. The default `produce()` re-chunks them to exactly `batch_size` + records per batch (smaller on flush_ms timeout or end-of-stream). + + Legacy subclasses may still override `produce()` directly. They bypass + the base-class batching and `produce_chunks` is not called. + """ + + DEFAULT_BATCH_SIZE = 1000 + DEFAULT_FLUSH_MS = None # streaming subclasses override to enable timeout flush + + async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: + """Yield natural-size chunks from the source. + + Subclasses should override this method. The base-class `produce()` + will re-chunk the output to exact `batch_size` slices. + """ + raise NotImplementedError( + f"{type(self).__name__} must override produce_chunks() or produce()" + ) + # Make this an async generator for type-checking purposes. + yield # pragma: no cover + + async def produce(self) -> AsyncGenerator[List[Dict[str, Any]], None]: + """Re-chunks `produce_chunks()` output to exact batch_size batches. + + Reads `batch_size` and `flush_ms` from properties lazily so subclasses + don't need to remember to call `super().init()`. + """ + batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE)) + flush_ms = self.properties.get("flush_ms", self.DEFAULT_FLUSH_MS) + timeout = (flush_ms / 1000) if flush_ms else None + + queue: asyncio.Queue = asyncio.Queue() + EOS = object() + + async def pump(): + try: + async for chunk in self.produce_chunks(): + if chunk: + await queue.put(chunk) + except asyncio.CancelledError: + raise + except Exception as exc: + logger.exception("produce_chunks raised; ending stream: %s", exc) + finally: + await queue.put(EOS) + + pump_task = asyncio.create_task(pump()) + buffer: List[Dict[str, Any]] = [] + try: + while True: + try: + item = await asyncio.wait_for(queue.get(), timeout=timeout) + except asyncio.TimeoutError: + if buffer: + yield buffer + buffer = [] + continue + + if item is EOS: + if buffer: + yield buffer + return + + buffer.extend(item) + while len(buffer) >= batch_size: + yield buffer[:batch_size] + buffer = buffer[batch_size:] + finally: + pump_task.cancel() + with suppress(asyncio.CancelledError, Exception): + await pump_task + + def ack(self, msg_ids: List[str]): + """Sends acknowledge for the message IDs of records that have been processed.""" + pass +``` + +Key differences from the current file: +- `produce()` is no longer `@abstractmethod` — it has a default implementation. +- `produce_chunks()` is the new override hook (not formally `@abstractmethod` so legacy subclasses still validate). +- `Message` class unchanged. + +- [ ] **Step 2.4: Run tests to verify they pass** + +Run: +```bash +cd core && python -m pytest src/datayoga_core/tests/test_producer_batching.py -v +``` + +Expected: 7 passed. + +- [ ] **Step 2.5: Run the full core test suite to confirm no regressions** + +Existing producers all still override `produce()`, so their behavior is unchanged. + +```bash +cd core && python -m pytest src/datayoga_core/ -x -q +``` + +Expected: all tests pass (including the new `test_producer_batching` and `test_schema_inherit`). + +- [ ] **Step 2.6: Commit** + +```bash +git add core/src/datayoga_core/producer.py \ + core/src/datayoga_core/tests/test_producer_batching.py +git commit -m "Producer base class re-chunks via produce_chunks (#400)" +``` + +--- + +## Task 3: Migrate `std/read` + +`std/read` already has `batch_size` and a custom `process_batch` accumulator. Replace it with a `produce_chunks` that yields one chunk; the base class re-chunks. + +**Files:** +- Modify: `core/src/datayoga_core/blocks/std/read/block.py` +- Modify: `core/src/datayoga_core/blocks/std/read/block.schema.json` + +- [ ] **Step 3.1: Write the failing test** + +There is no existing `tests/` directory under `std/read`. The std/read producer is exercised indirectly by integration tests, but we add a unit test for batching here. + +Create `core/src/datayoga_core/blocks/std/read/tests/__init__.py` (empty file) and `core/src/datayoga_core/blocks/std/read/tests/test_std_read.py`: + +```python +import asyncio +from unittest.mock import patch + +import orjson +import pytest + +from datayoga_core.blocks.std.read.block import Block + + +async def _drain(producer): + out = [] + async for batch in producer.produce(): + out.append(batch) + return out + + +@pytest.mark.asyncio +async def test_std_read_batches_to_batch_size(): + payload = [{"i": i} for i in range(2500)] + fake_stdin = [orjson.dumps(payload).decode()] + + block = Block({"batch_size": 1000}) + block.init() + + with patch("datayoga_core.blocks.std.read.block.select.select", + return_value=([object()], [], [])), \ + patch("datayoga_core.blocks.std.read.block.sys.stdin", fake_stdin): + batches = await _drain(block) + + assert [len(b) for b in batches] == [1000, 1000, 500] + # records carry their MSG_ID_FIELD and original payload values + flat = [r for b in batches for r in b] + assert flat[0]["i"] == 0 + assert all(Block.MSG_ID_FIELD in r for r in flat) +``` + +- [ ] **Step 3.2: Run test to verify it fails** + +Run: +```bash +cd core && python -m pytest src/datayoga_core/blocks/std/read/tests/test_std_read.py -v +``` + +Expected: FAIL — the current implementation yields batches of `batch_size`, but its `process_batch` helper won't be exercised through the new `produce()` machinery because it overrides `produce()` directly. The test may also fail because the current produce() doesn't see the `batch_size_in_std_read_block` branch's batch logic interact cleanly with the test mocks. (The point of this step is to drive the migration; the failure shape is secondary.) + +- [ ] **Step 3.3: Migrate `std/read` to `produce_chunks`** + +Replace the contents of `core/src/datayoga_core/blocks/std/read/block.py` with: + +```python +import logging +import select +import sys +import uuid +from typing import Any, AsyncGenerator, Dict, List, Optional + +import orjson +from datayoga_core.context import Context +from datayoga_core.producer import Producer as DyProducer + +logger = logging.getLogger("dy") + + +class Block(DyProducer): + def init(self, context: Optional[Context] = None): + logger.debug(f"Initializing {self.get_block_name()}") + + async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: + if select.select([sys.stdin], [], [], 0.0)[0]: + all_records: List[Dict[str, Any]] = [] + for line in sys.stdin: + all_records.extend(self.get_records(line)) + else: + print("Enter data to process:") + all_records = self.get_records(input()) + + if all_records: + yield [self.get_message(record) for record in all_records] + + @staticmethod + def get_records(data: str) -> List[Dict[str, Any]]: + records = orjson.loads(data) + if isinstance(records, dict): + records = [records] + return records + + def get_message(self, record: Dict[str, Any]) -> Dict[str, Any]: + return {self.MSG_ID_FIELD: str(uuid.uuid4()), **record} +``` + +The `process_batch`, `batch_size` init read, and `produce` override are all gone. The base class handles batching. + +- [ ] **Step 3.4: Update the schema to use the fragment** + +Replace the contents of `core/src/datayoga_core/blocks/std/read/block.schema.json` with: + +```json +{ + "title": "std.read", + "description": "Read from the standard input", + "type": "object", + "$inherit": ["batchable"], + "properties": {}, + "additionalProperties": false +} +``` + +The `batch_size` declaration now comes from the fragment. + +- [ ] **Step 3.5: Run test to verify it passes** + +```bash +cd core && python -m pytest src/datayoga_core/blocks/std/read/tests/test_std_read.py -v +``` + +Expected: PASS. + +- [ ] **Step 3.6: Run the full core suite** + +```bash +cd core && python -m pytest src/datayoga_core/ -x -q +``` + +Expected: all tests pass. + +- [ ] **Step 3.7: Commit** + +```bash +git add core/src/datayoga_core/blocks/std/read/block.py \ + core/src/datayoga_core/blocks/std/read/block.schema.json \ + core/src/datayoga_core/blocks/std/read/tests/__init__.py \ + core/src/datayoga_core/blocks/std/read/tests/test_std_read.py +git commit -m "Migrate std/read to produce_chunks (#400, #296)" +``` + +--- + +## Task 4: Migrate `files/read_csv` + +Replace the `produce()` override and `islice` loop with a `produce_chunks` that yields one chunk per `batch_size` rows. The base class re-chunks to the configured `batch_size`. + +**Files:** +- Modify: `core/src/datayoga_core/blocks/files/read_csv/block.py` +- Modify: `core/src/datayoga_core/blocks/files/read_csv/block.schema.json` + +- [ ] **Step 4.1: Write the failing test** + +Create `core/src/datayoga_core/blocks/files/read_csv/tests/__init__.py` (empty) and `core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py`: + +```python +from pathlib import Path + +import pytest + +from datayoga_core.blocks.files.read_csv.block import Block + + +async def _drain(producer): + out = [] + async for batch in producer.produce(): + out.append(batch) + return out + + +@pytest.fixture +def csv_path(tmp_path) -> Path: + p = tmp_path / "data.csv" + rows = ["fname,lname"] + [f"first{i},last{i}" for i in range(2500)] + p.write_text("\n".join(rows) + "\n", encoding="utf-8") + return p + + +@pytest.mark.asyncio +async def test_csv_batches_to_batch_size(csv_path): + block = Block({"file": str(csv_path), "batch_size": 1000, "skip": 1}) + block.init() + batches = await _drain(block) + assert [len(b) for b in batches] == [1000, 1000, 500] + # message ids are populated + assert all(Block.MSG_ID_FIELD in r for b in batches for r in b) + # first row content + assert batches[0][0]["fname"] == "first0" + + +@pytest.mark.asyncio +async def test_csv_default_batch_size(csv_path): + block = Block({"file": str(csv_path), "skip": 1}) + block.init() + batches = await _drain(block) + # default batch_size is 1000 + assert [len(b) for b in batches] == [1000, 1000, 500] +``` + +- [ ] **Step 4.2: Run test to verify it fails** + +```bash +cd core && python -m pytest src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py -v +``` + +Expected: FAIL — current `produce()` works but the tests may pass coincidentally because `files/read_csv` already batches. That's fine; the test exists to *protect* the contract. Proceed to the migration anyway and confirm the test still passes afterward. + +- [ ] **Step 4.3: Migrate `files/read_csv` to `produce_chunks`** + +Replace the contents of `core/src/datayoga_core/blocks/files/read_csv/block.py` with: + +```python +import logging +import os +from abc import ABCMeta +from contextlib import suppress +from csv import DictReader +from itertools import count, islice +from typing import Any, AsyncGenerator, Dict, List, Optional + +from datayoga_core.context import Context +from datayoga_core.producer import Producer as DyProducer + +logger = logging.getLogger("dy") + + +class Block(DyProducer, metaclass=ABCMeta): + + def init(self, context: Optional[Context] = None): + logger.debug(f"Initializing {self.get_block_name()}") + csv_file = self.properties["file"] + if os.path.isabs(csv_file) or context is None: + self.file = csv_file + else: + self.file = os.path.join(context.properties.get("data_path"), csv_file) + logger.debug(f"file: {self.file}") + self.encoding = self.properties.get("encoding", "utf-8") + self.fields = self.properties.get("fields") + self.skip = self.properties.get("skip", 0) + self.delimiter = self.properties.get("delimiter", ",") + self.quotechar = self.properties.get("quotechar", "\"") + + async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: + logger.debug("Reading CSV") + batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE)) + + with open(self.file, "r", encoding=self.encoding) as read_obj: + reader = DictReader(read_obj, fieldnames=self.fields, + delimiter=self.delimiter, quotechar=self.quotechar) + for _ in range(self.skip): + with suppress(StopIteration): + next(reader) + counter = iter(count()) + while True: + chunk = [ + {self.MSG_ID_FIELD: f"{next(counter)}", **record} + for record in islice(reader, batch_size) + ] + if not chunk: + return + yield chunk +``` + +The init no longer reads `self.batch_size` (read lazily in `produce_chunks`). + +- [ ] **Step 4.4: Update the schema** + +Replace `core/src/datayoga_core/blocks/files/read_csv/block.schema.json` with: + +```json +{ + "title": "files.read_csv", + "description": "Read data from CSV", + "type": "object", + "$inherit": ["batchable"], + "properties": { + "file": { + "description": "Filename. Can contain a regexp or glob expression", + "type": "string" + }, + "encoding": { + "description": "Encoding to use for reading the file", + "type": "string", + "default": "utf-8" + }, + "fields": { + "type": "array", + "title": "List of columns to use", + "description": "List of columns to use for extract", + "default": null, + "examples": [["fname", "lname"]], + "minLength": 1, + "additionalItems": true, + "items": { + "type": "string", + "description": "field name", + "examples": ["fname"] + } + }, + "skip": { + "description": "Number of lines to skip", + "type": "number", + "minimum": 0, + "default": 0 + }, + "delimiter": { + "description": "Delimiter to use for splitting the csv records", + "type": "string", + "minLength": 1, + "maxLength": 1, + "default": "," + }, + "quotechar": { + "description": "A one-character string used to quote fields containing special characters, such as the delimiter or quotechar, or which contain new-line characters. It defaults to '", + "type": "string", + "minLength": 1, + "maxLength": 1, + "default": "\"" + } + }, + "additionalProperties": false, + "required": ["file"], + "examples": [ + { + "file": "archive.csv", + "delimiter": ";" + } + ] +} +``` + +The `batch_size` inline property is removed; it comes from the `batchable` fragment. + +- [ ] **Step 4.5: Run test to verify it passes** + +```bash +cd core && python -m pytest src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py -v +``` + +Expected: 2 passed. + +- [ ] **Step 4.6: Run the full core suite** + +```bash +cd core && python -m pytest src/datayoga_core/ -x -q +``` + +Expected: all tests pass. + +- [ ] **Step 4.7: Commit** + +```bash +git add core/src/datayoga_core/blocks/files/read_csv/block.py \ + core/src/datayoga_core/blocks/files/read_csv/block.schema.json \ + core/src/datayoga_core/blocks/files/read_csv/tests/__init__.py \ + core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py +git commit -m "Migrate files/read_csv to produce_chunks (#400)" +``` + +--- + +## Task 5: Migrate `parquet/read` (fixes one-by-one bug) + +Today `parquet/read` iterates each row of each row group and yields a single-record list per iteration. Migrate it to yield each row group as a single chunk; the base class re-chunks to `batch_size`. + +**Files:** +- Modify: `core/src/datayoga_core/blocks/parquet/read/block.py` +- Modify: `core/src/datayoga_core/blocks/parquet/read/block.schema.json` + +- [ ] **Step 5.1: Write the failing test** + +Create `core/src/datayoga_core/blocks/parquet/read/tests/__init__.py` (empty) and `core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py`: + +```python +from pathlib import Path + +import pandas as pd +import pytest + +from datayoga_core.blocks.parquet.read.block import Block + + +async def _drain(producer): + out = [] + async for batch in producer.produce(): + out.append(batch) + return out + + +@pytest.fixture +def parquet_path(tmp_path) -> Path: + p = tmp_path / "data.parquet" + df = pd.DataFrame({"i": list(range(2500))}) + # row_group_offsets=1000 creates 3 row groups (1000, 1000, 500) + from fastparquet import write as fp_write + fp_write(str(p), df, row_group_offsets=1000) + return p + + +@pytest.mark.asyncio +async def test_parquet_batches_to_batch_size(parquet_path): + block = Block({"file": str(parquet_path), "batch_size": 1000}) + block.init() + batches = await _drain(block) + assert [len(b) for b in batches] == [1000, 1000, 500] + flat = [r for b in batches for r in b] + assert flat[0]["i"] == 0 + assert all(Block.MSG_ID_FIELD in r for r in flat) + + +@pytest.mark.asyncio +async def test_parquet_rechunks_across_row_groups(parquet_path): + # row groups are [1000, 1000, 500]; batch_size=750 should give batches of + # [750, 750, 750, 250] regardless of row group boundaries. + block = Block({"file": str(parquet_path), "batch_size": 750}) + block.init() + batches = await _drain(block) + assert [len(b) for b in batches] == [750, 750, 750, 250] +``` + +- [ ] **Step 5.2: Run test to verify it fails** + +```bash +cd core && python -m pytest src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py -v +``` + +Expected: FAIL — current implementation yields batches of size 1, so the assertions fail. + +- [ ] **Step 5.3: Migrate `parquet/read`** + +Replace the contents of `core/src/datayoga_core/blocks/parquet/read/block.py` with: + +```python +import logging +import os +from abc import ABCMeta +from itertools import count +from typing import Any, AsyncGenerator, Dict, List, Optional + +from datayoga_core.context import Context +from datayoga_core.producer import Producer as DyProducer +from fastparquet import ParquetFile + +logger = logging.getLogger("dy") + + +class Block(DyProducer, metaclass=ABCMeta): + + def init(self, context: Optional[Context] = None): + logger.debug(f"Initializing {self.get_block_name()}") + parquet_file = self.properties["file"] + if os.path.isabs(parquet_file) or context is None: + self.file = parquet_file + else: + self.file = os.path.join(context.properties.get("data_path"), parquet_file) + logger.debug(f"file: {self.file}") + + async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: + logger.debug("Reading parquet") + pf = ParquetFile(self.file) + counter = iter(count()) + for df in pf.iter_row_groups(): + yield [ + {self.MSG_ID_FIELD: str(next(counter)), **row.to_dict()} + for _, row in df.iterrows() + ] +``` + +- [ ] **Step 5.4: Update the schema** + +Replace `core/src/datayoga_core/blocks/parquet/read/block.schema.json` with: + +```json +{ + "title": "parquet.read", + "description": "Read data from parquet", + "type": "object", + "$inherit": ["batchable"], + "properties": { + "file": { + "description": "Filename. Can contain a regexp or glob expression", + "type": "string" + } + }, + "additionalProperties": false, + "required": ["file"], + "examples": [ + { + "file": "data.parquet" + } + ] +} +``` + +- [ ] **Step 5.5: Run test to verify it passes** + +```bash +cd core && python -m pytest src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py -v +``` + +Expected: 2 passed. + +- [ ] **Step 5.6: Run the full core suite** + +```bash +cd core && python -m pytest src/datayoga_core/ -x -q +``` + +Expected: all tests pass. + +- [ ] **Step 5.7: Commit** + +```bash +git add core/src/datayoga_core/blocks/parquet/read/block.py \ + core/src/datayoga_core/blocks/parquet/read/block.schema.json \ + core/src/datayoga_core/blocks/parquet/read/tests/__init__.py \ + core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py +git commit -m "Migrate parquet/read to produce_chunks, fix one-by-one yield (#400, #293)" +``` + +--- + +## Task 6: Migrate `relational/read` (fix bug + add `fetch_size`) + +Today `relational/read` does `fetchmany(10000)` then yields one row at a time. Migrate to `produce_chunks` that yields each `fetchmany` result. Add an optional `fetch_size` property; default to 10000 to preserve today's DB round-trip count. + +**Files:** +- Modify: `core/src/datayoga_core/blocks/relational/read/block.py` +- Modify: `core/src/datayoga_core/blocks/relational/read/block.schema.json` + +- [ ] **Step 6.1: Write the failing test** + +Create `core/src/datayoga_core/blocks/relational/read/tests/__init__.py` (empty) and `core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py`: + +```python +from unittest.mock import MagicMock, patch + +import pytest + +from datayoga_core.blocks.relational.read.block import Block + + +async def _drain(producer): + out = [] + async for batch in producer.produce(): + out.append(batch) + return out + + +def _fake_result(rows): + """Build a fake SQLAlchemy result that returns rows in fetchmany chunks.""" + state = {"i": 0} + + def fetchmany(n): + i = state["i"] + chunk = rows[i:i + n] + state["i"] += len(chunk) + return chunk + + res = MagicMock() + res.fetchmany.side_effect = fetchmany + res.execution_options.return_value = res + return res + + +class _Row: + def __init__(self, d): + self._d = d + + def _asdict(self): + return self._d + + +@pytest.mark.asyncio +async def test_relational_read_yields_batches_not_rows(): + rows = [_Row({"i": i}) for i in range(2500)] + fake_result = _fake_result(rows) + + block = Block.__new__(Block) + block.properties = {"batch_size": 1000} + block.connection = MagicMock() + block.tbl = MagicMock() + block.tbl.select.return_value = "SELECT *" + block.connection.execution_options.return_value.execute.return_value = fake_result + + batches = await _drain(block) + assert [len(b) for b in batches] == [1000, 1000, 500] + + +@pytest.mark.asyncio +async def test_relational_read_fetch_size_independent_of_batch_size(): + rows = [_Row({"i": i}) for i in range(5000)] + fake_result = _fake_result(rows) + + block = Block.__new__(Block) + block.properties = {"batch_size": 1000, "fetch_size": 2500} + block.connection = MagicMock() + block.tbl = MagicMock() + block.tbl.select.return_value = "SELECT *" + block.connection.execution_options.return_value.execute.return_value = fake_result + + batches = await _drain(block) + # Downstream batches are still batch_size=1000 + assert [len(b) for b in batches] == [1000, 1000, 1000, 1000, 1000] + # Driver fetched in fetch_size=2500 chunks: 2500 + 2500 + 0 = 3 calls + fetch_sizes = [c.args[0] for c in fake_result.fetchmany.call_args_list] + assert fetch_sizes[0] == 2500 + assert fetch_sizes[1] == 2500 + + +@pytest.mark.asyncio +async def test_relational_read_default_fetch_size_is_10000(): + rows = [_Row({"i": i}) for i in range(500)] + fake_result = _fake_result(rows) + + block = Block.__new__(Block) + block.properties = {} + block.connection = MagicMock() + block.tbl = MagicMock() + block.tbl.select.return_value = "SELECT *" + block.connection.execution_options.return_value.execute.return_value = fake_result + + await _drain(block) + fetch_sizes = [c.args[0] for c in fake_result.fetchmany.call_args_list] + assert fetch_sizes[0] == 10000 +``` + +- [ ] **Step 6.2: Run test to verify it fails** + +```bash +cd core && python -m pytest src/datayoga_core/blocks/relational/read/tests/test_relational_read.py -v +``` + +Expected: FAIL — the current `produce()` yields one row at a time, so `[len(b) for b in batches]` is `[1] * 2500`. + +- [ ] **Step 6.3: Migrate `relational/read`** + +Replace the contents of `core/src/datayoga_core/blocks/relational/read/block.py` with: + +```python +import logging +from typing import Any, AsyncGenerator, Dict, List, Optional + +import sqlalchemy as sa +from datayoga_core import utils +from datayoga_core.blocks.relational import utils as relational_utils +from datayoga_core.context import Context +from datayoga_core.producer import Producer as DyProducer + +logger = logging.getLogger("dy") + + +class Block(DyProducer): + DEFAULT_FETCH_SIZE = 10000 + + def init(self, context: Optional[Context] = None): + self.engine, self.db_type = relational_utils.get_engine( + self.properties["connection"], + context, + autocommit=False, + ) + + self.schema = self.properties.get("schema") + self.table = self.properties.get("table") + self.opcode_field = self.properties.get("opcode_field") + self.load_strategy = self.properties.get("load_strategy") + self.keys = self.properties.get("keys") + self.mapping = self.properties.get("mapping") + + self.tbl = sa.Table(self.table, sa.MetaData(schema=self.schema), autoload_with=self.engine) + + logger.debug(f"Connecting to {self.db_type}") + self.connection = self.engine.connect() + + async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: + fetch_size = int(self.properties.get("fetch_size", self.DEFAULT_FETCH_SIZE)) + result = self.connection.execution_options(stream_results=True).execute(self.tbl.select()) + while True: + rows = result.fetchmany(fetch_size) + if not rows: + return + yield [utils.add_uid(dict(row._asdict())) for row in rows] + + def stop(self): + self.connection.close() + self.engine.dispose() +``` + +- [ ] **Step 6.4: Update the schema** + +Replace `core/src/datayoga_core/blocks/relational/read/block.schema.json` with: + +```json +{ + "title": "relational.read", + "description": "Read a table from an SQL-compatible data store", + "type": "object", + "$inherit": ["batchable"], + "additionalProperties": false, + "examples": [ + { + "id": "read_snowflake", + "type": "relational.read", + "properties": { + "connection": "eu_datalake", + "table": "employees", + "schema": "dbo" + } + } + ], + "properties": { + "connection": { + "type": "string", + "title": "The connection to use for loading", + "description": "Logical connection name as defined in the connections.dy.yaml", + "examples": ["europe_db", "target", "eu_dwh"] + }, + "schema": { + "type": "string", + "title": "The table schema of the table", + "description": "If left blank, the default schema of this connection will be used as defined in the connections.dy.yaml", + "examples": ["dbo"] + }, + "table": { + "type": "string", + "title": "The table name", + "description": "Table name", + "examples": ["employees"] + }, + "columns": { + "type": "array", + "title": "Optional subset of columns to load", + "items": { + "type": ["string", "object"], + "title": "name of column" + }, + "examples": [["fname", { "lname": "last_name" }]] + }, + "fetch_size": { + "type": "integer", + "minimum": 1, + "description": "Driver-level rows fetched per round-trip. Defaults to 10000.", + "default": 10000 + } + }, + "required": ["connection", "table"] +} +``` + +- [ ] **Step 6.5: Run test to verify it passes** + +```bash +cd core && python -m pytest src/datayoga_core/blocks/relational/read/tests/test_relational_read.py -v +``` + +Expected: 3 passed. + +- [ ] **Step 6.6: Run the full core suite** + +```bash +cd core && python -m pytest src/datayoga_core/ -x -q +``` + +Expected: all tests pass. + +- [ ] **Step 6.7: Commit** + +```bash +git add core/src/datayoga_core/blocks/relational/read/block.py \ + core/src/datayoga_core/blocks/relational/read/block.schema.json \ + core/src/datayoga_core/blocks/relational/read/tests/__init__.py \ + core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py +git commit -m "Migrate relational/read to produce_chunks, add fetch_size (#400, #295)" +``` + +--- + +## Task 7: Migrate `http/receiver` (fix one-by-one) + +The receiver currently yields one record per HTTP request. Migrate to drain the queue per chunk; `flush_ms` ensures partial batches flush during low-traffic periods. + +**Files:** +- Modify: `core/src/datayoga_core/blocks/http/receiver/block.py` +- Modify: `core/src/datayoga_core/blocks/http/receiver/block.schema.json` + +- [ ] **Step 7.1: Write the failing test** + +Create `core/src/datayoga_core/blocks/http/receiver/tests/__init__.py` (empty) and `core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py`: + +```python +import asyncio + +import aiohttp +import pytest + +from datayoga_core.blocks.http.receiver.block import Block + + +def _free_port(): + import socket + with socket.socket() as s: + s.bind(("127.0.0.1", 0)) + return s.getsockname()[1] + + +@pytest.mark.asyncio +async def test_http_receiver_batches_incoming_requests(): + port = _free_port() + block = Block({"host": "127.0.0.1", "port": port, + "batch_size": 50, "flush_ms": 200}) + block.init() + + received = [] + + async def consumer(): + async for batch in block.produce(): + received.append(batch) + if sum(len(b) for b in received) >= 60: + return + + consumer_task = asyncio.create_task(consumer()) + await asyncio.sleep(0.2) # let server start + + async with aiohttp.ClientSession() as session: + for i in range(60): + async with session.post(f"http://127.0.0.1:{port}", json={"i": i}) as r: + assert r.status == 200 + + await asyncio.wait_for(consumer_task, timeout=5) + + flat = [r for b in received for r in b] + assert len(flat) == 60 + # Most records arrive in a full batch_size=50 batch; the rest arrive as a + # partial batch flushed by flush_ms. + assert any(len(b) == 50 for b in received) + assert all(Block.MSG_ID_FIELD in r for r in flat) +``` + +- [ ] **Step 7.2: Run test to verify it fails** + +```bash +cd core && python -m pytest src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py -v +``` + +Expected: FAIL — current implementation yields one record per batch; `assert any(len(b) == 50 ...)` is false. + +- [ ] **Step 7.3: Migrate `http/receiver`** + +Replace the contents of `core/src/datayoga_core/blocks/http/receiver/block.py` with: + +```python +import logging +from abc import ABCMeta +from asyncio import Queue +from contextlib import suppress +from itertools import count +from typing import Any, AsyncGenerator, Dict, List, Optional + +import orjson +from aiohttp.web import (BaseRequest, HTTPInternalServerError, HTTPOk, + Response, Server, ServerRunner, TCPSite) +from datayoga_core.context import Context +from datayoga_core.producer import Producer as DyProducer + +logger = logging.getLogger("dy") + + +class Block(DyProducer, metaclass=ABCMeta): + port: int + host: str + DEFAULT_FLUSH_MS = 1000 + + def init(self, context: Optional[Context] = None): + logger.debug(f"Initializing {self.get_block_name()}") + self.port = int(self.properties.get("port", 8080)) + self.host = self.properties.get("host", "0.0.0.0") + + async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: + queue: Queue = Queue(maxsize=1000) + + async def handler(request: BaseRequest) -> Response: + try: + queue.put_nowait(orjson.loads(await request.read())) + return HTTPOk() + except Exception: + logger.exception("Got exception while parsing request:") + return HTTPInternalServerError() + + runner = ServerRunner(Server(handler)) + await runner.setup() + srv = TCPSite(runner, self.host, self.port) + await srv.start() + logger.info(f"Listening on {self.host}:{self.port}...") + + try: + counter = iter(count()) + while True: + first = await queue.get() + chunk = [{self.MSG_ID_FIELD: f"{next(counter)}", **first}] + while not queue.empty(): + record = queue.get_nowait() + chunk.append({self.MSG_ID_FIELD: f"{next(counter)}", **record}) + yield chunk + finally: + with suppress(Exception): + await srv.stop() +``` + +- [ ] **Step 7.4: Update the schema** + +Replace `core/src/datayoga_core/blocks/http/receiver/block.schema.json` with: + +```json +{ + "title": "http.receiver", + "description": "Receives HTTP requests and process the data.", + "type": "object", + "$inherit": ["streamable"], + "properties": { + "host": { + "description": "Host to listen", + "type": "string", + "default": "0.0.0.0" + }, + "port": { + "description": "Port to listen", + "type": "integer", + "default": 8080 + } + }, + "additionalProperties": false, + "examples": [ + { + "host": "localhost", + "port": 8080 + } + ] +} +``` + +- [ ] **Step 7.5: Run test to verify it passes** + +```bash +cd core && python -m pytest src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py -v +``` + +Expected: 1 passed. + +- [ ] **Step 7.6: Run the full core suite** + +```bash +cd core && python -m pytest src/datayoga_core/ -x -q +``` + +Expected: all tests pass. + +- [ ] **Step 7.7: Commit** + +```bash +git add core/src/datayoga_core/blocks/http/receiver/block.py \ + core/src/datayoga_core/blocks/http/receiver/block.schema.json \ + core/src/datayoga_core/blocks/http/receiver/tests/__init__.py \ + core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py +git commit -m "Migrate http/receiver to produce_chunks (#400)" +``` + +--- + +## Task 8: Migrate `redis/read_stream` (closes #377) + +The redis stream producer yields one record at a time today. Migrate so it requests `count=batch_size` from `xreadgroup` and yields each response as a chunk; `flush_ms` flushes partial batches during low-volume periods. + +**Files:** +- Modify: `core/src/datayoga_core/blocks/redis/read_stream/block.py` +- Modify: `core/src/datayoga_core/blocks/redis/read_stream/block.schema.json` + +- [ ] **Step 8.1: Write the failing test** + +Create `core/src/datayoga_core/blocks/redis/read_stream/tests/__init__.py` (empty) and `core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py`: + +```python +from unittest.mock import MagicMock + +import pytest + +from datayoga_core.blocks.redis.read_stream.block import Block + + +def _mk_block(properties, redis_client): + block = Block.__new__(Block) + block.properties = properties + block.redis_client = redis_client + block.stream = "mystream" + block.snapshot = properties.get("_snapshot", True) + block.consumer_group = "g" + block.requesting_consumer = "c" + return block + + +@pytest.mark.asyncio +async def test_redis_uses_count_equal_to_batch_size(): + redis = MagicMock() + # First call returns pending messages, second call returns "no new", which + # ends snapshot mode. + payload_a = (b"1-0", {b"data": b'{"i": 1}'}) + payload_b = (b"2-0", {b"data": b'{"i": 2}'}) + redis.xreadgroup.side_effect = [ + [(b"mystream", [payload_a, payload_b])], # pending + [(b"mystream", [])], # nothing new -> exit + ] + + block = _mk_block({"batch_size": 250, "_snapshot": True}, redis) + batches = [] + async for b in block.produce(): + batches.append(b) + + assert all(c.kwargs.get("count") == 250 or (len(c.args) >= 4 and c.args[3] == 250) + for c in redis.xreadgroup.call_args_list), \ + "xreadgroup should be called with count=batch_size" + + +@pytest.mark.asyncio +async def test_redis_yields_records_as_a_batch_not_one_by_one(): + redis = MagicMock() + pages = [(f"{i}-0".encode(), {b"data": f'{{"i": {i}}}'.encode()}) for i in range(5)] + redis.xreadgroup.side_effect = [ + [(b"mystream", pages)], + [(b"mystream", [])], + ] + + block = _mk_block({"batch_size": 100, "_snapshot": True}, redis) + batches = [] + async for b in block.produce(): + batches.append(b) + + # 5 records arrive as one chunk; base class re-emits as one batch of 5. + assert [len(b) for b in batches] == [5] + assert batches[0][0]["i"] == 0 +``` + +- [ ] **Step 8.2: Run test to verify it fails** + +```bash +cd core && python -m pytest src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py -v +``` + +Expected: FAIL — current `xreadgroup` call passes `count=None`, and the producer yields one record at a time. + +- [ ] **Step 8.3: Migrate `redis/read_stream`** + +Replace the contents of `core/src/datayoga_core/blocks/redis/read_stream/block.py` with: + +```python +import logging +from typing import Any, AsyncGenerator, Dict, List, Optional + +import datayoga_core.blocks.redis.utils as redis_utils +import orjson +from datayoga_core.connection import Connection +from datayoga_core.context import Context +from datayoga_core.producer import Producer as DyProducer + +logger = logging.getLogger("dy") + + +class Block(DyProducer): + DEFAULT_FLUSH_MS = 1000 + + def init(self, context: Optional[Context] = None): + logger.debug(f"Initializing {self.get_block_name()}") + connection_details = Connection.get_connection_details(self.properties["connection"], context) + self.redis_client = redis_utils.get_client(connection_details) + self.stream = self.properties["stream_name"] + self.snapshot = self.properties.get("snapshot", False) + self.consumer_group = f'datayoga_job_{context.properties.get("job_name", "") if context else ""}' + self.requesting_consumer = "dy_consumer_a" + stream_groups = self.redis_client.xinfo_groups(self.stream) + if next(filter(lambda x: x["name"] == self.consumer_group, stream_groups), None) is None: + logger.info(f"Creating a new {self.consumer_group} consumer group associated with the {self.stream}") + self.redis_client.xgroup_create(self.stream, self.consumer_group, 0) + + async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: + logger.debug(f"Running {self.get_block_name()}") + batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE)) + read_pending = True + + while True: + streams = self.redis_client.xreadgroup( + self.consumer_group, self.requesting_consumer, + {self.stream: "0" if read_pending else ">"}, + count=batch_size, + block=100 if self.snapshot else 0, + ) + + yielded_any = False + for stream in streams: + logger.debug(f"Messages in {self.stream} stream (pending: {read_pending}):\n\t{stream}") + chunk: List[Dict[str, Any]] = [] + for key, value in stream[1]: + payload = orjson.loads(value[next(iter(value))]) + payload[self.MSG_ID_FIELD] = key + chunk.append(payload) + if chunk: + yielded_any = True + yield chunk + + # Snapshot ends after a pending-read followed by a "no new" read. + if self.snapshot and not read_pending and not yielded_any: + return + + read_pending = False + + def ack(self, msg_ids: List[str]): + for msg_id in msg_ids: + logger.info(f"Acking {msg_id} message in {self.stream} stream of {self.consumer_group} consumer group") + self.redis_client.xack(self.stream, self.consumer_group, msg_id) +``` + +Note: snapshot termination is slightly tightened: the loop exits when a non-pending read returns no messages, matching the spec's intent. This is more robust than the original `if self.snapshot and not read_pending: break`. + +- [ ] **Step 8.4: Update the schema** + +Replace `core/src/datayoga_core/blocks/redis/read_stream/block.schema.json` with: + +```json +{ + "title": "redis.read_stream", + "description": "Read from Redis stream", + "type": "object", + "$inherit": ["streamable"], + "properties": { + "connection": { "description": "Connection name", "type": "string" }, + "stream_name": { + "type": "string", + "title": "Source stream name", + "description": "Source stream name" + }, + "snapshot": { + "type": "boolean", + "title": "Snapshot current entries and quit", + "description": "Snapshot current entries and quit", + "default": false + } + }, + "additionalProperties": false, + "required": ["connection", "stream_name"] +} +``` + +- [ ] **Step 8.5: Run test to verify it passes** + +```bash +cd core && python -m pytest src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py -v +``` + +Expected: 2 passed. + +- [ ] **Step 8.6: Run the full core suite** + +```bash +cd core && python -m pytest src/datayoga_core/ -x -q +``` + +Expected: all tests pass. + +- [ ] **Step 8.7: Commit** + +```bash +git add core/src/datayoga_core/blocks/redis/read_stream/block.py \ + core/src/datayoga_core/blocks/redis/read_stream/block.schema.json \ + core/src/datayoga_core/blocks/redis/read_stream/tests/__init__.py \ + core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py +git commit -m "Migrate redis/read_stream to batched xreadgroup (#400, #377)" +``` + +--- + +## Task 9: Migrate `azure/read_event_hub` (rename `batch_size` → `max_batch_size`) + +Today `batch_size` controls the SDK callback size, not the pipeline batch size. Rename to `max_batch_size`, add `additionalProperties: false`, and use the streamable fragment so the *new* `batch_size` means pipeline batch size. + +**Files:** +- Modify: `core/src/datayoga_core/blocks/azure/read_event_hub/block.py` +- Modify: `core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json` + +- [ ] **Step 9.1: Write the failing test** + +Create `core/src/datayoga_core/blocks/azure/read_event_hub/tests/__init__.py` (empty) and `core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py`: + +```python +import pytest +from jsonschema import ValidationError + +from datayoga_core.blocks.azure.read_event_hub.block import Block + + +def _minimal_props(extra=None): + base = { + "event_hub_connection_string": "Endpoint=sb://x/;SharedAccessKeyName=k;SharedAccessKey=v;EntityPath=eh", + "event_hub_consumer_group_name": "$Default", + "event_hub_name": "eh", + "checkpoint_store_connection_string": "DefaultEndpointsProtocol=https;AccountName=a;AccountKey=k==", + "checkpoint_store_container_name": "chk", + } + if extra: + base.update(extra) + return base + + +def test_unknown_property_rejected_by_validation(): + """additionalProperties: false catches typos like the legacy 'batch_sz'.""" + with pytest.raises(ValidationError): + Block(_minimal_props({"batch_sz": 300})) + + +def test_max_batch_size_accepted(): + """The renamed SDK-level property is now max_batch_size.""" + block = Block(_minimal_props({"max_batch_size": 500, "batch_size": 100})) + assert block.properties["max_batch_size"] == 500 + assert block.properties["batch_size"] == 100 + + +def test_max_batch_size_defaults_to_300_when_omitted(): + """init() reads max_batch_size with a default of 300 if not present.""" + # We can't safely call init() in unit tests (it instantiates the Azure + # SDK client); read the property via the same path init() does. + block = Block(_minimal_props()) + assert int(block.properties.get("max_batch_size", 300)) == 300 + + +def test_renamed_schema_has_additional_properties_false(): + """Schema after rename: max_batch_size + streamable's batch_size/flush_ms, + no unknown properties allowed.""" + block = Block(_minimal_props()) + schema = block.get_json_schema() + assert schema.get("additionalProperties") is False + assert "max_batch_size" in schema["properties"] + assert "batch_size" in schema["properties"] # from streamable fragment + assert "flush_ms" in schema["properties"] # from streamable fragment + + +def test_batch_size_300_is_silently_repurposed(): + """A user upgrading from a pre-rename version with batch_size: 300 (which + used to mean SDK callback size) will see their YAML still validate, but + batch_size now means pipeline batch size. This is documented in the PR + description and processing-strategies.md as a breaking change.""" + block = Block(_minimal_props({"batch_size": 300})) + # Schema validation passes — batch_size is a known property (now pipeline-meaning). + # The user must rename to max_batch_size: 300 to preserve old behavior. + assert block.properties["batch_size"] == 300 + assert "max_batch_size" not in block.properties +``` + +- [ ] **Step 9.2: Run test to verify it fails** + +```bash +cd core && python -m pytest src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py -v +``` + +Expected: most of the 5 tests FAIL — current schema has no `additionalProperties: false`, no `max_batch_size`, no `$inherit`. + +- [ ] **Step 9.3: Update the schema** + +Replace `core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json` with: + +```json +{ + "title": "azure.read_event_hub", + "description": "Read from Azure Event Hub", + "type": "object", + "$inherit": ["streamable"], + "properties": { + "event_hub_connection_string": { + "type": "string", + "description": "The connection string for the Azure Event Hub namespace." + }, + "event_hub_consumer_group_name": { + "type": "string", + "description": "The name of the consumer group to read events from." + }, + "event_hub_name": { + "type": "string", + "description": "The name of the Azure Event Hub." + }, + "checkpoint_store_connection_string": { + "type": "string", + "description": "The connection string for the Azure Storage account used as the checkpoint store." + }, + "checkpoint_store_container_name": { + "type": "string", + "description": "The name of the container within the checkpoint store to store the checkpoints." + }, + "max_batch_size": { + "type": "integer", + "minimum": 1, + "description": "Maximum number of events to receive in each SDK callback. Renamed from the previous batch_size which used to mean this. Defaults to 300.", + "default": 300 + } + }, + "additionalProperties": false, + "required": [ + "event_hub_connection_string", + "event_hub_consumer_group_name", + "event_hub_name", + "checkpoint_store_connection_string", + "checkpoint_store_container_name" + ] +} +``` + +- [ ] **Step 9.4: Migrate the producer** + +Replace the contents of `core/src/datayoga_core/blocks/azure/read_event_hub/block.py` with: + +```python +import asyncio +import logging +from typing import Any, AsyncGenerator, Dict, List, Optional + +import orjson +from azure.eventhub import EventData, PartitionContext +from azure.eventhub.aio import EventHubConsumerClient +from azure.eventhub.extensions.checkpointstoreblobaio import \ + BlobCheckpointStore +from datayoga_core.context import Context +from datayoga_core.producer import Producer as DyProducer + +logger = logging.getLogger("dy") + + +class Block(DyProducer): + """Azure Event Hub block for reading events.""" + + DEFAULT_FLUSH_MS = 1000 + + def init(self, context: Optional[Context] = None): + logger.debug(f"Initializing {self.get_block_name()}") + self.max_batch_size = int(self.properties.get("max_batch_size", 300)) + self.consumer_client = EventHubConsumerClient.from_connection_string( + conn_str=self.properties["event_hub_connection_string"], + consumer_group=self.properties["event_hub_consumer_group_name"], + eventhub_name=self.properties["event_hub_name"], + checkpoint_store=BlobCheckpointStore.from_connection_string( + self.properties["checkpoint_store_connection_string"], + self.properties["checkpoint_store_container_name"]), + ) + self.events: Dict[Any, Any] = {} + self.messages: asyncio.Queue = asyncio.Queue() + + async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: + logger.debug(f"Running {self.get_block_name()}") + logger.debug("Starting event receiving process") + asyncio.create_task(self.receive_batch()) + + while True: + first = await self.messages.get() + chunk = [first] + while not self.messages.empty(): + chunk.append(self.messages.get_nowait()) + yield chunk + + async def receive_batch(self): + await self.consumer_client.receive_batch( + on_event_batch=self.on_event_batch, + max_batch_size=self.max_batch_size, + starting_position="-1", + ) + + async def on_event_batch(self, partition_context: PartitionContext, events: List[EventData]): + logger.debug(f"Received batch of events from partition: {partition_context.partition_id}") + for event in events: + try: + payload = orjson.loads(event.body_as_str(encoding="UTF-8")) + msg_id = event.system_properties[b"x-opt-sequence-number"] + self.events[msg_id] = (event, partition_context) + payload[self.MSG_ID_FIELD] = msg_id + await self.messages.put(payload) + except Exception as e: + logger.error(e) + + async def complete_events(self, msg_ids: List[str]): + for msg_id in msg_ids: + logger.debug(f"Acking {msg_id} event") + event, partition_context = self.events.pop(msg_id, (None, None)) + if event is not None: + await partition_context.update_checkpoint(event) + else: + logger.warning(f"Couldn't find event {msg_id} for acknowledging") + + def ack(self, msg_ids: List[str]): + asyncio.create_task(self.complete_events(msg_ids)) +``` + +- [ ] **Step 9.5: Run test to verify it passes** + +```bash +cd core && python -m pytest src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py -v +``` + +Expected: 5 passed. + +- [ ] **Step 9.6: Run the full core suite** + +```bash +cd core && python -m pytest src/datayoga_core/ -x -q +``` + +Expected: all tests pass. + +- [ ] **Step 9.7: Commit** + +```bash +git add core/src/datayoga_core/blocks/azure/read_event_hub/block.py \ + core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json \ + core/src/datayoga_core/blocks/azure/read_event_hub/tests/__init__.py \ + core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py +git commit -m "Migrate azure/read_event_hub; rename batch_size -> max_batch_size (#400, BREAKING)" +``` + +--- + +## Task 10: Regenerate autogenerated schemas and docs + +The aggregated `schemas/job.schema.json` and the per-block markdown in `docs/reference/blocks/` are generated by scripts. After the per-block schema changes, regenerate them. + +**Files:** +- Modify: `schemas/job.schema.json` +- Modify: `docs/reference/blocks/std_read.md`, `files_read_csv.md`, `parquet_read.md`, `relational_read.md`, `redis_read_stream.md`, `http_receiver.md`, `azure_read_event_hub.md` (autogenerated) + +- [ ] **Step 10.1: Regenerate the JSON schemas** + +```bash +bash scripts/generate-jsonschemas.sh +``` + +Expected output: `JSON schemas generated successfully`. + +- [ ] **Step 10.2: Regenerate the reference docs** + +```bash +bash scripts/generate-docs.sh +``` + +Expected: completes without error. + +- [ ] **Step 10.3: Inspect the diff** + +```bash +git diff schemas/ docs/reference/blocks/ | head -200 +``` + +Expected: `batch_size` (and `flush_ms` for streaming producers, `fetch_size` for relational/read, `max_batch_size` for event_hub) appear in the appropriate schema entries and docs. + +- [ ] **Step 10.4: Commit** + +```bash +git add schemas/job.schema.json docs/reference/blocks/ +git commit -m "Regenerate JSON schemas and reference docs after producer batching (#400)" +``` + +--- + +## Task 11: Document the producer batching model in processing-strategies + +**Files:** +- Modify: `docs/processing-strategies.md` + +- [ ] **Step 11.1: Add a section on producer batching** + +Append the following section to `docs/processing-strategies.md` (or replace an existing section if one already covers it): + +````markdown +## Producer Batching + +Every producer block (any block that reads from a source — `std/read`, `files/read_csv`, `parquet/read`, `relational/read`, `redis/read_stream`, `azure/read_event_hub`, `http/receiver`) accepts a `batch_size` property. The producer base class re-chunks the source's output into batches of exactly `batch_size` records, regardless of how the source delivers them (per row, per row group, per `fetchmany`, per network message). + +```yaml +input: + uses: files.read_csv + with: + file: people.csv + batch_size: 500 # downstream steps process 500 records per call +``` + +Default: `1000`. + +### Streaming producers and `flush_ms` + +Streaming producers (`redis/read_stream`, `azure/read_event_hub`, `http/receiver`) also accept `flush_ms`. If no new records arrive within that many milliseconds, any partial batch is flushed downstream instead of being held until `batch_size` is reached. + +```yaml +input: + uses: redis.read_stream + with: + connection: my_redis + stream_name: events + batch_size: 1000 + flush_ms: 500 # emit a partial batch after 500ms of inactivity +``` + +Default: `1000` ms. Set to `null` to disable time-based flushing (records are held until `batch_size` or end-of-stream). + +### `relational/read` and `fetch_size` + +`relational/read` exposes an extra `fetch_size` property that controls how many rows are pulled from the database driver per round-trip, independent of the pipeline `batch_size`. Default: `10000`. Tune lower for memory pressure with wide rows; tune higher if you want fewer DB round-trips and downstream processing is the bottleneck. + +### `azure/read_event_hub` migration note + +In earlier versions, `batch_size` on `azure/read_event_hub` controlled the SDK callback batch size, not the pipeline batch size. As of #400 it has been renamed to `max_batch_size` to match the SDK semantic, and `batch_size` now consistently means pipeline batch size as it does for every other producer. +```` + +- [ ] **Step 11.2: Commit** + +```bash +git add docs/processing-strategies.md +git commit -m "Document producer batching model in processing-strategies (#400)" +``` + +--- + +## Task 12: Full verification and push branch + +- [ ] **Step 12.1: Run full core test suite** + +```bash +cd core && python -m pytest src/datayoga_core/ -v +``` + +Expected: all tests pass. Notably: +- `test_producer_batching.py` (7 tests) +- `test_schema_inherit.py` (5 tests) +- `test_std_read.py`, `test_read_csv.py`, `test_parquet_read.py`, `test_relational_read.py`, `test_http_receiver.py`, `test_redis_read_stream.py`, `test_event_hub.py` (12 tests total) +- All pre-existing tests still pass. + +- [ ] **Step 12.2: Inspect the branch's commit history** + +```bash +git log --oneline 400-producer-batching-unification ^main +``` + +Expected: a clean sequence of commits — one per task — each referencing #400. + +- [ ] **Step 12.3: Push the branch** + +```bash +git push -u origin 400-producer-batching-unification +``` + +Expected: branch pushed to remote. + +- [ ] **Step 12.4: Open a draft PR (deferred — confirm with user first)** + +Before opening the PR, ask the user whether to open it as draft or ready-for-review, and confirm the body content. Do not run `gh pr create` autonomously. + +The PR description should call out the breaking change explicitly (no CHANGELOG file exists in this repo, so the PR description is the canonical place): + +> **Breaking change:** `azure/read_event_hub.batch_size` has been renamed to `max_batch_size`. The name `batch_size` now means pipeline batch size on this block, consistent with every other producer. Users with `batch_size: ` in their YAML for `azure/read_event_hub` must rename it to `max_batch_size: ` to preserve the previous SDK callback size semantic; the literal `batch_size: ` will validate but with the new pipeline-level meaning. From 5c178b6074c00db8d3bc2cbe1e82d9b7e0d857c8 Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 16:35:23 +0300 Subject: [PATCH 10/38] Add $inherit schema fragment resolver (#400) --- core/src/datayoga_core/block.py | 3 +- core/src/datayoga_core/job.py | 3 +- .../resources/schemas/batchable.schema.json | 13 ++++ .../resources/schemas/streamable.schema.json | 19 ++++++ core/src/datayoga_core/schema_utils.py | 52 ++++++++++++++++ core/src/datayoga_core/tests/__init__.py | 0 .../tests/test_schema_inherit.py | 62 +++++++++++++++++++ 7 files changed, 150 insertions(+), 2 deletions(-) create mode 100644 core/src/datayoga_core/resources/schemas/batchable.schema.json create mode 100644 core/src/datayoga_core/resources/schemas/streamable.schema.json create mode 100644 core/src/datayoga_core/schema_utils.py create mode 100644 core/src/datayoga_core/tests/__init__.py create mode 100644 core/src/datayoga_core/tests/test_schema_inherit.py diff --git a/core/src/datayoga_core/block.py b/core/src/datayoga_core/block.py index 29363953..2a83036d 100644 --- a/core/src/datayoga_core/block.py +++ b/core/src/datayoga_core/block.py @@ -56,7 +56,8 @@ def get_json_schema(self) -> Dict[str, Any]: os.path.dirname(os.path.realpath(sys.modules[self.__module__].__file__)), "block.schema.json") logger.debug(f"loading schema from {json_schema_file}") - return utils.read_json(json_schema_file) + from datayoga_core.schema_utils import resolve_inherits + return resolve_inherits(utils.read_json(json_schema_file)) @abstractmethod def init(self, context: Optional[Context] = None): diff --git a/core/src/datayoga_core/job.py b/core/src/datayoga_core/job.py index 082dde7c..6fac1132 100644 --- a/core/src/datayoga_core/job.py +++ b/core/src/datayoga_core/job.py @@ -237,10 +237,11 @@ def get_json_schema(whitelisted_blocks: Optional[List[str]] = None) -> Dict[str, # Now build the sorted lists block_types = [] block_schemas = [] + from datayoga_core.schema_utils import resolve_inherits for block_type, schema_path in block_info: block_types.append(block_type) # load schema file - schema = utils.read_json(f"{schema_path}") + schema = resolve_inherits(utils.read_json(f"{schema_path}")) # append to the array of allOf for the full schema # we use allOf for better error reporting block_schemas.append({ diff --git a/core/src/datayoga_core/resources/schemas/batchable.schema.json b/core/src/datayoga_core/resources/schemas/batchable.schema.json new file mode 100644 index 00000000..f158d4fb --- /dev/null +++ b/core/src/datayoga_core/resources/schemas/batchable.schema.json @@ -0,0 +1,13 @@ +{ + "title": "batchable", + "description": "Producer batching mixin: declares batch_size for producers that yield records in batches.", + "type": "object", + "properties": { + "batch_size": { + "type": "integer", + "minimum": 1, + "description": "Maximum number of records yielded per downstream batch.", + "default": 1000 + } + } +} diff --git a/core/src/datayoga_core/resources/schemas/streamable.schema.json b/core/src/datayoga_core/resources/schemas/streamable.schema.json new file mode 100644 index 00000000..761c6d65 --- /dev/null +++ b/core/src/datayoga_core/resources/schemas/streamable.schema.json @@ -0,0 +1,19 @@ +{ + "title": "streamable", + "description": "Streaming producer mixin: declares batch_size and flush_ms for producers reading from continuous sources.", + "type": "object", + "properties": { + "batch_size": { + "type": "integer", + "minimum": 1, + "description": "Maximum number of records yielded per downstream batch.", + "default": 1000 + }, + "flush_ms": { + "type": ["integer", "null"], + "minimum": 1, + "description": "If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.", + "default": 1000 + } + } +} diff --git a/core/src/datayoga_core/schema_utils.py b/core/src/datayoga_core/schema_utils.py new file mode 100644 index 00000000..77bdee45 --- /dev/null +++ b/core/src/datayoga_core/schema_utils.py @@ -0,0 +1,52 @@ +"""Schema composition helpers. + +Producers and other blocks can declare `"$inherit": ["batchable"]` at the +top of their block.schema.json to pull in shared property definitions from +the fragments in resources/schemas/. `resolve_inherits` merges the +fragments' `properties` into the local schema (local properties win), then +removes the `$inherit` key. Schemas without `$inherit` are returned as-is. +""" +from __future__ import annotations + +import copy +from os import path +from typing import Any, Dict, List + +from datayoga_core import utils + + +def resolve_inherits(schema: Dict[str, Any], schemas_dir: str = None) -> Dict[str, Any]: + """Merge any fragments listed in $inherit into the schema's properties. + + Args: + schema: The schema to resolve. Mutated in place and also returned. + schemas_dir: Directory containing the fragment files. Defaults to + the bundled/non-bundled resources/schemas directory. + + Returns: + The mutated schema with $inherit removed and fragment properties merged. + """ + inherits: List[str] = schema.get("$inherit") or [] + if not inherits: + return schema + + if schemas_dir is None: + schemas_dir = utils.get_resource_path("schemas") + + merged_properties: Dict[str, Any] = {} + for fragment_name in inherits: + fragment_path = path.join(schemas_dir, f"{fragment_name}.schema.json") + if not path.isfile(fragment_path): + raise FileNotFoundError( + f"Schema fragment '{fragment_name}' not found at {fragment_path}" + ) + fragment = utils.read_json(fragment_path) + merged_properties.update(copy.deepcopy(fragment.get("properties", {}))) + + # Local properties take precedence over inherited ones. + local_properties = schema.get("properties", {}) + merged_properties.update(local_properties) + + schema["properties"] = merged_properties + schema.pop("$inherit", None) + return schema diff --git a/core/src/datayoga_core/tests/__init__.py b/core/src/datayoga_core/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/core/src/datayoga_core/tests/test_schema_inherit.py b/core/src/datayoga_core/tests/test_schema_inherit.py new file mode 100644 index 00000000..77178c37 --- /dev/null +++ b/core/src/datayoga_core/tests/test_schema_inherit.py @@ -0,0 +1,62 @@ +import json +from pathlib import Path + +import pytest + +from datayoga_core.schema_utils import resolve_inherits + + +SCHEMAS_DIR = ( + Path(__file__).resolve().parent.parent / "resources" / "schemas" +) + + +def test_inherit_merges_fragment_properties(): + schema = { + "title": "demo", + "type": "object", + "$inherit": ["batchable"], + "properties": {"foo": {"type": "string"}}, + "additionalProperties": False, + } + resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) + assert "$inherit" not in resolved + assert "batch_size" in resolved["properties"] + assert resolved["properties"]["batch_size"]["default"] == 1000 + assert resolved["properties"]["foo"] == {"type": "string"} + assert resolved["additionalProperties"] is False + + +def test_inherit_local_property_wins_over_fragment(): + schema = { + "type": "object", + "$inherit": ["batchable"], + "properties": { + "batch_size": {"type": "integer", "minimum": 1, "default": 50} + }, + } + resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) + assert resolved["properties"]["batch_size"]["default"] == 50 + + +def test_inherit_streamable_brings_both_props(): + schema = {"type": "object", "$inherit": ["streamable"], "properties": {}} + resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) + assert "batch_size" in resolved["properties"] + assert "flush_ms" in resolved["properties"] + + +def test_schema_without_inherit_unchanged(): + schema = { + "type": "object", + "properties": {"foo": {"type": "string"}}, + "additionalProperties": False, + } + resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) + assert resolved == schema + + +def test_unknown_fragment_raises(): + schema = {"type": "object", "$inherit": ["nope"], "properties": {}} + with pytest.raises(FileNotFoundError): + resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) From cb126c665fbdbf4ba4f52f6236dc3280825cd634 Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 16:45:16 +0300 Subject: [PATCH 11/38] Tighten $inherit resolver: reject non-list, guard against nested (#400) --- core/src/datayoga_core/block.py | 1 + core/src/datayoga_core/job.py | 1 + core/src/datayoga_core/schema_utils.py | 18 +++++++--- .../tests/test_schema_inherit.py | 33 ++++++++++++++++++- 4 files changed, 48 insertions(+), 5 deletions(-) diff --git a/core/src/datayoga_core/block.py b/core/src/datayoga_core/block.py index 2a83036d..a0b65e06 100644 --- a/core/src/datayoga_core/block.py +++ b/core/src/datayoga_core/block.py @@ -56,6 +56,7 @@ def get_json_schema(self) -> Dict[str, Any]: os.path.dirname(os.path.realpath(sys.modules[self.__module__].__file__)), "block.schema.json") logger.debug(f"loading schema from {json_schema_file}") + # Lazy import: schema_utils -> utils -> block creates a circular import at module load. from datayoga_core.schema_utils import resolve_inherits return resolve_inherits(utils.read_json(json_schema_file)) diff --git a/core/src/datayoga_core/job.py b/core/src/datayoga_core/job.py index 6fac1132..9df8c267 100644 --- a/core/src/datayoga_core/job.py +++ b/core/src/datayoga_core/job.py @@ -237,6 +237,7 @@ def get_json_schema(whitelisted_blocks: Optional[List[str]] = None) -> Dict[str, # Now build the sorted lists block_types = [] block_schemas = [] + # Lazy import: schema_utils -> utils -> block creates a circular import at module load. from datayoga_core.schema_utils import resolve_inherits for block_type, schema_path in block_info: block_types.append(block_type) diff --git a/core/src/datayoga_core/schema_utils.py b/core/src/datayoga_core/schema_utils.py index 77bdee45..8f6657f7 100644 --- a/core/src/datayoga_core/schema_utils.py +++ b/core/src/datayoga_core/schema_utils.py @@ -10,12 +10,12 @@ import copy from os import path -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional from datayoga_core import utils -def resolve_inherits(schema: Dict[str, Any], schemas_dir: str = None) -> Dict[str, Any]: +def resolve_inherits(schema: Dict[str, Any], schemas_dir: Optional[str] = None) -> Dict[str, Any]: """Merge any fragments listed in $inherit into the schema's properties. Args: @@ -26,9 +26,13 @@ def resolve_inherits(schema: Dict[str, Any], schemas_dir: str = None) -> Dict[st Returns: The mutated schema with $inherit removed and fragment properties merged. """ - inherits: List[str] = schema.get("$inherit") or [] - if not inherits: + inherits = schema.get("$inherit") + if inherits is None or inherits == []: return schema + if not isinstance(inherits, list) or not all(isinstance(name, str) for name in inherits): + raise TypeError( + f"$inherit must be a list of fragment names (strings), got {inherits!r}" + ) if schemas_dir is None: schemas_dir = utils.get_resource_path("schemas") @@ -41,6 +45,12 @@ def resolve_inherits(schema: Dict[str, Any], schemas_dir: str = None) -> Dict[st f"Schema fragment '{fragment_name}' not found at {fragment_path}" ) fragment = utils.read_json(fragment_path) + if fragment.get("$inherit"): + raise ValueError( + f"Schema fragment '{fragment_name}' itself contains $inherit; " + "nested inheritance is not supported. Inline the parent fragment's " + "properties or restructure the hierarchy." + ) merged_properties.update(copy.deepcopy(fragment.get("properties", {}))) # Local properties take precedence over inherited ones. diff --git a/core/src/datayoga_core/tests/test_schema_inherit.py b/core/src/datayoga_core/tests/test_schema_inherit.py index 77178c37..c22ea2c8 100644 --- a/core/src/datayoga_core/tests/test_schema_inherit.py +++ b/core/src/datayoga_core/tests/test_schema_inherit.py @@ -1,4 +1,3 @@ -import json from pathlib import Path import pytest @@ -60,3 +59,35 @@ def test_unknown_fragment_raises(): schema = {"type": "object", "$inherit": ["nope"], "properties": {}} with pytest.raises(FileNotFoundError): resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) + + +def test_inherit_string_value_raises_type_error(): + schema = {"type": "object", "$inherit": "batchable", "properties": {}} + with pytest.raises(TypeError): + resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) + + +def test_inherit_non_string_items_raises_type_error(): + schema = {"type": "object", "$inherit": ["batchable", 123], "properties": {}} + with pytest.raises(TypeError): + resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) + + +def test_inherit_empty_list_returns_unchanged(): + schema = {"type": "object", "$inherit": [], "properties": {"foo": {}}} + resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) + # Early-return path: schema is returned as-is (no mutation, no key removal). + assert resolved is schema + + +def test_nested_inherit_raises_value_error(tmp_path): + # Build a fragment dir with a fragment that has its own $inherit. + (tmp_path / "parent.schema.json").write_text( + '{"properties": {"x": {"type": "string"}}}' + ) + (tmp_path / "child.schema.json").write_text( + '{"$inherit": ["parent"], "properties": {"y": {"type": "string"}}}' + ) + schema = {"$inherit": ["child"], "type": "object", "properties": {}} + with pytest.raises(ValueError, match="nested inheritance is not supported"): + resolve_inherits(schema, schemas_dir=str(tmp_path)) From 09319184076f6683018ee26fa10be9099722c1f8 Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 16:45:59 +0300 Subject: [PATCH 12/38] Remove unused List import in schema_utils (#400) Co-Authored-By: Claude Opus 4.7 (1M context) --- core/src/datayoga_core/schema_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/datayoga_core/schema_utils.py b/core/src/datayoga_core/schema_utils.py index 8f6657f7..e009a984 100644 --- a/core/src/datayoga_core/schema_utils.py +++ b/core/src/datayoga_core/schema_utils.py @@ -10,7 +10,7 @@ import copy from os import path -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Optional from datayoga_core import utils From c9dbe921bf51738030f6fbef4c35dbb631c215f5 Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 16:49:03 +0300 Subject: [PATCH 13/38] Producer base class re-chunks via produce_chunks (#400) --- core/src/datayoga_core/producer.py | 88 ++++++++++-- .../tests/test_producer_batching.py | 126 ++++++++++++++++++ 2 files changed, 203 insertions(+), 11 deletions(-) create mode 100644 core/src/datayoga_core/tests/test_producer_batching.py diff --git a/core/src/datayoga_core/producer.py b/core/src/datayoga_core/producer.py index e32b2e01..2b61390d 100644 --- a/core/src/datayoga_core/producer.py +++ b/core/src/datayoga_core/producer.py @@ -1,8 +1,12 @@ -from abc import abstractmethod +import asyncio +import logging +from contextlib import suppress from typing import Any, AsyncGenerator, Dict, List from .block import Block +logger = logging.getLogger("dy") + class Message: def __init__(self, msg_id: str, value: Dict[str, Any]): @@ -11,20 +15,82 @@ def __init__(self, msg_id: str, value: Dict[str, Any]): class Producer(Block): + """Base class for producer (read) blocks. + + Subclasses override `produce_chunks()` to yield chunks of any size from + the source. The default `produce()` re-chunks them to exactly `batch_size` + records per batch (smaller on flush_ms timeout or end-of-stream). + + Legacy subclasses may still override `produce()` directly. They bypass + the base-class batching and `produce_chunks` is not called. + """ - @abstractmethod - async def produce(self) -> AsyncGenerator[List[Message], None]: - """Produces data + DEFAULT_BATCH_SIZE = 1000 + DEFAULT_FLUSH_MS = None # streaming subclasses override to enable timeout flush - Returns: - AsyncGenerator[List[Message], None]: A generator of message batches. + async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: + """Yield natural-size chunks from the source. + + Subclasses should override this method. The base-class `produce()` + will re-chunk the output to exact `batch_size` slices. """ - raise NotImplementedError + raise NotImplementedError( + f"{type(self).__name__} must override produce_chunks() or produce()" + ) + # Make this an async generator for type-checking purposes. + yield # pragma: no cover - def ack(self, msg_ids: List[str]): - """Sends acknowledge for the message IDs of the records that have been processed + async def produce(self) -> AsyncGenerator[List[Dict[str, Any]], None]: + """Re-chunks `produce_chunks()` output to exact batch_size batches. - Args: - msg_ids (List[str]): Message IDs + Reads `batch_size` and `flush_ms` from properties lazily so subclasses + don't need to remember to call `super().init()`. """ + batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE)) + flush_ms = self.properties.get("flush_ms", self.DEFAULT_FLUSH_MS) + timeout = (flush_ms / 1000) if flush_ms else None + + queue: asyncio.Queue = asyncio.Queue() + EOS = object() + + async def pump(): + try: + async for chunk in self.produce_chunks(): + if chunk: + await queue.put(chunk) + except asyncio.CancelledError: + raise + except Exception as exc: + logger.exception("produce_chunks raised; ending stream: %s", exc) + finally: + await queue.put(EOS) + + pump_task = asyncio.create_task(pump()) + buffer: List[Dict[str, Any]] = [] + try: + while True: + try: + item = await asyncio.wait_for(queue.get(), timeout=timeout) + except asyncio.TimeoutError: + if buffer: + yield buffer + buffer = [] + continue + + if item is EOS: + if buffer: + yield buffer + return + + buffer.extend(item) + while len(buffer) >= batch_size: + yield buffer[:batch_size] + buffer = buffer[batch_size:] + finally: + pump_task.cancel() + with suppress(asyncio.CancelledError, Exception): + await pump_task + + def ack(self, msg_ids: List[str]): + """Sends acknowledge for the message IDs of records that have been processed.""" pass diff --git a/core/src/datayoga_core/tests/test_producer_batching.py b/core/src/datayoga_core/tests/test_producer_batching.py new file mode 100644 index 00000000..59601786 --- /dev/null +++ b/core/src/datayoga_core/tests/test_producer_batching.py @@ -0,0 +1,126 @@ +import asyncio +from typing import AsyncGenerator, List, Optional + +import pytest + +from datayoga_core.context import Context +from datayoga_core.producer import Message, Producer + + +def _msg(i: int) -> dict: + return {Producer.MSG_ID_FIELD: str(i), "v": i} + + +class FakeProducer(Producer): + """Producer driven by a scripted list of chunks plus optional sleeps.""" + + def __init__(self, properties=None, *, chunks=None, sleep_before=None): + # schema for a FakeProducer; declare batch_size/flush_ms so validation passes + self._test_schema = { + "type": "object", + "properties": { + "batch_size": {"type": "integer", "minimum": 1}, + "flush_ms": {"type": ["integer", "null"], "minimum": 1}, + }, + } + self._chunks = chunks or [] + self._sleep_before = sleep_before or [] + super().__init__(properties or {}) + + def get_json_schema(self): + return self._test_schema + + def init(self, context: Optional[Context] = None): + pass + + async def produce_chunks(self) -> AsyncGenerator[List[Message], None]: + for i, chunk in enumerate(self._chunks): + if i < len(self._sleep_before) and self._sleep_before[i]: + await asyncio.sleep(self._sleep_before[i]) + yield chunk + + +async def _drain(producer: Producer): + out = [] + async for batch in producer.produce(): + out.append(batch) + return out + + +@pytest.mark.asyncio +async def test_rechunks_one_large_chunk(): + chunks = [[_msg(i) for i in range(5000)]] + p = FakeProducer({"batch_size": 1000}, chunks=chunks) + batches = await _drain(p) + assert [len(b) for b in batches] == [1000, 1000, 1000, 1000, 1000] + + +@pytest.mark.asyncio +async def test_accumulates_small_chunks_and_flushes_on_eos(): + chunks = [[_msg(i) for i in range(200)], + [_msg(i) for i in range(200, 500)], + [_msg(i) for i in range(500, 900)]] + p = FakeProducer({"batch_size": 1000}, chunks=chunks) + batches = await _drain(p) + assert [len(b) for b in batches] == [900] + + +@pytest.mark.asyncio +async def test_partial_final_batch_on_eos(): + chunks = [[_msg(i) for i in range(1500)]] + p = FakeProducer({"batch_size": 1000}, chunks=chunks) + batches = await _drain(p) + assert [len(b) for b in batches] == [1000, 500] + + +@pytest.mark.asyncio +async def test_empty_chunks_are_ignored(): + chunks = [[], [_msg(1), _msg(2)], [], [_msg(3)]] + p = FakeProducer({"batch_size": 10}, chunks=chunks) + batches = await _drain(p) + assert [len(b) for b in batches] == [3] + + +@pytest.mark.asyncio +async def test_flush_ms_emits_partial_on_inactivity(): + # one chunk of 2 records, then a 300ms wait before EOS; flush_ms=100 should + # flush the partial batch of 2 well before EOS. + chunks = [[_msg(1), _msg(2)], [_msg(3)]] + sleeps = [0, 0.3] + p = FakeProducer({"batch_size": 100, "flush_ms": 100}, + chunks=chunks, sleep_before=sleeps) + + received = [] + started = asyncio.get_event_loop().time() + timings = [] + async for batch in p.produce(): + timings.append(asyncio.get_event_loop().time() - started) + received.append(batch) + + assert [len(b) for b in received] == [2, 1] + # first flush happens because of inactivity (~100ms), not waiting for chunk 2 + assert timings[0] < 0.25, f"expected first flush before 250ms, got {timings[0]}" + + +@pytest.mark.asyncio +async def test_no_flush_ms_holds_records_until_eos(): + chunks = [[_msg(1)], [_msg(2)]] + sleeps = [0, 0.1] + p = FakeProducer({"batch_size": 100}, chunks=chunks, sleep_before=sleeps) + batches = await _drain(p) + assert [len(b) for b in batches] == [2] # combined on EOS, never flushed mid-stream + + +@pytest.mark.asyncio +async def test_consumer_cancellation_cleans_up_pump(): + chunks = [[_msg(i)] for i in range(1000)] + p = FakeProducer({"batch_size": 10, "flush_ms": 50}, chunks=chunks, + sleep_before=[0.05] * 1000) + + gen = p.produce() + first = await gen.__anext__() + assert len(first) >= 1 + await gen.aclose() + # If pump task wasn't cleaned up we'd see a "Task was destroyed but it is + # pending!" warning here. Sleep briefly so the loop has a chance to surface it. + await asyncio.sleep(0.1) From f1311d88309e08a2165784795a36a1c065b4a10b Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 16:52:22 +0300 Subject: [PATCH 14/38] Migrate std/read to produce_chunks (#400, #296) --- .../datayoga_core/blocks/std/read/block.py | 44 ++++--------------- .../blocks/std/read/block.schema.json | 10 ++--- .../blocks/std/read/tests/__init__.py | 0 .../blocks/std/read/tests/test_std_read.py | 32 ++++++++++++++ 4 files changed, 44 insertions(+), 42 deletions(-) create mode 100644 core/src/datayoga_core/blocks/std/read/tests/__init__.py create mode 100644 core/src/datayoga_core/blocks/std/read/tests/test_std_read.py diff --git a/core/src/datayoga_core/blocks/std/read/block.py b/core/src/datayoga_core/blocks/std/read/block.py index e0b60b13..1c51839d 100644 --- a/core/src/datayoga_core/blocks/std/read/block.py +++ b/core/src/datayoga_core/blocks/std/read/block.py @@ -6,59 +6,33 @@ import orjson from datayoga_core.context import Context -from datayoga_core.producer import Message from datayoga_core.producer import Producer as DyProducer logger = logging.getLogger("dy") class Block(DyProducer): - def init(self, context: Optional[Context] = None): logger.debug(f"Initializing {self.get_block_name()}") - self.batch_size = int(self.properties.get("batch_size", 1000)) - logger.info(f"Using batch size: {self.batch_size}") - - async def process_batch(self, records: List[Dict[str, Any]]) -> AsyncGenerator[List[Message], None]: - """Process records and yield batches according to batch_size""" - batch = [] - for record in records: - batch.append(self.get_message(record)) - - # When batch is full, yield it - if len(batch) >= self.batch_size: - logger.info(f"Yielding batch of {len(batch)} records") - yield batch - batch = [] - # Yield any remaining records - if batch: - logger.info(f"Yielding final batch of {len(batch)} records") - yield batch - - async def produce(self) -> AsyncGenerator[List[Message], None]: - if select.select([sys.stdin, ], [], [], 0.0)[0]: - # piped data exists - all_records = [] - for data in sys.stdin: - all_records.extend(self.get_records(data)) + async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: + if select.select([sys.stdin], [], [], 0.0)[0]: + all_records: List[Dict[str, Any]] = [] + for line in sys.stdin: + all_records.extend(self.get_records(line)) else: - # interactive mode print("Enter data to process:") - data = input() - all_records = self.get_records(data) + all_records = self.get_records(input()) - async for batch in self.process_batch(all_records): - yield batch + if all_records: + yield [self.get_message(record) for record in all_records] @staticmethod def get_records(data: str) -> List[Dict[str, Any]]: records = orjson.loads(data) - if isinstance(records, dict): records = [records] - return records - def get_message(self, record: Dict[str, Any]) -> Message: + def get_message(self, record: Dict[str, Any]) -> Dict[str, Any]: return {self.MSG_ID_FIELD: str(uuid.uuid4()), **record} diff --git a/core/src/datayoga_core/blocks/std/read/block.schema.json b/core/src/datayoga_core/blocks/std/read/block.schema.json index 38ad05af..2214ac05 100644 --- a/core/src/datayoga_core/blocks/std/read/block.schema.json +++ b/core/src/datayoga_core/blocks/std/read/block.schema.json @@ -2,11 +2,7 @@ "title": "std.read", "description": "Read from the standard input", "type": "object", - "properties": { - "batch_size": { - "type": "integer", - "description": "Number of records to process in a single batch", - "default": 1000 - } - } + "$inherit": ["batchable"], + "properties": {}, + "additionalProperties": false } diff --git a/core/src/datayoga_core/blocks/std/read/tests/__init__.py b/core/src/datayoga_core/blocks/std/read/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py b/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py new file mode 100644 index 00000000..609f0915 --- /dev/null +++ b/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py @@ -0,0 +1,32 @@ +from unittest.mock import patch + +import orjson +import pytest + +from datayoga_core.blocks.std.read.block import Block + + +async def _drain(producer): + out = [] + async for batch in producer.produce(): + out.append(batch) + return out + + +@pytest.mark.asyncio +async def test_std_read_batches_to_batch_size(): + payload = [{"i": i} for i in range(2500)] + fake_stdin = [orjson.dumps(payload).decode()] + + block = Block({"batch_size": 1000}) + block.init() + + with patch("datayoga_core.blocks.std.read.block.select.select", + return_value=([object()], [], [])), \ + patch("datayoga_core.blocks.std.read.block.sys.stdin", fake_stdin): + batches = await _drain(block) + + assert [len(b) for b in batches] == [1000, 1000, 500] + flat = [r for b in batches for r in b] + assert flat[0]["i"] == 0 + assert all(Block.MSG_ID_FIELD in r for r in flat) From 12c13fb14e4097d9b1187660117ac675e6de5076 Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 16:56:25 +0300 Subject: [PATCH 15/38] Migrate files/read_csv to produce_chunks (#400) --- .../blocks/files/read_csv/block.py | 32 ++++++---------- .../blocks/files/read_csv/block.schema.json | 7 +--- .../blocks/files/read_csv/tests/__init__.py | 0 .../files/read_csv/tests/test_read_csv.py | 38 +++++++++++++++++++ 4 files changed, 51 insertions(+), 26 deletions(-) create mode 100644 core/src/datayoga_core/blocks/files/read_csv/tests/__init__.py create mode 100644 core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py diff --git a/core/src/datayoga_core/blocks/files/read_csv/block.py b/core/src/datayoga_core/blocks/files/read_csv/block.py index c4bca6f6..336450dc 100644 --- a/core/src/datayoga_core/blocks/files/read_csv/block.py +++ b/core/src/datayoga_core/blocks/files/read_csv/block.py @@ -4,10 +4,9 @@ from contextlib import suppress from csv import DictReader from itertools import count, islice -from typing import AsyncGenerator, List, Optional +from typing import Any, AsyncGenerator, Dict, List, Optional from datayoga_core.context import Context -from datayoga_core.producer import Message from datayoga_core.producer import Producer as DyProducer logger = logging.getLogger("dy") @@ -18,40 +17,33 @@ class Block(DyProducer, metaclass=ABCMeta): def init(self, context: Optional[Context] = None): logger.debug(f"Initializing {self.get_block_name()}") csv_file = self.properties["file"] - if os.path.isabs(csv_file) or context is None: self.file = csv_file else: self.file = os.path.join(context.properties.get("data_path"), csv_file) - logger.debug(f"file: {self.file}") - self.encoding = self.properties.get("encoding", "utf-8") - self.batch_size = self.properties.get("batch_size", 1000) self.fields = self.properties.get("fields") self.skip = self.properties.get("skip", 0) self.delimiter = self.properties.get("delimiter", ",") self.quotechar = self.properties.get("quotechar", "\"") - async def produce(self) -> AsyncGenerator[List[Message], None]: + async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: logger.debug("Reading CSV") + batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE)) with open(self.file, "r", encoding=self.encoding) as read_obj: - reader = DictReader(read_obj, fieldnames=self.fields, delimiter=self.delimiter, quotechar=self.quotechar) - counter = iter(count()) - + reader = DictReader(read_obj, fieldnames=self.fields, + delimiter=self.delimiter, quotechar=self.quotechar) for _ in range(self.skip): with suppress(StopIteration): next(reader) - + counter = iter(count()) while True: - sliced = islice(reader, self.batch_size) - records = [{self.MSG_ID_FIELD: f"{next(counter)}", **record} for record in sliced] - - if not records: - logger.debug(f"Done reading {self.file}") + chunk = [ + {self.MSG_ID_FIELD: f"{next(counter)}", **record} + for record in islice(reader, batch_size) + ] + if not chunk: return - - logger.debug(f"Producing {len(records)} records") - - yield records + yield chunk diff --git a/core/src/datayoga_core/blocks/files/read_csv/block.schema.json b/core/src/datayoga_core/blocks/files/read_csv/block.schema.json index 39e7118a..ca7d638b 100644 --- a/core/src/datayoga_core/blocks/files/read_csv/block.schema.json +++ b/core/src/datayoga_core/blocks/files/read_csv/block.schema.json @@ -2,6 +2,7 @@ "title": "files.read_csv", "description": "Read data from CSV", "type": "object", + "$inherit": ["batchable"], "properties": { "file": { "description": "Filename. Can contain a regexp or glob expression", @@ -39,12 +40,6 @@ "maxLength": 1, "default": "," }, - "batch_size": { - "description": "Number of records to read per batch", - "type": "number", - "minimum": 1, - "default": 1000 - }, "quotechar": { "description": "A one-character string used to quote fields containing special characters, such as the delimiter or quotechar, or which contain new-line characters. It defaults to '", "type": "string", diff --git a/core/src/datayoga_core/blocks/files/read_csv/tests/__init__.py b/core/src/datayoga_core/blocks/files/read_csv/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py b/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py new file mode 100644 index 00000000..16cb9b17 --- /dev/null +++ b/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py @@ -0,0 +1,38 @@ +from pathlib import Path + +import pytest + +from datayoga_core.blocks.files.read_csv.block import Block + + +async def _drain(producer): + out = [] + async for batch in producer.produce(): + out.append(batch) + return out + + +@pytest.fixture +def csv_path(tmp_path) -> Path: + p = tmp_path / "data.csv" + rows = ["fname,lname"] + [f"first{i},last{i}" for i in range(2500)] + p.write_text("\n".join(rows) + "\n", encoding="utf-8") + return p + + +@pytest.mark.asyncio +async def test_csv_batches_to_batch_size(csv_path): + block = Block({"file": str(csv_path), "batch_size": 1000}) + block.init() + batches = await _drain(block) + assert [len(b) for b in batches] == [1000, 1000, 500] + assert all(Block.MSG_ID_FIELD in r for b in batches for r in b) + assert batches[0][0]["fname"] == "first0" + + +@pytest.mark.asyncio +async def test_csv_default_batch_size(csv_path): + block = Block({"file": str(csv_path)}) + block.init() + batches = await _drain(block) + assert [len(b) for b in batches] == [1000, 1000, 500] From 1af0c66a21f4c48eaab751a6c1b857fc861a1608 Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 16:59:34 +0300 Subject: [PATCH 16/38] Migrate parquet/read to produce_chunks, fix one-by-one yield (#400, #293) --- .../blocks/parquet/read/block.py | 19 ++++---- .../blocks/parquet/read/block.schema.json | 1 + .../blocks/parquet/read/tests/__init__.py | 0 .../parquet/read/tests/test_parquet_read.py | 43 +++++++++++++++++++ 4 files changed, 52 insertions(+), 11 deletions(-) create mode 100644 core/src/datayoga_core/blocks/parquet/read/tests/__init__.py create mode 100644 core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py diff --git a/core/src/datayoga_core/blocks/parquet/read/block.py b/core/src/datayoga_core/blocks/parquet/read/block.py index f72e6490..1c7128c6 100644 --- a/core/src/datayoga_core/blocks/parquet/read/block.py +++ b/core/src/datayoga_core/blocks/parquet/read/block.py @@ -1,10 +1,10 @@ import logging import os from abc import ABCMeta -from typing import AsyncGenerator, List, Optional +from itertools import count +from typing import Any, AsyncGenerator, Dict, List, Optional from datayoga_core.context import Context -from datayoga_core.producer import Message from datayoga_core.producer import Producer as DyProducer from fastparquet import ParquetFile @@ -16,21 +16,18 @@ class Block(DyProducer, metaclass=ABCMeta): def init(self, context: Optional[Context] = None): logger.debug(f"Initializing {self.get_block_name()}") parquet_file = self.properties["file"] - if os.path.isabs(parquet_file) or context is None: self.file = parquet_file else: self.file = os.path.join(context.properties.get("data_path"), parquet_file) - logger.debug(f"file: {self.file}") - async def produce(self) -> AsyncGenerator[List[Message], None]: + async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: logger.debug("Reading parquet") - pf = ParquetFile(self.file) - - count = 0 + counter = iter(count()) for df in pf.iter_row_groups(): - for _, data in df.iterrows(): - yield [{self.MSG_ID_FIELD: str(count), **data.to_dict()}] - count += 1 + yield [ + {self.MSG_ID_FIELD: str(next(counter)), **row.to_dict()} + for _, row in df.iterrows() + ] diff --git a/core/src/datayoga_core/blocks/parquet/read/block.schema.json b/core/src/datayoga_core/blocks/parquet/read/block.schema.json index 13bcec76..395b3edd 100644 --- a/core/src/datayoga_core/blocks/parquet/read/block.schema.json +++ b/core/src/datayoga_core/blocks/parquet/read/block.schema.json @@ -2,6 +2,7 @@ "title": "parquet.read", "description": "Read data from parquet", "type": "object", + "$inherit": ["batchable"], "properties": { "file": { "description": "Filename. Can contain a regexp or glob expression", diff --git a/core/src/datayoga_core/blocks/parquet/read/tests/__init__.py b/core/src/datayoga_core/blocks/parquet/read/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py b/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py new file mode 100644 index 00000000..ab6d8517 --- /dev/null +++ b/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py @@ -0,0 +1,43 @@ +from pathlib import Path + +import pandas as pd +import pytest + +from datayoga_core.blocks.parquet.read.block import Block + + +async def _drain(producer): + out = [] + async for batch in producer.produce(): + out.append(batch) + return out + + +@pytest.fixture +def parquet_path(tmp_path) -> Path: + p = tmp_path / "data.parquet" + df = pd.DataFrame({"i": list(range(2500))}) + from fastparquet import write as fp_write + fp_write(str(p), df, row_group_offsets=1000) + return p + + +@pytest.mark.asyncio +async def test_parquet_batches_to_batch_size(parquet_path): + block = Block({"file": str(parquet_path), "batch_size": 1000}) + block.init() + batches = await _drain(block) + assert [len(b) for b in batches] == [1000, 1000, 500] + flat = [r for b in batches for r in b] + assert flat[0]["i"] == 0 + assert all(Block.MSG_ID_FIELD in r for r in flat) + + +@pytest.mark.asyncio +async def test_parquet_rechunks_across_row_groups(parquet_path): + # row groups are [1000, 1000, 500]; batch_size=750 should give batches of + # [750, 750, 750, 250] regardless of row group boundaries. + block = Block({"file": str(parquet_path), "batch_size": 750}) + block.init() + batches = await _drain(block) + assert [len(b) for b in batches] == [750, 750, 750, 250] From 85ac26a9642c895b34a9d9d8107203c81554670f Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 17:01:52 +0300 Subject: [PATCH 17/38] Migrate relational/read to produce_chunks, add fetch_size (#400, #295) Co-Authored-By: Claude Opus 4.7 (1M context) --- .../blocks/relational/read/block.py | 19 +++-- .../blocks/relational/read/block.schema.json | 7 ++ .../blocks/relational/read/tests/__init__.py | 0 .../read/tests/test_relational_read.py | 79 +++++++++++++++++++ 4 files changed, 95 insertions(+), 10 deletions(-) create mode 100644 core/src/datayoga_core/blocks/relational/read/tests/__init__.py create mode 100644 core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py diff --git a/core/src/datayoga_core/blocks/relational/read/block.py b/core/src/datayoga_core/blocks/relational/read/block.py index 97d8dcdd..2b04f3c3 100644 --- a/core/src/datayoga_core/blocks/relational/read/block.py +++ b/core/src/datayoga_core/blocks/relational/read/block.py @@ -1,23 +1,23 @@ import logging -from typing import AsyncGenerator, List, Optional +from typing import Any, AsyncGenerator, Dict, List, Optional import sqlalchemy as sa from datayoga_core import utils from datayoga_core.blocks.relational import utils as relational_utils from datayoga_core.context import Context -from datayoga_core.producer import Message from datayoga_core.producer import Producer as DyProducer logger = logging.getLogger("dy") class Block(DyProducer): + DEFAULT_FETCH_SIZE = 10000 def init(self, context: Optional[Context] = None): self.engine, self.db_type = relational_utils.get_engine( self.properties["connection"], context, - autocommit=False + autocommit=False, ) self.schema = self.properties.get("schema") @@ -32,15 +32,14 @@ def init(self, context: Optional[Context] = None): logger.debug(f"Connecting to {self.db_type}") self.connection = self.engine.connect() - async def produce(self) -> AsyncGenerator[List[Message], None]: + async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: + fetch_size = int(self.properties.get("fetch_size", self.DEFAULT_FETCH_SIZE)) result = self.connection.execution_options(stream_results=True).execute(self.tbl.select()) - while True: - chunk = result.fetchmany(10000) - if not chunk: - break - for row in chunk: - yield [utils.add_uid(dict(row._asdict()))] + rows = result.fetchmany(fetch_size) + if not rows: + return + yield [utils.add_uid(dict(row._asdict())) for row in rows] def stop(self): self.connection.close() diff --git a/core/src/datayoga_core/blocks/relational/read/block.schema.json b/core/src/datayoga_core/blocks/relational/read/block.schema.json index 4a65a8fc..df5bc8b2 100644 --- a/core/src/datayoga_core/blocks/relational/read/block.schema.json +++ b/core/src/datayoga_core/blocks/relational/read/block.schema.json @@ -2,6 +2,7 @@ "title": "relational.read", "description": "Read a table from an SQL-compatible data store", "type": "object", + "$inherit": ["batchable"], "additionalProperties": false, "examples": [ { @@ -41,6 +42,12 @@ "title": "name of column" }, "examples": [["fname", { "lname": "last_name" }]] + }, + "fetch_size": { + "type": "integer", + "minimum": 1, + "description": "Driver-level rows fetched per round-trip. Defaults to 10000.", + "default": 10000 } }, "required": ["connection", "table"] diff --git a/core/src/datayoga_core/blocks/relational/read/tests/__init__.py b/core/src/datayoga_core/blocks/relational/read/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py b/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py new file mode 100644 index 00000000..0fba4629 --- /dev/null +++ b/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py @@ -0,0 +1,79 @@ +from unittest.mock import MagicMock + +import pytest + +from datayoga_core.blocks.relational.read.block import Block + + +async def _drain(producer): + out = [] + async for batch in producer.produce(): + out.append(batch) + return out + + +def _fake_result(rows): + """Build a fake SQLAlchemy result that returns rows in fetchmany chunks.""" + state = {"i": 0} + + def fetchmany(n): + i = state["i"] + chunk = rows[i:i + n] + state["i"] += len(chunk) + return chunk + + res = MagicMock() + res.fetchmany.side_effect = fetchmany + res.execution_options.return_value = res + return res + + +class _Row: + def __init__(self, d): + self._d = d + + def _asdict(self): + return self._d + + +def _mk_block(properties, fake_result): + block = Block.__new__(Block) + block.properties = properties + block.connection = MagicMock() + block.tbl = MagicMock() + block.tbl.select.return_value = "SELECT *" + block.connection.execution_options.return_value.execute.return_value = fake_result + return block + + +@pytest.mark.asyncio +async def test_relational_read_yields_batches_not_rows(): + rows = [_Row({"i": i}) for i in range(2500)] + fake_result = _fake_result(rows) + block = _mk_block({"batch_size": 1000}, fake_result) + batches = await _drain(block) + assert [len(b) for b in batches] == [1000, 1000, 500] + + +@pytest.mark.asyncio +async def test_relational_read_fetch_size_independent_of_batch_size(): + rows = [_Row({"i": i}) for i in range(5000)] + fake_result = _fake_result(rows) + block = _mk_block({"batch_size": 1000, "fetch_size": 2500}, fake_result) + batches = await _drain(block) + # Downstream batches are still batch_size=1000 + assert [len(b) for b in batches] == [1000, 1000, 1000, 1000, 1000] + # Driver fetched in fetch_size=2500 chunks (2500 + 2500 + 0) + fetch_sizes = [c.args[0] for c in fake_result.fetchmany.call_args_list] + assert fetch_sizes[0] == 2500 + assert fetch_sizes[1] == 2500 + + +@pytest.mark.asyncio +async def test_relational_read_default_fetch_size_is_10000(): + rows = [_Row({"i": i}) for i in range(500)] + fake_result = _fake_result(rows) + block = _mk_block({}, fake_result) + await _drain(block) + fetch_sizes = [c.args[0] for c in fake_result.fetchmany.call_args_list] + assert fetch_sizes[0] == 10000 From 3b72998380518ac85fe333d7ac03531dd75957e4 Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 17:04:16 +0300 Subject: [PATCH 18/38] Migrate http/receiver to produce_chunks (#400) --- .../blocks/http/receiver/block.py | 20 +++++---- .../blocks/http/receiver/block.schema.json | 1 + .../blocks/http/receiver/tests/__init__.py | 0 .../http/receiver/tests/test_http_receiver.py | 44 +++++++++++++++++++ 4 files changed, 56 insertions(+), 9 deletions(-) create mode 100644 core/src/datayoga_core/blocks/http/receiver/tests/__init__.py create mode 100644 core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py diff --git a/core/src/datayoga_core/blocks/http/receiver/block.py b/core/src/datayoga_core/blocks/http/receiver/block.py index f325e56b..3f5b1833 100644 --- a/core/src/datayoga_core/blocks/http/receiver/block.py +++ b/core/src/datayoga_core/blocks/http/receiver/block.py @@ -3,13 +3,12 @@ from asyncio import Queue from contextlib import suppress from itertools import count -from typing import AsyncGenerator, List, Optional +from typing import Any, AsyncGenerator, Dict, List, Optional import orjson from aiohttp.web import (BaseRequest, HTTPInternalServerError, HTTPOk, Response, Server, ServerRunner, TCPSite) from datayoga_core.context import Context -from datayoga_core.producer import Message from datayoga_core.producer import Producer as DyProducer logger = logging.getLogger("dy") @@ -18,20 +17,21 @@ class Block(DyProducer, metaclass=ABCMeta): port: int host: str + DEFAULT_FLUSH_MS = 1000 def init(self, context: Optional[Context] = None): logger.debug(f"Initializing {self.get_block_name()}") self.port = int(self.properties.get("port", 8080)) self.host = self.properties.get("host", "0.0.0.0") - async def produce(self) -> AsyncGenerator[List[Message], None]: - queue = Queue(maxsize=1000) + async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: + queue: Queue = Queue(maxsize=1000) async def handler(request: BaseRequest) -> Response: try: queue.put_nowait(orjson.loads(await request.read())) return HTTPOk() - except Exception: # noqa + except Exception: logger.exception("Got exception while parsing request:") return HTTPInternalServerError() @@ -43,11 +43,13 @@ async def handler(request: BaseRequest) -> Response: try: counter = iter(count()) - while True: - data = await queue.get() - yield [{self.MSG_ID_FIELD: f"{next(counter)}", **data}] - + first = await queue.get() + chunk = [{self.MSG_ID_FIELD: f"{next(counter)}", **first}] + while not queue.empty(): + record = queue.get_nowait() + chunk.append({self.MSG_ID_FIELD: f"{next(counter)}", **record}) + yield chunk finally: with suppress(Exception): await srv.stop() diff --git a/core/src/datayoga_core/blocks/http/receiver/block.schema.json b/core/src/datayoga_core/blocks/http/receiver/block.schema.json index c5189b5f..a52edcc5 100644 --- a/core/src/datayoga_core/blocks/http/receiver/block.schema.json +++ b/core/src/datayoga_core/blocks/http/receiver/block.schema.json @@ -2,6 +2,7 @@ "title": "http.receiver", "description": "Receives HTTP requests and process the data.", "type": "object", + "$inherit": ["streamable"], "properties": { "host": { "description": "Host to listen", diff --git a/core/src/datayoga_core/blocks/http/receiver/tests/__init__.py b/core/src/datayoga_core/blocks/http/receiver/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py b/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py new file mode 100644 index 00000000..613d91d7 --- /dev/null +++ b/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py @@ -0,0 +1,44 @@ +import asyncio + +import aiohttp +import pytest + +from datayoga_core.blocks.http.receiver.block import Block + + +def _free_port(): + import socket + with socket.socket() as s: + s.bind(("127.0.0.1", 0)) + return s.getsockname()[1] + + +@pytest.mark.asyncio +async def test_http_receiver_batches_incoming_requests(): + port = _free_port() + block = Block({"host": "127.0.0.1", "port": port, + "batch_size": 50, "flush_ms": 200}) + block.init() + + received = [] + + async def consumer(): + async for batch in block.produce(): + received.append(batch) + if sum(len(b) for b in received) >= 60: + return + + consumer_task = asyncio.create_task(consumer()) + await asyncio.sleep(0.2) # let server start + + async with aiohttp.ClientSession() as session: + for i in range(60): + async with session.post(f"http://127.0.0.1:{port}", json={"i": i}) as r: + assert r.status == 200 + + await asyncio.wait_for(consumer_task, timeout=5) + + flat = [r for b in received for r in b] + assert len(flat) == 60 + assert any(len(b) == 50 for b in received) + assert all(Block.MSG_ID_FIELD in r for r in flat) From 0b774ac3ce082327c6a36b35e83509162984048a Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 17:07:04 +0300 Subject: [PATCH 19/38] Migrate redis/read_stream to batched xreadgroup (#400, #377) --- .../blocks/redis/read_stream/block.py | 35 +++++++----- .../redis/read_stream/block.schema.json | 1 + .../redis/read_stream/tests/__init__.py | 0 .../tests/test_redis_read_stream.py | 54 +++++++++++++++++++ 4 files changed, 76 insertions(+), 14 deletions(-) create mode 100644 core/src/datayoga_core/blocks/redis/read_stream/tests/__init__.py create mode 100644 core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py diff --git a/core/src/datayoga_core/blocks/redis/read_stream/block.py b/core/src/datayoga_core/blocks/redis/read_stream/block.py index 667ed02d..136d0963 100644 --- a/core/src/datayoga_core/blocks/redis/read_stream/block.py +++ b/core/src/datayoga_core/blocks/redis/read_stream/block.py @@ -1,23 +1,22 @@ import logging -from typing import AsyncGenerator, List, Optional +from typing import Any, AsyncGenerator, Dict, List, Optional import datayoga_core.blocks.redis.utils as redis_utils import orjson from datayoga_core.connection import Connection from datayoga_core.context import Context -from datayoga_core.producer import Message from datayoga_core.producer import Producer as DyProducer logger = logging.getLogger("dy") class Block(DyProducer): + DEFAULT_FLUSH_MS = 1000 + def init(self, context: Optional[Context] = None): logger.debug(f"Initializing {self.get_block_name()}") - connection_details = Connection.get_connection_details(self.properties["connection"], context) self.redis_client = redis_utils.get_client(connection_details) - self.stream = self.properties["stream_name"] self.snapshot = self.properties.get("snapshot", False) self.consumer_group = f'datayoga_job_{context.properties.get("job_name", "") if context else ""}' @@ -27,25 +26,33 @@ def init(self, context: Optional[Context] = None): logger.info(f"Creating a new {self.consumer_group} consumer group associated with the {self.stream}") self.redis_client.xgroup_create(self.stream, self.consumer_group, 0) - async def produce(self) -> AsyncGenerator[List[Message], None]: + async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: logger.debug(f"Running {self.get_block_name()}") - + batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE)) read_pending = True - while True: - # Read pending messages (fetched by us before but not acknowledged) in the first time, then consume new messages - streams = self.redis_client.xreadgroup(self.consumer_group, self.requesting_consumer, { - self.stream: "0" if read_pending else ">"}, None, 100 if self.snapshot else 0) + while True: + streams = self.redis_client.xreadgroup( + self.consumer_group, self.requesting_consumer, + {self.stream: "0" if read_pending else ">"}, + count=batch_size, + block=100 if self.snapshot else 0, + ) + + yielded_any = False for stream in streams: logger.debug(f"Messages in {self.stream} stream (pending: {read_pending}):\n\t{stream}") + chunk: List[Dict[str, Any]] = [] for key, value in stream[1]: payload = orjson.loads(value[next(iter(value))]) payload[self.MSG_ID_FIELD] = key - yield [payload] + chunk.append(payload) + if chunk: + yielded_any = True + yield chunk - # Quit after consuming pending current messages in case of snapshot - if self.snapshot and not read_pending: - break + if self.snapshot and not read_pending and not yielded_any: + return read_pending = False diff --git a/core/src/datayoga_core/blocks/redis/read_stream/block.schema.json b/core/src/datayoga_core/blocks/redis/read_stream/block.schema.json index bc2d148c..f7e0a948 100644 --- a/core/src/datayoga_core/blocks/redis/read_stream/block.schema.json +++ b/core/src/datayoga_core/blocks/redis/read_stream/block.schema.json @@ -2,6 +2,7 @@ "title": "redis.read_stream", "description": "Read from Redis stream", "type": "object", + "$inherit": ["streamable"], "properties": { "connection": { "description": "Connection name", "type": "string" }, "stream_name": { diff --git a/core/src/datayoga_core/blocks/redis/read_stream/tests/__init__.py b/core/src/datayoga_core/blocks/redis/read_stream/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py b/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py new file mode 100644 index 00000000..f45b8d67 --- /dev/null +++ b/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py @@ -0,0 +1,54 @@ +from unittest.mock import MagicMock + +import pytest + +from datayoga_core.blocks.redis.read_stream.block import Block + + +def _mk_block(properties, redis_client): + block = Block.__new__(Block) + block.properties = properties + block.redis_client = redis_client + block.stream = "mystream" + block.snapshot = properties.get("_snapshot", True) + block.consumer_group = "g" + block.requesting_consumer = "c" + return block + + +@pytest.mark.asyncio +async def test_redis_uses_count_equal_to_batch_size(): + redis = MagicMock() + payload_a = (b"1-0", {b"data": b'{"i": 1}'}) + payload_b = (b"2-0", {b"data": b'{"i": 2}'}) + redis.xreadgroup.side_effect = [ + [(b"mystream", [payload_a, payload_b])], # pending + [(b"mystream", [])], # nothing new -> exit + ] + + block = _mk_block({"batch_size": 250, "_snapshot": True}, redis) + batches = [] + async for b in block.produce(): + batches.append(b) + + assert all(c.kwargs.get("count") == 250 or (len(c.args) >= 4 and c.args[3] == 250) + for c in redis.xreadgroup.call_args_list), \ + "xreadgroup should be called with count=batch_size" + + +@pytest.mark.asyncio +async def test_redis_yields_records_as_a_batch_not_one_by_one(): + redis = MagicMock() + pages = [(f"{i}-0".encode(), {b"data": f'{{"i": {i}}}'.encode()}) for i in range(5)] + redis.xreadgroup.side_effect = [ + [(b"mystream", pages)], + [(b"mystream", [])], + ] + + block = _mk_block({"batch_size": 100, "_snapshot": True}, redis) + batches = [] + async for b in block.produce(): + batches.append(b) + + assert [len(b) for b in batches] == [5] + assert batches[0][0]["i"] == 0 From 38cf4ec7edf55bfdc741179e40239ae215e6d3cc Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 17:10:25 +0300 Subject: [PATCH 20/38] Migrate azure/read_event_hub; rename batch_size -> max_batch_size (#400, BREAKING) --- .../blocks/azure/read_event_hub/block.py | 66 +++++-------------- .../azure/read_event_hub/block.schema.json | 7 +- .../azure/read_event_hub/tests/__init__.py | 0 .../read_event_hub/tests/test_event_hub.py | 56 ++++++++++++++++ 4 files changed, 76 insertions(+), 53 deletions(-) create mode 100644 core/src/datayoga_core/blocks/azure/read_event_hub/tests/__init__.py create mode 100644 core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py diff --git a/core/src/datayoga_core/blocks/azure/read_event_hub/block.py b/core/src/datayoga_core/blocks/azure/read_event_hub/block.py index d91497ed..77f76d7d 100644 --- a/core/src/datayoga_core/blocks/azure/read_event_hub/block.py +++ b/core/src/datayoga_core/blocks/azure/read_event_hub/block.py @@ -1,6 +1,6 @@ import asyncio import logging -from typing import AsyncGenerator, List, Optional +from typing import Any, AsyncGenerator, Dict, List, Optional import orjson from azure.eventhub import EventData, PartitionContext @@ -8,7 +8,6 @@ from azure.eventhub.extensions.checkpointstoreblobaio import \ BlobCheckpointStore from datayoga_core.context import Context -from datayoga_core.producer import Message from datayoga_core.producer import Producer as DyProducer logger = logging.getLogger("dy") @@ -17,67 +16,43 @@ class Block(DyProducer): """Azure Event Hub block for reading events.""" - def init(self, context: Optional[Context] = None): - """Initializes the block. + DEFAULT_FLUSH_MS = 1000 - Args: - context (Context, optional): The block context. Defaults to None. - """ + def init(self, context: Optional[Context] = None): logger.debug(f"Initializing {self.get_block_name()}") - - self.batch_size = self.properties.get("batch_size", 300) - + self.max_batch_size = int(self.properties.get("max_batch_size", 300)) self.consumer_client = EventHubConsumerClient.from_connection_string( conn_str=self.properties["event_hub_connection_string"], consumer_group=self.properties["event_hub_consumer_group_name"], eventhub_name=self.properties["event_hub_name"], checkpoint_store=BlobCheckpointStore.from_connection_string( self.properties["checkpoint_store_connection_string"], - self.properties["checkpoint_store_container_name"]) + self.properties["checkpoint_store_container_name"]), ) + self.events: Dict[Any, Any] = {} + self.messages: asyncio.Queue = asyncio.Queue() - self.events = {} # Retrieved events by sequence number, used for acknowledging them once processed - self.messages = asyncio.Queue() - - async def produce(self) -> AsyncGenerator[List[Message], None]: - """Starts the event receiving process and yield batches of messages. - - Yields: - AsyncGenerator[List[Message], None]: A generator of message batches. - """ + async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: logger.debug(f"Running {self.get_block_name()}") - logger.debug("Starting event receiving process") asyncio.create_task(self.receive_batch()) while True: - if not self.messages.empty(): - batch = [] - while not self.messages.empty(): - message = await self.messages.get() - batch.append(message) - - yield batch - - await asyncio.sleep(0.1) + first = await self.messages.get() + chunk = [first] + while not self.messages.empty(): + chunk.append(self.messages.get_nowait()) + yield chunk async def receive_batch(self): - """Receives events in batches from the Event Hub.""" await self.consumer_client.receive_batch( on_event_batch=self.on_event_batch, - max_batch_size=self.batch_size, - starting_position="-1", # read from the beginning of the partition. + max_batch_size=self.max_batch_size, + starting_position="-1", ) async def on_event_batch(self, partition_context: PartitionContext, events: List[EventData]): - """Processes each batch of events received from the Event Hub. - - Args: - partition_context (PartitionContext): The partition context. - events (List[EventData]): The list of events in the batch. - """ logger.debug(f"Received batch of events from partition: {partition_context.partition_id}") - for event in events: try: payload = orjson.loads(event.body_as_str(encoding="UTF-8")) @@ -89,24 +64,13 @@ async def on_event_batch(self, partition_context: PartitionContext, events: List logger.error(e) async def complete_events(self, msg_ids: List[str]): - """Completes the events and update the checkpoint. - - Args: - msg_ids (List[str]): The list of message IDs to complete. - """ for msg_id in msg_ids: logger.debug(f"Acking {msg_id} event") event, partition_context = self.events.pop(msg_id, (None, None)) - if event is not None: await partition_context.update_checkpoint(event) else: logger.warning(f"Couldn't find event {msg_id} for acknowledging") def ack(self, msg_ids: List[str]): - """Acknowledges the completion of events. - - Args: - msg_ids (List[str]): The list of message IDs to acknowledge. - """ asyncio.create_task(self.complete_events(msg_ids)) diff --git a/core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json b/core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json index 908c211c..f663d383 100644 --- a/core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json +++ b/core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json @@ -2,6 +2,7 @@ "title": "azure.read_event_hub", "description": "Read from Azure Event Hub", "type": "object", + "$inherit": ["streamable"], "properties": { "event_hub_connection_string": { "type": "string", @@ -23,12 +24,14 @@ "type": "string", "description": "The name of the container within the checkpoint store to store the checkpoints." }, - "batch_size": { + "max_batch_size": { "type": "integer", - "description": "The maximum number of events to receive in each batch.", + "minimum": 1, + "description": "Maximum number of events to receive in each SDK callback. Renamed from the previous batch_size which used to mean this. Defaults to 300.", "default": 300 } }, + "additionalProperties": false, "required": [ "event_hub_connection_string", "event_hub_consumer_group_name", diff --git a/core/src/datayoga_core/blocks/azure/read_event_hub/tests/__init__.py b/core/src/datayoga_core/blocks/azure/read_event_hub/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py b/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py new file mode 100644 index 00000000..074b7c36 --- /dev/null +++ b/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py @@ -0,0 +1,56 @@ +import pytest +from jsonschema import ValidationError + +from datayoga_core.blocks.azure.read_event_hub.block import Block + + +def _minimal_props(extra=None): + base = { + "event_hub_connection_string": "Endpoint=sb://x/;SharedAccessKeyName=k;SharedAccessKey=v;EntityPath=eh", + "event_hub_consumer_group_name": "$Default", + "event_hub_name": "eh", + "checkpoint_store_connection_string": "DefaultEndpointsProtocol=https;AccountName=a;AccountKey=k==", + "checkpoint_store_container_name": "chk", + } + if extra: + base.update(extra) + return base + + +def test_unknown_property_rejected_by_validation(): + """additionalProperties: false catches typos like 'batch_sz'.""" + with pytest.raises(ValidationError): + Block(_minimal_props({"batch_sz": 300})) + + +def test_max_batch_size_accepted(): + """The renamed SDK-level property is now max_batch_size.""" + block = Block(_minimal_props({"max_batch_size": 500, "batch_size": 100})) + assert block.properties["max_batch_size"] == 500 + assert block.properties["batch_size"] == 100 + + +def test_max_batch_size_defaults_to_300_when_omitted(): + """The block's init() reads max_batch_size with a default of 300.""" + block = Block(_minimal_props()) + assert int(block.properties.get("max_batch_size", 300)) == 300 + + +def test_renamed_schema_has_additional_properties_false(): + """Schema after rename: max_batch_size + streamable's batch_size/flush_ms, + no unknown properties allowed.""" + block = Block(_minimal_props()) + schema = block.get_json_schema() + assert schema.get("additionalProperties") is False + assert "max_batch_size" in schema["properties"] + assert "batch_size" in schema["properties"] + assert "flush_ms" in schema["properties"] + + +def test_batch_size_300_is_silently_repurposed(): + """A user upgrading from a pre-rename version with batch_size: 300 (which + used to mean SDK callback size) will see their YAML still validate, but + batch_size now means pipeline batch size. Documented as breaking change.""" + block = Block(_minimal_props({"batch_size": 300})) + assert block.properties["batch_size"] == 300 + assert "max_batch_size" not in block.properties From b67bc4a8e345e202610f27af0271d41e841af686 Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 17:15:46 +0300 Subject: [PATCH 21/38] Regenerate JSON schemas and reference docs after producer batching (#400) --- docs/reference/batchable.md | 24 +++++++ docs/reference/blocks/azure_read_event_hub.md | 5 +- docs/reference/blocks/files_read_csv.md | 1 - docs/reference/blocks/relational_read.md | 1 + docs/reference/blocks/std_read.md | 14 +--- docs/reference/connections.md | 2 +- docs/reference/job.md | 2 +- docs/reference/streamable.md | 26 +++++++ schemas/job.schema.json | 68 +++++++++++++++++-- 9 files changed, 121 insertions(+), 22 deletions(-) create mode 100644 docs/reference/batchable.md create mode 100644 docs/reference/streamable.md diff --git a/docs/reference/batchable.md b/docs/reference/batchable.md new file mode 100644 index 00000000..4c344fa8 --- /dev/null +++ b/docs/reference/batchable.md @@ -0,0 +1,24 @@ +--- +parent: Reference +nav_order: 1 +--- + +# batchable + +Producer batching mixin: declares batch_size for producers that yield records in batches. + + +**Properties** + +|Name|Type|Description|Required| +|----|----|-----------|--------| +|**batch\_size**|`integer`|Maximum number of records yielded per downstream batch.
Default: `1000`
Minimum: `1`
|| + +**Example** + +```yaml +batch_size: 1000 + +``` + + diff --git a/docs/reference/blocks/azure_read_event_hub.md b/docs/reference/blocks/azure_read_event_hub.md index b247fb30..578b968c 100644 --- a/docs/reference/blocks/azure_read_event_hub.md +++ b/docs/reference/blocks/azure_read_event_hub.md @@ -17,12 +17,13 @@ Read from Azure Event Hub |**event\_hub\_name**|`string`|The name of the Azure Event Hub.
|yes| |**checkpoint\_store\_connection\_string**|`string`|The connection string for the Azure Storage account used as the checkpoint store.
|yes| |**checkpoint\_store\_container\_name**|`string`|The name of the container within the checkpoint store to store the checkpoints.
|yes| -|**batch\_size**|`integer`|The maximum number of events to receive in each batch.
Default: `300`
|no| +|**max\_batch\_size**|`integer`|Maximum number of events to receive in each SDK callback. Renamed from the previous batch_size which used to mean this. Defaults to 300.
Default: `300`
Minimum: `1`
|no| +**Additional Properties:** not allowed **Example** ```yaml -batch_size: 300 +max_batch_size: 300 ``` diff --git a/docs/reference/blocks/files_read_csv.md b/docs/reference/blocks/files_read_csv.md index 3f47237f..8948865a 100644 --- a/docs/reference/blocks/files_read_csv.md +++ b/docs/reference/blocks/files_read_csv.md @@ -17,7 +17,6 @@ Read data from CSV |[**fields**](#fields)
(List of columns to use)|`string[]`|List of columns to use for extract
Minimal Length: `1`
|no| |**skip**|`number`|Number of lines to skip
Default: `0`
Minimum: `0`
|no| |**delimiter**|`string`|Delimiter to use for splitting the csv records
Default: `","`
Minimal Length: `1`
Maximal Length: `1`
|no| -|**batch\_size**|`number`|Number of records to read per batch
Default: `1000`
Minimum: `1`
|no| |**quotechar**|`string`|A one-character string used to quote fields containing special characters, such as the delimiter or quotechar, or which contain new-line characters. It defaults to '
Default: `"\""`
Minimal Length: `1`
Maximal Length: `1`
|no| **Additional Properties:** not allowed diff --git a/docs/reference/blocks/relational_read.md b/docs/reference/blocks/relational_read.md index 1b11df44..4bb5248c 100644 --- a/docs/reference/blocks/relational_read.md +++ b/docs/reference/blocks/relational_read.md @@ -16,6 +16,7 @@ Read a table from an SQL-compatible data store |**schema**
(The table schema of the table)|`string`|If left blank, the default schema of this connection will be used as defined in the connections.dy.yaml
|no| |**table**
(The table name)|`string`|Table name
|yes| |[**columns**](#columns)
(Optional subset of columns to load)|`array`||no| +|**fetch\_size**|`integer`|Driver-level rows fetched per round-trip. Defaults to 10000.
Default: `10000`
Minimum: `1`
|no| **Additional Properties:** not allowed **Example** diff --git a/docs/reference/blocks/std_read.md b/docs/reference/blocks/std_read.md index aca1c24a..bee360f2 100644 --- a/docs/reference/blocks/std_read.md +++ b/docs/reference/blocks/std_read.md @@ -8,17 +8,7 @@ grand_parent: Reference Read from the standard input -**Properties** - -|Name|Type|Description|Required| -|----|----|-----------|--------| -|**batch\_size**|`integer`|Number of records to process in a single batch
Default: `1000`
|| - -**Example** - -```yaml -batch_size: 1000 - -``` +**No properties.** +**Additional Properties:** not allowed diff --git a/docs/reference/connections.md b/docs/reference/connections.md index bfc2b8d0..580fbb39 100644 --- a/docs/reference/connections.md +++ b/docs/reference/connections.md @@ -1,6 +1,6 @@ --- parent: Reference -nav_order: 1 +nav_order: 2 --- # Connections diff --git a/docs/reference/job.md b/docs/reference/job.md index ed88211d..615a6da8 100644 --- a/docs/reference/job.md +++ b/docs/reference/job.md @@ -1,6 +1,6 @@ --- parent: Reference -nav_order: 2 +nav_order: 3 --- # Job diff --git a/docs/reference/streamable.md b/docs/reference/streamable.md new file mode 100644 index 00000000..49f499cd --- /dev/null +++ b/docs/reference/streamable.md @@ -0,0 +1,26 @@ +--- +parent: Reference +nav_order: 4 +--- + +# streamable + +Streaming producer mixin: declares batch_size and flush_ms for producers reading from continuous sources. + + +**Properties** + +|Name|Type|Description|Required| +|----|----|-----------|--------| +|**batch\_size**|`integer`|Maximum number of records yielded per downstream batch.
Default: `1000`
Minimum: `1`
|| +|**flush\_ms**|`integer`, `null`|If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.
Default: `1000`
Minimum: `1`
|| + +**Example** + +```yaml +batch_size: 1000 +flush_ms: 1000 + +``` + + diff --git a/schemas/job.schema.json b/schemas/job.schema.json index 1b2a2533..ad0f20b9 100644 --- a/schemas/job.schema.json +++ b/schemas/job.schema.json @@ -111,11 +111,13 @@ "then": { "properties": { "with": { + "additionalProperties": false, "description": "Read from Azure Event Hub", "properties": { "batch_size": { - "default": 300, - "description": "The maximum number of events to receive in each batch.", + "default": 1000, + "description": "Maximum number of records yielded per downstream batch.", + "minimum": 1, "type": "integer" }, "checkpoint_store_connection_string": { @@ -137,6 +139,18 @@ "event_hub_name": { "description": "The name of the Azure Event Hub.", "type": "string" + }, + "flush_ms": { + "default": 1000, + "description": "If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.", + "minimum": 1, + "type": ["integer", "null"] + }, + "max_batch_size": { + "default": 300, + "description": "Maximum number of events to receive in each SDK callback. Renamed from the previous batch_size which used to mean this. Defaults to 300.", + "minimum": 1, + "type": "integer" } }, "required": [ @@ -258,9 +272,9 @@ "properties": { "batch_size": { "default": 1000, - "description": "Number of records to read per batch", + "description": "Maximum number of records yielded per downstream batch.", "minimum": 1, - "type": "number" + "type": "integer" }, "delimiter": { "default": ",", @@ -366,6 +380,18 @@ "description": "Receives HTTP requests and process the data.", "examples": [{ "host": "localhost", "port": 8080 }], "properties": { + "batch_size": { + "default": 1000, + "description": "Maximum number of records yielded per downstream batch.", + "minimum": 1, + "type": "integer" + }, + "flush_ms": { + "default": 1000, + "description": "If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.", + "minimum": 1, + "type": ["integer", "null"] + }, "host": { "default": "0.0.0.0", "description": "Host to listen", @@ -696,6 +722,12 @@ "description": "Read data from parquet", "examples": [{ "file": "data.parquet" }], "properties": { + "batch_size": { + "default": 1000, + "description": "Maximum number of records yielded per downstream batch.", + "minimum": 1, + "type": "integer" + }, "file": { "description": "Filename. Can contain a regexp or glob expression", "type": "string" @@ -825,10 +857,22 @@ "additionalProperties": false, "description": "Read from Redis stream", "properties": { + "batch_size": { + "default": 1000, + "description": "Maximum number of records yielded per downstream batch.", + "minimum": 1, + "type": "integer" + }, "connection": { "description": "Connection name", "type": "string" }, + "flush_ms": { + "default": 1000, + "description": "If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.", + "minimum": 1, + "type": ["integer", "null"] + }, "snapshot": { "default": false, "description": "Snapshot current entries and quit", @@ -1022,6 +1066,12 @@ } ], "properties": { + "batch_size": { + "default": 1000, + "description": "Maximum number of records yielded per downstream batch.", + "minimum": 1, + "type": "integer" + }, "columns": { "examples": [["fname", { "lname": "last_name" }]], "items": { @@ -1037,6 +1087,12 @@ "title": "The connection to use for loading", "type": "string" }, + "fetch_size": { + "default": 10000, + "description": "Driver-level rows fetched per round-trip. Defaults to 10000.", + "minimum": 1, + "type": "integer" + }, "schema": { "description": "If left blank, the default schema of this connection will be used as defined in the connections.dy.yaml", "examples": ["dbo"], @@ -1370,11 +1426,13 @@ "then": { "properties": { "with": { + "additionalProperties": false, "description": "Read from the standard input", "properties": { "batch_size": { "default": 1000, - "description": "Number of records to process in a single batch", + "description": "Maximum number of records yielded per downstream batch.", + "minimum": 1, "type": "integer" } }, From 606946568d2375076bd4253b7bf291fd0bfd7385 Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 17:19:12 +0300 Subject: [PATCH 22/38] Resolve $inherit before jsonschema2mk so block docs include batch_size (#400) --- docs/reference/blocks/azure_read_event_hub.md | 4 +++ docs/reference/blocks/files_read_csv.md | 1 + docs/reference/blocks/http_receiver.md | 2 ++ docs/reference/blocks/parquet_read.md | 1 + docs/reference/blocks/redis_read_stream.md | 4 +++ docs/reference/blocks/relational_read.md | 1 + docs/reference/blocks/std_read.md | 13 ++++++- scripts/generate-docs.sh | 34 ++++++++++++++++++- 8 files changed, 58 insertions(+), 2 deletions(-) diff --git a/docs/reference/blocks/azure_read_event_hub.md b/docs/reference/blocks/azure_read_event_hub.md index 578b968c..fc3f8e5b 100644 --- a/docs/reference/blocks/azure_read_event_hub.md +++ b/docs/reference/blocks/azure_read_event_hub.md @@ -12,6 +12,8 @@ Read from Azure Event Hub |Name|Type|Description|Required| |----|----|-----------|--------| +|**batch\_size**|`integer`|Maximum number of records yielded per downstream batch.
Default: `1000`
Minimum: `1`
|no| +|**flush\_ms**|`integer`, `null`|If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.
Default: `1000`
Minimum: `1`
|no| |**event\_hub\_connection\_string**|`string`|The connection string for the Azure Event Hub namespace.
|yes| |**event\_hub\_consumer\_group\_name**|`string`|The name of the consumer group to read events from.
|yes| |**event\_hub\_name**|`string`|The name of the Azure Event Hub.
|yes| @@ -23,6 +25,8 @@ Read from Azure Event Hub **Example** ```yaml +batch_size: 1000 +flush_ms: 1000 max_batch_size: 300 ``` diff --git a/docs/reference/blocks/files_read_csv.md b/docs/reference/blocks/files_read_csv.md index 8948865a..44833e34 100644 --- a/docs/reference/blocks/files_read_csv.md +++ b/docs/reference/blocks/files_read_csv.md @@ -12,6 +12,7 @@ Read data from CSV |Name|Type|Description|Required| |----|----|-----------|--------| +|**batch\_size**|`integer`|Maximum number of records yielded per downstream batch.
Default: `1000`
Minimum: `1`
|no| |**file**|`string`|Filename. Can contain a regexp or glob expression
|yes| |**encoding**|`string`|Encoding to use for reading the file
Default: `"utf-8"`
|no| |[**fields**](#fields)
(List of columns to use)|`string[]`|List of columns to use for extract
Minimal Length: `1`
|no| diff --git a/docs/reference/blocks/http_receiver.md b/docs/reference/blocks/http_receiver.md index 749cadb4..fa2c4cf2 100644 --- a/docs/reference/blocks/http_receiver.md +++ b/docs/reference/blocks/http_receiver.md @@ -12,6 +12,8 @@ Receives HTTP requests and process the data. |Name|Type|Description|Required| |----|----|-----------|--------| +|**batch\_size**|`integer`|Maximum number of records yielded per downstream batch.
Default: `1000`
Minimum: `1`
|| +|**flush\_ms**|`integer`, `null`|If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.
Default: `1000`
Minimum: `1`
|| |**host**|`string`|Host to listen
Default: `"0.0.0.0"`
|| |**port**|`integer`|Port to listen
Default: `8080`
|| diff --git a/docs/reference/blocks/parquet_read.md b/docs/reference/blocks/parquet_read.md index ba08da29..10f9f2b6 100644 --- a/docs/reference/blocks/parquet_read.md +++ b/docs/reference/blocks/parquet_read.md @@ -12,6 +12,7 @@ Read data from parquet |Name|Type|Description|Required| |----|----|-----------|--------| +|**batch\_size**|`integer`|Maximum number of records yielded per downstream batch.
Default: `1000`
Minimum: `1`
|no| |**file**|`string`|Filename. Can contain a regexp or glob expression
|yes| **Additional Properties:** not allowed diff --git a/docs/reference/blocks/redis_read_stream.md b/docs/reference/blocks/redis_read_stream.md index 3c3b6043..31c0b265 100644 --- a/docs/reference/blocks/redis_read_stream.md +++ b/docs/reference/blocks/redis_read_stream.md @@ -12,6 +12,8 @@ Read from Redis stream |Name|Type|Description|Required| |----|----|-----------|--------| +|**batch\_size**|`integer`|Maximum number of records yielded per downstream batch.
Default: `1000`
Minimum: `1`
|no| +|**flush\_ms**|`integer`, `null`|If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.
Default: `1000`
Minimum: `1`
|no| |**connection**|`string`|Connection name
|yes| |**stream\_name**
(Source stream name)|`string`|Source stream name
|yes| |**snapshot**
(Snapshot current entries and quit)|`boolean`|Snapshot current entries and quit
Default: `false`
|no| @@ -20,6 +22,8 @@ Read from Redis stream **Example** ```yaml +batch_size: 1000 +flush_ms: 1000 snapshot: false ``` diff --git a/docs/reference/blocks/relational_read.md b/docs/reference/blocks/relational_read.md index 4bb5248c..b439eb1b 100644 --- a/docs/reference/blocks/relational_read.md +++ b/docs/reference/blocks/relational_read.md @@ -12,6 +12,7 @@ Read a table from an SQL-compatible data store |Name|Type|Description|Required| |----|----|-----------|--------| +|**batch\_size**|`integer`|Maximum number of records yielded per downstream batch.
Default: `1000`
Minimum: `1`
|no| |**connection**
(The connection to use for loading)|`string`|Logical connection name as defined in the connections.dy.yaml
|yes| |**schema**
(The table schema of the table)|`string`|If left blank, the default schema of this connection will be used as defined in the connections.dy.yaml
|no| |**table**
(The table name)|`string`|Table name
|yes| diff --git a/docs/reference/blocks/std_read.md b/docs/reference/blocks/std_read.md index bee360f2..e2d9481c 100644 --- a/docs/reference/blocks/std_read.md +++ b/docs/reference/blocks/std_read.md @@ -8,7 +8,18 @@ grand_parent: Reference Read from the standard input -**No properties.** +**Properties** + +|Name|Type|Description|Required| +|----|----|-----------|--------| +|**batch\_size**|`integer`|Maximum number of records yielded per downstream batch.
Default: `1000`
Minimum: `1`
|| **Additional Properties:** not allowed +**Example** + +```yaml +batch_size: 1000 + +``` + diff --git a/scripts/generate-docs.sh b/scripts/generate-docs.sh index 631bd1ed..7fabd705 100755 --- a/scripts/generate-docs.sh +++ b/scripts/generate-docs.sh @@ -37,6 +37,24 @@ done rm -rf ./docs/reference/blocks mkdir ./docs/reference/blocks +# Pick a Python that can import datayoga_core via PYTHONPATH=core/src. +if [ -x "./core/.venv/bin/python" ]; then + DOC_PYTHON="./core/.venv/bin/python" +elif [ -x "./venv/bin/python" ]; then + DOC_PYTHON="./venv/bin/python" +else + DOC_PYTHON="python3" +fi + +# Track temp files so we can clean them up on exit. +RESOLVED_TMP_FILES=() +cleanup_resolved_tmps() { + for tmp in "${RESOLVED_TMP_FILES[@]}"; do + [ -f "${tmp}" ] && rm -f "${tmp}" + done +} +trap cleanup_resolved_tmps EXIT + blocks_dir="./core/src/datayoga_core/blocks" for schema in $(find ${blocks_dir} -name '*.schema.json' | sort) do @@ -46,7 +64,21 @@ do block_package="$(echo ${block_package} | cut -c2- | sed 's/\//_/g')" [ ! -z "${block_package}" ] && block_package="${block_package}_" - npx jsonschema2mk --schema ${schema} --extension yaml-examples \ + # Resolve $inherit fragments so jsonschema2mk sees the inherited properties + # (batch_size, flush_ms, etc.). jsonschema2mk does not understand our custom + # $inherit extension, so we materialize a resolved copy first. + resolved_tmp="$(mktemp --suffix=.schema.json)" + RESOLVED_TMP_FILES+=("${resolved_tmp}") + PYTHONPATH=core/src "${DOC_PYTHON}" -c " +import json, sys +from datayoga_core.schema_utils import resolve_inherits +from datayoga_core import utils +schema = utils.read_json('${schema}') +resolved = resolve_inherits(schema) +sys.stdout.write(json.dumps(resolved)) +" > "${resolved_tmp}" + + npx jsonschema2mk --schema "${resolved_tmp}" --extension yaml-examples \ --extension front-matter --fm.parent "Blocks" --fm.grand_parent "Reference" > \ "./docs/reference/blocks/${block_package}${doc_name}" done From 9dbc5d37b9d9fddfc51feb129a8d914fdd2ec4fc Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 17:19:58 +0300 Subject: [PATCH 23/38] Document producer batching model in processing-strategies (#400) Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/processing-strategies.md | 38 +++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/docs/processing-strategies.md b/docs/processing-strategies.md index 8e9e83be..692d82c1 100644 --- a/docs/processing-strategies.md +++ b/docs/processing-strategies.md @@ -64,6 +64,44 @@ Rate limit allows to set guards for the frequency of processing in a given time The Rate limit strategy defines the number of requests per given time interval. For example, 5 requests a minute. When the limit is reached, processing for this Step will pause until the time period elapses to allow additional calls. +## Producer Batching + +Every producer block (any block that reads from a source — `std/read`, `files/read_csv`, `parquet/read`, `relational/read`, `redis/read_stream`, `azure/read_event_hub`, `http/receiver`) accepts a `batch_size` property. The producer base class re-chunks the source's output into batches of exactly `batch_size` records, regardless of how the source delivers them (per row, per row group, per `fetchmany`, per network message). + +```yaml +input: + uses: files.read_csv + with: + file: people.csv + batch_size: 500 # downstream steps process 500 records per call +``` + +Default: `1000`. + +### Streaming producers and `flush_ms` + +Streaming producers (`redis/read_stream`, `azure/read_event_hub`, `http/receiver`) also accept `flush_ms`. If no new records arrive within that many milliseconds, any partial batch is flushed downstream instead of being held until `batch_size` is reached. + +```yaml +input: + uses: redis.read_stream + with: + connection: my_redis + stream_name: events + batch_size: 1000 + flush_ms: 500 # emit a partial batch after 500ms of inactivity +``` + +Default: `1000` ms. Set to `null` to disable time-based flushing (records are held until `batch_size` or end-of-stream). + +### `relational/read` and `fetch_size` + +`relational/read` exposes an extra `fetch_size` property that controls how many rows are pulled from the database driver per round-trip, independent of the pipeline `batch_size`. Default: `10000`. Tune lower for memory pressure with wide rows; tune higher if you want fewer DB round-trips and downstream processing is the bottleneck. + +### `azure/read_event_hub` migration note + +In earlier versions, `batch_size` on `azure/read_event_hub` controlled the SDK callback batch size, not the pipeline batch size. As of #400 it has been renamed to `max_batch_size` to match the SDK semantic, and `batch_size` now consistently means pipeline batch size as it does for every other producer. + ## Mix and Match The processing strategies can be mixed to fit the specific use case. For example, reading records from a Stream one by one, pushing into a parallel processor to perform a transformation, batched and fanned out to multiple processes to load into a relational database in bulk From 5c2eab4d8a13fb1bff20e426739617198437c6da Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 17:25:55 +0300 Subject: [PATCH 24/38] Clean up http/receiver test teardown (#400) Explicitly aclose() the producer async generator on consumer exit so the underlying pump task and aiohttp server are torn down cleanly. Removes a "Task was destroyed but it is pending!" warning at test teardown. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../blocks/http/receiver/tests/test_http_receiver.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py b/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py index 613d91d7..85a40435 100644 --- a/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py +++ b/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py @@ -21,12 +21,16 @@ async def test_http_receiver_batches_incoming_requests(): block.init() received = [] + gen = block.produce() async def consumer(): - async for batch in block.produce(): - received.append(batch) - if sum(len(b) for b in received) >= 60: - return + try: + async for batch in gen: + received.append(batch) + if sum(len(b) for b in received) >= 60: + return + finally: + await gen.aclose() consumer_task = asyncio.create_task(consumer()) await asyncio.sleep(0.2) # let server start From 05f4b015e73bf359f34b83f00df7ee3ab48ce043 Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 17:39:38 +0300 Subject: [PATCH 25/38] Add docstrings to all methods touched in this PR (#400) One-line docstrings on every method, helper, and inner function added or modified by this PR: Producer base class, all 7 migrated producers, and the per-block test helpers. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../datayoga_core/blocks/azure/read_event_hub/block.py | 6 ++++++ .../blocks/azure/read_event_hub/tests/test_event_hub.py | 1 + core/src/datayoga_core/blocks/files/read_csv/block.py | 3 +++ .../blocks/files/read_csv/tests/test_read_csv.py | 2 ++ core/src/datayoga_core/blocks/http/receiver/block.py | 5 +++++ .../blocks/http/receiver/tests/test_http_receiver.py | 2 ++ core/src/datayoga_core/blocks/parquet/read/block.py | 3 +++ .../blocks/parquet/read/tests/test_parquet_read.py | 2 ++ core/src/datayoga_core/blocks/redis/read_stream/block.py | 5 +++++ .../redis/read_stream/tests/test_redis_read_stream.py | 1 + core/src/datayoga_core/blocks/relational/read/block.py | 5 +++++ .../blocks/relational/read/tests/test_relational_read.py | 8 +++++++- core/src/datayoga_core/blocks/std/read/block.py | 9 +++++++++ .../datayoga_core/blocks/std/read/tests/test_std_read.py | 1 + core/src/datayoga_core/producer.py | 4 ++++ core/src/datayoga_core/tests/test_producer_batching.py | 6 ++++++ 16 files changed, 62 insertions(+), 1 deletion(-) diff --git a/core/src/datayoga_core/blocks/azure/read_event_hub/block.py b/core/src/datayoga_core/blocks/azure/read_event_hub/block.py index 77f76d7d..ba4173ba 100644 --- a/core/src/datayoga_core/blocks/azure/read_event_hub/block.py +++ b/core/src/datayoga_core/blocks/azure/read_event_hub/block.py @@ -19,6 +19,7 @@ class Block(DyProducer): DEFAULT_FLUSH_MS = 1000 def init(self, context: Optional[Context] = None): + """Constructs the Event Hub consumer client and the internal message queue.""" logger.debug(f"Initializing {self.get_block_name()}") self.max_batch_size = int(self.properties.get("max_batch_size", 300)) self.consumer_client = EventHubConsumerClient.from_connection_string( @@ -33,6 +34,7 @@ def init(self, context: Optional[Context] = None): self.messages: asyncio.Queue = asyncio.Queue() async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: + """Starts the receive loop and yields one chunk per drained-queue snapshot.""" logger.debug(f"Running {self.get_block_name()}") logger.debug("Starting event receiving process") asyncio.create_task(self.receive_batch()) @@ -45,6 +47,7 @@ async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: yield chunk async def receive_batch(self): + """Runs the Azure SDK receive loop, dispatching each batch to `on_event_batch`.""" await self.consumer_client.receive_batch( on_event_batch=self.on_event_batch, max_batch_size=self.max_batch_size, @@ -52,6 +55,7 @@ async def receive_batch(self): ) async def on_event_batch(self, partition_context: PartitionContext, events: List[EventData]): + """SDK callback: parses each event body as JSON and enqueues it for delivery.""" logger.debug(f"Received batch of events from partition: {partition_context.partition_id}") for event in events: try: @@ -64,6 +68,7 @@ async def on_event_batch(self, partition_context: PartitionContext, events: List logger.error(e) async def complete_events(self, msg_ids: List[str]): + """Updates the partition checkpoint for each previously-delivered message id.""" for msg_id in msg_ids: logger.debug(f"Acking {msg_id} event") event, partition_context = self.events.pop(msg_id, (None, None)) @@ -73,4 +78,5 @@ async def complete_events(self, msg_ids: List[str]): logger.warning(f"Couldn't find event {msg_id} for acknowledging") def ack(self, msg_ids: List[str]): + """Schedules checkpoint updates for the given message ids.""" asyncio.create_task(self.complete_events(msg_ids)) diff --git a/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py b/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py index 074b7c36..f0f06cd0 100644 --- a/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py +++ b/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py @@ -5,6 +5,7 @@ def _minimal_props(extra=None): + """Returns a minimal set of properties accepted by the Event Hub block schema.""" base = { "event_hub_connection_string": "Endpoint=sb://x/;SharedAccessKeyName=k;SharedAccessKey=v;EntityPath=eh", "event_hub_consumer_group_name": "$Default", diff --git a/core/src/datayoga_core/blocks/files/read_csv/block.py b/core/src/datayoga_core/blocks/files/read_csv/block.py index 336450dc..8e94f1f7 100644 --- a/core/src/datayoga_core/blocks/files/read_csv/block.py +++ b/core/src/datayoga_core/blocks/files/read_csv/block.py @@ -13,8 +13,10 @@ class Block(DyProducer, metaclass=ABCMeta): + """Producer block that reads records from a CSV file.""" def init(self, context: Optional[Context] = None): + """Initializes the block: resolves the CSV file path and reader options.""" logger.debug(f"Initializing {self.get_block_name()}") csv_file = self.properties["file"] if os.path.isabs(csv_file) or context is None: @@ -29,6 +31,7 @@ def init(self, context: Optional[Context] = None): self.quotechar = self.properties.get("quotechar", "\"") async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: + """Yields successive `batch_size`-sized chunks of CSV rows.""" logger.debug("Reading CSV") batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE)) diff --git a/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py b/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py index 16cb9b17..22651bb1 100644 --- a/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py +++ b/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py @@ -6,6 +6,7 @@ async def _drain(producer): + """Collects all batches emitted by a producer until end-of-stream.""" out = [] async for batch in producer.produce(): out.append(batch) @@ -14,6 +15,7 @@ async def _drain(producer): @pytest.fixture def csv_path(tmp_path) -> Path: + """Writes a 2500-row CSV with a single header row to a temp path.""" p = tmp_path / "data.csv" rows = ["fname,lname"] + [f"first{i},last{i}" for i in range(2500)] p.write_text("\n".join(rows) + "\n", encoding="utf-8") diff --git a/core/src/datayoga_core/blocks/http/receiver/block.py b/core/src/datayoga_core/blocks/http/receiver/block.py index 3f5b1833..ab0fa60a 100644 --- a/core/src/datayoga_core/blocks/http/receiver/block.py +++ b/core/src/datayoga_core/blocks/http/receiver/block.py @@ -15,19 +15,24 @@ class Block(DyProducer, metaclass=ABCMeta): + """Producer block that exposes an HTTP endpoint and emits POSTed JSON bodies.""" + port: int host: str DEFAULT_FLUSH_MS = 1000 def init(self, context: Optional[Context] = None): + """Reads host/port from properties; the HTTP server is started in produce_chunks.""" logger.debug(f"Initializing {self.get_block_name()}") self.port = int(self.properties.get("port", 8080)) self.host = self.properties.get("host", "0.0.0.0") async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: + """Starts the HTTP server, then yields one chunk per drained queue snapshot.""" queue: Queue = Queue(maxsize=1000) async def handler(request: BaseRequest) -> Response: + """Parses the incoming HTTP body as JSON and enqueues it for delivery.""" try: queue.put_nowait(orjson.loads(await request.read())) return HTTPOk() diff --git a/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py b/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py index 85a40435..9f93360e 100644 --- a/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py +++ b/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py @@ -7,6 +7,7 @@ def _free_port(): + """Returns an unused TCP port on localhost.""" import socket with socket.socket() as s: s.bind(("127.0.0.1", 0)) @@ -24,6 +25,7 @@ async def test_http_receiver_batches_incoming_requests(): gen = block.produce() async def consumer(): + """Drains the producer until 60 records have arrived, then closes the generator.""" try: async for batch in gen: received.append(batch) diff --git a/core/src/datayoga_core/blocks/parquet/read/block.py b/core/src/datayoga_core/blocks/parquet/read/block.py index 1c7128c6..f82604ee 100644 --- a/core/src/datayoga_core/blocks/parquet/read/block.py +++ b/core/src/datayoga_core/blocks/parquet/read/block.py @@ -12,8 +12,10 @@ class Block(DyProducer, metaclass=ABCMeta): + """Producer block that reads records from a Parquet file.""" def init(self, context: Optional[Context] = None): + """Initializes the block: resolves the Parquet file path.""" logger.debug(f"Initializing {self.get_block_name()}") parquet_file = self.properties["file"] if os.path.isabs(parquet_file) or context is None: @@ -23,6 +25,7 @@ def init(self, context: Optional[Context] = None): logger.debug(f"file: {self.file}") async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: + """Yields one chunk per Parquet row group; the base class re-chunks to `batch_size`.""" logger.debug("Reading parquet") pf = ParquetFile(self.file) counter = iter(count()) diff --git a/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py b/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py index ab6d8517..a04bc3fe 100644 --- a/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py +++ b/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py @@ -7,6 +7,7 @@ async def _drain(producer): + """Collects all batches emitted by a producer until end-of-stream.""" out = [] async for batch in producer.produce(): out.append(batch) @@ -15,6 +16,7 @@ async def _drain(producer): @pytest.fixture def parquet_path(tmp_path) -> Path: + """Writes a 2500-row Parquet file with three row groups (1000, 1000, 500).""" p = tmp_path / "data.parquet" df = pd.DataFrame({"i": list(range(2500))}) from fastparquet import write as fp_write diff --git a/core/src/datayoga_core/blocks/redis/read_stream/block.py b/core/src/datayoga_core/blocks/redis/read_stream/block.py index 136d0963..aa464743 100644 --- a/core/src/datayoga_core/blocks/redis/read_stream/block.py +++ b/core/src/datayoga_core/blocks/redis/read_stream/block.py @@ -11,9 +11,12 @@ class Block(DyProducer): + """Producer block that reads messages from a Redis stream consumer group.""" + DEFAULT_FLUSH_MS = 1000 def init(self, context: Optional[Context] = None): + """Connects to Redis and ensures the consumer group exists on the target stream.""" logger.debug(f"Initializing {self.get_block_name()}") connection_details = Connection.get_connection_details(self.properties["connection"], context) self.redis_client = redis_utils.get_client(connection_details) @@ -27,6 +30,7 @@ def init(self, context: Optional[Context] = None): self.redis_client.xgroup_create(self.stream, self.consumer_group, 0) async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: + """Reads pending then new stream messages via XREADGROUP, yielding each response as a chunk.""" logger.debug(f"Running {self.get_block_name()}") batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE)) read_pending = True @@ -57,6 +61,7 @@ async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: read_pending = False def ack(self, msg_ids: List[str]): + """Acknowledges the given message ids with XACK on the stream consumer group.""" for msg_id in msg_ids: logger.info(f"Acking {msg_id} message in {self.stream} stream of {self.consumer_group} consumer group") self.redis_client.xack(self.stream, self.consumer_group, msg_id) diff --git a/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py b/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py index f45b8d67..f06936d4 100644 --- a/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py +++ b/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py @@ -6,6 +6,7 @@ def _mk_block(properties, redis_client): + """Builds a redis/read_stream Block bypassing its real init() (mocks the Redis client).""" block = Block.__new__(Block) block.properties = properties block.redis_client = redis_client diff --git a/core/src/datayoga_core/blocks/relational/read/block.py b/core/src/datayoga_core/blocks/relational/read/block.py index 2b04f3c3..4dd8f026 100644 --- a/core/src/datayoga_core/blocks/relational/read/block.py +++ b/core/src/datayoga_core/blocks/relational/read/block.py @@ -11,9 +11,12 @@ class Block(DyProducer): + """Producer block that reads rows from a SQL-compatible relational database.""" + DEFAULT_FETCH_SIZE = 10000 def init(self, context: Optional[Context] = None): + """Initializes the engine, autoloads the target table, and opens a connection.""" self.engine, self.db_type = relational_utils.get_engine( self.properties["connection"], context, @@ -33,6 +36,7 @@ def init(self, context: Optional[Context] = None): self.connection = self.engine.connect() async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: + """Yields each `fetchmany(fetch_size)` result as a chunk; the base class re-chunks to `batch_size`.""" fetch_size = int(self.properties.get("fetch_size", self.DEFAULT_FETCH_SIZE)) result = self.connection.execution_options(stream_results=True).execute(self.tbl.select()) while True: @@ -42,5 +46,6 @@ async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: yield [utils.add_uid(dict(row._asdict())) for row in rows] def stop(self): + """Closes the database connection and disposes of the engine.""" self.connection.close() self.engine.dispose() diff --git a/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py b/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py index 0fba4629..3e59315b 100644 --- a/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py +++ b/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py @@ -6,6 +6,7 @@ async def _drain(producer): + """Collects all batches emitted by a producer until end-of-stream.""" out = [] async for batch in producer.produce(): out.append(batch) @@ -13,7 +14,7 @@ async def _drain(producer): def _fake_result(rows): - """Build a fake SQLAlchemy result that returns rows in fetchmany chunks.""" + """Builds a fake SQLAlchemy result that returns rows in fetchmany chunks.""" state = {"i": 0} def fetchmany(n): @@ -29,14 +30,19 @@ def fetchmany(n): class _Row: + """Stand-in for a SQLAlchemy Row exposing only `_asdict()`.""" + def __init__(self, d): + """Stores the underlying dict that `_asdict()` will return.""" self._d = d def _asdict(self): + """Returns the stored dict, matching SQLAlchemy Row's API.""" return self._d def _mk_block(properties, fake_result): + """Builds a relational/read Block without running its real init() (mocks engine/connection).""" block = Block.__new__(Block) block.properties = properties block.connection = MagicMock() diff --git a/core/src/datayoga_core/blocks/std/read/block.py b/core/src/datayoga_core/blocks/std/read/block.py index 1c51839d..8ff15811 100644 --- a/core/src/datayoga_core/blocks/std/read/block.py +++ b/core/src/datayoga_core/blocks/std/read/block.py @@ -12,10 +12,17 @@ class Block(DyProducer): + """Producer block that reads JSON records from standard input.""" + def init(self, context: Optional[Context] = None): + """Initializes the block.""" logger.debug(f"Initializing {self.get_block_name()}") async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: + """Reads all stdin records and yields them as a single chunk. + + The base class re-chunks the output to `batch_size` records per batch. + """ if select.select([sys.stdin], [], [], 0.0)[0]: all_records: List[Dict[str, Any]] = [] for line in sys.stdin: @@ -29,10 +36,12 @@ async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: @staticmethod def get_records(data: str) -> List[Dict[str, Any]]: + """Parses a JSON string into a list of records (wraps single objects in a list).""" records = orjson.loads(data) if isinstance(records, dict): records = [records] return records def get_message(self, record: Dict[str, Any]) -> Dict[str, Any]: + """Returns the record with a generated message id field added.""" return {self.MSG_ID_FIELD: str(uuid.uuid4()), **record} diff --git a/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py b/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py index 609f0915..b588af69 100644 --- a/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py +++ b/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py @@ -7,6 +7,7 @@ async def _drain(producer): + """Collects all batches emitted by a producer until end-of-stream.""" out = [] async for batch in producer.produce(): out.append(batch) diff --git a/core/src/datayoga_core/producer.py b/core/src/datayoga_core/producer.py index 2b61390d..8b672433 100644 --- a/core/src/datayoga_core/producer.py +++ b/core/src/datayoga_core/producer.py @@ -9,7 +9,10 @@ class Message: + """A message produced by a producer block.""" + def __init__(self, msg_id: str, value: Dict[str, Any]): + """Initializes a message with an id and a payload value.""" self.msg_id = msg_id self.value = value @@ -54,6 +57,7 @@ async def produce(self) -> AsyncGenerator[List[Dict[str, Any]], None]: EOS = object() async def pump(): + """Drains produce_chunks() into the queue; signals EOS on exit.""" try: async for chunk in self.produce_chunks(): if chunk: diff --git a/core/src/datayoga_core/tests/test_producer_batching.py b/core/src/datayoga_core/tests/test_producer_batching.py index 59601786..695d7e1c 100644 --- a/core/src/datayoga_core/tests/test_producer_batching.py +++ b/core/src/datayoga_core/tests/test_producer_batching.py @@ -8,6 +8,7 @@ def _msg(i: int) -> dict: + """Builds a record carrying the producer MSG_ID_FIELD and a numeric value.""" return {Producer.MSG_ID_FIELD: str(i), "v": i} @@ -15,6 +16,7 @@ class FakeProducer(Producer): """Producer driven by a scripted list of chunks plus optional sleeps.""" def __init__(self, properties=None, *, chunks=None, sleep_before=None): + """Configures the scripted chunks and optional per-chunk sleep delays.""" # schema for a FakeProducer; declare batch_size/flush_ms so validation passes self._test_schema = { "type": "object", @@ -28,12 +30,15 @@ def __init__(self, properties=None, *, chunks=None, sleep_before=None): super().__init__(properties or {}) def get_json_schema(self): + """Returns the in-memory test schema (avoids reading from disk).""" return self._test_schema def init(self, context: Optional[Context] = None): + """No-op init; FakeProducer doesn't need any setup.""" pass async def produce_chunks(self) -> AsyncGenerator[List[Message], None]: + """Yields the scripted chunks, optionally sleeping before each one.""" for i, chunk in enumerate(self._chunks): if i < len(self._sleep_before) and self._sleep_before[i]: await asyncio.sleep(self._sleep_before[i]) @@ -41,6 +46,7 @@ async def produce_chunks(self) -> AsyncGenerator[List[Message], None]: async def _drain(producer: Producer): + """Collects all batches emitted by a producer until end-of-stream.""" out = [] async for batch in producer.produce(): out.append(batch) From 0b8d8f77638d17813b11ea60e8841e4f542060b9 Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 17:43:10 +0300 Subject: [PATCH 26/38] Fix CI: isort formatting + stdlib-only $inherit resolver in docs script (#400) - Run isort/autopep8 on test files; collapse the blank line between third-party imports (pytest, etc.) and datayoga_core imports that isort flagged. - Rewrite the $inherit resolution in scripts/generate-docs.sh to use only the Python standard library, so the docs CI job (which installs only node) no longer hits ModuleNotFoundError on prometheus_client when importing datayoga_core. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../read_event_hub/tests/test_event_hub.py | 3 +- .../files/read_csv/tests/test_read_csv.py | 1 - .../http/receiver/tests/test_http_receiver.py | 1 - .../parquet/read/tests/test_parquet_read.py | 1 - .../tests/test_redis_read_stream.py | 1 - .../read/tests/test_relational_read.py | 1 - .../blocks/std/read/tests/test_std_read.py | 3 +- .../tests/test_producer_batching.py | 1 - .../tests/test_schema_inherit.py | 2 - scripts/generate-docs.sh | 45 ++++++++++++------- 10 files changed, 30 insertions(+), 29 deletions(-) diff --git a/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py b/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py index f0f06cd0..17cff570 100644 --- a/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py +++ b/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py @@ -1,7 +1,6 @@ import pytest -from jsonschema import ValidationError - from datayoga_core.blocks.azure.read_event_hub.block import Block +from jsonschema import ValidationError def _minimal_props(extra=None): diff --git a/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py b/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py index 22651bb1..55fd548e 100644 --- a/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py +++ b/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py @@ -1,7 +1,6 @@ from pathlib import Path import pytest - from datayoga_core.blocks.files.read_csv.block import Block diff --git a/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py b/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py index 9f93360e..4673801d 100644 --- a/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py +++ b/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py @@ -2,7 +2,6 @@ import aiohttp import pytest - from datayoga_core.blocks.http.receiver.block import Block diff --git a/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py b/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py index a04bc3fe..b33a3d03 100644 --- a/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py +++ b/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py @@ -2,7 +2,6 @@ import pandas as pd import pytest - from datayoga_core.blocks.parquet.read.block import Block diff --git a/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py b/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py index f06936d4..5d46e99e 100644 --- a/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py +++ b/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py @@ -1,7 +1,6 @@ from unittest.mock import MagicMock import pytest - from datayoga_core.blocks.redis.read_stream.block import Block diff --git a/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py b/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py index 3e59315b..6dafd72e 100644 --- a/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py +++ b/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py @@ -1,7 +1,6 @@ from unittest.mock import MagicMock import pytest - from datayoga_core.blocks.relational.read.block import Block diff --git a/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py b/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py index b588af69..d9698d16 100644 --- a/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py +++ b/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py @@ -2,7 +2,6 @@ import orjson import pytest - from datayoga_core.blocks.std.read.block import Block @@ -24,7 +23,7 @@ async def test_std_read_batches_to_batch_size(): with patch("datayoga_core.blocks.std.read.block.select.select", return_value=([object()], [], [])), \ - patch("datayoga_core.blocks.std.read.block.sys.stdin", fake_stdin): + patch("datayoga_core.blocks.std.read.block.sys.stdin", fake_stdin): batches = await _drain(block) assert [len(b) for b in batches] == [1000, 1000, 500] diff --git a/core/src/datayoga_core/tests/test_producer_batching.py b/core/src/datayoga_core/tests/test_producer_batching.py index 695d7e1c..34413037 100644 --- a/core/src/datayoga_core/tests/test_producer_batching.py +++ b/core/src/datayoga_core/tests/test_producer_batching.py @@ -2,7 +2,6 @@ from typing import AsyncGenerator, List, Optional import pytest - from datayoga_core.context import Context from datayoga_core.producer import Message, Producer diff --git a/core/src/datayoga_core/tests/test_schema_inherit.py b/core/src/datayoga_core/tests/test_schema_inherit.py index c22ea2c8..46f448cd 100644 --- a/core/src/datayoga_core/tests/test_schema_inherit.py +++ b/core/src/datayoga_core/tests/test_schema_inherit.py @@ -1,10 +1,8 @@ from pathlib import Path import pytest - from datayoga_core.schema_utils import resolve_inherits - SCHEMAS_DIR = ( Path(__file__).resolve().parent.parent / "resources" / "schemas" ) diff --git a/scripts/generate-docs.sh b/scripts/generate-docs.sh index 7fabd705..03aa51ae 100755 --- a/scripts/generate-docs.sh +++ b/scripts/generate-docs.sh @@ -37,15 +37,6 @@ done rm -rf ./docs/reference/blocks mkdir ./docs/reference/blocks -# Pick a Python that can import datayoga_core via PYTHONPATH=core/src. -if [ -x "./core/.venv/bin/python" ]; then - DOC_PYTHON="./core/.venv/bin/python" -elif [ -x "./venv/bin/python" ]; then - DOC_PYTHON="./venv/bin/python" -else - DOC_PYTHON="python3" -fi - # Track temp files so we can clean them up on exit. RESOLVED_TMP_FILES=() cleanup_resolved_tmps() { @@ -56,6 +47,7 @@ cleanup_resolved_tmps() { trap cleanup_resolved_tmps EXIT blocks_dir="./core/src/datayoga_core/blocks" +schemas_dir="./core/src/datayoga_core/resources/schemas" for schema in $(find ${blocks_dir} -name '*.schema.json' | sort) do doc_name="$(awk -F/ '{ print $(NF-1) }' <<<${schema}).md" @@ -67,16 +59,35 @@ do # Resolve $inherit fragments so jsonschema2mk sees the inherited properties # (batch_size, flush_ms, etc.). jsonschema2mk does not understand our custom # $inherit extension, so we materialize a resolved copy first. + # Self-contained Python (stdlib only) so this works in CI without installing + # datayoga_core's runtime dependencies. resolved_tmp="$(mktemp --suffix=.schema.json)" RESOLVED_TMP_FILES+=("${resolved_tmp}") - PYTHONPATH=core/src "${DOC_PYTHON}" -c " -import json, sys -from datayoga_core.schema_utils import resolve_inherits -from datayoga_core import utils -schema = utils.read_json('${schema}') -resolved = resolve_inherits(schema) -sys.stdout.write(json.dumps(resolved)) -" > "${resolved_tmp}" + python3 - "${schema}" "${schemas_dir}" > "${resolved_tmp}" <<'PYEOF' +import json +import os +import sys + +schema_path, schemas_dir = sys.argv[1], sys.argv[2] +with open(schema_path) as f: + schema = json.load(f) +inherits = schema.get("$inherit") or [] +if inherits: + if not isinstance(inherits, list) or not all(isinstance(n, str) for n in inherits): + raise SystemExit(f"$inherit must be a list of strings, got {inherits!r}") + merged = {} + for name in inherits: + fragment_path = os.path.join(schemas_dir, f"{name}.schema.json") + with open(fragment_path) as f: + fragment = json.load(f) + if fragment.get("$inherit"): + raise SystemExit(f"Nested $inherit in fragment '{name}' is not supported") + merged.update(fragment.get("properties", {})) + merged.update(schema.get("properties", {})) + schema["properties"] = merged + schema.pop("$inherit", None) +json.dump(schema, sys.stdout) +PYEOF npx jsonschema2mk --schema "${resolved_tmp}" --extension yaml-examples \ --extension front-matter --fm.parent "Blocks" --fm.grand_parent "Reference" > \ From 4e3b3fc974441348da2b0c00314b897725aed8c9 Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 17:51:25 +0300 Subject: [PATCH 27/38] Add azure-eventhub deps to test extras for CI (#400) unit-tests CI does `pip install .[test]`. The azure/read_event_hub test module imports the block, which imports azure.eventhub at module load. Without azure-eventhub in the test extras, pytest's collection fails on ModuleNotFoundError. Other producer test modules (parquet, redis, http, relational) already work because their backing deps are in [test]. Co-Authored-By: Claude Opus 4.7 (1M context) --- core/pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/pyproject.toml b/core/pyproject.toml index 9939fd53..2a55ee25 100644 --- a/core/pyproject.toml +++ b/core/pyproject.toml @@ -66,6 +66,8 @@ sqlserver = ["pymssql", "SQLAlchemy"] test = [ "aiohttp", + "azure-eventhub", + "azure-eventhub-checkpointstoreblob-aio", "cassandra-driver", "fastparquet", "ibm_db_sa", From a3d927595d981e1484e396633122ae9aae0d368d Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 18:38:50 +0300 Subject: [PATCH 28/38] Format superpowers spec and plan with prettier (#400) formatting-check runs prettier --check on all .md including these. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...026-05-28-producer-batching-unification.md | 30 ++++++++++++-- ...28-producer-batching-unification-design.md | 40 ++++++++++--------- 2 files changed, 47 insertions(+), 23 deletions(-) diff --git a/docs/superpowers/plans/2026-05-28-producer-batching-unification.md b/docs/superpowers/plans/2026-05-28-producer-batching-unification.md index a53f0e0f..d4fa4415 100644 --- a/docs/superpowers/plans/2026-05-28-producer-batching-unification.md +++ b/docs/superpowers/plans/2026-05-28-producer-batching-unification.md @@ -16,6 +16,7 @@ ## File Structure **Created:** + - `core/src/datayoga_core/resources/schemas/batchable.schema.json` — fragment exposing `batch_size` - `core/src/datayoga_core/resources/schemas/streamable.schema.json` — fragment exposing `flush_ms` (combined with batchable) - `core/src/datayoga_core/schema_utils.py` — `$inherit` resolver used by Block + Job @@ -36,6 +37,7 @@ - `core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py` **Modified:** + - `core/src/datayoga_core/producer.py` — adds `produce_chunks` and a default `produce()` that re-chunks - `core/src/datayoga_core/block.py` — `get_json_schema()` runs through `$inherit` resolver - `core/src/datayoga_core/job.py` — `get_json_schema()` loop runs each loaded schema through the resolver @@ -64,6 +66,7 @@ Adds the `$inherit` convention and the two shared fragments. After this task, schemas referencing `batchable` / `streamable` get the fragments' properties merged in at load time. **Files:** + - Create: `core/src/datayoga_core/resources/schemas/batchable.schema.json` - Create: `core/src/datayoga_core/resources/schemas/streamable.schema.json` - Create: `core/src/datayoga_core/schema_utils.py` @@ -198,6 +201,7 @@ def test_unknown_fragment_raises(): - [ ] **Step 1.5: Run test to verify it fails** Run: + ```bash cd core && python -m pytest src/datayoga_core/tests/test_schema_inherit.py -v ``` @@ -266,6 +270,7 @@ def resolve_inherits(schema: Dict[str, Any], schemas_dir: str = None) -> Dict[st - [ ] **Step 1.7: Run test to verify it passes** Run: + ```bash cd core && python -m pytest src/datayoga_core/tests/test_schema_inherit.py -v ``` @@ -305,6 +310,7 @@ Note: the `from datayoga_core.schema_utils import resolve_inherits` line is insi Modify `core/src/datayoga_core/job.py`. Inside the `for block_type, schema_path in block_info:` loop (around line 240–243), apply the resolver to each loaded schema. Find this block: + ```python for block_type, schema_path in block_info: block_types.append(block_type) @@ -314,6 +320,7 @@ Find this block: ``` Replace with: + ```python from datayoga_core.schema_utils import resolve_inherits for block_type, schema_path in block_info: @@ -353,6 +360,7 @@ git commit -m "Add \$inherit schema fragment resolver (#400)" Add `produce_chunks()` and a default `produce()` that re-chunks. Existing subclasses override `produce()` directly and are unaffected until migrated in later tasks. **Files:** + - Create: `core/src/datayoga_core/tests/test_producer_batching.py` - Modify: `core/src/datayoga_core/producer.py` @@ -492,6 +500,7 @@ async def test_consumer_cancellation_cleans_up_pump(): - [ ] **Step 2.2: Run tests to verify they fail** Run: + ```bash cd core && python -m pytest src/datayoga_core/tests/test_producer_batching.py -v ``` @@ -602,6 +611,7 @@ class Producer(Block): ``` Key differences from the current file: + - `produce()` is no longer `@abstractmethod` — it has a default implementation. - `produce_chunks()` is the new override hook (not formally `@abstractmethod` so legacy subclasses still validate). - `Message` class unchanged. @@ -609,6 +619,7 @@ Key differences from the current file: - [ ] **Step 2.4: Run tests to verify they pass** Run: + ```bash cd core && python -m pytest src/datayoga_core/tests/test_producer_batching.py -v ``` @@ -640,6 +651,7 @@ git commit -m "Producer base class re-chunks via produce_chunks (#400)" `std/read` already has `batch_size` and a custom `process_batch` accumulator. Replace it with a `produce_chunks` that yields one chunk; the base class re-chunks. **Files:** + - Modify: `core/src/datayoga_core/blocks/std/read/block.py` - Modify: `core/src/datayoga_core/blocks/std/read/block.schema.json` @@ -689,6 +701,7 @@ async def test_std_read_batches_to_batch_size(): - [ ] **Step 3.2: Run test to verify it fails** Run: + ```bash cd core && python -m pytest src/datayoga_core/blocks/std/read/tests/test_std_read.py -v ``` @@ -792,6 +805,7 @@ git commit -m "Migrate std/read to produce_chunks (#400, #296)" Replace the `produce()` override and `islice` loop with a `produce_chunks` that yields one chunk per `batch_size` rows. The base class re-chunks to the configured `batch_size`. **Files:** + - Modify: `core/src/datayoga_core/blocks/files/read_csv/block.py` - Modify: `core/src/datayoga_core/blocks/files/read_csv/block.schema.json` @@ -849,7 +863,7 @@ async def test_csv_default_batch_size(csv_path): cd core && python -m pytest src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py -v ``` -Expected: FAIL — current `produce()` works but the tests may pass coincidentally because `files/read_csv` already batches. That's fine; the test exists to *protect* the contract. Proceed to the migration anyway and confirm the test still passes afterward. +Expected: FAIL — current `produce()` works but the tests may pass coincidentally because `files/read_csv` already batches. That's fine; the test exists to _protect_ the contract. Proceed to the migration anyway and confirm the test still passes afterward. - [ ] **Step 4.3: Migrate `files/read_csv` to `produce_chunks`** @@ -1010,6 +1024,7 @@ git commit -m "Migrate files/read_csv to produce_chunks (#400)" Today `parquet/read` iterates each row of each row group and yields a single-record list per iteration. Migrate it to yield each row group as a single chunk; the base class re-chunks to `batch_size`. **Files:** + - Modify: `core/src/datayoga_core/blocks/parquet/read/block.py` - Modify: `core/src/datayoga_core/blocks/parquet/read/block.schema.json` @@ -1171,6 +1186,7 @@ git commit -m "Migrate parquet/read to produce_chunks, fix one-by-one yield (#40 Today `relational/read` does `fetchmany(10000)` then yields one row at a time. Migrate to `produce_chunks` that yields each `fetchmany` result. Add an optional `fetch_size` property; default to 10000 to preserve today's DB round-trip count. **Files:** + - Modify: `core/src/datayoga_core/blocks/relational/read/block.py` - Modify: `core/src/datayoga_core/blocks/relational/read/block.schema.json` @@ -1426,6 +1442,7 @@ git commit -m "Migrate relational/read to produce_chunks, add fetch_size (#400, The receiver currently yields one record per HTTP request. Migrate to drain the queue per chunk; `flush_ms` ensures partial batches flush during low-traffic periods. **Files:** + - Modify: `core/src/datayoga_core/blocks/http/receiver/block.py` - Modify: `core/src/datayoga_core/blocks/http/receiver/block.schema.json` @@ -1617,6 +1634,7 @@ git commit -m "Migrate http/receiver to produce_chunks (#400)" The redis stream producer yields one record at a time today. Migrate so it requests `count=batch_size` from `xreadgroup` and yields each response as a chunk; `flush_ms` flushes partial batches during low-volume periods. **Files:** + - Modify: `core/src/datayoga_core/blocks/redis/read_stream/block.py` - Modify: `core/src/datayoga_core/blocks/redis/read_stream/block.schema.json` @@ -1823,9 +1841,10 @@ git commit -m "Migrate redis/read_stream to batched xreadgroup (#400, #377)" ## Task 9: Migrate `azure/read_event_hub` (rename `batch_size` → `max_batch_size`) -Today `batch_size` controls the SDK callback size, not the pipeline batch size. Rename to `max_batch_size`, add `additionalProperties: false`, and use the streamable fragment so the *new* `batch_size` means pipeline batch size. +Today `batch_size` controls the SDK callback size, not the pipeline batch size. Rename to `max_batch_size`, add `additionalProperties: false`, and use the streamable fragment so the _new_ `batch_size` means pipeline batch size. **Files:** + - Modify: `core/src/datayoga_core/blocks/azure/read_event_hub/block.py` - Modify: `core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json` @@ -2070,6 +2089,7 @@ git commit -m "Migrate azure/read_event_hub; rename batch_size -> max_batch_size The aggregated `schemas/job.schema.json` and the per-block markdown in `docs/reference/blocks/` are generated by scripts. After the per-block schema changes, regenerate them. **Files:** + - Modify: `schemas/job.schema.json` - Modify: `docs/reference/blocks/std_read.md`, `files_read_csv.md`, `parquet_read.md`, `relational_read.md`, `redis_read_stream.md`, `http_receiver.md`, `azure_read_event_hub.md` (autogenerated) @@ -2109,6 +2129,7 @@ git commit -m "Regenerate JSON schemas and reference docs after producer batchin ## Task 11: Document the producer batching model in processing-strategies **Files:** + - Modify: `docs/processing-strategies.md` - [ ] **Step 11.1: Add a section on producer batching** @@ -2125,7 +2146,7 @@ input: uses: files.read_csv with: file: people.csv - batch_size: 500 # downstream steps process 500 records per call + batch_size: 500 # downstream steps process 500 records per call ``` Default: `1000`. @@ -2141,7 +2162,7 @@ input: connection: my_redis stream_name: events batch_size: 1000 - flush_ms: 500 # emit a partial batch after 500ms of inactivity + flush_ms: 500 # emit a partial batch after 500ms of inactivity ``` Default: `1000` ms. Set to `null` to disable time-based flushing (records are held until `batch_size` or end-of-stream). @@ -2173,6 +2194,7 @@ cd core && python -m pytest src/datayoga_core/ -v ``` Expected: all tests pass. Notably: + - `test_producer_batching.py` (7 tests) - `test_schema_inherit.py` (5 tests) - `test_std_read.py`, `test_read_csv.py`, `test_parquet_read.py`, `test_relational_read.py`, `test_http_receiver.py`, `test_redis_read_stream.py`, `test_event_hub.py` (12 tests total) diff --git a/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md b/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md index 81692cdc..7ef27825 100644 --- a/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md +++ b/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md @@ -9,15 +9,15 @@ Seven producer blocks each handle (or fail to handle) batching differently: -| Producer | Bounded/Streaming | `batch_size` today | Behavior | -|---|---|---|---| -| `std/read` | bounded | yes, default 1000 *(on `batch_size_in_std_read_block` branch)* | custom `process_batch` accumulator | -| `files/read_csv` | bounded | yes, default 1000 | own `islice(reader, batch_size)` loop | -| `relational/read` | bounded | **no** — hardcoded `fetchmany(10000)` | yields one row at a time downstream (bug) | -| `parquet/read` | bounded | **no** | yields one row at a time (bug) | -| `redis/read_stream` | streaming | **no** | yields one record at a time (bug #377) | -| `azure/read_event_hub` | streaming | yes, default 300, **but** controls *SDK callback batch size*, not pipeline batch size | drains internal queue in unbounded batches | -| `http/receiver` | streaming | **no** | yields one record per HTTP request (bug) | +| Producer | Bounded/Streaming | `batch_size` today | Behavior | +| ---------------------- | ----------------- | ------------------------------------------------------------------------------------- | ------------------------------------------ | +| `std/read` | bounded | yes, default 1000 _(on `batch_size_in_std_read_block` branch)_ | custom `process_batch` accumulator | +| `files/read_csv` | bounded | yes, default 1000 | own `islice(reader, batch_size)` loop | +| `relational/read` | bounded | **no** — hardcoded `fetchmany(10000)` | yields one row at a time downstream (bug) | +| `parquet/read` | bounded | **no** | yields one row at a time (bug) | +| `redis/read_stream` | streaming | **no** | yields one record at a time (bug #377) | +| `azure/read_event_hub` | streaming | yes, default 300, **but** controls _SDK callback batch size_, not pipeline batch size | drains internal queue in unbounded batches | +| `http/receiver` | streaming | **no** | yields one record per HTTP request (bug) | Four are actively buggy (yielding single records into the pipeline when batches are intended). One uses `batch_size` with a different semantic. Each producer that has implemented batching has done it differently. @@ -115,7 +115,7 @@ async def produce(self) -> AsyncGenerator[List[Message], None]: await pump_task ``` -Why a queue and not `asyncio.wait_for(anext(gen), timeout)`: cancelling `__anext__` on an async generator with side effects (open connections, partial reads) can leave it in a broken state. Cancelling the *pump task* boundary is safe; the generator finishes its current chunk before the pump's `try/finally` runs. +Why a queue and not `asyncio.wait_for(anext(gen), timeout)`: cancelling `__anext__` on an async generator with side effects (open connections, partial reads) can leave it in a broken state. Cancelling the _pump task_ boundary is safe; the generator finishes its current chunk before the pump's `try/finally` runs. `flush_ms = None` ⇒ `timeout = None` ⇒ `queue.get()` waits forever ⇒ no time-based flush. Bounded sources don't set `flush_ms` and aren't affected. @@ -124,6 +124,7 @@ Why a queue and not `asyncio.wait_for(anext(gen), timeout)`: cancelling `__anext Two shared fragments in `core/src/datayoga_core/resources/schemas/`: `batchable.schema.json`: + ```json { "type": "object", @@ -139,6 +140,7 @@ Two shared fragments in `core/src/datayoga_core/resources/schemas/`: ``` `streamable.schema.json`: + ```json { "type": "object", @@ -324,15 +326,15 @@ class Block(DyProducer): ### Defaults summary -| Producer | `batch_size` | `flush_ms` | Other | -|---|---|---|---| -| `std/read` | 1000 | — | — | -| `files/read_csv` | 1000 | — | — | -| `relational/read` | 1000 | — | optional `fetch_size`, defaults to 10000 | -| `parquet/read` | 1000 | — | — | -| `redis/read_stream` | 1000 | 1000 | — | -| `azure/read_event_hub` | 1000 | 1000 | `max_batch_size` 300 (renamed from old `batch_size`) | -| `http/receiver` | 1000 | 1000 | — | +| Producer | `batch_size` | `flush_ms` | Other | +| ---------------------- | ------------ | ---------- | ---------------------------------------------------- | +| `std/read` | 1000 | — | — | +| `files/read_csv` | 1000 | — | — | +| `relational/read` | 1000 | — | optional `fetch_size`, defaults to 10000 | +| `parquet/read` | 1000 | — | — | +| `redis/read_stream` | 1000 | 1000 | — | +| `azure/read_event_hub` | 1000 | 1000 | `max_batch_size` 300 (renamed from old `batch_size`) | +| `http/receiver` | 1000 | 1000 | — | ## Tests From 5734e033637fca4461527f0538fc1c798a92e10b Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 19:04:39 +0300 Subject: [PATCH 29/38] Address Copilot review: propagate source errors + bounded backpressure (#400) Two correctness fixes flagged by the Copilot PR reviewer: 1. Source errors no longer become silent EOS. The pump now captures non-cancellation exceptions and re-raises them on the consumer side after flushing the partial buffer, so a Redis disconnect, broken CSV, or DB read error fails the job loudly instead of being treated as end-of-stream against truncated input. 2. The internal queue is now bounded (maxsize=1), restoring the backpressure the old yield-driven model had. Without this, large bounded sources (parquet, relational, csv) could pre-load the entire table/file into memory while downstream was processing batch 1. The pump's `finally: put(EOS)` is skipped on cancellation to avoid deadlocking against a full queue. Also: corrected processing-strategies docs to say "up to batch_size" instead of "exactly batch_size", since partial batches fire on EOS and flush_ms timeout. Three new tests: - test_source_errors_propagate_instead_of_silent_eos - test_source_error_flushes_buffer_before_raising - test_pump_does_not_outrun_consumer_unboundedly Co-Authored-By: Claude Opus 4.7 (1M context) --- core/src/datayoga_core/producer.py | 33 ++++- .../tests/test_producer_batching.py | 122 ++++++++++++++++++ docs/processing-strategies.md | 2 +- ...28-producer-batching-unification-design.md | 5 +- 4 files changed, 153 insertions(+), 9 deletions(-) diff --git a/core/src/datayoga_core/producer.py b/core/src/datayoga_core/producer.py index 8b672433..dc5b05d5 100644 --- a/core/src/datayoga_core/producer.py +++ b/core/src/datayoga_core/producer.py @@ -44,30 +44,47 @@ async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: yield # pragma: no cover async def produce(self) -> AsyncGenerator[List[Dict[str, Any]], None]: - """Re-chunks `produce_chunks()` output to exact batch_size batches. + """Re-chunks `produce_chunks()` output into batches of up to `batch_size`. + + Each batch is exactly `batch_size` except for the last batch on + end-of-stream and any partial batch flushed by `flush_ms` inactivity. Reads `batch_size` and `flush_ms` from properties lazily so subclasses don't need to remember to call `super().init()`. + + Source errors raised by `produce_chunks()` propagate to the caller (the + job aborts) rather than being treated as a silent end-of-stream. The + background pump uses a bounded queue so source reads cannot outpace + downstream consumption — the existing backpressure is preserved. """ batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE)) flush_ms = self.properties.get("flush_ms", self.DEFAULT_FLUSH_MS) timeout = (flush_ms / 1000) if flush_ms else None - queue: asyncio.Queue = asyncio.Queue() + # maxsize=1 keeps the pump exactly one chunk ahead of the consumer, + # which restores the natural backpressure the old yield-driven model had. + queue: asyncio.Queue = asyncio.Queue(maxsize=1) EOS = object() + pump_error: List[BaseException] = [] # length 0 or 1 async def pump(): - """Drains produce_chunks() into the queue; signals EOS on exit.""" + """Drains produce_chunks() into the queue; signals EOS on exit and captures errors.""" + cancelled = False try: async for chunk in self.produce_chunks(): if chunk: await queue.put(chunk) except asyncio.CancelledError: + cancelled = True raise - except Exception as exc: - logger.exception("produce_chunks raised; ending stream: %s", exc) + except BaseException as exc: + pump_error.append(exc) finally: - await queue.put(EOS) + # Skip the EOS put when cancelled — the consumer's finally is + # awaiting us, the queue may be full (maxsize=1), and putting + # would deadlock. The consumer won't read EOS anyway. + if not cancelled: + await queue.put(EOS) pump_task = asyncio.create_task(pump()) buffer: List[Dict[str, Any]] = [] @@ -84,6 +101,10 @@ async def pump(): if item is EOS: if buffer: yield buffer + if pump_error: + # Re-raise the source error so the job fails loudly + # instead of treating a truncated read as success. + raise pump_error[0] return buffer.extend(item) diff --git a/core/src/datayoga_core/tests/test_producer_batching.py b/core/src/datayoga_core/tests/test_producer_batching.py index 34413037..337cc3c6 100644 --- a/core/src/datayoga_core/tests/test_producer_batching.py +++ b/core/src/datayoga_core/tests/test_producer_batching.py @@ -129,3 +129,125 @@ async def test_consumer_cancellation_cleans_up_pump(): # If pump task wasn't cleaned up we'd see a "Task was destroyed but it is # pending!" warning here. Sleep briefly so the loop has a chance to surface it. await asyncio.sleep(0.1) + + +class _BoomProducer(Producer): + """Producer whose produce_chunks() raises after emitting some chunks.""" + + def __init__(self, properties, *, before_error, error): + """Configures how many chunks to emit before raising.""" + self._test_schema = { + "type": "object", + "properties": {"batch_size": {"type": "integer", "minimum": 1}}, + } + self._before_error = before_error + self._error = error + super().__init__(properties) + + def get_json_schema(self): + """Returns the in-memory test schema (avoids reading from disk).""" + return self._test_schema + + def init(self, context: Optional[Context] = None): + """No-op init; _BoomProducer doesn't need any setup.""" + pass + + async def produce_chunks(self) -> AsyncGenerator[List[Message], None]: + """Emits the scripted lead-in chunks, then raises the configured exception.""" + for chunk in self._before_error: + yield chunk + raise self._error + + +@pytest.mark.asyncio +async def test_source_errors_propagate_instead_of_silent_eos(): + """A failing source must abort the consumer, not look like clean EOS.""" + p = _BoomProducer( + {"batch_size": 100}, + before_error=[[_msg(1), _msg(2)]], + error=RuntimeError("source connection lost"), + ) + with pytest.raises(RuntimeError, match="source connection lost"): + async for _ in p.produce(): + pass + + +@pytest.mark.asyncio +async def test_source_error_flushes_buffer_before_raising(): + """Partial buffer is yielded before the error propagates, so already-read + records aren't dropped on top of the error.""" + p = _BoomProducer( + {"batch_size": 1000}, + before_error=[[_msg(1), _msg(2), _msg(3)]], + error=RuntimeError("disk read failed"), + ) + received = [] + with pytest.raises(RuntimeError, match="disk read failed"): + async for batch in p.produce(): + received.append(batch) + assert [len(b) for b in received] == [3] + + +class _CountingProducer(Producer): + """Producer that records how many chunks it has been allowed to emit. + + Used to prove the base class applies backpressure (the pump stays no more + than one chunk ahead of the consumer when maxsize=1). + """ + + def __init__(self, properties, *, num_chunks, chunk_size, on_emit): + """Configures how many fixed-size chunks to emit and a per-emit hook.""" + self._test_schema = { + "type": "object", + "properties": {"batch_size": {"type": "integer", "minimum": 1}}, + } + self._num_chunks = num_chunks + self._chunk_size = chunk_size + self._on_emit = on_emit + super().__init__(properties) + + def get_json_schema(self): + """Returns the in-memory test schema (avoids reading from disk).""" + return self._test_schema + + def init(self, context: Optional[Context] = None): + """No-op init; _CountingProducer doesn't need any setup.""" + pass + + async def produce_chunks(self) -> AsyncGenerator[List[Message], None]: + """Yields num_chunks fixed-size chunks, calling on_emit after each yield.""" + for i in range(self._num_chunks): + yield [_msg(i * self._chunk_size + j) for j in range(self._chunk_size)] + self._on_emit(i + 1) + + +@pytest.mark.asyncio +async def test_pump_does_not_outrun_consumer_unboundedly(): + """With the default bounded queue, the pump stays close to the consumer. + + Without backpressure, the pump would emit all 1000 chunks before the + consumer reads any. With maxsize=1 the pump can be at most ~2 chunks + ahead at any moment (one being put, one queued). + """ + emitted_count = [0] + + def record_emit(n): + emitted_count[0] = n + + p = _CountingProducer( + {"batch_size": 100}, + num_chunks=1000, + chunk_size=100, + on_emit=record_emit, + ) + + gen = p.produce() + # Pull one batch and observe how far ahead the pump got. + await gen.__anext__() + # Yield once so the pump gets a chance to advance after the consumer + # took one chunk off the queue. + await asyncio.sleep(0) + ahead = emitted_count[0] + await gen.aclose() + # Pump should be at most a handful of chunks ahead, not all 1000. + assert ahead <= 5, f"pump emitted {ahead} chunks while consumer pulled 1" diff --git a/docs/processing-strategies.md b/docs/processing-strategies.md index 692d82c1..2cf186b3 100644 --- a/docs/processing-strategies.md +++ b/docs/processing-strategies.md @@ -66,7 +66,7 @@ The Rate limit strategy defines the number of requests per given time interval. ## Producer Batching -Every producer block (any block that reads from a source — `std/read`, `files/read_csv`, `parquet/read`, `relational/read`, `redis/read_stream`, `azure/read_event_hub`, `http/receiver`) accepts a `batch_size` property. The producer base class re-chunks the source's output into batches of exactly `batch_size` records, regardless of how the source delivers them (per row, per row group, per `fetchmany`, per network message). +Every producer block (any block that reads from a source — `std/read`, `files/read_csv`, `parquet/read`, `relational/read`, `redis/read_stream`, `azure/read_event_hub`, `http/receiver`) accepts a `batch_size` property. The producer base class re-chunks the source's output into batches of up to `batch_size` records, regardless of how the source delivers them (per row, per row group, per `fetchmany`, per network message). The last batch on end-of-stream and any partial batch flushed by `flush_ms` may be smaller. ```yaml input: diff --git a/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md b/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md index 7ef27825..2b96ce05 100644 --- a/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md +++ b/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md @@ -53,13 +53,14 @@ class Producer(Block): async def produce(self) -> AsyncGenerator[List[Message], None]: """Public entry point. Reads chunks from produce_chunks() and re-emits - in exact batch_size slices, with optional time-based flush.""" + in batches of up to batch_size (smaller on EOS or flush_ms), with + bounded backpressure and source-error propagation.""" ... ``` Subclasses override `produce_chunks` instead of `produce`. They emit chunks of any size — whatever's natural to the source (a Parquet row group, a `fetchmany` result, an `xreadgroup` response, an Event Hub callback batch, a single record). -The base class accumulates chunks and re-emits them in exact `batch_size` slices, flushing whatever's left on end-of-stream. +The base class accumulates chunks and re-emits them in batches of up to `batch_size`, flushing whatever's left on end-of-stream and (for streaming sources) on `flush_ms` inactivity. ### `batch_size` and `flush_ms` are read lazily From 0dda422520b89c8d8ad017a9f77b8616aab6c988 Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 19:10:18 +0300 Subject: [PATCH 30/38] Add docstrings to all test functions added in this PR (#400) One-liner docstrings on every test_* function: producer batching, schema inherit, and per-block read tests. Matches the docstring coverage applied to production code earlier in this PR. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../blocks/files/read_csv/tests/test_read_csv.py | 2 ++ .../blocks/http/receiver/tests/test_http_receiver.py | 1 + .../blocks/parquet/read/tests/test_parquet_read.py | 2 ++ .../redis/read_stream/tests/test_redis_read_stream.py | 2 ++ .../blocks/relational/read/tests/test_relational_read.py | 3 +++ .../datayoga_core/blocks/std/read/tests/test_std_read.py | 1 + core/src/datayoga_core/tests/test_producer_batching.py | 7 +++++++ core/src/datayoga_core/tests/test_schema_inherit.py | 9 +++++++++ 8 files changed, 27 insertions(+) diff --git a/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py b/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py index 55fd548e..a479910a 100644 --- a/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py +++ b/core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py @@ -23,6 +23,7 @@ def csv_path(tmp_path) -> Path: @pytest.mark.asyncio async def test_csv_batches_to_batch_size(csv_path): + """2500 CSV rows with batch_size=1000 yields batches of [1000, 1000, 500].""" block = Block({"file": str(csv_path), "batch_size": 1000}) block.init() batches = await _drain(block) @@ -33,6 +34,7 @@ async def test_csv_batches_to_batch_size(csv_path): @pytest.mark.asyncio async def test_csv_default_batch_size(csv_path): + """Without batch_size in properties, the default 1000 is applied.""" block = Block({"file": str(csv_path)}) block.init() batches = await _drain(block) diff --git a/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py b/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py index 4673801d..ee187f71 100644 --- a/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py +++ b/core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py @@ -15,6 +15,7 @@ def _free_port(): @pytest.mark.asyncio async def test_http_receiver_batches_incoming_requests(): + """60 POSTs with batch_size=50 + flush_ms=200 yield at least one full batch of 50.""" port = _free_port() block = Block({"host": "127.0.0.1", "port": port, "batch_size": 50, "flush_ms": 200}) diff --git a/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py b/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py index b33a3d03..546f77d9 100644 --- a/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py +++ b/core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py @@ -25,6 +25,7 @@ def parquet_path(tmp_path) -> Path: @pytest.mark.asyncio async def test_parquet_batches_to_batch_size(parquet_path): + """2500 rows across three row groups, batch_size=1000 -> [1000, 1000, 500].""" block = Block({"file": str(parquet_path), "batch_size": 1000}) block.init() batches = await _drain(block) @@ -36,6 +37,7 @@ async def test_parquet_batches_to_batch_size(parquet_path): @pytest.mark.asyncio async def test_parquet_rechunks_across_row_groups(parquet_path): + """Batches honor batch_size regardless of underlying row-group boundaries.""" # row groups are [1000, 1000, 500]; batch_size=750 should give batches of # [750, 750, 750, 250] regardless of row group boundaries. block = Block({"file": str(parquet_path), "batch_size": 750}) diff --git a/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py b/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py index 5d46e99e..5c4a43f7 100644 --- a/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py +++ b/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py @@ -18,6 +18,7 @@ def _mk_block(properties, redis_client): @pytest.mark.asyncio async def test_redis_uses_count_equal_to_batch_size(): + """xreadgroup is called with count=batch_size (closes #377).""" redis = MagicMock() payload_a = (b"1-0", {b"data": b'{"i": 1}'}) payload_b = (b"2-0", {b"data": b'{"i": 2}'}) @@ -38,6 +39,7 @@ async def test_redis_uses_count_equal_to_batch_size(): @pytest.mark.asyncio async def test_redis_yields_records_as_a_batch_not_one_by_one(): + """A 5-record xreadgroup response yields one batch of 5, not five batches of 1.""" redis = MagicMock() pages = [(f"{i}-0".encode(), {b"data": f'{{"i": {i}}}'.encode()}) for i in range(5)] redis.xreadgroup.side_effect = [ diff --git a/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py b/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py index 6dafd72e..47528712 100644 --- a/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py +++ b/core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py @@ -53,6 +53,7 @@ def _mk_block(properties, fake_result): @pytest.mark.asyncio async def test_relational_read_yields_batches_not_rows(): + """2500 rows with batch_size=1000 yield [1000, 1000, 500], not 2500 single-row batches.""" rows = [_Row({"i": i}) for i in range(2500)] fake_result = _fake_result(rows) block = _mk_block({"batch_size": 1000}, fake_result) @@ -62,6 +63,7 @@ async def test_relational_read_yields_batches_not_rows(): @pytest.mark.asyncio async def test_relational_read_fetch_size_independent_of_batch_size(): + """fetch_size controls driver round-trips; batch_size controls downstream batches; both are decoupled.""" rows = [_Row({"i": i}) for i in range(5000)] fake_result = _fake_result(rows) block = _mk_block({"batch_size": 1000, "fetch_size": 2500}, fake_result) @@ -76,6 +78,7 @@ async def test_relational_read_fetch_size_independent_of_batch_size(): @pytest.mark.asyncio async def test_relational_read_default_fetch_size_is_10000(): + """When fetch_size is omitted, the driver-level fetchmany is called with 10000.""" rows = [_Row({"i": i}) for i in range(500)] fake_result = _fake_result(rows) block = _mk_block({}, fake_result) diff --git a/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py b/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py index d9698d16..6ec3d933 100644 --- a/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py +++ b/core/src/datayoga_core/blocks/std/read/tests/test_std_read.py @@ -15,6 +15,7 @@ async def _drain(producer): @pytest.mark.asyncio async def test_std_read_batches_to_batch_size(): + """2500 stdin records with batch_size=1000 yield batches of [1000, 1000, 500].""" payload = [{"i": i} for i in range(2500)] fake_stdin = [orjson.dumps(payload).decode()] diff --git a/core/src/datayoga_core/tests/test_producer_batching.py b/core/src/datayoga_core/tests/test_producer_batching.py index 337cc3c6..5ab4f98a 100644 --- a/core/src/datayoga_core/tests/test_producer_batching.py +++ b/core/src/datayoga_core/tests/test_producer_batching.py @@ -54,6 +54,7 @@ async def _drain(producer: Producer): @pytest.mark.asyncio async def test_rechunks_one_large_chunk(): + """One 5000-record chunk + batch_size=1000 yields five batches of 1000.""" chunks = [[_msg(i) for i in range(5000)]] p = FakeProducer({"batch_size": 1000}, chunks=chunks) batches = await _drain(p) @@ -62,6 +63,7 @@ async def test_rechunks_one_large_chunk(): @pytest.mark.asyncio async def test_accumulates_small_chunks_and_flushes_on_eos(): + """Small chunks (200+300+400=900) are accumulated; the partial batch flushes on EOS.""" chunks = [[_msg(i) for i in range(200)], [_msg(i) for i in range(200, 500)], [_msg(i) for i in range(500, 900)]] @@ -72,6 +74,7 @@ async def test_accumulates_small_chunks_and_flushes_on_eos(): @pytest.mark.asyncio async def test_partial_final_batch_on_eos(): + """1500 records + batch_size=1000 yields [1000, 500] — the trailing partial fires on EOS.""" chunks = [[_msg(i) for i in range(1500)]] p = FakeProducer({"batch_size": 1000}, chunks=chunks) batches = await _drain(p) @@ -80,6 +83,7 @@ async def test_partial_final_batch_on_eos(): @pytest.mark.asyncio async def test_empty_chunks_are_ignored(): + """Empty chunks from produce_chunks() don't produce empty batches.""" chunks = [[], [_msg(1), _msg(2)], [], [_msg(3)]] p = FakeProducer({"batch_size": 10}, chunks=chunks) batches = await _drain(p) @@ -88,6 +92,7 @@ async def test_empty_chunks_are_ignored(): @pytest.mark.asyncio async def test_flush_ms_emits_partial_on_inactivity(): + """With flush_ms set, a partial batch is emitted on source inactivity, not held to EOS.""" # one chunk of 2 records, then a 300ms wait before EOS; flush_ms=100 should # flush the partial batch of 2 well before EOS. chunks = [[_msg(1), _msg(2)], [_msg(3)]] @@ -109,6 +114,7 @@ async def test_flush_ms_emits_partial_on_inactivity(): @pytest.mark.asyncio async def test_no_flush_ms_holds_records_until_eos(): + """Without flush_ms, accumulated records stay buffered until batch_size or EOS.""" chunks = [[_msg(1)], [_msg(2)]] sleeps = [0, 0.1] p = FakeProducer({"batch_size": 100}, chunks=chunks, sleep_before=sleeps) @@ -118,6 +124,7 @@ async def test_no_flush_ms_holds_records_until_eos(): @pytest.mark.asyncio async def test_consumer_cancellation_cleans_up_pump(): + """Closing the producer generator cancels the pump cleanly (no orphaned task warnings).""" chunks = [[_msg(i)] for i in range(1000)] p = FakeProducer({"batch_size": 10, "flush_ms": 50}, chunks=chunks, sleep_before=[0.05] * 1000) diff --git a/core/src/datayoga_core/tests/test_schema_inherit.py b/core/src/datayoga_core/tests/test_schema_inherit.py index 46f448cd..f01a1dfe 100644 --- a/core/src/datayoga_core/tests/test_schema_inherit.py +++ b/core/src/datayoga_core/tests/test_schema_inherit.py @@ -9,6 +9,7 @@ def test_inherit_merges_fragment_properties(): + """A schema with $inherit:[batchable] picks up batch_size from the fragment.""" schema = { "title": "demo", "type": "object", @@ -25,6 +26,7 @@ def test_inherit_merges_fragment_properties(): def test_inherit_local_property_wins_over_fragment(): + """When local schema redefines an inherited property, the local version takes precedence.""" schema = { "type": "object", "$inherit": ["batchable"], @@ -37,6 +39,7 @@ def test_inherit_local_property_wins_over_fragment(): def test_inherit_streamable_brings_both_props(): + """$inherit:[streamable] exposes both batch_size and flush_ms on the schema.""" schema = {"type": "object", "$inherit": ["streamable"], "properties": {}} resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) assert "batch_size" in resolved["properties"] @@ -44,6 +47,7 @@ def test_inherit_streamable_brings_both_props(): def test_schema_without_inherit_unchanged(): + """Schemas without $inherit pass through resolve_inherits unmodified.""" schema = { "type": "object", "properties": {"foo": {"type": "string"}}, @@ -54,24 +58,28 @@ def test_schema_without_inherit_unchanged(): def test_unknown_fragment_raises(): + """$inherit referencing a missing fragment file raises FileNotFoundError.""" schema = {"type": "object", "$inherit": ["nope"], "properties": {}} with pytest.raises(FileNotFoundError): resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) def test_inherit_string_value_raises_type_error(): + """$inherit must be a list; passing a string raises TypeError loudly.""" schema = {"type": "object", "$inherit": "batchable", "properties": {}} with pytest.raises(TypeError): resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) def test_inherit_non_string_items_raises_type_error(): + """Non-string items in the $inherit list raise TypeError.""" schema = {"type": "object", "$inherit": ["batchable", 123], "properties": {}} with pytest.raises(TypeError): resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) def test_inherit_empty_list_returns_unchanged(): + """An empty $inherit list is a no-op; the schema is returned as-is.""" schema = {"type": "object", "$inherit": [], "properties": {"foo": {}}} resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) # Early-return path: schema is returned as-is (no mutation, no key removal). @@ -79,6 +87,7 @@ def test_inherit_empty_list_returns_unchanged(): def test_nested_inherit_raises_value_error(tmp_path): + """A fragment that itself contains $inherit raises ValueError (no nested inheritance).""" # Build a fragment dir with a fragment that has its own $inherit. (tmp_path / "parent.schema.json").write_text( '{"properties": {"x": {"type": "string"}}}' From c1e2e7130fd90f8779ceef5481975c2dca91a845 Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 19:27:21 +0300 Subject: [PATCH 31/38] Drop meaningless test_max_batch_size_defaults_to_300_when_omitted (#400) Audit pass: the test literally asserted block.properties.get("max_batch_size", 300) == 300 which is testing the standard library, not Block code. Removed. Remaining 4 azure tests cover validation, schema shape, accepted properties, and the documented breaking-change behavior. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../blocks/azure/read_event_hub/tests/test_event_hub.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py b/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py index 17cff570..0506ee7b 100644 --- a/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py +++ b/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py @@ -30,12 +30,6 @@ def test_max_batch_size_accepted(): assert block.properties["batch_size"] == 100 -def test_max_batch_size_defaults_to_300_when_omitted(): - """The block's init() reads max_batch_size with a default of 300.""" - block = Block(_minimal_props()) - assert int(block.properties.get("max_batch_size", 300)) == 300 - - def test_renamed_schema_has_additional_properties_false(): """Schema after rename: max_batch_size + streamable's batch_size/flush_ms, no unknown properties allowed.""" From 056f8cf5df32657bb9cf1285de52ed0e4ce75d31 Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 19:31:17 +0300 Subject: [PATCH 32/38] Drop retrospective implementation plan; keep design spec (#400) The 2225-line plan was execution scaffolding for the work that now ships. The spec remains as the architectural record. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...026-05-28-producer-batching-unification.md | 2225 ----------------- 1 file changed, 2225 deletions(-) delete mode 100644 docs/superpowers/plans/2026-05-28-producer-batching-unification.md diff --git a/docs/superpowers/plans/2026-05-28-producer-batching-unification.md b/docs/superpowers/plans/2026-05-28-producer-batching-unification.md deleted file mode 100644 index d4fa4415..00000000 --- a/docs/superpowers/plans/2026-05-28-producer-batching-unification.md +++ /dev/null @@ -1,2225 +0,0 @@ -# Producer Batching Unification Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Move batching out of individual producer blocks into the `Producer` base class so every read block has consistent `batch_size` behavior, and three buggy producers stop yielding single records. - -**Architecture:** The `Producer` base class gets a new abstract-by-convention hook `produce_chunks()` that yields lists of any size. Its `produce()` method becomes a re-chunker that emits exact `batch_size` batches, with an optional `flush_ms` timeout-flush for streaming sources. Schema fragments (`batchable.schema.json`, `streamable.schema.json`) provide the shared `batch_size`/`flush_ms` definitions, resolved at load time via a `$inherit` convention. Each of the 7 producer blocks migrates to override `produce_chunks` instead of `produce`. - -**Tech Stack:** Python 3.7+, asyncio, jsonschema, pytest (asyncio mode), SQLAlchemy, redis-py, aiohttp, azure-eventhub. - -**Spec:** `docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md` -**Issue:** #400 - ---- - -## File Structure - -**Created:** - -- `core/src/datayoga_core/resources/schemas/batchable.schema.json` — fragment exposing `batch_size` -- `core/src/datayoga_core/resources/schemas/streamable.schema.json` — fragment exposing `flush_ms` (combined with batchable) -- `core/src/datayoga_core/schema_utils.py` — `$inherit` resolver used by Block + Job -- `core/src/datayoga_core/tests/__init__.py` — empty, makes the tests package importable -- `core/src/datayoga_core/tests/test_schema_inherit.py` — tests for the `$inherit` resolver -- `core/src/datayoga_core/tests/test_producer_batching.py` — base-class batching tests -- `core/src/datayoga_core/blocks/parquet/read/tests/__init__.py` (if package missing) -- `core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py` -- `core/src/datayoga_core/blocks/files/read_csv/tests/__init__.py` -- `core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py` -- `core/src/datayoga_core/blocks/redis/read_stream/tests/__init__.py` -- `core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py` -- `core/src/datayoga_core/blocks/http/receiver/tests/__init__.py` -- `core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py` -- `core/src/datayoga_core/blocks/azure/read_event_hub/tests/__init__.py` -- `core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py` -- `core/src/datayoga_core/blocks/relational/read/tests/__init__.py` -- `core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py` - -**Modified:** - -- `core/src/datayoga_core/producer.py` — adds `produce_chunks` and a default `produce()` that re-chunks -- `core/src/datayoga_core/block.py` — `get_json_schema()` runs through `$inherit` resolver -- `core/src/datayoga_core/job.py` — `get_json_schema()` loop runs each loaded schema through the resolver -- `core/src/datayoga_core/blocks/std/read/block.py` — replace `process_batch` with `produce_chunks` -- `core/src/datayoga_core/blocks/std/read/block.schema.json` — use `$inherit: ["batchable"]` -- `core/src/datayoga_core/blocks/files/read_csv/block.py` — `produce_chunks` (drop `islice` loop in `produce`) -- `core/src/datayoga_core/blocks/files/read_csv/block.schema.json` — drop inline `batch_size`, add `$inherit` -- `core/src/datayoga_core/blocks/parquet/read/block.py` — `produce_chunks` per row group -- `core/src/datayoga_core/blocks/parquet/read/block.schema.json` — add `$inherit` -- `core/src/datayoga_core/blocks/relational/read/block.py` — `produce_chunks` with `fetch_size` -- `core/src/datayoga_core/blocks/relational/read/block.schema.json` — add `$inherit` + `fetch_size` property -- `core/src/datayoga_core/blocks/redis/read_stream/block.py` — `produce_chunks` with `count=batch_size` -- `core/src/datayoga_core/blocks/redis/read_stream/block.schema.json` — `$inherit: ["streamable"]` -- `core/src/datayoga_core/blocks/http/receiver/block.py` — `produce_chunks` drains queue -- `core/src/datayoga_core/blocks/http/receiver/block.schema.json` — `$inherit: ["streamable"]` -- `core/src/datayoga_core/blocks/azure/read_event_hub/block.py` — `produce_chunks`, rename `batch_size` → `max_batch_size` -- `core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json` — rename property, add `additionalProperties: false`, `$inherit: ["streamable"]` -- `schemas/job.schema.json` — regenerated at the end -- `docs/reference/blocks/*.md` — regenerated at the end -- `docs/processing-strategies.md` — new section on producer batching - ---- - -## Task 1: Schema fragment loader - -Adds the `$inherit` convention and the two shared fragments. After this task, schemas referencing `batchable` / `streamable` get the fragments' properties merged in at load time. - -**Files:** - -- Create: `core/src/datayoga_core/resources/schemas/batchable.schema.json` -- Create: `core/src/datayoga_core/resources/schemas/streamable.schema.json` -- Create: `core/src/datayoga_core/schema_utils.py` -- Create: `core/src/datayoga_core/tests/__init__.py` -- Create: `core/src/datayoga_core/tests/test_schema_inherit.py` -- Modify: `core/src/datayoga_core/block.py` (lines 44–59) -- Modify: `core/src/datayoga_core/job.py` (lines 223–244) - -- [ ] **Step 1.1: Create the `batchable` fragment** - -Create `core/src/datayoga_core/resources/schemas/batchable.schema.json`: - -```json -{ - "title": "batchable", - "description": "Producer batching mixin: declares batch_size for producers that yield records in batches.", - "type": "object", - "properties": { - "batch_size": { - "type": "integer", - "minimum": 1, - "description": "Maximum number of records yielded per downstream batch.", - "default": 1000 - } - } -} -``` - -- [ ] **Step 1.2: Create the `streamable` fragment** - -Create `core/src/datayoga_core/resources/schemas/streamable.schema.json`: - -```json -{ - "title": "streamable", - "description": "Streaming producer mixin: declares batch_size and flush_ms for producers reading from continuous sources.", - "type": "object", - "properties": { - "batch_size": { - "type": "integer", - "minimum": 1, - "description": "Maximum number of records yielded per downstream batch.", - "default": 1000 - }, - "flush_ms": { - "type": ["integer", "null"], - "minimum": 1, - "description": "If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.", - "default": 1000 - } - } -} -``` - -- [ ] **Step 1.3: Create empty tests package** - -If `core/src/datayoga_core/tests/__init__.py` does not exist, create it as an empty file. (Several test modules in this plan live in `core/src/datayoga_core/tests/`; the directory must be importable.) - -```bash -test -f core/src/datayoga_core/tests/__init__.py || touch core/src/datayoga_core/tests/__init__.py -``` - -- [ ] **Step 1.4: Write the failing test for `$inherit` resolution** - -Create `core/src/datayoga_core/tests/test_schema_inherit.py`: - -```python -import json -from pathlib import Path - -import pytest - -from datayoga_core.schema_utils import resolve_inherits - - -SCHEMAS_DIR = ( - Path(__file__).resolve().parent.parent / "resources" / "schemas" -) - - -def test_inherit_merges_fragment_properties(): - schema = { - "title": "demo", - "type": "object", - "$inherit": ["batchable"], - "properties": {"foo": {"type": "string"}}, - "additionalProperties": False, - } - resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) - assert "$inherit" not in resolved - assert "batch_size" in resolved["properties"] - assert resolved["properties"]["batch_size"]["default"] == 1000 - assert resolved["properties"]["foo"] == {"type": "string"} - assert resolved["additionalProperties"] is False - - -def test_inherit_local_property_wins_over_fragment(): - schema = { - "type": "object", - "$inherit": ["batchable"], - "properties": { - "batch_size": {"type": "integer", "minimum": 1, "default": 50} - }, - } - resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) - assert resolved["properties"]["batch_size"]["default"] == 50 - - -def test_inherit_streamable_brings_both_props(): - schema = {"type": "object", "$inherit": ["streamable"], "properties": {}} - resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) - assert "batch_size" in resolved["properties"] - assert "flush_ms" in resolved["properties"] - - -def test_schema_without_inherit_unchanged(): - schema = { - "type": "object", - "properties": {"foo": {"type": "string"}}, - "additionalProperties": False, - } - resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) - assert resolved == schema - - -def test_unknown_fragment_raises(): - schema = {"type": "object", "$inherit": ["nope"], "properties": {}} - with pytest.raises(FileNotFoundError): - resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) -``` - -- [ ] **Step 1.5: Run test to verify it fails** - -Run: - -```bash -cd core && python -m pytest src/datayoga_core/tests/test_schema_inherit.py -v -``` - -Expected: FAIL with `ModuleNotFoundError: No module named 'datayoga_core.schema_utils'`. - -- [ ] **Step 1.6: Implement the resolver** - -Create `core/src/datayoga_core/schema_utils.py`: - -```python -"""Schema composition helpers. - -Producers and other blocks can declare `"$inherit": ["batchable"]` at the -top of their block.schema.json to pull in shared property definitions from -the fragments in resources/schemas/. `resolve_inherits` merges the -fragments' `properties` into the local schema (local properties win), then -removes the `$inherit` key. Schemas without `$inherit` are returned as-is. -""" -from __future__ import annotations - -import copy -from os import path -from typing import Any, Dict, List - -from datayoga_core import utils - - -def resolve_inherits(schema: Dict[str, Any], schemas_dir: str = None) -> Dict[str, Any]: - """Merge any fragments listed in $inherit into the schema's properties. - - Args: - schema: The schema to resolve. Mutated in place and also returned. - schemas_dir: Directory containing the fragment files. Defaults to - the bundled/non-bundled resources/schemas directory. - - Returns: - The mutated schema with $inherit removed and fragment properties merged. - """ - inherits: List[str] = schema.get("$inherit") or [] - if not inherits: - return schema - - if schemas_dir is None: - schemas_dir = utils.get_resource_path("schemas") - - merged_properties: Dict[str, Any] = {} - for fragment_name in inherits: - fragment_path = path.join(schemas_dir, f"{fragment_name}.schema.json") - if not path.isfile(fragment_path): - raise FileNotFoundError( - f"Schema fragment '{fragment_name}' not found at {fragment_path}" - ) - fragment = utils.read_json(fragment_path) - merged_properties.update(copy.deepcopy(fragment.get("properties", {}))) - - # Local properties take precedence over inherited ones. - local_properties = schema.get("properties", {}) - merged_properties.update(local_properties) - - schema["properties"] = merged_properties - schema.pop("$inherit", None) - return schema -``` - -- [ ] **Step 1.7: Run test to verify it passes** - -Run: - -```bash -cd core && python -m pytest src/datayoga_core/tests/test_schema_inherit.py -v -``` - -Expected: 5 passed. - -- [ ] **Step 1.8: Wire resolver into `Block.get_json_schema`** - -Modify `core/src/datayoga_core/block.py`. After loading the schema (currently `return utils.read_json(json_schema_file)` on line 59), pass it through the resolver. - -Replace lines 44–59 with: - -```python - def get_json_schema(self) -> Dict[str, Any]: - """Returns the JSON Schema for this block. - - Returns: - Dict[str, Any]: JSON Schema. - """ - json_schema_file = path.join( - utils.get_bundled_dir(), - os.path.relpath( - os.path.dirname(sys.modules[self.__module__].__file__), - start=os.path.dirname(__file__)), - "block.schema.json") if utils.is_bundled() else path.join( - os.path.dirname(os.path.realpath(sys.modules[self.__module__].__file__)), - "block.schema.json") - logger.debug(f"loading schema from {json_schema_file}") - from datayoga_core.schema_utils import resolve_inherits - return resolve_inherits(utils.read_json(json_schema_file)) -``` - -Note: the `from datayoga_core.schema_utils import resolve_inherits` line is inside the function to avoid a circular import (schema_utils imports from utils, utils imports from block). - -- [ ] **Step 1.9: Wire resolver into `Job.get_json_schema`** - -Modify `core/src/datayoga_core/job.py`. Inside the `for block_type, schema_path in block_info:` loop (around line 240–243), apply the resolver to each loaded schema. - -Find this block: - -```python - for block_type, schema_path in block_info: - block_types.append(block_type) - # load schema file - schema = utils.read_json(f"{schema_path}") - # append to the array of allOf for the full schema -``` - -Replace with: - -```python - from datayoga_core.schema_utils import resolve_inherits - for block_type, schema_path in block_info: - block_types.append(block_type) - # load schema file - schema = resolve_inherits(utils.read_json(f"{schema_path}")) - # append to the array of allOf for the full schema -``` - -- [ ] **Step 1.10: Verify existing block validation still passes** - -Run the full core test suite to make sure nothing regressed (no producer is using `$inherit` yet, so behavior should be unchanged): - -```bash -cd core && python -m pytest src/datayoga_core/ -x -q -``` - -Expected: all existing tests pass; the 5 new `test_schema_inherit.py` tests also pass. - -- [ ] **Step 1.11: Commit** - -```bash -git add core/src/datayoga_core/resources/schemas/batchable.schema.json \ - core/src/datayoga_core/resources/schemas/streamable.schema.json \ - core/src/datayoga_core/schema_utils.py \ - core/src/datayoga_core/tests/__init__.py \ - core/src/datayoga_core/tests/test_schema_inherit.py \ - core/src/datayoga_core/block.py \ - core/src/datayoga_core/job.py -git commit -m "Add \$inherit schema fragment resolver (#400)" -``` - ---- - -## Task 2: Producer base class with batching - -Add `produce_chunks()` and a default `produce()` that re-chunks. Existing subclasses override `produce()` directly and are unaffected until migrated in later tasks. - -**Files:** - -- Create: `core/src/datayoga_core/tests/test_producer_batching.py` -- Modify: `core/src/datayoga_core/producer.py` - -- [ ] **Step 2.1: Write the failing tests** - -Create `core/src/datayoga_core/tests/test_producer_batching.py`: - -```python -import asyncio -from typing import AsyncGenerator, List, Optional - -import pytest - -from datayoga_core.context import Context -from datayoga_core.producer import Message, Producer - - -def _msg(i: int) -> dict: - return {Producer.MSG_ID_FIELD: str(i), "v": i} - - -class FakeProducer(Producer): - """Producer driven by a scripted list of chunks plus optional sleeps.""" - - def __init__(self, properties=None, *, chunks=None, sleep_before=None): - # schema for a FakeProducer; declare batch_size/flush_ms so validation passes - self._test_schema = { - "type": "object", - "properties": { - "batch_size": {"type": "integer", "minimum": 1}, - "flush_ms": {"type": ["integer", "null"], "minimum": 1}, - }, - } - self._chunks = chunks or [] - self._sleep_before = sleep_before or [] - super().__init__(properties or {}) - - def get_json_schema(self): - return self._test_schema - - def init(self, context: Optional[Context] = None): - pass - - async def produce_chunks(self) -> AsyncGenerator[List[Message], None]: - for i, chunk in enumerate(self._chunks): - if i < len(self._sleep_before) and self._sleep_before[i]: - await asyncio.sleep(self._sleep_before[i]) - yield chunk - - -async def _drain(producer: Producer): - out = [] - async for batch in producer.produce(): - out.append(batch) - return out - - -@pytest.mark.asyncio -async def test_rechunks_one_large_chunk(): - chunks = [[_msg(i) for i in range(5000)]] - p = FakeProducer({"batch_size": 1000}, chunks=chunks) - batches = await _drain(p) - assert [len(b) for b in batches] == [1000, 1000, 1000, 1000, 1000] - - -@pytest.mark.asyncio -async def test_accumulates_small_chunks_and_flushes_on_eos(): - chunks = [[_msg(i) for i in range(200)], - [_msg(i) for i in range(200, 500)], - [_msg(i) for i in range(500, 900)]] - p = FakeProducer({"batch_size": 1000}, chunks=chunks) - batches = await _drain(p) - assert [len(b) for b in batches] == [900] - - -@pytest.mark.asyncio -async def test_partial_final_batch_on_eos(): - chunks = [[_msg(i) for i in range(1500)]] - p = FakeProducer({"batch_size": 1000}, chunks=chunks) - batches = await _drain(p) - assert [len(b) for b in batches] == [1000, 500] - - -@pytest.mark.asyncio -async def test_empty_chunks_are_ignored(): - chunks = [[], [_msg(1), _msg(2)], [], [_msg(3)]] - p = FakeProducer({"batch_size": 10}, chunks=chunks) - batches = await _drain(p) - assert [len(b) for b in batches] == [3] - - -@pytest.mark.asyncio -async def test_flush_ms_emits_partial_on_inactivity(): - # one chunk of 2 records, then a 300ms wait before EOS; flush_ms=100 should - # flush the partial batch of 2 well before EOS. - chunks = [[_msg(1), _msg(2)], [_msg(3)]] - sleeps = [0, 0.3] - p = FakeProducer({"batch_size": 100, "flush_ms": 100}, - chunks=chunks, sleep_before=sleeps) - - received = [] - started = asyncio.get_event_loop().time() - timings = [] - async for batch in p.produce(): - timings.append(asyncio.get_event_loop().time() - started) - received.append(batch) - - assert [len(b) for b in received] == [2, 1] - # first flush happens because of inactivity (~100ms), not waiting for chunk 2 - assert timings[0] < 0.25, f"expected first flush before 250ms, got {timings[0]}" - - -@pytest.mark.asyncio -async def test_no_flush_ms_holds_records_until_eos(): - chunks = [[_msg(1)], [_msg(2)]] - sleeps = [0, 0.1] - p = FakeProducer({"batch_size": 100}, chunks=chunks, sleep_before=sleeps) - batches = await _drain(p) - assert [len(b) for b in batches] == [2] # combined on EOS, never flushed mid-stream - - -@pytest.mark.asyncio -async def test_consumer_cancellation_cleans_up_pump(): - chunks = [[_msg(i)] for i in range(1000)] - p = FakeProducer({"batch_size": 10, "flush_ms": 50}, chunks=chunks, - sleep_before=[0.05] * 1000) - - gen = p.produce() - first = await gen.__anext__() - assert len(first) >= 1 - await gen.aclose() - # If pump task wasn't cleaned up we'd see a "Task was destroyed but it is - # pending!" warning here. Sleep briefly so the loop has a chance to surface it. - await asyncio.sleep(0.1) -``` - -- [ ] **Step 2.2: Run tests to verify they fail** - -Run: - -```bash -cd core && python -m pytest src/datayoga_core/tests/test_producer_batching.py -v -``` - -Expected: All 7 tests FAIL with `TypeError: Can't instantiate abstract class FakeProducer with abstract methods produce` (because `produce` is currently abstract and `FakeProducer` doesn't override it; it overrides `produce_chunks` which doesn't exist yet). - -- [ ] **Step 2.3: Implement the new `Producer` base class** - -Replace the contents of `core/src/datayoga_core/producer.py` with: - -```python -import asyncio -import logging -from contextlib import suppress -from typing import Any, AsyncGenerator, Dict, List - -from .block import Block - -logger = logging.getLogger("dy") - - -class Message: - def __init__(self, msg_id: str, value: Dict[str, Any]): - self.msg_id = msg_id - self.value = value - - -class Producer(Block): - """Base class for producer (read) blocks. - - Subclasses override `produce_chunks()` to yield chunks of any size from - the source. The default `produce()` re-chunks them to exactly `batch_size` - records per batch (smaller on flush_ms timeout or end-of-stream). - - Legacy subclasses may still override `produce()` directly. They bypass - the base-class batching and `produce_chunks` is not called. - """ - - DEFAULT_BATCH_SIZE = 1000 - DEFAULT_FLUSH_MS = None # streaming subclasses override to enable timeout flush - - async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: - """Yield natural-size chunks from the source. - - Subclasses should override this method. The base-class `produce()` - will re-chunk the output to exact `batch_size` slices. - """ - raise NotImplementedError( - f"{type(self).__name__} must override produce_chunks() or produce()" - ) - # Make this an async generator for type-checking purposes. - yield # pragma: no cover - - async def produce(self) -> AsyncGenerator[List[Dict[str, Any]], None]: - """Re-chunks `produce_chunks()` output to exact batch_size batches. - - Reads `batch_size` and `flush_ms` from properties lazily so subclasses - don't need to remember to call `super().init()`. - """ - batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE)) - flush_ms = self.properties.get("flush_ms", self.DEFAULT_FLUSH_MS) - timeout = (flush_ms / 1000) if flush_ms else None - - queue: asyncio.Queue = asyncio.Queue() - EOS = object() - - async def pump(): - try: - async for chunk in self.produce_chunks(): - if chunk: - await queue.put(chunk) - except asyncio.CancelledError: - raise - except Exception as exc: - logger.exception("produce_chunks raised; ending stream: %s", exc) - finally: - await queue.put(EOS) - - pump_task = asyncio.create_task(pump()) - buffer: List[Dict[str, Any]] = [] - try: - while True: - try: - item = await asyncio.wait_for(queue.get(), timeout=timeout) - except asyncio.TimeoutError: - if buffer: - yield buffer - buffer = [] - continue - - if item is EOS: - if buffer: - yield buffer - return - - buffer.extend(item) - while len(buffer) >= batch_size: - yield buffer[:batch_size] - buffer = buffer[batch_size:] - finally: - pump_task.cancel() - with suppress(asyncio.CancelledError, Exception): - await pump_task - - def ack(self, msg_ids: List[str]): - """Sends acknowledge for the message IDs of records that have been processed.""" - pass -``` - -Key differences from the current file: - -- `produce()` is no longer `@abstractmethod` — it has a default implementation. -- `produce_chunks()` is the new override hook (not formally `@abstractmethod` so legacy subclasses still validate). -- `Message` class unchanged. - -- [ ] **Step 2.4: Run tests to verify they pass** - -Run: - -```bash -cd core && python -m pytest src/datayoga_core/tests/test_producer_batching.py -v -``` - -Expected: 7 passed. - -- [ ] **Step 2.5: Run the full core test suite to confirm no regressions** - -Existing producers all still override `produce()`, so their behavior is unchanged. - -```bash -cd core && python -m pytest src/datayoga_core/ -x -q -``` - -Expected: all tests pass (including the new `test_producer_batching` and `test_schema_inherit`). - -- [ ] **Step 2.6: Commit** - -```bash -git add core/src/datayoga_core/producer.py \ - core/src/datayoga_core/tests/test_producer_batching.py -git commit -m "Producer base class re-chunks via produce_chunks (#400)" -``` - ---- - -## Task 3: Migrate `std/read` - -`std/read` already has `batch_size` and a custom `process_batch` accumulator. Replace it with a `produce_chunks` that yields one chunk; the base class re-chunks. - -**Files:** - -- Modify: `core/src/datayoga_core/blocks/std/read/block.py` -- Modify: `core/src/datayoga_core/blocks/std/read/block.schema.json` - -- [ ] **Step 3.1: Write the failing test** - -There is no existing `tests/` directory under `std/read`. The std/read producer is exercised indirectly by integration tests, but we add a unit test for batching here. - -Create `core/src/datayoga_core/blocks/std/read/tests/__init__.py` (empty file) and `core/src/datayoga_core/blocks/std/read/tests/test_std_read.py`: - -```python -import asyncio -from unittest.mock import patch - -import orjson -import pytest - -from datayoga_core.blocks.std.read.block import Block - - -async def _drain(producer): - out = [] - async for batch in producer.produce(): - out.append(batch) - return out - - -@pytest.mark.asyncio -async def test_std_read_batches_to_batch_size(): - payload = [{"i": i} for i in range(2500)] - fake_stdin = [orjson.dumps(payload).decode()] - - block = Block({"batch_size": 1000}) - block.init() - - with patch("datayoga_core.blocks.std.read.block.select.select", - return_value=([object()], [], [])), \ - patch("datayoga_core.blocks.std.read.block.sys.stdin", fake_stdin): - batches = await _drain(block) - - assert [len(b) for b in batches] == [1000, 1000, 500] - # records carry their MSG_ID_FIELD and original payload values - flat = [r for b in batches for r in b] - assert flat[0]["i"] == 0 - assert all(Block.MSG_ID_FIELD in r for r in flat) -``` - -- [ ] **Step 3.2: Run test to verify it fails** - -Run: - -```bash -cd core && python -m pytest src/datayoga_core/blocks/std/read/tests/test_std_read.py -v -``` - -Expected: FAIL — the current implementation yields batches of `batch_size`, but its `process_batch` helper won't be exercised through the new `produce()` machinery because it overrides `produce()` directly. The test may also fail because the current produce() doesn't see the `batch_size_in_std_read_block` branch's batch logic interact cleanly with the test mocks. (The point of this step is to drive the migration; the failure shape is secondary.) - -- [ ] **Step 3.3: Migrate `std/read` to `produce_chunks`** - -Replace the contents of `core/src/datayoga_core/blocks/std/read/block.py` with: - -```python -import logging -import select -import sys -import uuid -from typing import Any, AsyncGenerator, Dict, List, Optional - -import orjson -from datayoga_core.context import Context -from datayoga_core.producer import Producer as DyProducer - -logger = logging.getLogger("dy") - - -class Block(DyProducer): - def init(self, context: Optional[Context] = None): - logger.debug(f"Initializing {self.get_block_name()}") - - async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: - if select.select([sys.stdin], [], [], 0.0)[0]: - all_records: List[Dict[str, Any]] = [] - for line in sys.stdin: - all_records.extend(self.get_records(line)) - else: - print("Enter data to process:") - all_records = self.get_records(input()) - - if all_records: - yield [self.get_message(record) for record in all_records] - - @staticmethod - def get_records(data: str) -> List[Dict[str, Any]]: - records = orjson.loads(data) - if isinstance(records, dict): - records = [records] - return records - - def get_message(self, record: Dict[str, Any]) -> Dict[str, Any]: - return {self.MSG_ID_FIELD: str(uuid.uuid4()), **record} -``` - -The `process_batch`, `batch_size` init read, and `produce` override are all gone. The base class handles batching. - -- [ ] **Step 3.4: Update the schema to use the fragment** - -Replace the contents of `core/src/datayoga_core/blocks/std/read/block.schema.json` with: - -```json -{ - "title": "std.read", - "description": "Read from the standard input", - "type": "object", - "$inherit": ["batchable"], - "properties": {}, - "additionalProperties": false -} -``` - -The `batch_size` declaration now comes from the fragment. - -- [ ] **Step 3.5: Run test to verify it passes** - -```bash -cd core && python -m pytest src/datayoga_core/blocks/std/read/tests/test_std_read.py -v -``` - -Expected: PASS. - -- [ ] **Step 3.6: Run the full core suite** - -```bash -cd core && python -m pytest src/datayoga_core/ -x -q -``` - -Expected: all tests pass. - -- [ ] **Step 3.7: Commit** - -```bash -git add core/src/datayoga_core/blocks/std/read/block.py \ - core/src/datayoga_core/blocks/std/read/block.schema.json \ - core/src/datayoga_core/blocks/std/read/tests/__init__.py \ - core/src/datayoga_core/blocks/std/read/tests/test_std_read.py -git commit -m "Migrate std/read to produce_chunks (#400, #296)" -``` - ---- - -## Task 4: Migrate `files/read_csv` - -Replace the `produce()` override and `islice` loop with a `produce_chunks` that yields one chunk per `batch_size` rows. The base class re-chunks to the configured `batch_size`. - -**Files:** - -- Modify: `core/src/datayoga_core/blocks/files/read_csv/block.py` -- Modify: `core/src/datayoga_core/blocks/files/read_csv/block.schema.json` - -- [ ] **Step 4.1: Write the failing test** - -Create `core/src/datayoga_core/blocks/files/read_csv/tests/__init__.py` (empty) and `core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py`: - -```python -from pathlib import Path - -import pytest - -from datayoga_core.blocks.files.read_csv.block import Block - - -async def _drain(producer): - out = [] - async for batch in producer.produce(): - out.append(batch) - return out - - -@pytest.fixture -def csv_path(tmp_path) -> Path: - p = tmp_path / "data.csv" - rows = ["fname,lname"] + [f"first{i},last{i}" for i in range(2500)] - p.write_text("\n".join(rows) + "\n", encoding="utf-8") - return p - - -@pytest.mark.asyncio -async def test_csv_batches_to_batch_size(csv_path): - block = Block({"file": str(csv_path), "batch_size": 1000, "skip": 1}) - block.init() - batches = await _drain(block) - assert [len(b) for b in batches] == [1000, 1000, 500] - # message ids are populated - assert all(Block.MSG_ID_FIELD in r for b in batches for r in b) - # first row content - assert batches[0][0]["fname"] == "first0" - - -@pytest.mark.asyncio -async def test_csv_default_batch_size(csv_path): - block = Block({"file": str(csv_path), "skip": 1}) - block.init() - batches = await _drain(block) - # default batch_size is 1000 - assert [len(b) for b in batches] == [1000, 1000, 500] -``` - -- [ ] **Step 4.2: Run test to verify it fails** - -```bash -cd core && python -m pytest src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py -v -``` - -Expected: FAIL — current `produce()` works but the tests may pass coincidentally because `files/read_csv` already batches. That's fine; the test exists to _protect_ the contract. Proceed to the migration anyway and confirm the test still passes afterward. - -- [ ] **Step 4.3: Migrate `files/read_csv` to `produce_chunks`** - -Replace the contents of `core/src/datayoga_core/blocks/files/read_csv/block.py` with: - -```python -import logging -import os -from abc import ABCMeta -from contextlib import suppress -from csv import DictReader -from itertools import count, islice -from typing import Any, AsyncGenerator, Dict, List, Optional - -from datayoga_core.context import Context -from datayoga_core.producer import Producer as DyProducer - -logger = logging.getLogger("dy") - - -class Block(DyProducer, metaclass=ABCMeta): - - def init(self, context: Optional[Context] = None): - logger.debug(f"Initializing {self.get_block_name()}") - csv_file = self.properties["file"] - if os.path.isabs(csv_file) or context is None: - self.file = csv_file - else: - self.file = os.path.join(context.properties.get("data_path"), csv_file) - logger.debug(f"file: {self.file}") - self.encoding = self.properties.get("encoding", "utf-8") - self.fields = self.properties.get("fields") - self.skip = self.properties.get("skip", 0) - self.delimiter = self.properties.get("delimiter", ",") - self.quotechar = self.properties.get("quotechar", "\"") - - async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: - logger.debug("Reading CSV") - batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE)) - - with open(self.file, "r", encoding=self.encoding) as read_obj: - reader = DictReader(read_obj, fieldnames=self.fields, - delimiter=self.delimiter, quotechar=self.quotechar) - for _ in range(self.skip): - with suppress(StopIteration): - next(reader) - counter = iter(count()) - while True: - chunk = [ - {self.MSG_ID_FIELD: f"{next(counter)}", **record} - for record in islice(reader, batch_size) - ] - if not chunk: - return - yield chunk -``` - -The init no longer reads `self.batch_size` (read lazily in `produce_chunks`). - -- [ ] **Step 4.4: Update the schema** - -Replace `core/src/datayoga_core/blocks/files/read_csv/block.schema.json` with: - -```json -{ - "title": "files.read_csv", - "description": "Read data from CSV", - "type": "object", - "$inherit": ["batchable"], - "properties": { - "file": { - "description": "Filename. Can contain a regexp or glob expression", - "type": "string" - }, - "encoding": { - "description": "Encoding to use for reading the file", - "type": "string", - "default": "utf-8" - }, - "fields": { - "type": "array", - "title": "List of columns to use", - "description": "List of columns to use for extract", - "default": null, - "examples": [["fname", "lname"]], - "minLength": 1, - "additionalItems": true, - "items": { - "type": "string", - "description": "field name", - "examples": ["fname"] - } - }, - "skip": { - "description": "Number of lines to skip", - "type": "number", - "minimum": 0, - "default": 0 - }, - "delimiter": { - "description": "Delimiter to use for splitting the csv records", - "type": "string", - "minLength": 1, - "maxLength": 1, - "default": "," - }, - "quotechar": { - "description": "A one-character string used to quote fields containing special characters, such as the delimiter or quotechar, or which contain new-line characters. It defaults to '", - "type": "string", - "minLength": 1, - "maxLength": 1, - "default": "\"" - } - }, - "additionalProperties": false, - "required": ["file"], - "examples": [ - { - "file": "archive.csv", - "delimiter": ";" - } - ] -} -``` - -The `batch_size` inline property is removed; it comes from the `batchable` fragment. - -- [ ] **Step 4.5: Run test to verify it passes** - -```bash -cd core && python -m pytest src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py -v -``` - -Expected: 2 passed. - -- [ ] **Step 4.6: Run the full core suite** - -```bash -cd core && python -m pytest src/datayoga_core/ -x -q -``` - -Expected: all tests pass. - -- [ ] **Step 4.7: Commit** - -```bash -git add core/src/datayoga_core/blocks/files/read_csv/block.py \ - core/src/datayoga_core/blocks/files/read_csv/block.schema.json \ - core/src/datayoga_core/blocks/files/read_csv/tests/__init__.py \ - core/src/datayoga_core/blocks/files/read_csv/tests/test_read_csv.py -git commit -m "Migrate files/read_csv to produce_chunks (#400)" -``` - ---- - -## Task 5: Migrate `parquet/read` (fixes one-by-one bug) - -Today `parquet/read` iterates each row of each row group and yields a single-record list per iteration. Migrate it to yield each row group as a single chunk; the base class re-chunks to `batch_size`. - -**Files:** - -- Modify: `core/src/datayoga_core/blocks/parquet/read/block.py` -- Modify: `core/src/datayoga_core/blocks/parquet/read/block.schema.json` - -- [ ] **Step 5.1: Write the failing test** - -Create `core/src/datayoga_core/blocks/parquet/read/tests/__init__.py` (empty) and `core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py`: - -```python -from pathlib import Path - -import pandas as pd -import pytest - -from datayoga_core.blocks.parquet.read.block import Block - - -async def _drain(producer): - out = [] - async for batch in producer.produce(): - out.append(batch) - return out - - -@pytest.fixture -def parquet_path(tmp_path) -> Path: - p = tmp_path / "data.parquet" - df = pd.DataFrame({"i": list(range(2500))}) - # row_group_offsets=1000 creates 3 row groups (1000, 1000, 500) - from fastparquet import write as fp_write - fp_write(str(p), df, row_group_offsets=1000) - return p - - -@pytest.mark.asyncio -async def test_parquet_batches_to_batch_size(parquet_path): - block = Block({"file": str(parquet_path), "batch_size": 1000}) - block.init() - batches = await _drain(block) - assert [len(b) for b in batches] == [1000, 1000, 500] - flat = [r for b in batches for r in b] - assert flat[0]["i"] == 0 - assert all(Block.MSG_ID_FIELD in r for r in flat) - - -@pytest.mark.asyncio -async def test_parquet_rechunks_across_row_groups(parquet_path): - # row groups are [1000, 1000, 500]; batch_size=750 should give batches of - # [750, 750, 750, 250] regardless of row group boundaries. - block = Block({"file": str(parquet_path), "batch_size": 750}) - block.init() - batches = await _drain(block) - assert [len(b) for b in batches] == [750, 750, 750, 250] -``` - -- [ ] **Step 5.2: Run test to verify it fails** - -```bash -cd core && python -m pytest src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py -v -``` - -Expected: FAIL — current implementation yields batches of size 1, so the assertions fail. - -- [ ] **Step 5.3: Migrate `parquet/read`** - -Replace the contents of `core/src/datayoga_core/blocks/parquet/read/block.py` with: - -```python -import logging -import os -from abc import ABCMeta -from itertools import count -from typing import Any, AsyncGenerator, Dict, List, Optional - -from datayoga_core.context import Context -from datayoga_core.producer import Producer as DyProducer -from fastparquet import ParquetFile - -logger = logging.getLogger("dy") - - -class Block(DyProducer, metaclass=ABCMeta): - - def init(self, context: Optional[Context] = None): - logger.debug(f"Initializing {self.get_block_name()}") - parquet_file = self.properties["file"] - if os.path.isabs(parquet_file) or context is None: - self.file = parquet_file - else: - self.file = os.path.join(context.properties.get("data_path"), parquet_file) - logger.debug(f"file: {self.file}") - - async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: - logger.debug("Reading parquet") - pf = ParquetFile(self.file) - counter = iter(count()) - for df in pf.iter_row_groups(): - yield [ - {self.MSG_ID_FIELD: str(next(counter)), **row.to_dict()} - for _, row in df.iterrows() - ] -``` - -- [ ] **Step 5.4: Update the schema** - -Replace `core/src/datayoga_core/blocks/parquet/read/block.schema.json` with: - -```json -{ - "title": "parquet.read", - "description": "Read data from parquet", - "type": "object", - "$inherit": ["batchable"], - "properties": { - "file": { - "description": "Filename. Can contain a regexp or glob expression", - "type": "string" - } - }, - "additionalProperties": false, - "required": ["file"], - "examples": [ - { - "file": "data.parquet" - } - ] -} -``` - -- [ ] **Step 5.5: Run test to verify it passes** - -```bash -cd core && python -m pytest src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py -v -``` - -Expected: 2 passed. - -- [ ] **Step 5.6: Run the full core suite** - -```bash -cd core && python -m pytest src/datayoga_core/ -x -q -``` - -Expected: all tests pass. - -- [ ] **Step 5.7: Commit** - -```bash -git add core/src/datayoga_core/blocks/parquet/read/block.py \ - core/src/datayoga_core/blocks/parquet/read/block.schema.json \ - core/src/datayoga_core/blocks/parquet/read/tests/__init__.py \ - core/src/datayoga_core/blocks/parquet/read/tests/test_parquet_read.py -git commit -m "Migrate parquet/read to produce_chunks, fix one-by-one yield (#400, #293)" -``` - ---- - -## Task 6: Migrate `relational/read` (fix bug + add `fetch_size`) - -Today `relational/read` does `fetchmany(10000)` then yields one row at a time. Migrate to `produce_chunks` that yields each `fetchmany` result. Add an optional `fetch_size` property; default to 10000 to preserve today's DB round-trip count. - -**Files:** - -- Modify: `core/src/datayoga_core/blocks/relational/read/block.py` -- Modify: `core/src/datayoga_core/blocks/relational/read/block.schema.json` - -- [ ] **Step 6.1: Write the failing test** - -Create `core/src/datayoga_core/blocks/relational/read/tests/__init__.py` (empty) and `core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py`: - -```python -from unittest.mock import MagicMock, patch - -import pytest - -from datayoga_core.blocks.relational.read.block import Block - - -async def _drain(producer): - out = [] - async for batch in producer.produce(): - out.append(batch) - return out - - -def _fake_result(rows): - """Build a fake SQLAlchemy result that returns rows in fetchmany chunks.""" - state = {"i": 0} - - def fetchmany(n): - i = state["i"] - chunk = rows[i:i + n] - state["i"] += len(chunk) - return chunk - - res = MagicMock() - res.fetchmany.side_effect = fetchmany - res.execution_options.return_value = res - return res - - -class _Row: - def __init__(self, d): - self._d = d - - def _asdict(self): - return self._d - - -@pytest.mark.asyncio -async def test_relational_read_yields_batches_not_rows(): - rows = [_Row({"i": i}) for i in range(2500)] - fake_result = _fake_result(rows) - - block = Block.__new__(Block) - block.properties = {"batch_size": 1000} - block.connection = MagicMock() - block.tbl = MagicMock() - block.tbl.select.return_value = "SELECT *" - block.connection.execution_options.return_value.execute.return_value = fake_result - - batches = await _drain(block) - assert [len(b) for b in batches] == [1000, 1000, 500] - - -@pytest.mark.asyncio -async def test_relational_read_fetch_size_independent_of_batch_size(): - rows = [_Row({"i": i}) for i in range(5000)] - fake_result = _fake_result(rows) - - block = Block.__new__(Block) - block.properties = {"batch_size": 1000, "fetch_size": 2500} - block.connection = MagicMock() - block.tbl = MagicMock() - block.tbl.select.return_value = "SELECT *" - block.connection.execution_options.return_value.execute.return_value = fake_result - - batches = await _drain(block) - # Downstream batches are still batch_size=1000 - assert [len(b) for b in batches] == [1000, 1000, 1000, 1000, 1000] - # Driver fetched in fetch_size=2500 chunks: 2500 + 2500 + 0 = 3 calls - fetch_sizes = [c.args[0] for c in fake_result.fetchmany.call_args_list] - assert fetch_sizes[0] == 2500 - assert fetch_sizes[1] == 2500 - - -@pytest.mark.asyncio -async def test_relational_read_default_fetch_size_is_10000(): - rows = [_Row({"i": i}) for i in range(500)] - fake_result = _fake_result(rows) - - block = Block.__new__(Block) - block.properties = {} - block.connection = MagicMock() - block.tbl = MagicMock() - block.tbl.select.return_value = "SELECT *" - block.connection.execution_options.return_value.execute.return_value = fake_result - - await _drain(block) - fetch_sizes = [c.args[0] for c in fake_result.fetchmany.call_args_list] - assert fetch_sizes[0] == 10000 -``` - -- [ ] **Step 6.2: Run test to verify it fails** - -```bash -cd core && python -m pytest src/datayoga_core/blocks/relational/read/tests/test_relational_read.py -v -``` - -Expected: FAIL — the current `produce()` yields one row at a time, so `[len(b) for b in batches]` is `[1] * 2500`. - -- [ ] **Step 6.3: Migrate `relational/read`** - -Replace the contents of `core/src/datayoga_core/blocks/relational/read/block.py` with: - -```python -import logging -from typing import Any, AsyncGenerator, Dict, List, Optional - -import sqlalchemy as sa -from datayoga_core import utils -from datayoga_core.blocks.relational import utils as relational_utils -from datayoga_core.context import Context -from datayoga_core.producer import Producer as DyProducer - -logger = logging.getLogger("dy") - - -class Block(DyProducer): - DEFAULT_FETCH_SIZE = 10000 - - def init(self, context: Optional[Context] = None): - self.engine, self.db_type = relational_utils.get_engine( - self.properties["connection"], - context, - autocommit=False, - ) - - self.schema = self.properties.get("schema") - self.table = self.properties.get("table") - self.opcode_field = self.properties.get("opcode_field") - self.load_strategy = self.properties.get("load_strategy") - self.keys = self.properties.get("keys") - self.mapping = self.properties.get("mapping") - - self.tbl = sa.Table(self.table, sa.MetaData(schema=self.schema), autoload_with=self.engine) - - logger.debug(f"Connecting to {self.db_type}") - self.connection = self.engine.connect() - - async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: - fetch_size = int(self.properties.get("fetch_size", self.DEFAULT_FETCH_SIZE)) - result = self.connection.execution_options(stream_results=True).execute(self.tbl.select()) - while True: - rows = result.fetchmany(fetch_size) - if not rows: - return - yield [utils.add_uid(dict(row._asdict())) for row in rows] - - def stop(self): - self.connection.close() - self.engine.dispose() -``` - -- [ ] **Step 6.4: Update the schema** - -Replace `core/src/datayoga_core/blocks/relational/read/block.schema.json` with: - -```json -{ - "title": "relational.read", - "description": "Read a table from an SQL-compatible data store", - "type": "object", - "$inherit": ["batchable"], - "additionalProperties": false, - "examples": [ - { - "id": "read_snowflake", - "type": "relational.read", - "properties": { - "connection": "eu_datalake", - "table": "employees", - "schema": "dbo" - } - } - ], - "properties": { - "connection": { - "type": "string", - "title": "The connection to use for loading", - "description": "Logical connection name as defined in the connections.dy.yaml", - "examples": ["europe_db", "target", "eu_dwh"] - }, - "schema": { - "type": "string", - "title": "The table schema of the table", - "description": "If left blank, the default schema of this connection will be used as defined in the connections.dy.yaml", - "examples": ["dbo"] - }, - "table": { - "type": "string", - "title": "The table name", - "description": "Table name", - "examples": ["employees"] - }, - "columns": { - "type": "array", - "title": "Optional subset of columns to load", - "items": { - "type": ["string", "object"], - "title": "name of column" - }, - "examples": [["fname", { "lname": "last_name" }]] - }, - "fetch_size": { - "type": "integer", - "minimum": 1, - "description": "Driver-level rows fetched per round-trip. Defaults to 10000.", - "default": 10000 - } - }, - "required": ["connection", "table"] -} -``` - -- [ ] **Step 6.5: Run test to verify it passes** - -```bash -cd core && python -m pytest src/datayoga_core/blocks/relational/read/tests/test_relational_read.py -v -``` - -Expected: 3 passed. - -- [ ] **Step 6.6: Run the full core suite** - -```bash -cd core && python -m pytest src/datayoga_core/ -x -q -``` - -Expected: all tests pass. - -- [ ] **Step 6.7: Commit** - -```bash -git add core/src/datayoga_core/blocks/relational/read/block.py \ - core/src/datayoga_core/blocks/relational/read/block.schema.json \ - core/src/datayoga_core/blocks/relational/read/tests/__init__.py \ - core/src/datayoga_core/blocks/relational/read/tests/test_relational_read.py -git commit -m "Migrate relational/read to produce_chunks, add fetch_size (#400, #295)" -``` - ---- - -## Task 7: Migrate `http/receiver` (fix one-by-one) - -The receiver currently yields one record per HTTP request. Migrate to drain the queue per chunk; `flush_ms` ensures partial batches flush during low-traffic periods. - -**Files:** - -- Modify: `core/src/datayoga_core/blocks/http/receiver/block.py` -- Modify: `core/src/datayoga_core/blocks/http/receiver/block.schema.json` - -- [ ] **Step 7.1: Write the failing test** - -Create `core/src/datayoga_core/blocks/http/receiver/tests/__init__.py` (empty) and `core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py`: - -```python -import asyncio - -import aiohttp -import pytest - -from datayoga_core.blocks.http.receiver.block import Block - - -def _free_port(): - import socket - with socket.socket() as s: - s.bind(("127.0.0.1", 0)) - return s.getsockname()[1] - - -@pytest.mark.asyncio -async def test_http_receiver_batches_incoming_requests(): - port = _free_port() - block = Block({"host": "127.0.0.1", "port": port, - "batch_size": 50, "flush_ms": 200}) - block.init() - - received = [] - - async def consumer(): - async for batch in block.produce(): - received.append(batch) - if sum(len(b) for b in received) >= 60: - return - - consumer_task = asyncio.create_task(consumer()) - await asyncio.sleep(0.2) # let server start - - async with aiohttp.ClientSession() as session: - for i in range(60): - async with session.post(f"http://127.0.0.1:{port}", json={"i": i}) as r: - assert r.status == 200 - - await asyncio.wait_for(consumer_task, timeout=5) - - flat = [r for b in received for r in b] - assert len(flat) == 60 - # Most records arrive in a full batch_size=50 batch; the rest arrive as a - # partial batch flushed by flush_ms. - assert any(len(b) == 50 for b in received) - assert all(Block.MSG_ID_FIELD in r for r in flat) -``` - -- [ ] **Step 7.2: Run test to verify it fails** - -```bash -cd core && python -m pytest src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py -v -``` - -Expected: FAIL — current implementation yields one record per batch; `assert any(len(b) == 50 ...)` is false. - -- [ ] **Step 7.3: Migrate `http/receiver`** - -Replace the contents of `core/src/datayoga_core/blocks/http/receiver/block.py` with: - -```python -import logging -from abc import ABCMeta -from asyncio import Queue -from contextlib import suppress -from itertools import count -from typing import Any, AsyncGenerator, Dict, List, Optional - -import orjson -from aiohttp.web import (BaseRequest, HTTPInternalServerError, HTTPOk, - Response, Server, ServerRunner, TCPSite) -from datayoga_core.context import Context -from datayoga_core.producer import Producer as DyProducer - -logger = logging.getLogger("dy") - - -class Block(DyProducer, metaclass=ABCMeta): - port: int - host: str - DEFAULT_FLUSH_MS = 1000 - - def init(self, context: Optional[Context] = None): - logger.debug(f"Initializing {self.get_block_name()}") - self.port = int(self.properties.get("port", 8080)) - self.host = self.properties.get("host", "0.0.0.0") - - async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: - queue: Queue = Queue(maxsize=1000) - - async def handler(request: BaseRequest) -> Response: - try: - queue.put_nowait(orjson.loads(await request.read())) - return HTTPOk() - except Exception: - logger.exception("Got exception while parsing request:") - return HTTPInternalServerError() - - runner = ServerRunner(Server(handler)) - await runner.setup() - srv = TCPSite(runner, self.host, self.port) - await srv.start() - logger.info(f"Listening on {self.host}:{self.port}...") - - try: - counter = iter(count()) - while True: - first = await queue.get() - chunk = [{self.MSG_ID_FIELD: f"{next(counter)}", **first}] - while not queue.empty(): - record = queue.get_nowait() - chunk.append({self.MSG_ID_FIELD: f"{next(counter)}", **record}) - yield chunk - finally: - with suppress(Exception): - await srv.stop() -``` - -- [ ] **Step 7.4: Update the schema** - -Replace `core/src/datayoga_core/blocks/http/receiver/block.schema.json` with: - -```json -{ - "title": "http.receiver", - "description": "Receives HTTP requests and process the data.", - "type": "object", - "$inherit": ["streamable"], - "properties": { - "host": { - "description": "Host to listen", - "type": "string", - "default": "0.0.0.0" - }, - "port": { - "description": "Port to listen", - "type": "integer", - "default": 8080 - } - }, - "additionalProperties": false, - "examples": [ - { - "host": "localhost", - "port": 8080 - } - ] -} -``` - -- [ ] **Step 7.5: Run test to verify it passes** - -```bash -cd core && python -m pytest src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py -v -``` - -Expected: 1 passed. - -- [ ] **Step 7.6: Run the full core suite** - -```bash -cd core && python -m pytest src/datayoga_core/ -x -q -``` - -Expected: all tests pass. - -- [ ] **Step 7.7: Commit** - -```bash -git add core/src/datayoga_core/blocks/http/receiver/block.py \ - core/src/datayoga_core/blocks/http/receiver/block.schema.json \ - core/src/datayoga_core/blocks/http/receiver/tests/__init__.py \ - core/src/datayoga_core/blocks/http/receiver/tests/test_http_receiver.py -git commit -m "Migrate http/receiver to produce_chunks (#400)" -``` - ---- - -## Task 8: Migrate `redis/read_stream` (closes #377) - -The redis stream producer yields one record at a time today. Migrate so it requests `count=batch_size` from `xreadgroup` and yields each response as a chunk; `flush_ms` flushes partial batches during low-volume periods. - -**Files:** - -- Modify: `core/src/datayoga_core/blocks/redis/read_stream/block.py` -- Modify: `core/src/datayoga_core/blocks/redis/read_stream/block.schema.json` - -- [ ] **Step 8.1: Write the failing test** - -Create `core/src/datayoga_core/blocks/redis/read_stream/tests/__init__.py` (empty) and `core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py`: - -```python -from unittest.mock import MagicMock - -import pytest - -from datayoga_core.blocks.redis.read_stream.block import Block - - -def _mk_block(properties, redis_client): - block = Block.__new__(Block) - block.properties = properties - block.redis_client = redis_client - block.stream = "mystream" - block.snapshot = properties.get("_snapshot", True) - block.consumer_group = "g" - block.requesting_consumer = "c" - return block - - -@pytest.mark.asyncio -async def test_redis_uses_count_equal_to_batch_size(): - redis = MagicMock() - # First call returns pending messages, second call returns "no new", which - # ends snapshot mode. - payload_a = (b"1-0", {b"data": b'{"i": 1}'}) - payload_b = (b"2-0", {b"data": b'{"i": 2}'}) - redis.xreadgroup.side_effect = [ - [(b"mystream", [payload_a, payload_b])], # pending - [(b"mystream", [])], # nothing new -> exit - ] - - block = _mk_block({"batch_size": 250, "_snapshot": True}, redis) - batches = [] - async for b in block.produce(): - batches.append(b) - - assert all(c.kwargs.get("count") == 250 or (len(c.args) >= 4 and c.args[3] == 250) - for c in redis.xreadgroup.call_args_list), \ - "xreadgroup should be called with count=batch_size" - - -@pytest.mark.asyncio -async def test_redis_yields_records_as_a_batch_not_one_by_one(): - redis = MagicMock() - pages = [(f"{i}-0".encode(), {b"data": f'{{"i": {i}}}'.encode()}) for i in range(5)] - redis.xreadgroup.side_effect = [ - [(b"mystream", pages)], - [(b"mystream", [])], - ] - - block = _mk_block({"batch_size": 100, "_snapshot": True}, redis) - batches = [] - async for b in block.produce(): - batches.append(b) - - # 5 records arrive as one chunk; base class re-emits as one batch of 5. - assert [len(b) for b in batches] == [5] - assert batches[0][0]["i"] == 0 -``` - -- [ ] **Step 8.2: Run test to verify it fails** - -```bash -cd core && python -m pytest src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py -v -``` - -Expected: FAIL — current `xreadgroup` call passes `count=None`, and the producer yields one record at a time. - -- [ ] **Step 8.3: Migrate `redis/read_stream`** - -Replace the contents of `core/src/datayoga_core/blocks/redis/read_stream/block.py` with: - -```python -import logging -from typing import Any, AsyncGenerator, Dict, List, Optional - -import datayoga_core.blocks.redis.utils as redis_utils -import orjson -from datayoga_core.connection import Connection -from datayoga_core.context import Context -from datayoga_core.producer import Producer as DyProducer - -logger = logging.getLogger("dy") - - -class Block(DyProducer): - DEFAULT_FLUSH_MS = 1000 - - def init(self, context: Optional[Context] = None): - logger.debug(f"Initializing {self.get_block_name()}") - connection_details = Connection.get_connection_details(self.properties["connection"], context) - self.redis_client = redis_utils.get_client(connection_details) - self.stream = self.properties["stream_name"] - self.snapshot = self.properties.get("snapshot", False) - self.consumer_group = f'datayoga_job_{context.properties.get("job_name", "") if context else ""}' - self.requesting_consumer = "dy_consumer_a" - stream_groups = self.redis_client.xinfo_groups(self.stream) - if next(filter(lambda x: x["name"] == self.consumer_group, stream_groups), None) is None: - logger.info(f"Creating a new {self.consumer_group} consumer group associated with the {self.stream}") - self.redis_client.xgroup_create(self.stream, self.consumer_group, 0) - - async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: - logger.debug(f"Running {self.get_block_name()}") - batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE)) - read_pending = True - - while True: - streams = self.redis_client.xreadgroup( - self.consumer_group, self.requesting_consumer, - {self.stream: "0" if read_pending else ">"}, - count=batch_size, - block=100 if self.snapshot else 0, - ) - - yielded_any = False - for stream in streams: - logger.debug(f"Messages in {self.stream} stream (pending: {read_pending}):\n\t{stream}") - chunk: List[Dict[str, Any]] = [] - for key, value in stream[1]: - payload = orjson.loads(value[next(iter(value))]) - payload[self.MSG_ID_FIELD] = key - chunk.append(payload) - if chunk: - yielded_any = True - yield chunk - - # Snapshot ends after a pending-read followed by a "no new" read. - if self.snapshot and not read_pending and not yielded_any: - return - - read_pending = False - - def ack(self, msg_ids: List[str]): - for msg_id in msg_ids: - logger.info(f"Acking {msg_id} message in {self.stream} stream of {self.consumer_group} consumer group") - self.redis_client.xack(self.stream, self.consumer_group, msg_id) -``` - -Note: snapshot termination is slightly tightened: the loop exits when a non-pending read returns no messages, matching the spec's intent. This is more robust than the original `if self.snapshot and not read_pending: break`. - -- [ ] **Step 8.4: Update the schema** - -Replace `core/src/datayoga_core/blocks/redis/read_stream/block.schema.json` with: - -```json -{ - "title": "redis.read_stream", - "description": "Read from Redis stream", - "type": "object", - "$inherit": ["streamable"], - "properties": { - "connection": { "description": "Connection name", "type": "string" }, - "stream_name": { - "type": "string", - "title": "Source stream name", - "description": "Source stream name" - }, - "snapshot": { - "type": "boolean", - "title": "Snapshot current entries and quit", - "description": "Snapshot current entries and quit", - "default": false - } - }, - "additionalProperties": false, - "required": ["connection", "stream_name"] -} -``` - -- [ ] **Step 8.5: Run test to verify it passes** - -```bash -cd core && python -m pytest src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py -v -``` - -Expected: 2 passed. - -- [ ] **Step 8.6: Run the full core suite** - -```bash -cd core && python -m pytest src/datayoga_core/ -x -q -``` - -Expected: all tests pass. - -- [ ] **Step 8.7: Commit** - -```bash -git add core/src/datayoga_core/blocks/redis/read_stream/block.py \ - core/src/datayoga_core/blocks/redis/read_stream/block.schema.json \ - core/src/datayoga_core/blocks/redis/read_stream/tests/__init__.py \ - core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py -git commit -m "Migrate redis/read_stream to batched xreadgroup (#400, #377)" -``` - ---- - -## Task 9: Migrate `azure/read_event_hub` (rename `batch_size` → `max_batch_size`) - -Today `batch_size` controls the SDK callback size, not the pipeline batch size. Rename to `max_batch_size`, add `additionalProperties: false`, and use the streamable fragment so the _new_ `batch_size` means pipeline batch size. - -**Files:** - -- Modify: `core/src/datayoga_core/blocks/azure/read_event_hub/block.py` -- Modify: `core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json` - -- [ ] **Step 9.1: Write the failing test** - -Create `core/src/datayoga_core/blocks/azure/read_event_hub/tests/__init__.py` (empty) and `core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py`: - -```python -import pytest -from jsonschema import ValidationError - -from datayoga_core.blocks.azure.read_event_hub.block import Block - - -def _minimal_props(extra=None): - base = { - "event_hub_connection_string": "Endpoint=sb://x/;SharedAccessKeyName=k;SharedAccessKey=v;EntityPath=eh", - "event_hub_consumer_group_name": "$Default", - "event_hub_name": "eh", - "checkpoint_store_connection_string": "DefaultEndpointsProtocol=https;AccountName=a;AccountKey=k==", - "checkpoint_store_container_name": "chk", - } - if extra: - base.update(extra) - return base - - -def test_unknown_property_rejected_by_validation(): - """additionalProperties: false catches typos like the legacy 'batch_sz'.""" - with pytest.raises(ValidationError): - Block(_minimal_props({"batch_sz": 300})) - - -def test_max_batch_size_accepted(): - """The renamed SDK-level property is now max_batch_size.""" - block = Block(_minimal_props({"max_batch_size": 500, "batch_size": 100})) - assert block.properties["max_batch_size"] == 500 - assert block.properties["batch_size"] == 100 - - -def test_max_batch_size_defaults_to_300_when_omitted(): - """init() reads max_batch_size with a default of 300 if not present.""" - # We can't safely call init() in unit tests (it instantiates the Azure - # SDK client); read the property via the same path init() does. - block = Block(_minimal_props()) - assert int(block.properties.get("max_batch_size", 300)) == 300 - - -def test_renamed_schema_has_additional_properties_false(): - """Schema after rename: max_batch_size + streamable's batch_size/flush_ms, - no unknown properties allowed.""" - block = Block(_minimal_props()) - schema = block.get_json_schema() - assert schema.get("additionalProperties") is False - assert "max_batch_size" in schema["properties"] - assert "batch_size" in schema["properties"] # from streamable fragment - assert "flush_ms" in schema["properties"] # from streamable fragment - - -def test_batch_size_300_is_silently_repurposed(): - """A user upgrading from a pre-rename version with batch_size: 300 (which - used to mean SDK callback size) will see their YAML still validate, but - batch_size now means pipeline batch size. This is documented in the PR - description and processing-strategies.md as a breaking change.""" - block = Block(_minimal_props({"batch_size": 300})) - # Schema validation passes — batch_size is a known property (now pipeline-meaning). - # The user must rename to max_batch_size: 300 to preserve old behavior. - assert block.properties["batch_size"] == 300 - assert "max_batch_size" not in block.properties -``` - -- [ ] **Step 9.2: Run test to verify it fails** - -```bash -cd core && python -m pytest src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py -v -``` - -Expected: most of the 5 tests FAIL — current schema has no `additionalProperties: false`, no `max_batch_size`, no `$inherit`. - -- [ ] **Step 9.3: Update the schema** - -Replace `core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json` with: - -```json -{ - "title": "azure.read_event_hub", - "description": "Read from Azure Event Hub", - "type": "object", - "$inherit": ["streamable"], - "properties": { - "event_hub_connection_string": { - "type": "string", - "description": "The connection string for the Azure Event Hub namespace." - }, - "event_hub_consumer_group_name": { - "type": "string", - "description": "The name of the consumer group to read events from." - }, - "event_hub_name": { - "type": "string", - "description": "The name of the Azure Event Hub." - }, - "checkpoint_store_connection_string": { - "type": "string", - "description": "The connection string for the Azure Storage account used as the checkpoint store." - }, - "checkpoint_store_container_name": { - "type": "string", - "description": "The name of the container within the checkpoint store to store the checkpoints." - }, - "max_batch_size": { - "type": "integer", - "minimum": 1, - "description": "Maximum number of events to receive in each SDK callback. Renamed from the previous batch_size which used to mean this. Defaults to 300.", - "default": 300 - } - }, - "additionalProperties": false, - "required": [ - "event_hub_connection_string", - "event_hub_consumer_group_name", - "event_hub_name", - "checkpoint_store_connection_string", - "checkpoint_store_container_name" - ] -} -``` - -- [ ] **Step 9.4: Migrate the producer** - -Replace the contents of `core/src/datayoga_core/blocks/azure/read_event_hub/block.py` with: - -```python -import asyncio -import logging -from typing import Any, AsyncGenerator, Dict, List, Optional - -import orjson -from azure.eventhub import EventData, PartitionContext -from azure.eventhub.aio import EventHubConsumerClient -from azure.eventhub.extensions.checkpointstoreblobaio import \ - BlobCheckpointStore -from datayoga_core.context import Context -from datayoga_core.producer import Producer as DyProducer - -logger = logging.getLogger("dy") - - -class Block(DyProducer): - """Azure Event Hub block for reading events.""" - - DEFAULT_FLUSH_MS = 1000 - - def init(self, context: Optional[Context] = None): - logger.debug(f"Initializing {self.get_block_name()}") - self.max_batch_size = int(self.properties.get("max_batch_size", 300)) - self.consumer_client = EventHubConsumerClient.from_connection_string( - conn_str=self.properties["event_hub_connection_string"], - consumer_group=self.properties["event_hub_consumer_group_name"], - eventhub_name=self.properties["event_hub_name"], - checkpoint_store=BlobCheckpointStore.from_connection_string( - self.properties["checkpoint_store_connection_string"], - self.properties["checkpoint_store_container_name"]), - ) - self.events: Dict[Any, Any] = {} - self.messages: asyncio.Queue = asyncio.Queue() - - async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: - logger.debug(f"Running {self.get_block_name()}") - logger.debug("Starting event receiving process") - asyncio.create_task(self.receive_batch()) - - while True: - first = await self.messages.get() - chunk = [first] - while not self.messages.empty(): - chunk.append(self.messages.get_nowait()) - yield chunk - - async def receive_batch(self): - await self.consumer_client.receive_batch( - on_event_batch=self.on_event_batch, - max_batch_size=self.max_batch_size, - starting_position="-1", - ) - - async def on_event_batch(self, partition_context: PartitionContext, events: List[EventData]): - logger.debug(f"Received batch of events from partition: {partition_context.partition_id}") - for event in events: - try: - payload = orjson.loads(event.body_as_str(encoding="UTF-8")) - msg_id = event.system_properties[b"x-opt-sequence-number"] - self.events[msg_id] = (event, partition_context) - payload[self.MSG_ID_FIELD] = msg_id - await self.messages.put(payload) - except Exception as e: - logger.error(e) - - async def complete_events(self, msg_ids: List[str]): - for msg_id in msg_ids: - logger.debug(f"Acking {msg_id} event") - event, partition_context = self.events.pop(msg_id, (None, None)) - if event is not None: - await partition_context.update_checkpoint(event) - else: - logger.warning(f"Couldn't find event {msg_id} for acknowledging") - - def ack(self, msg_ids: List[str]): - asyncio.create_task(self.complete_events(msg_ids)) -``` - -- [ ] **Step 9.5: Run test to verify it passes** - -```bash -cd core && python -m pytest src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py -v -``` - -Expected: 5 passed. - -- [ ] **Step 9.6: Run the full core suite** - -```bash -cd core && python -m pytest src/datayoga_core/ -x -q -``` - -Expected: all tests pass. - -- [ ] **Step 9.7: Commit** - -```bash -git add core/src/datayoga_core/blocks/azure/read_event_hub/block.py \ - core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json \ - core/src/datayoga_core/blocks/azure/read_event_hub/tests/__init__.py \ - core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py -git commit -m "Migrate azure/read_event_hub; rename batch_size -> max_batch_size (#400, BREAKING)" -``` - ---- - -## Task 10: Regenerate autogenerated schemas and docs - -The aggregated `schemas/job.schema.json` and the per-block markdown in `docs/reference/blocks/` are generated by scripts. After the per-block schema changes, regenerate them. - -**Files:** - -- Modify: `schemas/job.schema.json` -- Modify: `docs/reference/blocks/std_read.md`, `files_read_csv.md`, `parquet_read.md`, `relational_read.md`, `redis_read_stream.md`, `http_receiver.md`, `azure_read_event_hub.md` (autogenerated) - -- [ ] **Step 10.1: Regenerate the JSON schemas** - -```bash -bash scripts/generate-jsonschemas.sh -``` - -Expected output: `JSON schemas generated successfully`. - -- [ ] **Step 10.2: Regenerate the reference docs** - -```bash -bash scripts/generate-docs.sh -``` - -Expected: completes without error. - -- [ ] **Step 10.3: Inspect the diff** - -```bash -git diff schemas/ docs/reference/blocks/ | head -200 -``` - -Expected: `batch_size` (and `flush_ms` for streaming producers, `fetch_size` for relational/read, `max_batch_size` for event_hub) appear in the appropriate schema entries and docs. - -- [ ] **Step 10.4: Commit** - -```bash -git add schemas/job.schema.json docs/reference/blocks/ -git commit -m "Regenerate JSON schemas and reference docs after producer batching (#400)" -``` - ---- - -## Task 11: Document the producer batching model in processing-strategies - -**Files:** - -- Modify: `docs/processing-strategies.md` - -- [ ] **Step 11.1: Add a section on producer batching** - -Append the following section to `docs/processing-strategies.md` (or replace an existing section if one already covers it): - -````markdown -## Producer Batching - -Every producer block (any block that reads from a source — `std/read`, `files/read_csv`, `parquet/read`, `relational/read`, `redis/read_stream`, `azure/read_event_hub`, `http/receiver`) accepts a `batch_size` property. The producer base class re-chunks the source's output into batches of exactly `batch_size` records, regardless of how the source delivers them (per row, per row group, per `fetchmany`, per network message). - -```yaml -input: - uses: files.read_csv - with: - file: people.csv - batch_size: 500 # downstream steps process 500 records per call -``` - -Default: `1000`. - -### Streaming producers and `flush_ms` - -Streaming producers (`redis/read_stream`, `azure/read_event_hub`, `http/receiver`) also accept `flush_ms`. If no new records arrive within that many milliseconds, any partial batch is flushed downstream instead of being held until `batch_size` is reached. - -```yaml -input: - uses: redis.read_stream - with: - connection: my_redis - stream_name: events - batch_size: 1000 - flush_ms: 500 # emit a partial batch after 500ms of inactivity -``` - -Default: `1000` ms. Set to `null` to disable time-based flushing (records are held until `batch_size` or end-of-stream). - -### `relational/read` and `fetch_size` - -`relational/read` exposes an extra `fetch_size` property that controls how many rows are pulled from the database driver per round-trip, independent of the pipeline `batch_size`. Default: `10000`. Tune lower for memory pressure with wide rows; tune higher if you want fewer DB round-trips and downstream processing is the bottleneck. - -### `azure/read_event_hub` migration note - -In earlier versions, `batch_size` on `azure/read_event_hub` controlled the SDK callback batch size, not the pipeline batch size. As of #400 it has been renamed to `max_batch_size` to match the SDK semantic, and `batch_size` now consistently means pipeline batch size as it does for every other producer. -```` - -- [ ] **Step 11.2: Commit** - -```bash -git add docs/processing-strategies.md -git commit -m "Document producer batching model in processing-strategies (#400)" -``` - ---- - -## Task 12: Full verification and push branch - -- [ ] **Step 12.1: Run full core test suite** - -```bash -cd core && python -m pytest src/datayoga_core/ -v -``` - -Expected: all tests pass. Notably: - -- `test_producer_batching.py` (7 tests) -- `test_schema_inherit.py` (5 tests) -- `test_std_read.py`, `test_read_csv.py`, `test_parquet_read.py`, `test_relational_read.py`, `test_http_receiver.py`, `test_redis_read_stream.py`, `test_event_hub.py` (12 tests total) -- All pre-existing tests still pass. - -- [ ] **Step 12.2: Inspect the branch's commit history** - -```bash -git log --oneline 400-producer-batching-unification ^main -``` - -Expected: a clean sequence of commits — one per task — each referencing #400. - -- [ ] **Step 12.3: Push the branch** - -```bash -git push -u origin 400-producer-batching-unification -``` - -Expected: branch pushed to remote. - -- [ ] **Step 12.4: Open a draft PR (deferred — confirm with user first)** - -Before opening the PR, ask the user whether to open it as draft or ready-for-review, and confirm the body content. Do not run `gh pr create` autonomously. - -The PR description should call out the breaking change explicitly (no CHANGELOG file exists in this repo, so the PR description is the canonical place): - -> **Breaking change:** `azure/read_event_hub.batch_size` has been renamed to `max_batch_size`. The name `batch_size` now means pipeline batch size on this block, consistent with every other producer. Users with `batch_size: ` in their YAML for `azure/read_event_hub` must rename it to `max_batch_size: ` to preserve the previous SDK callback size semantic; the literal `batch_size: ` will validate but with the new pipeline-level meaning. From 78ef675c3858409109be8c28c5fdc47ff8440ddb Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 19:45:50 +0300 Subject: [PATCH 33/38] Switch from custom \$inherit to standard JSON Schema composition (#400) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per review discussion: drop the custom \$inherit extension and use JSON Schema's standard allOf + \$ref composition instead. The on-disk schemas are now idiomatic JSON Schema, understood by any standards- compliant tool. Changes: - Each producer block schema gains "\$schema": draft/2019-09 and uses allOf: [{"\$ref": "../../../resources/schemas/.schema.json"}] to inherit batch_size (and flush_ms for streaming producers). - additionalProperties: false -> unevaluatedProperties: false, which is composition-aware (the additionalProperties + allOf interaction is a known JSON Schema gotcha that rejects allOf-contributed properties). - schema_utils.resolve_inherits -> resolve_refs: walks the schema, inlines local-file \$refs recursively, detects cycles. The validation code path (Block.validate, Job.get_json_schema) stays unchanged — resolved schemas are flat. - Tests in test_schema_inherit.py rewritten for the new mechanics: inlining, transitive resolution, cycle detection, non-local refs passthrough, default base-dir fallback. - generate-docs.sh: walks standard \$ref instead of \$inherit, and also flattens allOf properties for jsonschema2mk's benefit (docs-only). - Aggregate schemas/job.schema.json regenerated. External \$ref-aware tools (IDE plugins, OpenAPI exporters) can now follow the schemas without our custom resolver. jsonschema2mk is the one tool that doesn't grok \$ref, so the docs generator keeps its pre-resolution step. Co-Authored-By: Claude Opus 4.7 (1M context) --- core/src/datayoga_core/block.py | 4 +- .../azure/read_event_hub/block.schema.json | 5 +- .../read_event_hub/tests/test_event_hub.py | 17 +- .../blocks/files/read_csv/block.schema.json | 5 +- .../blocks/http/receiver/block.schema.json | 5 +- .../blocks/parquet/read/block.schema.json | 5 +- .../redis/read_stream/block.schema.json | 5 +- .../blocks/relational/read/block.schema.json | 5 +- .../blocks/std/read/block.schema.json | 5 +- core/src/datayoga_core/job.py | 4 +- .../resources/schemas/batchable.schema.json | 1 + .../resources/schemas/streamable.schema.json | 1 + core/src/datayoga_core/schema_utils.py | 116 +++++---- .../tests/test_schema_inherit.py | 166 +++++++------ docs/reference/blocks/azure_read_event_hub.md | 1 - docs/reference/blocks/files_read_csv.md | 1 - docs/reference/blocks/http_receiver.md | 1 - docs/reference/blocks/parquet_read.md | 1 - docs/reference/blocks/redis_read_stream.md | 1 - docs/reference/blocks/relational_read.md | 1 - docs/reference/blocks/relational_write.md | 9 - docs/reference/blocks/std_read.md | 1 - ...28-producer-batching-unification-design.md | 41 ++-- schemas/job.schema.json | 228 ++++++++++++------ scripts/generate-docs.sh | 68 ++++-- 25 files changed, 413 insertions(+), 284 deletions(-) diff --git a/core/src/datayoga_core/block.py b/core/src/datayoga_core/block.py index a0b65e06..2dd6300b 100644 --- a/core/src/datayoga_core/block.py +++ b/core/src/datayoga_core/block.py @@ -57,8 +57,8 @@ def get_json_schema(self) -> Dict[str, Any]: "block.schema.json") logger.debug(f"loading schema from {json_schema_file}") # Lazy import: schema_utils -> utils -> block creates a circular import at module load. - from datayoga_core.schema_utils import resolve_inherits - return resolve_inherits(utils.read_json(json_schema_file)) + from datayoga_core.schema_utils import resolve_refs + return resolve_refs(utils.read_json(json_schema_file), schema_path=json_schema_file) @abstractmethod def init(self, context: Optional[Context] = None): diff --git a/core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json b/core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json index f663d383..f014b63f 100644 --- a/core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json +++ b/core/src/datayoga_core/blocks/azure/read_event_hub/block.schema.json @@ -1,8 +1,9 @@ { + "$schema": "https://json-schema.org/draft/2019-09/schema", "title": "azure.read_event_hub", "description": "Read from Azure Event Hub", "type": "object", - "$inherit": ["streamable"], + "allOf": [{ "$ref": "../../../resources/schemas/streamable.schema.json" }], "properties": { "event_hub_connection_string": { "type": "string", @@ -31,7 +32,7 @@ "default": 300 } }, - "additionalProperties": false, + "unevaluatedProperties": false, "required": [ "event_hub_connection_string", "event_hub_consumer_group_name", diff --git a/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py b/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py index 0506ee7b..b18fca3b 100644 --- a/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py +++ b/core/src/datayoga_core/blocks/azure/read_event_hub/tests/test_event_hub.py @@ -18,7 +18,7 @@ def _minimal_props(extra=None): def test_unknown_property_rejected_by_validation(): - """additionalProperties: false catches typos like 'batch_sz'.""" + """unevaluatedProperties: false catches typos like 'batch_sz'.""" with pytest.raises(ValidationError): Block(_minimal_props({"batch_sz": 300})) @@ -30,15 +30,18 @@ def test_max_batch_size_accepted(): assert block.properties["batch_size"] == 100 -def test_renamed_schema_has_additional_properties_false(): - """Schema after rename: max_batch_size + streamable's batch_size/flush_ms, - no unknown properties allowed.""" +def test_renamed_schema_uses_unevaluated_properties_with_streamable(): + """Schema after rename: max_batch_size locally, streamable contributes + batch_size + flush_ms via allOf $ref, and unevaluatedProperties=false + rejects anything else.""" block = Block(_minimal_props()) schema = block.get_json_schema() - assert schema.get("additionalProperties") is False + assert schema.get("unevaluatedProperties") is False assert "max_batch_size" in schema["properties"] - assert "batch_size" in schema["properties"] - assert "flush_ms" in schema["properties"] + # batch_size and flush_ms come from the inlined streamable fragment via allOf + fragment_props = schema["allOf"][0]["properties"] + assert "batch_size" in fragment_props + assert "flush_ms" in fragment_props def test_batch_size_300_is_silently_repurposed(): diff --git a/core/src/datayoga_core/blocks/files/read_csv/block.schema.json b/core/src/datayoga_core/blocks/files/read_csv/block.schema.json index ca7d638b..dc837561 100644 --- a/core/src/datayoga_core/blocks/files/read_csv/block.schema.json +++ b/core/src/datayoga_core/blocks/files/read_csv/block.schema.json @@ -1,8 +1,9 @@ { + "$schema": "https://json-schema.org/draft/2019-09/schema", "title": "files.read_csv", "description": "Read data from CSV", "type": "object", - "$inherit": ["batchable"], + "allOf": [{ "$ref": "../../../resources/schemas/batchable.schema.json" }], "properties": { "file": { "description": "Filename. Can contain a regexp or glob expression", @@ -48,7 +49,7 @@ "default": "\"" } }, - "additionalProperties": false, + "unevaluatedProperties": false, "required": ["file"], "examples": [ { diff --git a/core/src/datayoga_core/blocks/http/receiver/block.schema.json b/core/src/datayoga_core/blocks/http/receiver/block.schema.json index a52edcc5..1f93ccd5 100644 --- a/core/src/datayoga_core/blocks/http/receiver/block.schema.json +++ b/core/src/datayoga_core/blocks/http/receiver/block.schema.json @@ -1,8 +1,9 @@ { + "$schema": "https://json-schema.org/draft/2019-09/schema", "title": "http.receiver", "description": "Receives HTTP requests and process the data.", "type": "object", - "$inherit": ["streamable"], + "allOf": [{ "$ref": "../../../resources/schemas/streamable.schema.json" }], "properties": { "host": { "description": "Host to listen", @@ -15,7 +16,7 @@ "default": 8080 } }, - "additionalProperties": false, + "unevaluatedProperties": false, "examples": [ { "host": "localhost", diff --git a/core/src/datayoga_core/blocks/parquet/read/block.schema.json b/core/src/datayoga_core/blocks/parquet/read/block.schema.json index 395b3edd..777c23c4 100644 --- a/core/src/datayoga_core/blocks/parquet/read/block.schema.json +++ b/core/src/datayoga_core/blocks/parquet/read/block.schema.json @@ -1,15 +1,16 @@ { + "$schema": "https://json-schema.org/draft/2019-09/schema", "title": "parquet.read", "description": "Read data from parquet", "type": "object", - "$inherit": ["batchable"], + "allOf": [{ "$ref": "../../../resources/schemas/batchable.schema.json" }], "properties": { "file": { "description": "Filename. Can contain a regexp or glob expression", "type": "string" } }, - "additionalProperties": false, + "unevaluatedProperties": false, "required": ["file"], "examples": [ { diff --git a/core/src/datayoga_core/blocks/redis/read_stream/block.schema.json b/core/src/datayoga_core/blocks/redis/read_stream/block.schema.json index f7e0a948..4411149f 100644 --- a/core/src/datayoga_core/blocks/redis/read_stream/block.schema.json +++ b/core/src/datayoga_core/blocks/redis/read_stream/block.schema.json @@ -1,8 +1,9 @@ { + "$schema": "https://json-schema.org/draft/2019-09/schema", "title": "redis.read_stream", "description": "Read from Redis stream", "type": "object", - "$inherit": ["streamable"], + "allOf": [{ "$ref": "../../../resources/schemas/streamable.schema.json" }], "properties": { "connection": { "description": "Connection name", "type": "string" }, "stream_name": { @@ -17,6 +18,6 @@ "default": false } }, - "additionalProperties": false, + "unevaluatedProperties": false, "required": ["connection", "stream_name"] } diff --git a/core/src/datayoga_core/blocks/relational/read/block.schema.json b/core/src/datayoga_core/blocks/relational/read/block.schema.json index df5bc8b2..29f5715a 100644 --- a/core/src/datayoga_core/blocks/relational/read/block.schema.json +++ b/core/src/datayoga_core/blocks/relational/read/block.schema.json @@ -1,9 +1,10 @@ { + "$schema": "https://json-schema.org/draft/2019-09/schema", "title": "relational.read", "description": "Read a table from an SQL-compatible data store", "type": "object", - "$inherit": ["batchable"], - "additionalProperties": false, + "allOf": [{ "$ref": "../../../resources/schemas/batchable.schema.json" }], + "unevaluatedProperties": false, "examples": [ { "id": "read_snowflake", diff --git a/core/src/datayoga_core/blocks/std/read/block.schema.json b/core/src/datayoga_core/blocks/std/read/block.schema.json index 2214ac05..5d825898 100644 --- a/core/src/datayoga_core/blocks/std/read/block.schema.json +++ b/core/src/datayoga_core/blocks/std/read/block.schema.json @@ -1,8 +1,9 @@ { + "$schema": "https://json-schema.org/draft/2019-09/schema", "title": "std.read", "description": "Read from the standard input", "type": "object", - "$inherit": ["batchable"], + "allOf": [{ "$ref": "../../../resources/schemas/batchable.schema.json" }], "properties": {}, - "additionalProperties": false + "unevaluatedProperties": false } diff --git a/core/src/datayoga_core/job.py b/core/src/datayoga_core/job.py index 9df8c267..710d84e6 100644 --- a/core/src/datayoga_core/job.py +++ b/core/src/datayoga_core/job.py @@ -238,11 +238,11 @@ def get_json_schema(whitelisted_blocks: Optional[List[str]] = None) -> Dict[str, block_types = [] block_schemas = [] # Lazy import: schema_utils -> utils -> block creates a circular import at module load. - from datayoga_core.schema_utils import resolve_inherits + from datayoga_core.schema_utils import resolve_refs for block_type, schema_path in block_info: block_types.append(block_type) # load schema file - schema = resolve_inherits(utils.read_json(f"{schema_path}")) + schema = resolve_refs(utils.read_json(f"{schema_path}"), schema_path=f"{schema_path}") # append to the array of allOf for the full schema # we use allOf for better error reporting block_schemas.append({ diff --git a/core/src/datayoga_core/resources/schemas/batchable.schema.json b/core/src/datayoga_core/resources/schemas/batchable.schema.json index f158d4fb..c04fb8fa 100644 --- a/core/src/datayoga_core/resources/schemas/batchable.schema.json +++ b/core/src/datayoga_core/resources/schemas/batchable.schema.json @@ -1,4 +1,5 @@ { + "$schema": "https://json-schema.org/draft/2019-09/schema", "title": "batchable", "description": "Producer batching mixin: declares batch_size for producers that yield records in batches.", "type": "object", diff --git a/core/src/datayoga_core/resources/schemas/streamable.schema.json b/core/src/datayoga_core/resources/schemas/streamable.schema.json index 761c6d65..0bdba461 100644 --- a/core/src/datayoga_core/resources/schemas/streamable.schema.json +++ b/core/src/datayoga_core/resources/schemas/streamable.schema.json @@ -1,4 +1,5 @@ { + "$schema": "https://json-schema.org/draft/2019-09/schema", "title": "streamable", "description": "Streaming producer mixin: declares batch_size and flush_ms for producers reading from continuous sources.", "type": "object", diff --git a/core/src/datayoga_core/schema_utils.py b/core/src/datayoga_core/schema_utils.py index e009a984..c170b733 100644 --- a/core/src/datayoga_core/schema_utils.py +++ b/core/src/datayoga_core/schema_utils.py @@ -1,62 +1,88 @@ """Schema composition helpers. -Producers and other blocks can declare `"$inherit": ["batchable"]` at the -top of their block.schema.json to pull in shared property definitions from -the fragments in resources/schemas/. `resolve_inherits` merges the -fragments' `properties` into the local schema (local properties win), then -removes the `$inherit` key. Schemas without `$inherit` are returned as-is. +Producer block schemas use standard JSON Schema composition via `$ref` + +`allOf` (with `unevaluatedProperties: false` to allow inherited properties). +At validation time we want to keep the simple `jsonschema.validate(instance, +schema)` code path, so we resolve any local-file `$ref`s into the schema +ahead of time. The on-disk schemas remain standard JSON Schema; only the +in-memory form is flattened. + +Example: a block schema like + + {"allOf": [{"$ref": "../../../resources/schemas/batchable.schema.json"}], + "properties": {...}, + "unevaluatedProperties": false} + +becomes + + {"allOf": [], + "properties": {...}, + "unevaluatedProperties": false} + +after `resolve_refs(schema, schema_path)`. """ from __future__ import annotations import copy from os import path -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, Set from datayoga_core import utils -def resolve_inherits(schema: Dict[str, Any], schemas_dir: Optional[str] = None) -> Dict[str, Any]: - """Merge any fragments listed in $inherit into the schema's properties. +def resolve_refs(schema: Dict[str, Any], schema_path: Optional[str] = None) -> Dict[str, Any]: + """Return a copy of `schema` with local-file `$ref`s inlined recursively. Args: - schema: The schema to resolve. Mutated in place and also returned. - schemas_dir: Directory containing the fragment files. Defaults to + schema: The schema to resolve. + schema_path: Filesystem path the schema was loaded from. Used to + resolve relative `$ref` paths. If None, refs are resolved against the bundled/non-bundled resources/schemas directory. Returns: - The mutated schema with $inherit removed and fragment properties merged. + A new schema with all local-file $refs replaced by the referenced + document's contents. Non-local refs (http://, #fragments) and + non-existent files pass through unchanged or raise depending on form. + + Raises: + FileNotFoundError: A local-file $ref points at a file that doesn't exist. + ValueError: A circular $ref chain is detected. """ - inherits = schema.get("$inherit") - if inherits is None or inherits == []: - return schema - if not isinstance(inherits, list) or not all(isinstance(name, str) for name in inherits): - raise TypeError( - f"$inherit must be a list of fragment names (strings), got {inherits!r}" - ) - - if schemas_dir is None: - schemas_dir = utils.get_resource_path("schemas") - - merged_properties: Dict[str, Any] = {} - for fragment_name in inherits: - fragment_path = path.join(schemas_dir, f"{fragment_name}.schema.json") - if not path.isfile(fragment_path): - raise FileNotFoundError( - f"Schema fragment '{fragment_name}' not found at {fragment_path}" - ) - fragment = utils.read_json(fragment_path) - if fragment.get("$inherit"): - raise ValueError( - f"Schema fragment '{fragment_name}' itself contains $inherit; " - "nested inheritance is not supported. Inline the parent fragment's " - "properties or restructure the hierarchy." - ) - merged_properties.update(copy.deepcopy(fragment.get("properties", {}))) - - # Local properties take precedence over inherited ones. - local_properties = schema.get("properties", {}) - merged_properties.update(local_properties) - - schema["properties"] = merged_properties - schema.pop("$inherit", None) - return schema + if schema_path is not None: + base_dir = path.dirname(path.abspath(schema_path)) + else: + base_dir = utils.get_resource_path("schemas") + + return _resolve_node(schema, base_dir, visited=set()) + + +def _resolve_node(node: Any, base_dir: str, visited: Set[str]) -> Any: + if isinstance(node, dict): + ref = node.get("$ref") + if isinstance(ref, str) and _is_local_file_ref(ref): + target = path.normpath(path.join(base_dir, ref)) + if target in visited: + raise ValueError(f"Circular $ref detected resolving '{ref}' at {target}") + if not path.isfile(target): + raise FileNotFoundError( + f"$ref target not found: '{ref}' resolved to {target}" + ) + fragment = utils.read_json(target) + visited.add(target) + try: + resolved = _resolve_node(fragment, path.dirname(target), visited) + finally: + visited.discard(target) + return resolved + return {k: _resolve_node(v, base_dir, visited) for k, v in node.items()} + if isinstance(node, list): + return [_resolve_node(item, base_dir, visited) for item in node] + return copy.copy(node) + + +def _is_local_file_ref(ref: str) -> bool: + """A $ref is a local file ref if it looks like a path to a .json/.schema.json + file with no URI scheme and no in-document fragment.""" + if ref.startswith("#") or "://" in ref: + return False + return ref.endswith(".json") diff --git a/core/src/datayoga_core/tests/test_schema_inherit.py b/core/src/datayoga_core/tests/test_schema_inherit.py index f01a1dfe..dce5024d 100644 --- a/core/src/datayoga_core/tests/test_schema_inherit.py +++ b/core/src/datayoga_core/tests/test_schema_inherit.py @@ -1,100 +1,118 @@ +"""Tests for the $ref pre-resolver in `schema_utils.resolve_refs`. + +Block schemas use standard JSON Schema composition (`allOf` + `$ref` to +local fragment files). We pre-resolve those refs at load time so the +in-memory schema is self-contained. +""" +import json from pathlib import Path import pytest -from datayoga_core.schema_utils import resolve_inherits +from datayoga_core.schema_utils import resolve_refs + +SCHEMAS_DIR = Path(__file__).resolve().parent.parent / "resources" / "schemas" +BATCHABLE = SCHEMAS_DIR / "batchable.schema.json" -SCHEMAS_DIR = ( - Path(__file__).resolve().parent.parent / "resources" / "schemas" -) +def test_resolve_refs_inlines_local_ref(tmp_path): + """A {'$ref': 'localfile.json'} node is replaced inline with the file's contents.""" + fragment = {"type": "object", "properties": {"x": {"type": "integer"}}} + frag_path = tmp_path / "frag.schema.json" + frag_path.write_text(json.dumps(fragment)) -def test_inherit_merges_fragment_properties(): - """A schema with $inherit:[batchable] picks up batch_size from the fragment.""" schema = { - "title": "demo", "type": "object", - "$inherit": ["batchable"], - "properties": {"foo": {"type": "string"}}, - "additionalProperties": False, + "allOf": [{"$ref": "frag.schema.json"}], + "properties": {"y": {"type": "string"}}, } - resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) - assert "$inherit" not in resolved - assert "batch_size" in resolved["properties"] - assert resolved["properties"]["batch_size"]["default"] == 1000 - assert resolved["properties"]["foo"] == {"type": "string"} - assert resolved["additionalProperties"] is False + schema_path = tmp_path / "host.schema.json" + resolved = resolve_refs(schema, schema_path=str(schema_path)) + assert resolved["allOf"][0] == fragment + assert "$ref" not in json.dumps(resolved) -def test_inherit_local_property_wins_over_fragment(): - """When local schema redefines an inherited property, the local version takes precedence.""" - schema = { - "type": "object", - "$inherit": ["batchable"], - "properties": { - "batch_size": {"type": "integer", "minimum": 1, "default": 50} - }, - } - resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) - assert resolved["properties"]["batch_size"]["default"] == 50 + +def test_resolve_refs_no_ref_passthrough(tmp_path): + """Schemas with no `$ref` come out structurally equal.""" + schema = {"type": "object", "properties": {"x": {"type": "string"}}} + resolved = resolve_refs(schema, schema_path=str(tmp_path / "host.schema.json")) + assert resolved == schema -def test_inherit_streamable_brings_both_props(): - """$inherit:[streamable] exposes both batch_size and flush_ms on the schema.""" - schema = {"type": "object", "$inherit": ["streamable"], "properties": {}} - resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) - assert "batch_size" in resolved["properties"] - assert "flush_ms" in resolved["properties"] +def test_resolve_refs_resolves_transitively(tmp_path): + """A fragment that itself contains `$ref` is resolved all the way.""" + leaf = {"type": "object", "properties": {"leaf_prop": {"type": "integer"}}} + (tmp_path / "leaf.schema.json").write_text(json.dumps(leaf)) + middle = {"allOf": [{"$ref": "leaf.schema.json"}]} + (tmp_path / "middle.schema.json").write_text(json.dumps(middle)) -def test_schema_without_inherit_unchanged(): - """Schemas without $inherit pass through resolve_inherits unmodified.""" - schema = { - "type": "object", - "properties": {"foo": {"type": "string"}}, - "additionalProperties": False, - } - resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) - assert resolved == schema + schema = {"allOf": [{"$ref": "middle.schema.json"}]} + resolved = resolve_refs(schema, schema_path=str(tmp_path / "host.schema.json")) + # middle's $ref to leaf was resolved as part of the resolution of host's $ref to middle + assert resolved == {"allOf": [{"allOf": [leaf]}]} -def test_unknown_fragment_raises(): - """$inherit referencing a missing fragment file raises FileNotFoundError.""" - schema = {"type": "object", "$inherit": ["nope"], "properties": {}} - with pytest.raises(FileNotFoundError): - resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) +def test_resolve_refs_missing_file_raises(tmp_path): + """A `$ref` pointing at a missing local file raises FileNotFoundError.""" + schema = {"allOf": [{"$ref": "does_not_exist.schema.json"}]} + with pytest.raises(FileNotFoundError, match="does_not_exist.schema.json"): + resolve_refs(schema, schema_path=str(tmp_path / "host.schema.json")) -def test_inherit_string_value_raises_type_error(): - """$inherit must be a list; passing a string raises TypeError loudly.""" - schema = {"type": "object", "$inherit": "batchable", "properties": {}} - with pytest.raises(TypeError): - resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) +def test_resolve_refs_detects_circular(tmp_path): + """A → B → A cycle raises ValueError, not infinite recursion.""" + (tmp_path / "a.schema.json").write_text('{"allOf": [{"$ref": "b.schema.json"}]}') + (tmp_path / "b.schema.json").write_text('{"allOf": [{"$ref": "a.schema.json"}]}') -def test_inherit_non_string_items_raises_type_error(): - """Non-string items in the $inherit list raise TypeError.""" - schema = {"type": "object", "$inherit": ["batchable", 123], "properties": {}} - with pytest.raises(TypeError): - resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) + schema = {"allOf": [{"$ref": "a.schema.json"}]} + with pytest.raises(ValueError, match="Circular"): + resolve_refs(schema, schema_path=str(tmp_path / "host.schema.json")) -def test_inherit_empty_list_returns_unchanged(): - """An empty $inherit list is a no-op; the schema is returned as-is.""" - schema = {"type": "object", "$inherit": [], "properties": {"foo": {}}} - resolved = resolve_inherits(schema, schemas_dir=str(SCHEMAS_DIR)) - # Early-return path: schema is returned as-is (no mutation, no key removal). - assert resolved is schema +def test_resolve_refs_ignores_non_local_refs(tmp_path): + """`$ref` values like '#/$defs/x' or 'http://...' are left untouched.""" + schema = { + "allOf": [ + {"$ref": "#/$defs/internal"}, + {"$ref": "https://json-schema.org/draft/2019-09/schema"}, + ], + "$defs": {"internal": {"type": "integer"}}, + } + resolved = resolve_refs(schema, schema_path=str(tmp_path / "host.schema.json")) + assert resolved == schema -def test_nested_inherit_raises_value_error(tmp_path): - """A fragment that itself contains $inherit raises ValueError (no nested inheritance).""" - # Build a fragment dir with a fragment that has its own $inherit. - (tmp_path / "parent.schema.json").write_text( - '{"properties": {"x": {"type": "string"}}}' - ) - (tmp_path / "child.schema.json").write_text( - '{"$inherit": ["parent"], "properties": {"y": {"type": "string"}}}' +def test_resolve_refs_against_real_fragment(): + """resolve_refs against the actual batchable fragment in the repo works.""" + # Simulate loading a block schema whose path is at depth blocks/X/Y/. + schema = { + "$schema": "https://json-schema.org/draft/2019-09/schema", + "type": "object", + "allOf": [{"$ref": "../../../resources/schemas/batchable.schema.json"}], + "properties": {"connection": {"type": "string"}}, + "unevaluatedProperties": False, + } + # Pick any real block path so the relative $ref resolves. + block_path = ( + Path(__file__).resolve().parent.parent + / "blocks" / "std" / "read" / "block.schema.json" ) - schema = {"$inherit": ["child"], "type": "object", "properties": {}} - with pytest.raises(ValueError, match="nested inheritance is not supported"): - resolve_inherits(schema, schemas_dir=str(tmp_path)) + resolved = resolve_refs(schema, schema_path=str(block_path)) + # The batchable fragment is inlined inside allOf + assert resolved["allOf"][0]["properties"]["batch_size"]["default"] == 1000 + + +def test_resolve_refs_default_base_dir(): + """When schema_path is None, refs resolve against resources/schemas/.""" + schema = {"allOf": [{"$ref": "batchable.schema.json"}]} + resolved = resolve_refs(schema) + assert resolved["allOf"][0]["properties"]["batch_size"]["default"] == 1000 + + +def test_resolve_refs_default_base_dir_with_missing_file(): + """Without schema_path, refs pointing at unknown files in the resources dir raise.""" + schema = {"allOf": [{"$ref": "nope.schema.json"}]} + with pytest.raises(FileNotFoundError): + resolve_refs(schema) diff --git a/docs/reference/blocks/azure_read_event_hub.md b/docs/reference/blocks/azure_read_event_hub.md index fc3f8e5b..72bb4ef6 100644 --- a/docs/reference/blocks/azure_read_event_hub.md +++ b/docs/reference/blocks/azure_read_event_hub.md @@ -21,7 +21,6 @@ Read from Azure Event Hub |**checkpoint\_store\_container\_name**|`string`|The name of the container within the checkpoint store to store the checkpoints.
|yes| |**max\_batch\_size**|`integer`|Maximum number of events to receive in each SDK callback. Renamed from the previous batch_size which used to mean this. Defaults to 300.
Default: `300`
Minimum: `1`
|no| -**Additional Properties:** not allowed **Example** ```yaml diff --git a/docs/reference/blocks/files_read_csv.md b/docs/reference/blocks/files_read_csv.md index 44833e34..4a03458e 100644 --- a/docs/reference/blocks/files_read_csv.md +++ b/docs/reference/blocks/files_read_csv.md @@ -20,7 +20,6 @@ Read data from CSV |**delimiter**|`string`|Delimiter to use for splitting the csv records
Default: `","`
Minimal Length: `1`
Maximal Length: `1`
|no| |**quotechar**|`string`|A one-character string used to quote fields containing special characters, such as the delimiter or quotechar, or which contain new-line characters. It defaults to '
Default: `"\""`
Minimal Length: `1`
Maximal Length: `1`
|no| -**Additional Properties:** not allowed **Example** ```yaml diff --git a/docs/reference/blocks/http_receiver.md b/docs/reference/blocks/http_receiver.md index fa2c4cf2..1cad6824 100644 --- a/docs/reference/blocks/http_receiver.md +++ b/docs/reference/blocks/http_receiver.md @@ -17,7 +17,6 @@ Receives HTTP requests and process the data. |**host**|`string`|Host to listen
Default: `"0.0.0.0"`
|| |**port**|`integer`|Port to listen
Default: `8080`
|| -**Additional Properties:** not allowed **Example** ```yaml diff --git a/docs/reference/blocks/parquet_read.md b/docs/reference/blocks/parquet_read.md index 10f9f2b6..19a1c1b3 100644 --- a/docs/reference/blocks/parquet_read.md +++ b/docs/reference/blocks/parquet_read.md @@ -15,7 +15,6 @@ Read data from parquet |**batch\_size**|`integer`|Maximum number of records yielded per downstream batch.
Default: `1000`
Minimum: `1`
|no| |**file**|`string`|Filename. Can contain a regexp or glob expression
|yes| -**Additional Properties:** not allowed **Example** ```yaml diff --git a/docs/reference/blocks/redis_read_stream.md b/docs/reference/blocks/redis_read_stream.md index 31c0b265..317e4497 100644 --- a/docs/reference/blocks/redis_read_stream.md +++ b/docs/reference/blocks/redis_read_stream.md @@ -18,7 +18,6 @@ Read from Redis stream |**stream\_name**
(Source stream name)|`string`|Source stream name
|yes| |**snapshot**
(Snapshot current entries and quit)|`boolean`|Snapshot current entries and quit
Default: `false`
|no| -**Additional Properties:** not allowed **Example** ```yaml diff --git a/docs/reference/blocks/relational_read.md b/docs/reference/blocks/relational_read.md index b439eb1b..409d6adb 100644 --- a/docs/reference/blocks/relational_read.md +++ b/docs/reference/blocks/relational_read.md @@ -19,7 +19,6 @@ Read a table from an SQL-compatible data store |[**columns**](#columns)
(Optional subset of columns to load)|`array`||no| |**fetch\_size**|`integer`|Driver-level rows fetched per round-trip. Defaults to 10000.
Default: `10000`
Minimum: `1`
|no| -**Additional Properties:** not allowed **Example** ```yaml diff --git a/docs/reference/blocks/relational_write.md b/docs/reference/blocks/relational_write.md index 34e54fed..a8ebabfb 100644 --- a/docs/reference/blocks/relational_write.md +++ b/docs/reference/blocks/relational_write.md @@ -24,15 +24,6 @@ Write into a SQL-compatible data store |[**inactive\_record\_mapping**](#inactive_record_mapping)
(Used for \`TYPE2\` load\_strategy\. The columns mapping to use to close out an active record)|`array`|A list of columns to use. Use any valid SQL expression for the source. If 'target' is omitted, will default to the name of the source column
Default:
|no| **Additional Properties:** not allowed -  - -**No properties.** - -  -**Not [required1]:** -**No properties.** - - **Example** ```yaml diff --git a/docs/reference/blocks/std_read.md b/docs/reference/blocks/std_read.md index e2d9481c..9f858f42 100644 --- a/docs/reference/blocks/std_read.md +++ b/docs/reference/blocks/std_read.md @@ -14,7 +14,6 @@ Read from the standard input |----|----|-----------|--------| |**batch\_size**|`integer`|Maximum number of records yielded per downstream batch.
Default: `1000`
Minimum: `1`
|| -**Additional Properties:** not allowed **Example** ```yaml diff --git a/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md b/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md index 2b96ce05..f2d20436 100644 --- a/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md +++ b/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md @@ -120,44 +120,31 @@ Why a queue and not `asyncio.wait_for(anext(gen), timeout)`: cancelling `__anext `flush_ms = None` ⇒ `timeout = None` ⇒ `queue.get()` waits forever ⇒ no time-based flush. Bounded sources don't set `flush_ms` and aren't affected. -### Schema fragments +### Schema composition (standard JSON Schema) -Two shared fragments in `core/src/datayoga_core/resources/schemas/`: +Two shared fragments in `core/src/datayoga_core/resources/schemas/` declare the common properties: -`batchable.schema.json`: +- `batchable.schema.json` declares `batch_size`. +- `streamable.schema.json` declares both `batch_size` and `flush_ms`. + +Each block schema uses standard JSON Schema composition: `allOf` + `$ref` to the fragment file, plus `unevaluatedProperties: false` (rather than `additionalProperties: false`) so the fragment-contributed properties are recognized as evaluated. Example: ```json { + "$schema": "https://json-schema.org/draft/2019-09/schema", + "title": "std.read", "type": "object", - "properties": { - "batch_size": { - "type": "integer", - "minimum": 1, - "description": "Maximum number of records yielded per downstream batch", - "default": 1000 - } - } + "allOf": [{ "$ref": "../../../resources/schemas/batchable.schema.json" }], + "properties": {}, + "unevaluatedProperties": false } ``` -`streamable.schema.json`: +At load time, `schema_utils.resolve_refs(schema, schema_path)` walks the schema, finds any local-file `$ref` (relative path, ends in `.json`, no URI scheme, no in-document fragment), and inlines the referenced file's contents in place. The resulting in-memory schema is self-contained — no remaining `$ref`s — so `Block.validate()` keeps using the simple `jsonschema.validate(instance, schema)` code path. The on-disk schemas remain standards-compliant; the resolution is purely a runtime detail to avoid threading a `RefResolver` through every validation site. -```json -{ - "type": "object", - "allOf": [{ "$ref": "batchable.schema.json" }], - "properties": { - "flush_ms": { - "type": ["integer", "null"], - "minimum": 1, - "description": "If set, flush a partial batch after this many ms of inactivity. null/omitted = wait until batch_size or end-of-stream.", - "default": 1000 - } - } -} -``` +`unevaluatedProperties: false` (introduced in draft 2019-09) is what makes composition + strict property validation work: with `additionalProperties: false`, a property contributed by an `allOf` member would be rejected as "additional" at the parent level. `unevaluatedProperties` is composition-aware. -Bounded producer schemas `$ref` `batchable`; streaming producer schemas `$ref` `streamable`. The fragments are the single source of truth for the description, validation, and default. +External tools that ARE `$ref`-aware (IDE schema validators, OpenAPI exporters) read the on-disk schemas correctly without our resolver. The `jsonschema2mk` docs generator is not `$ref`-aware, so `scripts/generate-docs.sh` pre-resolves `$ref` and flattens `allOf` properties for docs rendering only. ### Per-producer changes diff --git a/schemas/job.schema.json b/schemas/job.schema.json index ad0f20b9..d23ccc8e 100644 --- a/schemas/job.schema.json +++ b/schemas/job.schema.json @@ -111,15 +111,31 @@ "then": { "properties": { "with": { - "additionalProperties": false, + "$schema": "https://json-schema.org/draft/2019-09/schema", + "allOf": [ + { + "$schema": "https://json-schema.org/draft/2019-09/schema", + "description": "Streaming producer mixin: declares batch_size and flush_ms for producers reading from continuous sources.", + "properties": { + "batch_size": { + "default": 1000, + "description": "Maximum number of records yielded per downstream batch.", + "minimum": 1, + "type": "integer" + }, + "flush_ms": { + "default": 1000, + "description": "If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.", + "minimum": 1, + "type": ["integer", "null"] + } + }, + "title": "streamable", + "type": "object" + } + ], "description": "Read from Azure Event Hub", "properties": { - "batch_size": { - "default": 1000, - "description": "Maximum number of records yielded per downstream batch.", - "minimum": 1, - "type": "integer" - }, "checkpoint_store_connection_string": { "description": "The connection string for the Azure Storage account used as the checkpoint store.", "type": "string" @@ -140,12 +156,6 @@ "description": "The name of the Azure Event Hub.", "type": "string" }, - "flush_ms": { - "default": 1000, - "description": "If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.", - "minimum": 1, - "type": ["integer", "null"] - }, "max_batch_size": { "default": 300, "description": "Maximum number of events to receive in each SDK callback. Renamed from the previous batch_size which used to mean this. Defaults to 300.", @@ -161,7 +171,8 @@ "checkpoint_store_container_name" ], "title": "azure.read_event_hub", - "type": "object" + "type": "object", + "unevaluatedProperties": false } } } @@ -266,16 +277,26 @@ "then": { "properties": { "with": { - "additionalProperties": false, + "$schema": "https://json-schema.org/draft/2019-09/schema", + "allOf": [ + { + "$schema": "https://json-schema.org/draft/2019-09/schema", + "description": "Producer batching mixin: declares batch_size for producers that yield records in batches.", + "properties": { + "batch_size": { + "default": 1000, + "description": "Maximum number of records yielded per downstream batch.", + "minimum": 1, + "type": "integer" + } + }, + "title": "batchable", + "type": "object" + } + ], "description": "Read data from CSV", "examples": [{ "delimiter": ";", "file": "archive.csv" }], "properties": { - "batch_size": { - "default": 1000, - "description": "Maximum number of records yielded per downstream batch.", - "minimum": 1, - "type": "integer" - }, "delimiter": { "default": ",", "description": "Delimiter to use for splitting the csv records", @@ -322,7 +343,8 @@ }, "required": ["file"], "title": "files.read_csv", - "type": "object" + "type": "object", + "unevaluatedProperties": false } } } @@ -376,22 +398,32 @@ "then": { "properties": { "with": { - "additionalProperties": false, + "$schema": "https://json-schema.org/draft/2019-09/schema", + "allOf": [ + { + "$schema": "https://json-schema.org/draft/2019-09/schema", + "description": "Streaming producer mixin: declares batch_size and flush_ms for producers reading from continuous sources.", + "properties": { + "batch_size": { + "default": 1000, + "description": "Maximum number of records yielded per downstream batch.", + "minimum": 1, + "type": "integer" + }, + "flush_ms": { + "default": 1000, + "description": "If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.", + "minimum": 1, + "type": ["integer", "null"] + } + }, + "title": "streamable", + "type": "object" + } + ], "description": "Receives HTTP requests and process the data.", "examples": [{ "host": "localhost", "port": 8080 }], "properties": { - "batch_size": { - "default": 1000, - "description": "Maximum number of records yielded per downstream batch.", - "minimum": 1, - "type": "integer" - }, - "flush_ms": { - "default": 1000, - "description": "If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.", - "minimum": 1, - "type": ["integer", "null"] - }, "host": { "default": "0.0.0.0", "description": "Host to listen", @@ -404,7 +436,8 @@ } }, "title": "http.receiver", - "type": "object" + "type": "object", + "unevaluatedProperties": false } } } @@ -718,16 +751,26 @@ "then": { "properties": { "with": { - "additionalProperties": false, + "$schema": "https://json-schema.org/draft/2019-09/schema", + "allOf": [ + { + "$schema": "https://json-schema.org/draft/2019-09/schema", + "description": "Producer batching mixin: declares batch_size for producers that yield records in batches.", + "properties": { + "batch_size": { + "default": 1000, + "description": "Maximum number of records yielded per downstream batch.", + "minimum": 1, + "type": "integer" + } + }, + "title": "batchable", + "type": "object" + } + ], "description": "Read data from parquet", "examples": [{ "file": "data.parquet" }], "properties": { - "batch_size": { - "default": 1000, - "description": "Maximum number of records yielded per downstream batch.", - "minimum": 1, - "type": "integer" - }, "file": { "description": "Filename. Can contain a regexp or glob expression", "type": "string" @@ -735,7 +778,8 @@ }, "required": ["file"], "title": "parquet.read", - "type": "object" + "type": "object", + "unevaluatedProperties": false } } } @@ -854,25 +898,35 @@ "then": { "properties": { "with": { - "additionalProperties": false, + "$schema": "https://json-schema.org/draft/2019-09/schema", + "allOf": [ + { + "$schema": "https://json-schema.org/draft/2019-09/schema", + "description": "Streaming producer mixin: declares batch_size and flush_ms for producers reading from continuous sources.", + "properties": { + "batch_size": { + "default": 1000, + "description": "Maximum number of records yielded per downstream batch.", + "minimum": 1, + "type": "integer" + }, + "flush_ms": { + "default": 1000, + "description": "If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.", + "minimum": 1, + "type": ["integer", "null"] + } + }, + "title": "streamable", + "type": "object" + } + ], "description": "Read from Redis stream", "properties": { - "batch_size": { - "default": 1000, - "description": "Maximum number of records yielded per downstream batch.", - "minimum": 1, - "type": "integer" - }, "connection": { "description": "Connection name", "type": "string" }, - "flush_ms": { - "default": 1000, - "description": "If set, flush a partial batch after this many ms of inactivity. null or omitted = wait until batch_size or end-of-stream.", - "minimum": 1, - "type": ["integer", "null"] - }, "snapshot": { "default": false, "description": "Snapshot current entries and quit", @@ -887,7 +941,8 @@ }, "required": ["connection", "stream_name"], "title": "redis.read_stream", - "type": "object" + "type": "object", + "unevaluatedProperties": false } } } @@ -1052,7 +1107,23 @@ "then": { "properties": { "with": { - "additionalProperties": false, + "$schema": "https://json-schema.org/draft/2019-09/schema", + "allOf": [ + { + "$schema": "https://json-schema.org/draft/2019-09/schema", + "description": "Producer batching mixin: declares batch_size for producers that yield records in batches.", + "properties": { + "batch_size": { + "default": 1000, + "description": "Maximum number of records yielded per downstream batch.", + "minimum": 1, + "type": "integer" + } + }, + "title": "batchable", + "type": "object" + } + ], "description": "Read a table from an SQL-compatible data store", "examples": [ { @@ -1066,12 +1137,6 @@ } ], "properties": { - "batch_size": { - "default": 1000, - "description": "Maximum number of records yielded per downstream batch.", - "minimum": 1, - "type": "integer" - }, "columns": { "examples": [["fname", { "lname": "last_name" }]], "items": { @@ -1108,7 +1173,8 @@ }, "required": ["connection", "table"], "title": "relational.read", - "type": "object" + "type": "object", + "unevaluatedProperties": false } } } @@ -1426,18 +1492,28 @@ "then": { "properties": { "with": { - "additionalProperties": false, - "description": "Read from the standard input", - "properties": { - "batch_size": { - "default": 1000, - "description": "Maximum number of records yielded per downstream batch.", - "minimum": 1, - "type": "integer" + "$schema": "https://json-schema.org/draft/2019-09/schema", + "allOf": [ + { + "$schema": "https://json-schema.org/draft/2019-09/schema", + "description": "Producer batching mixin: declares batch_size for producers that yield records in batches.", + "properties": { + "batch_size": { + "default": 1000, + "description": "Maximum number of records yielded per downstream batch.", + "minimum": 1, + "type": "integer" + } + }, + "title": "batchable", + "type": "object" } - }, + ], + "description": "Read from the standard input", + "properties": {}, "title": "std.read", - "type": "object" + "type": "object", + "unevaluatedProperties": false } } } diff --git a/scripts/generate-docs.sh b/scripts/generate-docs.sh index 03aa51ae..4d58f199 100755 --- a/scripts/generate-docs.sh +++ b/scripts/generate-docs.sh @@ -47,7 +47,6 @@ cleanup_resolved_tmps() { trap cleanup_resolved_tmps EXIT blocks_dir="./core/src/datayoga_core/blocks" -schemas_dir="./core/src/datayoga_core/resources/schemas" for schema in $(find ${blocks_dir} -name '*.schema.json' | sort) do doc_name="$(awk -F/ '{ print $(NF-1) }' <<<${schema}).md" @@ -56,36 +55,63 @@ do block_package="$(echo ${block_package} | cut -c2- | sed 's/\//_/g')" [ ! -z "${block_package}" ] && block_package="${block_package}_" - # Resolve $inherit fragments so jsonschema2mk sees the inherited properties - # (batch_size, flush_ms, etc.). jsonschema2mk does not understand our custom - # $inherit extension, so we materialize a resolved copy first. + # Materialize a docs-friendly copy of the schema: + # 1. Resolve local-file $ref nodes by inlining the referenced JSON. + # 2. Flatten allOf-contributed properties into the top-level `properties` + # so jsonschema2mk renders a single property table per block. # Self-contained Python (stdlib only) so this works in CI without installing - # datayoga_core's runtime dependencies. + # datayoga_core's runtime dependencies. Pre-resolve at doc-gen time only; + # the on-disk schemas remain standard JSON Schema. resolved_tmp="$(mktemp --suffix=.schema.json)" RESOLVED_TMP_FILES+=("${resolved_tmp}") - python3 - "${schema}" "${schemas_dir}" > "${resolved_tmp}" <<'PYEOF' + python3 - "${schema}" > "${resolved_tmp}" <<'PYEOF' import json import os import sys -schema_path, schemas_dir = sys.argv[1], sys.argv[2] -with open(schema_path) as f: - schema = json.load(f) -inherits = schema.get("$inherit") or [] -if inherits: - if not isinstance(inherits, list) or not all(isinstance(n, str) for n in inherits): - raise SystemExit(f"$inherit must be a list of strings, got {inherits!r}") + +def resolve_node(node, base_dir, visited): + if isinstance(node, dict): + ref = node.get("$ref") + if isinstance(ref, str) and not ref.startswith("#") and "://" not in ref and ref.endswith(".json"): + target = os.path.normpath(os.path.join(base_dir, ref)) + if target in visited: + raise SystemExit(f"Circular $ref at {target}") + if not os.path.isfile(target): + raise SystemExit(f"$ref target not found: {ref} -> {target}") + with open(target) as f: + fragment = json.load(f) + visited.add(target) + try: + return resolve_node(fragment, os.path.dirname(target), visited) + finally: + visited.discard(target) + return {k: resolve_node(v, base_dir, visited) for k, v in node.items()} + if isinstance(node, list): + return [resolve_node(item, base_dir, visited) for item in node] + return node + + +def flatten_allof_properties(schema): + """Inline `allOf[*].properties` into the top-level `properties`, removing + the allOf. Docs-only transformation so jsonschema2mk renders one table.""" + if not isinstance(schema, dict) or "allOf" not in schema: + return schema merged = {} - for name in inherits: - fragment_path = os.path.join(schemas_dir, f"{name}.schema.json") - with open(fragment_path) as f: - fragment = json.load(f) - if fragment.get("$inherit"): - raise SystemExit(f"Nested $inherit in fragment '{name}' is not supported") - merged.update(fragment.get("properties", {})) + for member in schema.get("allOf", []): + if isinstance(member, dict): + merged.update(member.get("properties", {})) merged.update(schema.get("properties", {})) schema["properties"] = merged - schema.pop("$inherit", None) + schema.pop("allOf", None) + return schema + + +schema_path = sys.argv[1] +with open(schema_path) as f: + schema = json.load(f) +schema = resolve_node(schema, os.path.dirname(os.path.abspath(schema_path)), set()) +schema = flatten_allof_properties(schema) json.dump(schema, sys.stdout) PYEOF From eae77e35a0522dbf3768a98174b8974f38ea958f Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 19:57:10 +0300 Subject: [PATCH 34/38] Update spec to reflect what shipped (#400) Four corrections after PR review and migration to standard JSON Schema: - Status: Design -> Implemented in PR #401. - flush_ms code snippet now shows the bounded queue (maxsize=1) and source-error propagation that Copilot review surfaced, plus the cancelled-flag dance that the bounded queue requires. - Producer ABC risk note: produce_chunks is the new override hook but not formally @abstractmethod, so legacy produce() overrides keep working (correcting an earlier overstatement). - Event Hub schema risk note: we use unevaluatedProperties: false, not additionalProperties: false (composition-aware). - Drop CHANGELOG mention (no CHANGELOG in this repo; PR description carries the breaking change note). Co-Authored-By: Claude Opus 4.7 (1M context) --- ...28-producer-batching-unification-design.md | 42 +++++++++++++------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md b/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md index f2d20436..f5411371 100644 --- a/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md +++ b/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md @@ -1,9 +1,9 @@ # Producer batching unification -**Status:** Design — pending implementation +**Status:** Implemented in PR #401 **Date:** 2026-05-28 **Issue:** #400 -**Closes:** #294, #295, #296, #377 (as a side effect of the refactor) +**Closes:** #293, #294, #295, #296, #377 (as a side effect of the refactor) ## Problem @@ -70,27 +70,39 @@ The base class accumulates chunks and re-emits them in batches of up to `batch_s For streaming sources, partial batches must flush on inactivity, otherwise a low-traffic stream could hold records indefinitely. -Implementation uses an internal queue + background pump task, mirroring the pattern already in `azure/read_event_hub`: +Implementation uses an internal **bounded** queue + background pump task. The pump captures source errors and re-raises on the consumer side, so failures aren't silently treated as EOS: ```python -async def produce(self) -> AsyncGenerator[List[Message], None]: +async def produce(self) -> AsyncGenerator[List[Dict[str, Any]], None]: batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE)) flush_ms = self.properties.get("flush_ms", self.DEFAULT_FLUSH_MS) - timeout = (flush_ms / 1000) if flush_ms is not None else None + timeout = (flush_ms / 1000) if flush_ms else None - queue: asyncio.Queue[Optional[List[Message]]] = asyncio.Queue() + # maxsize=1 preserves the natural backpressure the old yield-driven model + # had: the pump can be at most one chunk ahead of the consumer. + queue: asyncio.Queue = asyncio.Queue(maxsize=1) EOS = object() + pump_error: List[BaseException] = [] # captured non-cancellation errors async def pump(): + cancelled = False try: async for chunk in self.produce_chunks(): if chunk: await queue.put(chunk) + except asyncio.CancelledError: + cancelled = True + raise + except BaseException as exc: + pump_error.append(exc) finally: - await queue.put(EOS) + # Skip the EOS put on cancellation — the consumer's finally is + # awaiting us and the queue may be full; putting would deadlock. + if not cancelled: + await queue.put(EOS) pump_task = asyncio.create_task(pump()) - buffer: List[Message] = [] + buffer: List[Dict[str, Any]] = [] try: while True: try: @@ -104,6 +116,8 @@ async def produce(self) -> AsyncGenerator[List[Message], None]: if item is EOS: if buffer: yield buffer + if pump_error: + raise pump_error[0] # propagate source error to caller return buffer.extend(item) @@ -112,12 +126,16 @@ async def produce(self) -> AsyncGenerator[List[Message], None]: buffer = buffer[batch_size:] finally: pump_task.cancel() - with suppress(asyncio.CancelledError): + with suppress(asyncio.CancelledError, Exception): await pump_task ``` Why a queue and not `asyncio.wait_for(anext(gen), timeout)`: cancelling `__anext__` on an async generator with side effects (open connections, partial reads) can leave it in a broken state. Cancelling the _pump task_ boundary is safe; the generator finishes its current chunk before the pump's `try/finally` runs. +Why `maxsize=1` and the `cancelled` flag: an unbounded queue removes backpressure — the pump could pre-load an entire parquet or relational table into memory while the consumer is processing batch 1 (flagged by Copilot review). Bounding at 1 keeps memory flat at the cost of a deadlock when the consumer is cancelled mid-flow (the pump's `finally: put(EOS)` blocks against a full queue). The `cancelled` flag skips the EOS put on cancellation, since the consumer is gone and EOS doesn't need to be delivered. + +Why `pump_error`: catching all exceptions in the pump and letting it terminate via EOS would silently truncate input on a source failure (Redis disconnect, broken CSV, DB error) — the consumer would see clean end-of-stream against partial data. Capturing the exception and re-raising on the consumer side makes the job fail loudly instead (also flagged by Copilot review). + `flush_ms = None` ⇒ `timeout = None` ⇒ `queue.get()` waits forever ⇒ no time-based flush. Bounded sources don't set `flush_ms` and aren't affected. ### Schema composition (standard JSON Schema) @@ -351,15 +369,15 @@ A `FakeProducer` whose `produce_chunks` yields scripted chunks. Cases: - Update `docs/reference/blocks/*_read.md` for each affected producer (`batch_size`, `flush_ms`, `fetch_size`, `max_batch_size` where applicable). - Add a section in `docs/processing-strategies.md` explaining the producer batching model: chunked subclass output, base-class re-chunking, `flush_ms` for streaming sources. -- CHANGELOG entry calling out: +- PR description carries the breaking-change note (no CHANGELOG file in this repo): - New `batch_size`/`flush_ms` on previously non-batching producers. - **Breaking:** `azure/read_event_hub.batch_size` renamed to `max_batch_size`; the name `batch_size` now means pipeline batch size. ## Risks and trade-offs -1. **`Producer` ABC change.** `produce_chunks` is now the abstract method. Any external/downstream custom producer subclassing `Producer` and overriding `produce()` directly will break. Acceptable given datayoga's surface area; called out in CHANGELOG. +1. **`Producer` ABC change.** `produce_chunks` is the new override hook (raises NotImplementedError by default; not formally `@abstractmethod` so legacy subclasses that still override `produce()` directly continue to validate). All 7 in-tree producers were migrated to override `produce_chunks`; external/downstream subclassers that override `produce()` directly continue to work but bypass the base-class batching. Called out in the PR description. -2. **Event Hub silent-semantic-change risk.** The breaking rename is intentional. Adding `additionalProperties: false` to the Event Hub schema (which it lacks today) is part of this change so that old `batch_size: 300` configs fail validation loudly, not get silently ignored. +2. **Event Hub silent-semantic-change risk.** The breaking rename is intentional. Adding `unevaluatedProperties: false` to the Event Hub schema (which lacked any `additionalProperties` declaration before) catches typos loudly. The literal `batch_size: 300` still validates after the rename but now means pipeline batch size, not SDK callback size — that semantic shift is documented in the PR description and the processing-strategies docs. 3. **`flush_ms` semantics on Job shutdown.** When the producer is being cancelled (`Job.shutdown` → `Step.stop`), the pump's `try/finally` ensures `EOS` is queued. The `produce()` loop sees `EOS` and flushes the final partial batch. Verified by the `test_producer_batching` shutdown case. From 2d2bcc4f29bc4a38caac3cb2255ce6b0694ef719 Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Thu, 28 May 2026 19:57:44 +0300 Subject: [PATCH 35/38] Spec: correct Event Hub migration note (#400) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Section said schema gains additionalProperties: false; actually it gains unevaluatedProperties: false (composition-aware). Also reframed the "rejects batch_size: 300 loudly" claim, which was always wrong — that literal property name still validates after the rename, just with new semantics. Typos are what additionalProperties/unevaluatedProperties catches. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../specs/2026-05-28-producer-batching-unification-design.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md b/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md index f5411371..ffc1c7c9 100644 --- a/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md +++ b/docs/superpowers/specs/2026-05-28-producer-batching-unification-design.md @@ -292,9 +292,9 @@ class Block(DyProducer): yield chunk ``` -**Migration:** Users with `batch_size: 300` in YAML thinking it controls SDK callbacks must rename to `max_batch_size: 300`. No backward-compat shim. Called out in CHANGELOG. +**Migration:** Users with `batch_size: 300` in YAML thinking it controls SDK callbacks must rename to `max_batch_size: 300`. No backward-compat shim. The literal `batch_size: 300` still validates after the rename but now means pipeline batch size, not SDK callback size — that semantic shift is documented in the PR description. -The schema for `azure/read_event_hub` also gains `additionalProperties: false` (it doesn't have it today). Without this, an old `batch_size: 300` in YAML would silently be ignored as an unknown property after the rename. With it, validation fails loudly with a clear error. +The schema for `azure/read_event_hub` also gains `unevaluatedProperties: false` (it had no `additionalProperties` declaration before). Typos like `batch_sz: 300` now fail validation loudly with a clear error. **`http/receiver`** (streaming) From 26a271b3fd4ae64efd4cfede68f92631e561dbce Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Sun, 31 May 2026 13:29:20 +0300 Subject: [PATCH 36/38] Rename test_schema_inherit.py -> test_schema_refs.py (#400) The file now tests schema_utils.resolve_refs (standard JSON Schema \$ref resolution), not the old custom \$inherit extension. Filename was stale. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../tests/{test_schema_inherit.py => test_schema_refs.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename core/src/datayoga_core/tests/{test_schema_inherit.py => test_schema_refs.py} (100%) diff --git a/core/src/datayoga_core/tests/test_schema_inherit.py b/core/src/datayoga_core/tests/test_schema_refs.py similarity index 100% rename from core/src/datayoga_core/tests/test_schema_inherit.py rename to core/src/datayoga_core/tests/test_schema_refs.py From a58336229871691a8abfe14232d085655c435261 Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Sun, 31 May 2026 14:05:00 +0300 Subject: [PATCH 37/38] Fix redis/read_stream PEL pagination regression (#400) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Copilot re-review caught a real regression: my PR set count=batch_size on all xreadgroup calls, including the pending-entry-list reads (id="0"). Combined with the unconditional flip to read_pending=False after the first call, this meant: only the first batch_size pending entries got processed per job session; anything beyond was stranded until restart. The "stay in pending mode until empty" attempt I tried first didn't work: XREADGROUP id="0" always returns from the start of the PEL (since the producer doesn't ack inside produce_chunks), so a smaller count just makes us re-read the same first page forever. Fix: revert to the pre-PR semantic for the pending read — count=None drains the entire PEL in one call. Keep count=batch_size for the new-message read (id=">") so the #377 batching contract still applies to live streams. The producer's yield-as-a-chunk behavior (the actual fix for #377) is unchanged. Updated tests: - test_redis_new_message_read_uses_count_equal_to_batch_size: pending call uses count=None, new-message call uses count=batch_size - test_redis_drains_full_pel_in_one_call_even_when_larger_than_batch_size: 20 pending entries drain in a single call; base class re-chunks to batch_size=5 -> four batches of 5 Real-Redis smoke against Redis 7 with PEL=25, batch_size=5: produces batches=[5,5,5,5,5], all PEL delivered, no re-reads. The other Copilot comment (sync redis-py with block=0 freezes the asyncio event loop) is a real architectural concern but pre-existing — same behavior in pre-PR code. Deferred to a follow-up issue if needed. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../blocks/redis/read_stream/block.py | 15 +++- .../tests/test_redis_read_stream.py | 71 ++++++++++++++++--- 2 files changed, 75 insertions(+), 11 deletions(-) diff --git a/core/src/datayoga_core/blocks/redis/read_stream/block.py b/core/src/datayoga_core/blocks/redis/read_stream/block.py index aa464743..ad166a8c 100644 --- a/core/src/datayoga_core/blocks/redis/read_stream/block.py +++ b/core/src/datayoga_core/blocks/redis/read_stream/block.py @@ -30,7 +30,15 @@ def init(self, context: Optional[Context] = None): self.redis_client.xgroup_create(self.stream, self.consumer_group, 0) async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: - """Reads pending then new stream messages via XREADGROUP, yielding each response as a chunk.""" + """Reads pending then new stream messages via XREADGROUP, yielding each response as a chunk. + + Pending entries (id="0") are drained in a single unbounded XREADGROUP + call (count=None) — this matches pre-PR behavior. Paginating PEL via + count is not safe with a non-acking producer because XREADGROUP id="0" + always returns from the start of PEL, so a smaller count would just + re-read the same first page forever. New-message reads (id=">") use + count=batch_size to bound the Redis network response size. + """ logger.debug(f"Running {self.get_block_name()}") batch_size = int(self.properties.get("batch_size", self.DEFAULT_BATCH_SIZE)) read_pending = True @@ -39,7 +47,7 @@ async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: streams = self.redis_client.xreadgroup( self.consumer_group, self.requesting_consumer, {self.stream: "0" if read_pending else ">"}, - count=batch_size, + count=None if read_pending else batch_size, block=100 if self.snapshot else 0, ) @@ -58,6 +66,9 @@ async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: if self.snapshot and not read_pending and not yielded_any: return + # Flip unconditionally after the first pending-read call: count=None + # drained the entire PEL in that single call, so there's no more + # pending work to do this session. read_pending = False def ack(self, msg_ids: List[str]): diff --git a/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py b/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py index 5c4a43f7..de003da8 100644 --- a/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py +++ b/core/src/datayoga_core/blocks/redis/read_stream/tests/test_redis_read_stream.py @@ -16,15 +16,30 @@ def _mk_block(properties, redis_client): return block +def _xreadgroup_count(call): + """Extract the count arg from an xreadgroup call regardless of kw/positional.""" + if "count" in call.kwargs: + return call.kwargs["count"] + if len(call.args) >= 4: + return call.args[3] + return None + + +def _xreadgroup_id(call): + """Extract the stream-id dict value from an xreadgroup call.""" + streams = call.kwargs.get("streams") or (call.args[2] if len(call.args) >= 3 else {}) + return next(iter(streams.values())) if streams else None + + @pytest.mark.asyncio -async def test_redis_uses_count_equal_to_batch_size(): - """xreadgroup is called with count=batch_size (closes #377).""" +async def test_redis_new_message_read_uses_count_equal_to_batch_size(): + """xreadgroup for new messages (id='>') uses count=batch_size (closes #377).""" redis = MagicMock() payload_a = (b"1-0", {b"data": b'{"i": 1}'}) payload_b = (b"2-0", {b"data": b'{"i": 2}'}) redis.xreadgroup.side_effect = [ - [(b"mystream", [payload_a, payload_b])], # pending - [(b"mystream", [])], # nothing new -> exit + [(b"mystream", [payload_a, payload_b])], # pending (drained in one call, count=None) + [(b"mystream", [])], # new-read empty -> exit ] block = _mk_block({"batch_size": 250, "_snapshot": True}, redis) @@ -32,9 +47,18 @@ async def test_redis_uses_count_equal_to_batch_size(): async for b in block.produce(): batches.append(b) - assert all(c.kwargs.get("count") == 250 or (len(c.args) >= 4 and c.args[3] == 250) - for c in redis.xreadgroup.call_args_list), \ - "xreadgroup should be called with count=batch_size" + # First call is pending (id="0"); it uses count=None (drain). + pending_call = redis.xreadgroup.call_args_list[0] + assert _xreadgroup_id(pending_call) == "0" + assert _xreadgroup_count(pending_call) is None, \ + "pending read should use count=None to drain PEL in one call" + + # Subsequent new-message calls (id=">") use count=batch_size. + new_calls = [c for c in redis.xreadgroup.call_args_list if _xreadgroup_id(c) == ">"] + assert new_calls, "expected at least one new-message read" + for c in new_calls: + assert _xreadgroup_count(c) == 250, \ + f"new-message read should use count=batch_size, got count={_xreadgroup_count(c)}" @pytest.mark.asyncio @@ -43,8 +67,8 @@ async def test_redis_yields_records_as_a_batch_not_one_by_one(): redis = MagicMock() pages = [(f"{i}-0".encode(), {b"data": f'{{"i": {i}}}'.encode()}) for i in range(5)] redis.xreadgroup.side_effect = [ - [(b"mystream", pages)], - [(b"mystream", [])], + [(b"mystream", pages)], # pending drained in one call + [(b"mystream", [])], # new-read empty -> exit ] block = _mk_block({"batch_size": 100, "_snapshot": True}, redis) @@ -54,3 +78,32 @@ async def test_redis_yields_records_as_a_batch_not_one_by_one(): assert [len(b) for b in batches] == [5] assert batches[0][0]["i"] == 0 + + +@pytest.mark.asyncio +async def test_redis_drains_full_pel_in_one_call_even_when_larger_than_batch_size(): + """Pending reads use count=None so the entire PEL drains in a single call. + The base class re-chunks the result to batch_size. This avoids the + Copilot-flagged pagination bug where count=batch_size + XREADGROUP id='0' + would re-read the same first page forever (since the producer doesn't ack + inside produce_chunks).""" + redis = MagicMock() + # Simulate a PEL of 20 entries returned in one xreadgroup call. + pel = [(f"{i}-0".encode(), {b"data": f'{{"i": {i}}}'.encode()}) for i in range(20)] + redis.xreadgroup.side_effect = [ + [(b"mystream", pel)], # entire PEL in one call (count=None) + [(b"mystream", [])], # new-read empty -> exit + ] + + block = _mk_block({"batch_size": 5, "_snapshot": True}, redis) + batches = [] + async for b in block.produce(): + batches.append(b) + + # All 20 pending entries are delivered; the base class re-chunks them + # to batch_size=5 → four batches of 5. + assert [len(b) for b in batches] == [5, 5, 5, 5] + # Only ONE pending read was made (PEL drained in one shot). + pending_calls = [c for c in redis.xreadgroup.call_args_list if _xreadgroup_id(c) == "0"] + assert len(pending_calls) == 1, \ + f"expected exactly 1 pending read (count=None drains all), got {len(pending_calls)}" From 9c2c59b18525364d67854f6f6eeb8b0556afc155 Mon Sep 17 00:00:00 2001 From: spicy-sauce Date: Sun, 31 May 2026 14:40:47 +0300 Subject: [PATCH 38/38] Add property-based tests, external-cancel test, mypy fix (#400) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three final verification additions: 1. Property-based rechunker tests (hypothesis): probe ~1000 generated chunk-size/batch-size combinations to verify the rechunker's invariants — record conservation, order preservation, all batches well-formed, no empty emissions, partial-only-at-end. Adds hypothesis to test extras. Catches the class of bug Copilot flagged where my existing tests only covered specific inputs, not the contract. 2. test_external_task_cancellation_cleans_up_pump: simulates the Job.shutdown / Job.run cancellation path (cancelling the outer task that iterates produce()) and verifies no producer pump task is orphaned afterward. The spec claims this works; now there's a test. 3. mypy fix: Producer.DEFAULT_FLUSH_MS was inferred as None-only, making subclass overrides with int fail strict type-checking. Now typed as Optional[int]. mypy clean on all 9 changed source files. 90 tests pass (was 84: +5 property tests, +1 external-cancel test). Co-Authored-By: Claude Opus 4.7 (1M context) --- core/pyproject.toml | 2 + core/src/datayoga_core/producer.py | 6 +- .../tests/test_producer_batching.py | 36 +++++ .../test_producer_batching_properties.py | 141 ++++++++++++++++++ 4 files changed, 182 insertions(+), 3 deletions(-) create mode 100644 core/src/datayoga_core/tests/test_producer_batching_properties.py diff --git a/core/pyproject.toml b/core/pyproject.toml index 2a55ee25..2aeca188 100644 --- a/core/pyproject.toml +++ b/core/pyproject.toml @@ -28,6 +28,7 @@ python = "^3.8" PyYAML = "^6.0" sqlglot = "^10.4.3" +hypothesis = { version = "^6.0", optional = true } mock = { version = "^4.0.3", optional = true } pytest = { version = "^7.1.2", optional = true } pytest-aioresponses = { version = "^0.2.0", optional = true } @@ -70,6 +71,7 @@ test = [ "azure-eventhub-checkpointstoreblob-aio", "cassandra-driver", "fastparquet", + "hypothesis", "ibm_db_sa", "mock", "oracledb", diff --git a/core/src/datayoga_core/producer.py b/core/src/datayoga_core/producer.py index dc5b05d5..a199d446 100644 --- a/core/src/datayoga_core/producer.py +++ b/core/src/datayoga_core/producer.py @@ -1,7 +1,7 @@ import asyncio import logging from contextlib import suppress -from typing import Any, AsyncGenerator, Dict, List +from typing import Any, AsyncGenerator, Dict, List, Optional from .block import Block @@ -28,8 +28,8 @@ class Producer(Block): the base-class batching and `produce_chunks` is not called. """ - DEFAULT_BATCH_SIZE = 1000 - DEFAULT_FLUSH_MS = None # streaming subclasses override to enable timeout flush + DEFAULT_BATCH_SIZE: int = 1000 + DEFAULT_FLUSH_MS: Optional[int] = None # streaming subclasses override to enable timeout flush async def produce_chunks(self) -> AsyncGenerator[List[Dict[str, Any]], None]: """Yield natural-size chunks from the source. diff --git a/core/src/datayoga_core/tests/test_producer_batching.py b/core/src/datayoga_core/tests/test_producer_batching.py index 5ab4f98a..c3fe94b0 100644 --- a/core/src/datayoga_core/tests/test_producer_batching.py +++ b/core/src/datayoga_core/tests/test_producer_batching.py @@ -138,6 +138,42 @@ async def test_consumer_cancellation_cleans_up_pump(): await asyncio.sleep(0.1) +@pytest.mark.asyncio +async def test_external_task_cancellation_cleans_up_pump(): + """When the task iterating produce() is cancelled (e.g., Job.run is cancelled + by the runtime), the producer's pump task must clean up. This is the + Job-shutdown scenario: an external cancellation propagates through the + async-for loop into the producer generator's finally.""" + chunks = [[_msg(i)] for i in range(10_000)] + p = FakeProducer({"batch_size": 5, "flush_ms": 50}, chunks=chunks, + sleep_before=[0.01] * 10_000) + + async def consume(): + # Mirrors Job.run's iteration pattern. + async for batch in p.produce(): + pass # downstream processing would happen here + + consumer_task = asyncio.create_task(consume()) + await asyncio.sleep(0.05) # let the producer ramp up — some batches arrive + consumer_task.cancel() + with pytest.raises(asyncio.CancelledError): + await consumer_task + # Give the loop a moment to settle any pending finalizers. + await asyncio.sleep(0.1) + + # No producer pump task should remain after cancellation. We identify the + # pump specifically by Producer.produce..pump in its qualname, + # since the test's own name happens to contain "pump". + remaining = [t for t in asyncio.all_tasks() if not t.done()] + pump_tasks = [ + t for t in remaining + if "Producer.produce" in (t.get_coro().__qualname__ or "") + ] + assert not pump_tasks, \ + f"orphaned producer pump tasks after cancellation: " \ + f"{[t.get_coro().__qualname__ for t in pump_tasks]}" + + class _BoomProducer(Producer): """Producer whose produce_chunks() raises after emitting some chunks.""" diff --git a/core/src/datayoga_core/tests/test_producer_batching_properties.py b/core/src/datayoga_core/tests/test_producer_batching_properties.py new file mode 100644 index 00000000..d288c3a1 --- /dev/null +++ b/core/src/datayoga_core/tests/test_producer_batching_properties.py @@ -0,0 +1,141 @@ +"""Property-based tests for the Producer base-class rechunker. + +Where `test_producer_batching.py` asserts specific outputs for specific inputs, +this file uses Hypothesis to generate arbitrary chunk-size sequences and probe +the rechunker's invariants. Catches the class of bug where the code works for +the inputs you tested but breaks somewhere in the wider input space. +""" +import asyncio +from typing import AsyncGenerator, Dict, List, Optional + +import pytest +from datayoga_core.context import Context +from datayoga_core.producer import Producer +from hypothesis import given, settings +from hypothesis import strategies as st + + +class _ScriptedProducer(Producer): + """Producer driven by a scripted list of chunk-sizes; each chunk has + sequential integer payloads.""" + + def __init__(self, properties, *, chunk_sizes): + """Wires the schema and chunk script.""" + self._test_schema = { + "type": "object", + "properties": {"batch_size": {"type": "integer", "minimum": 1}}, + } + self._chunk_sizes = chunk_sizes + super().__init__(properties) + + def get_json_schema(self): + """In-memory schema (no disk read).""" + return self._test_schema + + def init(self, context: Optional[Context] = None): + """No-op.""" + pass + + async def produce_chunks(self) -> AsyncGenerator[List[Dict], None]: + """Yield chunks of the scripted sizes, with sequential payload ids.""" + counter = 0 + for size in self._chunk_sizes: + chunk = [{Producer.MSG_ID_FIELD: str(counter + i), "v": counter + i} + for i in range(size)] + counter += size + yield chunk + + +async def _drain(producer: Producer): + out = [] + async for batch in producer.produce(): + out.append(batch) + return out + + +# Strategies +chunk_sizes_strategy = st.lists( + st.integers(min_value=0, max_value=200), + min_size=0, + max_size=20, +) +batch_size_strategy = st.integers(min_value=1, max_value=300) + + +@given(chunk_sizes=chunk_sizes_strategy, batch_size=batch_size_strategy) +@settings(max_examples=200, deadline=2000) +def test_property_record_conservation(chunk_sizes, batch_size): + """The total number of records yielded downstream equals the total number + yielded by produce_chunks. No records lost; none duplicated.""" + p = _ScriptedProducer({"batch_size": batch_size}, chunk_sizes=chunk_sizes) + batches = asyncio.run(_drain(p)) + expected_total = sum(chunk_sizes) + actual_total = sum(len(b) for b in batches) + assert actual_total == expected_total, \ + f"chunk_sizes={chunk_sizes}, batch_size={batch_size}: " \ + f"expected {expected_total} records, got {actual_total}" + + +@given(chunk_sizes=chunk_sizes_strategy, batch_size=batch_size_strategy) +@settings(max_examples=200, deadline=2000) +def test_property_record_order_preserved(chunk_sizes, batch_size): + """Records flow downstream in the same order produce_chunks emits them. + Re-chunking doesn't shuffle.""" + p = _ScriptedProducer({"batch_size": batch_size}, chunk_sizes=chunk_sizes) + batches = asyncio.run(_drain(p)) + flat = [r["v"] for b in batches for r in b] + expected = list(range(sum(chunk_sizes))) + assert flat == expected, \ + f"chunk_sizes={chunk_sizes}, batch_size={batch_size}: order mismatch" + + +@given(chunk_sizes=chunk_sizes_strategy, batch_size=batch_size_strategy) +@settings(max_examples=200, deadline=2000) +def test_property_batch_sizes_well_formed(chunk_sizes, batch_size): + """Every batch is non-empty AND has length ≤ batch_size. All batches except + possibly the last have length == batch_size (the last may be partial on EOS).""" + p = _ScriptedProducer({"batch_size": batch_size}, chunk_sizes=chunk_sizes) + batches = asyncio.run(_drain(p)) + for i, b in enumerate(batches): + assert len(b) > 0, f"batch {i} is empty: {batches}" + assert len(b) <= batch_size, f"batch {i} exceeds batch_size: {len(b)} > {batch_size}" + # All non-final batches should be exactly batch_size (no time-based flush + # here since flush_ms is not set). + for i, b in enumerate(batches[:-1]): + assert len(b) == batch_size, \ + f"batch {i} is partial mid-stream: len={len(b)}, batch_size={batch_size}" + + +@given(chunk_sizes=chunk_sizes_strategy, batch_size=batch_size_strategy) +@settings(max_examples=200, deadline=2000) +def test_property_no_empty_emissions(chunk_sizes, batch_size): + """If produce_chunks emits empty chunks, the base class doesn't propagate + them downstream.""" + # Inject empty chunks throughout the sequence. + chunks_with_empties = [] + for size in chunk_sizes: + chunks_with_empties.append(0) # empty + chunks_with_empties.append(size) + p = _ScriptedProducer({"batch_size": batch_size}, chunk_sizes=chunks_with_empties) + batches = asyncio.run(_drain(p)) + for i, b in enumerate(batches): + assert len(b) > 0, f"empty batch emitted at index {i}" + + +@given(num_records=st.integers(min_value=0, max_value=500), + batch_size=st.integers(min_value=1, max_value=100)) +@settings(max_examples=100, deadline=2000) +def test_property_partial_final_batch_only(num_records, batch_size): + """When all records come in one big chunk, the output is N full batches plus + optionally one partial batch — never a partial in the middle.""" + p = _ScriptedProducer({"batch_size": batch_size}, chunk_sizes=[num_records]) + batches = asyncio.run(_drain(p)) + if num_records == 0: + assert batches == [], "expected no batches for empty source" + return + expected_full, remainder = divmod(num_records, batch_size) + sizes = [len(b) for b in batches] + if remainder == 0: + assert sizes == [batch_size] * expected_full + else: + assert sizes == [batch_size] * expected_full + [remainder]