Skip to content

Commit 94d0efc

Browse files
Paging iterator optimization: +1 lookahead to eliminate terminal out-of-range queries, env validation, setuptools bump
1 parent 83ef7a9 commit 94d0efc

8 files changed

Lines changed: 285 additions & 58 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,3 +226,4 @@ atp_pipeline-1.0.0-cp312-cp312-manylinux_2_34_x86_64.whl
226226
atp_pipeline-1.0.6-cp312-cp312-manylinux_2_35_x86_64.whl
227227
setup_env.sh.old
228228
atp_pipeline-1.0.7-cp312-cp312-manylinux_2_34_x86_64.whl
229+
TODOS_LISTS/AUDIT-KUZU-SEGFAULT-PAGING.md

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
[build-system]
2-
requires = ["setuptools>=61.0", "wheel"]
2+
requires = ["setuptools>=82.0", "wheel"]
33
build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "kuzualchemy"
7-
version = "0.3.22"
7+
version = "0.3.23"
88
description = "SQLAlchemy-like ORM for Kuzu graph database"
99
readme = "README.md"
1010
license = { file = "LICENSE" }

src/kuzualchemy/kuzu_query.py

Lines changed: 108 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,29 @@
3131

3232
logger = logging.getLogger(__name__)
3333

34+
_ENV_ATP_READONLY_POOL_MAX_SIZE = "ATP_READONLY_POOL_MAX_SIZE"
35+
36+
37+
def _read_required_positive_int_env(var_name: str) -> int:
38+
"""Read and validate a required positive integer environment variable."""
39+
raw = os.getenv(var_name)
40+
if raw is None:
41+
raise RuntimeError(
42+
f"Missing required environment variable '{var_name}'. "
43+
"Configure it before calling Query.iter()."
44+
)
45+
try:
46+
value = int(raw)
47+
except ValueError as exc:
48+
raise ValueError(
49+
f"Environment variable '{var_name}' must be an integer, got: {raw!r}"
50+
) from exc
51+
if value <= 0:
52+
raise ValueError(
53+
f"Environment variable '{var_name}' must be > 0, got: {value}"
54+
)
55+
return value
56+
3457
if TYPE_CHECKING:
3558
from .kuzu_session import KuzuSession
3659

@@ -599,8 +622,8 @@ def iter(self, page_size: int = 10, prefetch_pages: int = 1) -> Iterator[Union[M
599622
)
600623

601624
# Check if parallel execution is available and beneficial
602-
pool_size = int(os.environ["ATP_READONLY_POOL_MAX_SIZE"])
603-
parallel_threshold = int(os.environ["ATP_READONLY_POOL_MAX_SIZE"])
625+
pool_size = _read_required_positive_int_env(_ENV_ATP_READONLY_POOL_MAX_SIZE)
626+
parallel_threshold = pool_size
604627
use_parallel = pool_size > 1 and parallel_threshold > 0
605628

606629
def fetch_page(offset: int) -> List[Union[ModelType, Dict[str, Any]]]:
@@ -629,6 +652,37 @@ def fetch_page(offset: int) -> List[Union[ModelType, Dict[str, Any]]]:
629652

630653
return mapped
631654

655+
def fetch_page_with_lookahead(offset: int) -> Tuple[List[Union[ModelType, Dict[str, Any]]], bool]:
656+
"""Fetch a page with one-row lookahead to avoid terminal out-of-range SKIP queries."""
657+
q = self.offset(offset).limit(ps + 1)
658+
t0 = time.perf_counter()
659+
raw = q._execute()
660+
t1 = time.perf_counter()
661+
mapped = q._map_results(raw)
662+
t2 = time.perf_counter()
663+
664+
has_more = len(mapped) > ps
665+
page_data = mapped[:ps] if has_more else mapped
666+
667+
if getattr(self._session, "_debug_timing", False) or ((t2 - t0) >= 0.25):
668+
raw_rows = len(raw) if isinstance(raw, list) else None
669+
mapped_rows = len(page_data) if isinstance(page_data, list) else None
670+
logger.info(
671+
"kuzu.query.page.lookahead rel=%s offset=%d page_size=%d raw_rows=%s mapped_rows=%s has_more=%s exec_seconds=%.6f map_seconds=%.6f total_seconds=%.6f pairs_subset=%s",
672+
model_name,
673+
int(offset),
674+
int(ps),
675+
raw_rows,
676+
mapped_rows,
677+
has_more,
678+
(t1 - t0),
679+
(t2 - t1),
680+
(t2 - t0),
681+
pairs_subset_meta,
682+
)
683+
684+
return page_data, has_more
685+
632686
def fetch_pages_parallel(offsets: List[int]) -> List[List[Union[ModelType, Dict[str, Any]]]]:
633687
"""Fetch multiple pages in parallel using Rust rayon via ATP pipeline."""
634688
if not offsets:
@@ -672,63 +726,81 @@ def fetch_pages_parallel(offsets: List[int]) -> List[List[Union[ModelType, Dict[
672726
mapped_pages.append(mapped)
673727
return mapped_pages
674728

675-
# Fetch first page to determine if more pages exist
676-
offset = 0
677-
page = fetch_page(offset)
678-
offset += ps
729+
# If parallel execution is enabled, preserve existing count-bounded parallel strategy.
730+
if use_parallel:
731+
offset = 0
732+
page = fetch_page(offset)
733+
offset += ps
734+
735+
# If first page is not full, result set fits in one page.
736+
if len(page) < ps:
737+
for item in page:
738+
yield item
739+
return
740+
741+
total_rows = self.count_results()
742+
remaining_rows = max(total_rows - ps, 0)
679743

680-
# If parallel execution is enabled and first page is full, try parallel fetching
681-
if use_parallel and len(page) == ps:
682744
# Yield first page items
683745
for item in page:
684746
yield item
685-
747+
748+
if remaining_rows == 0:
749+
return
750+
686751
# Parallel batch fetching
687752
batch_size = min(pool_size, parallel_threshold)
688-
while True:
689-
# Build batch of offsets
690-
batch_offsets = [offset + i * ps for i in range(batch_size)]
691-
753+
while remaining_rows > 0:
754+
pages_in_batch = min(batch_size, (remaining_rows + ps - 1) // ps)
755+
batch_offsets = [offset + i * ps for i in range(pages_in_batch)]
756+
692757
# Fetch batch in parallel
693758
batch_pages = fetch_pages_parallel(batch_offsets)
694-
695-
# Yield results and track if we got a partial page
696-
last_page_full = True
697-
for page_idx, page_data in enumerate(batch_pages):
759+
760+
# Yield results in requested page order
761+
for page_data in batch_pages:
698762
for item in page_data:
699763
yield item
700-
if len(page_data) < ps:
701-
last_page_full = False
702-
break
703-
704-
if not last_page_full:
705-
break
706-
707-
offset += batch_size * ps
708-
elif pf > 0:
764+
765+
advanced_rows = pages_in_batch * ps
766+
offset += advanced_rows
767+
remaining_rows = max(remaining_rows - advanced_rows, 0)
768+
769+
return
770+
771+
# Sequential modes: use +1 lookahead to avoid issuing a terminal out-of-range page.
772+
offset = 0
773+
page, has_more = fetch_page_with_lookahead(offset)
774+
offset += ps
775+
776+
if pf > 0:
709777
# Sequential with prefetch (original behavior)
710778
with ThreadPoolExecutor(max_workers=1) as executor:
711-
next_future = executor.submit(fetch_page, offset) if len(page) == ps else None
779+
next_future = executor.submit(fetch_page_with_lookahead, offset) if has_more else None
712780
while True:
713781
for item in page:
714782
yield item
715-
if len(page) < ps:
783+
if not has_more:
716784
break
717-
next_page = next_future.result() if next_future is not None else fetch_page(offset)
785+
if next_future is not None:
786+
next_page, next_has_more = next_future.result()
787+
else:
788+
next_page, next_has_more = fetch_page_with_lookahead(offset)
718789
offset += ps
719-
if len(next_page) == ps and pf > 0:
720-
next_future = executor.submit(fetch_page, offset)
790+
if next_has_more and pf > 0:
791+
next_future = executor.submit(fetch_page_with_lookahead, offset)
721792
else:
722793
next_future = None
723794
page = next_page
795+
has_more = next_has_more
724796
else:
725797
# Pure sequential (no prefetch)
726798
while True:
727799
for item in page:
728800
yield item
729-
if len(page) < ps:
801+
if not has_more:
730802
break
731-
page = fetch_page(offset)
803+
page, has_more = fetch_page_with_lookahead(offset)
732804
offset += ps
733805

734806
def all(self, as_iterator: bool = False, page_size: Optional[int] = None, prefetch_pages: int = 1) -> Union[List[ModelType], List[Dict[str, Any]], Iterator[Union[ModelType, Dict[str, Any]]]]:
@@ -784,7 +856,9 @@ def exists(self) -> bool:
784856

785857
def count_results(self) -> int:
786858
"""Count the number of results."""
787-
count_query = self.count()
859+
# ORDER BY columns are not valid after scalar COUNT aggregation in Kuzu.
860+
# Keep all filters/joins while stripping ORDER BY for the COUNT query only.
861+
count_query = self._copy_with_state(order_by=[]).count()
788862
result = count_query._execute()
789863
if type(result) is not list:
790864
logger.error("Count query returned non-list result type: %r", type(result))

src/kuzualchemy/kuzu_session.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -265,37 +265,44 @@ def iterate(
265265
if " skip " in f" {q_lower} " or " limit " in f" {q_lower} ":
266266
raise ValueError("Raw query already contains SKIP/LIMIT; cannot auto-paginate. Remove them and retry.")
267267

268-
def fetch_page(offset: int) -> List[Dict[str, Any]]:
269-
paged_q = f"{base} SKIP {offset} LIMIT {page_size}"
270-
return self.execute(paged_q, parameters)
268+
def fetch_page(offset: int) -> tuple[List[Dict[str, Any]], bool]:
269+
paged_q = f"{base} SKIP {offset} LIMIT {page_size + 1}"
270+
rows = self.execute(paged_q, parameters)
271+
has_more = len(rows) > page_size
272+
page = rows[:page_size] if has_more else rows
273+
return page, has_more
271274

272275
# First page
273276
offset = 0
274-
page = fetch_page(offset)
277+
page, has_more = fetch_page(offset)
275278
if prefetch_pages > 0:
276279
from concurrent.futures import ThreadPoolExecutor
277280
with ThreadPoolExecutor(max_workers=1) as executor:
278-
next_future = executor.submit(fetch_page, offset + page_size) if len(page) == page_size else None
281+
next_future = executor.submit(fetch_page, offset + page_size) if has_more else None
279282
while True:
280283
for row in page:
281284
yield row
282-
if len(page) < page_size:
285+
if not has_more:
283286
break
284-
next_page = next_future.result() if next_future is not None else fetch_page(offset + page_size)
287+
if next_future is not None:
288+
next_page, next_has_more = next_future.result()
289+
else:
290+
next_page, next_has_more = fetch_page(offset + page_size)
285291
offset += page_size
286-
if len(next_page) == page_size:
292+
if next_has_more:
287293
next_future = executor.submit(fetch_page, offset + page_size)
288294
else:
289295
next_future = None
290296
page = next_page
297+
has_more = next_has_more
291298
else:
292299
while True:
293300
for row in page:
294301
yield row
295-
if len(page) < page_size:
302+
if not has_more:
296303
break
297304
offset += page_size
298-
page = fetch_page(offset)
305+
page, has_more = fetch_page(offset)
299306

300307

301308
def _execute_with_connection_reuse(self, query: str, parameters: Optional[Dict[str, Any]] = None) -> Any:

tests/.env.test

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Kuzualchemy pytest-specific environment.
2+
# Keys are namespaced to avoid cross-project collisions when other repositories
3+
# define similarly named runtime variables.
4+
5+
KUZUALCHEMY_TEST_OMP_NUM_THREADS=1
6+
KUZUALCHEMY_TEST_MKL_NUM_THREADS=1
7+
KUZUALCHEMY_TEST_NUMBA_NUM_THREADS=1
8+
KUZUALCHEMY_TEST_ATP_PROFILE=0
9+
KUZUALCHEMY_TEST_ATP_READONLY_POOL_MAX_SIZE=1
10+
KUZUALCHEMY_TEST_ATP_READONLY_POOL_WARM_COUNT=1
11+
KUZUALCHEMY_TEST_ATP_READONLY_BUFFER_POOL_BYTES=0

tests/_env.py

Lines changed: 64 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,76 @@
11
"""Test environment loader for Kuzualchemy pytest suite.
22
3-
This module ensures that the repository-level `.env` file is loaded before any
4-
tests run. All parsing and environment application is implemented in Rust in
5-
the `atp_core::env` module; here we simply delegate to the public
6-
``atp_pipeline.load_workspace_dotenv`` helper so Rust-only and Python-driven
7-
entrypoints share identical semantics.
3+
This module loads a dedicated test environment file (`tests/.env.test`) before
4+
any tests run. Keys in the file are intentionally namespaced to avoid
5+
cross-project collisions when other repositories expose similarly named runtime
6+
variables.
87
"""
98

109
from __future__ import annotations
1110

1211
import os
12+
from pathlib import Path
1313

14-
from atp_pipeline import load_workspace_dotenv
1514

15+
_TEST_ENV_PATH = Path(__file__).with_name(".env.test")
1616

17-
load_workspace_dotenv(required=True)
17+
_TEST_TO_RUNTIME_ENV_KEY_MAP = {
18+
"KUZUALCHEMY_TEST_OMP_NUM_THREADS": "OMP_NUM_THREADS",
19+
"KUZUALCHEMY_TEST_MKL_NUM_THREADS": "MKL_NUM_THREADS",
20+
"KUZUALCHEMY_TEST_NUMBA_NUM_THREADS": "NUMBA_NUM_THREADS",
21+
"KUZUALCHEMY_TEST_ATP_PROFILE": "ATP_PROFILE",
22+
"KUZUALCHEMY_TEST_ATP_READONLY_POOL_MAX_SIZE": "ATP_READONLY_POOL_MAX_SIZE",
23+
"KUZUALCHEMY_TEST_ATP_READONLY_POOL_WARM_COUNT": "ATP_READONLY_POOL_WARM_COUNT",
24+
"KUZUALCHEMY_TEST_ATP_READONLY_BUFFER_POOL_BYTES": "ATP_READONLY_BUFFER_POOL_BYTES",
25+
}
1826

19-
os.environ["ATP_PROFILE"] = "0"
20-
os.environ.pop("ATP_PROFILE_FREQ", None)
21-
os.environ.pop("ATP_PROFILE_FLAMEGRAPH", None)
22-
os.environ.pop("ATP_PROFILE_SPEEDSCOPE", None)
27+
28+
def _parse_env_file(path: Path) -> dict[str, str]:
29+
if not path.exists():
30+
raise RuntimeError(f"Missing required test env file: {path}")
31+
32+
parsed: dict[str, str] = {}
33+
lines = path.read_text(encoding="utf-8").splitlines()
34+
for line_no, raw_line in enumerate(lines, start=1):
35+
line = raw_line.strip()
36+
if not line or line.startswith("#"):
37+
continue
38+
if "=" not in line:
39+
raise ValueError(
40+
f"Invalid env entry in {path} at line {line_no}: expected KEY=VALUE"
41+
)
42+
key_part, value_part = line.split("=", 1)
43+
key = key_part.strip()
44+
value = value_part.strip()
45+
if not key:
46+
raise ValueError(
47+
f"Invalid env entry in {path} at line {line_no}: empty KEY"
48+
)
49+
if key in parsed:
50+
raise ValueError(
51+
f"Duplicate env key in {path} at line {line_no}: {key}"
52+
)
53+
parsed[key] = value
54+
return parsed
55+
56+
57+
def _apply_runtime_test_env(parsed_env: dict[str, str]) -> None:
58+
missing_required_keys = [
59+
key for key in _TEST_TO_RUNTIME_ENV_KEY_MAP if key not in parsed_env
60+
]
61+
if missing_required_keys:
62+
missing_csv = ", ".join(sorted(missing_required_keys))
63+
raise RuntimeError(
64+
f"Missing required keys in {_TEST_ENV_PATH}: {missing_csv}"
65+
)
66+
67+
for source_key, target_key in _TEST_TO_RUNTIME_ENV_KEY_MAP.items():
68+
os.environ[target_key] = parsed_env[source_key]
69+
70+
os.environ.pop("ATP_PROFILE_FREQ", None)
71+
os.environ.pop("ATP_PROFILE_FLAMEGRAPH", None)
72+
os.environ.pop("ATP_PROFILE_SPEEDSCOPE", None)
73+
74+
75+
_apply_runtime_test_env(_parse_env_file(_TEST_ENV_PATH))
2376

0 commit comments

Comments
 (0)