From 3d3c03776df0f15f68542d365a1b9eee751973f7 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Fri, 19 Jun 2026 16:35:22 +0200
Subject: [PATCH 01/16] feat(cubestore): port group-by-limit partial aggregate
 (renamed from topk), wip behind flag

Brings the worker-side partial hash-aggregate trim for
`GROUP BY <non-index-prefix> ORDER BY <subset of group-by> LIMIT k` onto this
branch, renamed from the misleading "TopK" naming (it trims by group-key order,
not by an aggregate top-N):
- GroupByLimitAggregateExec + group_by_limit_aggregate module
- group_by_limit_rewriter
- config group_by_limit_factor / CUBESTORE_GROUP_BY_LIMIT_FACTOR (default 2)

Reuses DataFusion's GroupValues building blocks; the only fork change is
pub new_group_values (df branch cubestore-hash-aggregate-limit).

WIP: the trim is planted during router planning but is NOT yet carried across
the ClusterSend boundary to the worker's independent physical re-plan, so it
does not engage at distributed execution yet. Next: cluster-boundary carry.

(cherry picked from commit d3aa25fad92724265b4adb710c1986bef354f0c9)
---
 rust/cubestore/Cargo.lock                     |  52 ++--
 .../cubestore-sql-tests/src/tests.rs          | 143 +++++++++
 rust/cubestore/cubestore/Cargo.toml           |   8 +-
 rust/cubestore/cubestore/src/config/mod.rs    |  17 +-
 .../group_by_limit_aggregate_stream.rs        | 284 ++++++++++++++++++
 .../group_by_limit_aggregate/mod.rs           | 204 +++++++++++++
 .../cubestore/src/queryplanner/mod.rs         |   1 +
 .../optimizations/group_by_limit_rewriter.rs  | 251 ++++++++++++++++
 .../src/queryplanner/optimizations/mod.rs     |  18 +-
 .../src/queryplanner/pretty_printers.rs       |  11 +
 .../src/queryplanner/query_executor.rs        |   9 +-
 11 files changed, 957 insertions(+), 41 deletions(-)
 create mode 100644 rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs
 create mode 100644 rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/mod.rs
 create mode 100644 rust/cubestore/cubestore/src/queryplanner/optimizations/group_by_limit_rewriter.rs
diff --git a/rust/cubestore/Cargo.lock b/rust/cubestore/Cargo.lock
index b10bf2e1f5427..889e7636aa66d 100644
--- a/rust/cubestore/Cargo.lock
+++ b/rust/cubestore/Cargo.lock
@@ -1758,7 +1758,7 @@ checksum = "c2e66c9d817f1720209181c316d28635c050fa304f9c79e47a520882661b7308"
 [[package]]
 name = "datafusion"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "arrow-ipc",
@@ -1811,7 +1811,7 @@ dependencies = [
 [[package]]
 name = "datafusion-catalog"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "async-trait",
@@ -1830,7 +1830,7 @@ dependencies = [
 [[package]]
 name = "datafusion-catalog-listing"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "async-trait",
@@ -1851,7 +1851,7 @@ dependencies = [
 [[package]]
 name = "datafusion-common"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "ahash 0.8.11",
  "arrow",
@@ -1874,7 +1874,7 @@ dependencies = [
 [[package]]
 name = "datafusion-common-runtime"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "log",
  "tokio",
@@ -1883,7 +1883,7 @@ dependencies = [
 [[package]]
 name = "datafusion-datasource"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "async-compression 0.4.17",
@@ -1916,12 +1916,12 @@ dependencies = [
 [[package]]
 name = "datafusion-doc"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 
 [[package]]
 name = "datafusion-execution"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "dashmap",
@@ -1941,7 +1941,7 @@ dependencies = [
 [[package]]
 name = "datafusion-expr"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "chrono",
@@ -1961,7 +1961,7 @@ dependencies = [
 [[package]]
 name = "datafusion-expr-common"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "datafusion-common",
@@ -1973,7 +1973,7 @@ dependencies = [
 [[package]]
 name = "datafusion-functions"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "arrow-buffer",
@@ -2001,7 +2001,7 @@ dependencies = [
 [[package]]
 name = "datafusion-functions-aggregate"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "ahash 0.8.11",
  "arrow",
@@ -2021,7 +2021,7 @@ dependencies = [
 [[package]]
 name = "datafusion-functions-aggregate-common"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "ahash 0.8.11",
  "arrow",
@@ -2033,7 +2033,7 @@ dependencies = [
 [[package]]
 name = "datafusion-functions-nested"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "arrow-ord",
@@ -2053,7 +2053,7 @@ dependencies = [
 [[package]]
 name = "datafusion-functions-table"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "async-trait",
@@ -2068,7 +2068,7 @@ dependencies = [
 [[package]]
 name = "datafusion-functions-window"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "datafusion-common",
  "datafusion-doc",
@@ -2084,7 +2084,7 @@ dependencies = [
 [[package]]
 name = "datafusion-functions-window-common"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "datafusion-common",
  "datafusion-physical-expr-common",
@@ -2093,7 +2093,7 @@ dependencies = [
 [[package]]
 name = "datafusion-macros"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "datafusion-expr",
  "quote",
@@ -2103,7 +2103,7 @@ dependencies = [
 [[package]]
 name = "datafusion-optimizer"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "chrono",
@@ -2121,7 +2121,7 @@ dependencies = [
 [[package]]
 name = "datafusion-physical-expr"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "ahash 0.8.11",
  "arrow",
@@ -2142,7 +2142,7 @@ dependencies = [
 [[package]]
 name = "datafusion-physical-expr-common"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "ahash 0.8.11",
  "arrow",
@@ -2155,7 +2155,7 @@ dependencies = [
 [[package]]
 name = "datafusion-physical-optimizer"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "datafusion-common",
@@ -2173,7 +2173,7 @@ dependencies = [
 [[package]]
 name = "datafusion-physical-plan"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "ahash 0.8.11",
  "arrow",
@@ -2205,7 +2205,7 @@ dependencies = [
 [[package]]
 name = "datafusion-proto"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "chrono",
@@ -2220,7 +2220,7 @@ dependencies = [
 [[package]]
 name = "datafusion-proto-common"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "datafusion-common",
@@ -2230,7 +2230,7 @@ dependencies = [
 [[package]]
 name = "datafusion-sql"
 version = "46.0.1"
-source = "git+https://github.com/cube-js/arrow-datafusion?branch=cube-46.0.1#dc9015e290adbeaff1da80c9c052219c50312f77"
+source = "git+https://github.com/cube-js/arrow-datafusion?branch=cubestore-hash-aggregate-limit#ef839b67f88734804bb2127c7e27b25122b55690"
 dependencies = [
  "arrow",
  "bigdecimal 0.4.8",
diff --git a/rust/cubestore/cubestore-sql-tests/src/tests.rs b/rust/cubestore/cubestore-sql-tests/src/tests.rs
index cb6a87b8339cd..71ed3ddd05f2a 100644
--- a/rust/cubestore/cubestore-sql-tests/src/tests.rs
+++ b/rust/cubestore/cubestore-sql-tests/src/tests.rs
@@ -154,6 +154,8 @@ pub fn sql_tests(prefix: &str) -> Vec<(&'static str, TestFn)> {
         t("planning_inplace_aggregate", planning_inplace_aggregate),
         t("planning_hints", planning_hints),
         t("planning_inplace_aggregate2", planning_inplace_aggregate2),
+        t("planning_topk_hash_aggregate", planning_topk_hash_aggregate),
+        t("topk_hash_aggregate_trim", topk_hash_aggregate_trim),
         t("topk_large_inputs", topk_large_inputs),
         t("partitioned_index", partitioned_index),
         t(
@@ -425,6 +427,7 @@ lazy_static::lazy_static! {
         "limit_pushdown_group_having",
         "limit_pushdown_group_nonprefix_order",
         "prefilter_chunks_shared_scan",
+        "planning_topk_hash_aggregate",
     ].into_iter().map(ToOwned::to_owned).collect();
 }
 
@@ -3201,6 +3204,146 @@ async fn planning_inplace_aggregate(service: Box<dyn SqlClient>) -> Result<(), C
     Ok(())
 }
 
+async fn planning_topk_hash_aggregate(service: Box<dyn SqlClient>) -> Result<(), CubeError> {
+    service.exec_query("CREATE SCHEMA s").await?;
+    service
+        .exec_query("CREATE TABLE s.Data(url text, day int, hits int)")
+        .await?;
+    service
+        .exec_query("CREATE TABLE s.D3(a int, b int, c int, h int)")
+        .await?;
+
+    // GROUP BY a non-indexed column -> hash (Linear) partial aggregate; ORDER BY the group
+    // column with a LIMIT -> the worker partial aggregate is replaced by GroupByLimitAggregate.
+    let p = service
+        .plan_query("SELECT day, SUM(hits) FROM s.Data GROUP BY 1 ORDER BY 1 LIMIT 10")
+        .await?;
+    let pp = pp_phys_plan_ext(p.worker.as_ref(), &PPOptions::none());
+    assert!(
+        pp.contains("GroupByLimitAggregate, k: 10, factor: 2,"),
+        "expected GroupByLimitAggregate on the worker, got:\n{}",
+        pp
+    );
+
+    // LIMIT + OFFSET -> k = limit + offset.
+    let p = service
+        .plan_query("SELECT day, SUM(hits) FROM s.Data GROUP BY 1 ORDER BY 1 LIMIT 10 OFFSET 5")
+        .await?;
+    let pp = pp_phys_plan_ext(p.worker.as_ref(), &PPOptions::none());
+    assert!(
+        pp.contains("GroupByLimitAggregate, k: 15, factor: 2,"),
+        "expected k=15 (limit+offset), got:\n{}",
+        pp
+    );
+
+    // ORDER BY an aggregate (not a group-by column) -> no trim.
+    let p = service
+        .plan_query("SELECT day, SUM(hits) FROM s.Data GROUP BY 1 ORDER BY 2 DESC LIMIT 10")
+        .await?;
+    let pp = pp_phys_plan_ext(p.worker.as_ref(), &PPOptions::none());
+    assert!(
+        !pp.contains("GroupByLimitAggregate"),
+        "did not expect GroupByLimitAggregate when ordering by an aggregate, got:\n{}",
+        pp
+    );
+
+    // No LIMIT -> no trim.
+    let p = service
+        .plan_query("SELECT day, SUM(hits) FROM s.Data GROUP BY 1 ORDER BY 1")
+        .await?;
+    let pp = pp_phys_plan_ext(p.worker.as_ref(), &PPOptions::none());
+    assert!(
+        !pp.contains("GroupByLimitAggregate"),
+        "did not expect GroupByLimitAggregate without a limit, got:\n{}",
+        pp
+    );
+
+    // ORDER BY a proper SUBSET of GROUP BY (b out of b, c). The worker cut and the router sort must
+    // both use the total order T = [b, c]: the worker trim order carries the tie-break column c, and
+    // the router's global Sort is extended with c so its top-k matches the global top-k by T.
+    let p = service
+        .plan_query("SELECT b, c, SUM(h) FROM s.D3 GROUP BY 1, 2 ORDER BY 1 LIMIT 3")
+        .await?;
+    let worker_pp = pp_phys_plan_ext(p.worker.as_ref(), &PPOptions::none());
+    assert!(
+        worker_pp.contains("GroupByLimitAggregate, k: 3, factor: 2,")
+            && worker_pp.contains("(0, SortOptions { descending: false, nulls_first: false })")
+            && worker_pp.contains("(1, SortOptions { descending: false, nulls_first: true })"),
+        "expected worker trim order [b, c] totalized, got:\n{}",
+        worker_pp
+    );
+    let router_pp = pp_phys_plan_ext(
+        p.router.as_ref(),
+        &PPOptions {
+            show_sort_by: true,
+            ..PPOptions::none()
+        },
+    );
+    assert!(
+        router_pp.contains("b@0") && router_pp.contains("c@1"),
+        "expected router Sort extended with the tie-break column c, got:\n{}",
+        router_pp
+    );
+
+    Ok(())
+}
+
+async fn topk_hash_aggregate_trim(service: Box<dyn SqlClient>) -> Result<(), CubeError> {
+    service.exec_query("CREATE SCHEMA s").await?;
+    service
+        .exec_query("CREATE TABLE s.Data(a int, b int, hits int)")
+        .await?;
+    // 12 distinct (a, b) groups, each with two rows so partial aggregation actually groups.
+    // With k=3 and factor=2 the trim activates (g=12 > 6) but the result must match a full
+    // top-k. ORDER BY a (a proper subset of GROUP BY a, b) exercises totalization: the worker
+    // breaks ties on a by b so the router still receives every needed partial state.
+    service
+        .exec_query(
+            "INSERT INTO s.Data(a, b, hits) VALUES \
+             (1,1,10),(1,1,5),(1,2,1),(1,2,2),\
+             (2,1,7),(2,1,3),(2,2,4),(2,2,6),\
+             (3,1,8),(3,1,2),(3,2,9),(3,2,1),\
+             (4,1,1),(4,1,1),(4,2,1),(4,2,1),\
+             (5,1,1),(5,1,1),(5,2,1),(5,2,1),\
+             (6,1,1),(6,1,1),(6,2,1),(6,2,1)",
+        )
+        .await?;
+
+    // ORDER BY a, b LIMIT 3 (ascending): smallest three groups by (a, b).
+    let r = service
+        .exec_query("SELECT a, b, SUM(hits) FROM s.Data GROUP BY 1, 2 ORDER BY 1, 2 LIMIT 3")
+        .await?;
+    assert_eq!(to_rows(&r), rows(&[(1, 1, 15), (1, 2, 3), (2, 1, 10)]));
+
+    // ORDER BY a, b DESC LIMIT 3: largest three groups by (a, b).
+    let r = service
+        .exec_query(
+            "SELECT a, b, SUM(hits) FROM s.Data GROUP BY 1, 2 ORDER BY 1 DESC, 2 DESC LIMIT 3",
+        )
+        .await?;
+    assert_eq!(to_rows(&r), rows(&[(6, 2, 2), (6, 1, 2), (5, 2, 2)]));
+
+    // ORDER BY a only (a proper subset of GROUP BY a, b), LIMIT 2. The selected group SET is
+    // deterministic (both groups of a=1), but the intra-tie row order is not, so assert as a set.
+    // Each returned group must carry its complete sum regardless of cross-worker tie-breaking,
+    // which is what totalization (append b to the cut order) guarantees.
+    let r = service
+        .exec_query("SELECT a, b, SUM(hits) FROM s.Data GROUP BY 1, 2 ORDER BY 1 LIMIT 2")
+        .await?;
+    let got = to_rows(&r);
+    assert_eq!(got.len(), 2, "expected 2 rows, got: {:?}", got);
+    for expected in rows(&[(1, 1, 15), (1, 2, 3)]) {
+        assert!(
+            got.contains(&expected),
+            "missing {:?} in {:?}",
+            expected,
+            got
+        );
+    }
+
+    Ok(())
+}
+
 async fn planning_hints(service: Box<dyn SqlClient>) -> Result<(), CubeError> {
     service.exec_query("CREATE SCHEMA s").await?;
     service
diff --git a/rust/cubestore/cubestore/Cargo.toml b/rust/cubestore/cubestore/Cargo.toml
index 66bc586d6ad4f..9ab37d88d6c35 100644
--- a/rust/cubestore/cubestore/Cargo.toml
+++ b/rust/cubestore/cubestore/Cargo.toml
@@ -28,10 +28,10 @@ cubezetasketch = { path = "../cubezetasketch" }
 cubedatasketches = { path = "../cubedatasketches" }
 cubeshared = { path = "../../cube/cubeshared" }
 cuberpc = { path = "../cuberpc" }
-datafusion = { git = "https://github.com/cube-js/arrow-datafusion", branch = "cube-46.0.1", features = ["serde"] }
-datafusion-datasource = { git = "https://github.com/cube-js/arrow-datafusion", branch = "cube-46.0.1" }
-datafusion-proto = { git = "https://github.com/cube-js/arrow-datafusion", branch = "cube-46.0.1" }
-datafusion-proto-common = { git = "https://github.com/cube-js/arrow-datafusion", branch = "cube-46.0.1" }
+datafusion = { git = "https://github.com/cube-js/arrow-datafusion", branch = "cubestore-hash-aggregate-limit", features = ["serde"] }
+datafusion-datasource = { git = "https://github.com/cube-js/arrow-datafusion", branch = "cubestore-hash-aggregate-limit" }
+datafusion-proto = { git = "https://github.com/cube-js/arrow-datafusion", branch = "cubestore-hash-aggregate-limit" }
+datafusion-proto-common = { git = "https://github.com/cube-js/arrow-datafusion", branch = "cubestore-hash-aggregate-limit" }
 csv = "1.1.3"
 bytes = "1.6.0"
 serde_json = "1.0.56"
diff --git a/rust/cubestore/cubestore/src/config/mod.rs b/rust/cubestore/cubestore/src/config/mod.rs
index 5cba4c1222959..1409cab50a57c 100644
--- a/rust/cubestore/cubestore/src/config/mod.rs
+++ b/rust/cubestore/cubestore/src/config/mod.rs
@@ -585,6 +585,10 @@ pub trait ConfigObj: DIService {
     /// streaming split never drops rows (the first child is the low catch-all, the last
     /// the high one), and the legacy per-chunk path performed no such metadata check.
     fn repartition_check_overlapping_children(&self) -> bool;
+    /// Factor `f` controlling when the worker-side partial hash aggregate trims its output to the
+    /// top-k groups. Trimming happens only when the number of local groups exceeds `f * k`, where
+    /// `k = limit + offset`. `0` disables the optimization.
+    fn group_by_limit_factor(&self) -> usize;
 
     fn allow_decimal128(&self) -> bool;
 
@@ -745,6 +749,7 @@ pub struct ConfigObjImpl {
     pub repartition_merge_max_input_files: usize,
     pub repartition_merge_max_rows: u64,
     pub repartition_check_overlapping_children: bool,
+    pub group_by_limit_factor: usize,
     pub allow_decimal128: bool,
     pub enable_remove_orphaned_remote_files: bool,
     pub enable_startup_warmup: bool,
@@ -1087,6 +1092,10 @@ impl ConfigObj for ConfigObjImpl {
         self.repartition_check_overlapping_children
     }
 
+    fn group_by_limit_factor(&self) -> usize {
+        self.group_by_limit_factor
+    }
+
     fn allow_decimal128(&self) -> bool {
         self.allow_decimal128
     }
@@ -1783,6 +1792,7 @@ impl Config {
                     "CUBESTORE_REPARTITION_CHECK_OVERLAPPING_CHILDREN",
                     false,
                 ),
+                group_by_limit_factor: env_parse("CUBESTORE_GROUP_BY_LIMIT_FACTOR", 2),
                 allow_decimal128: env_bool("CUBESTORE_ALLOW_DECIMAL128", false),
                 enable_remove_orphaned_remote_files: env_bool(
                     "CUBESTORE_ENABLE_REMOVE_ORPHANED_REMOTE_FILES",
@@ -2039,6 +2049,7 @@ impl Config {
                 repartition_merge_max_input_files: 50,
                 repartition_merge_max_rows: 4_000_000,
                 repartition_check_overlapping_children: false,
+                group_by_limit_factor: 2,
                 allow_decimal128: false,
                 enable_remove_orphaned_remote_files: false,
                 enable_startup_warmup: true,
@@ -2734,10 +2745,6 @@ impl Config {
 
         self.injector
             .register_typed_with_default::<dyn QueryExecutor, _, _, _>(async move |i| {
-                let push_partial_aggregate_below_merge = i
-                    .get_service_typed::<dyn ConfigObj>()
-                    .await
-                    .push_partial_aggregate_below_merge_enabled();
                 QueryExecutorImpl::new(
                     i.get_service_typed::<dyn CubestoreMetadataCacheFactory>()
                         .await
@@ -2745,7 +2752,7 @@ impl Config {
                         .clone(),
                     i.get_service_typed().await,
                     i.get_service_typed().await,
-                    push_partial_aggregate_below_merge,
+                    i.get_service_typed::<dyn ConfigObj>().await,
                 )
             })
             .await;
diff --git a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs
new file mode 100644
index 0000000000000..e1ee77c685ca9
--- /dev/null
+++ b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs
@@ -0,0 +1,284 @@
+use datafusion::arrow::array::{ArrayRef, AsArray, RecordBatch};
+use datafusion::arrow::compute::{lexsort_to_indices, take, SortColumn, SortOptions};
+use datafusion::arrow::datatypes::SchemaRef;
+use datafusion::dfschema::internal_err;
+use datafusion::error::Result as DFResult;
+use datafusion::execution::{RecordBatchStream, TaskContext};
+use datafusion::logical_expr::{EmitTo, GroupsAccumulator};
+use datafusion::physical_expr::GroupsAccumulatorAdapter;
+use datafusion::physical_plan::aggregates::group_values::{new_group_values, GroupValues};
+use datafusion::physical_plan::aggregates::order::GroupOrdering;
+use datafusion::physical_plan::aggregates::PhysicalGroupBy;
+use datafusion::physical_plan::udaf::AggregateFunctionExpr;
+use datafusion::physical_plan::{ExecutionPlan, PhysicalExpr, SendableRecordBatchStream};
+use futures::ready;
+use futures::stream::{Stream, StreamExt};
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use super::GroupByLimitAggregateExec;
+
+enum ExecutionState {
+    ReadingInput,
+    ProducingOutput(RecordBatch),
+    Done,
+}
+
+pub(crate) struct GroupByLimitAggregateStream {
+    schema: SchemaRef,
+    input: SendableRecordBatchStream,
+    aggregate_arguments: Vec<Vec<Arc<dyn PhysicalExpr>>>,
+    filter_expressions: Vec<Option<Arc<dyn PhysicalExpr>>>,
+    group_by: PhysicalGroupBy,
+    batch_size: usize,
+    exec_state: ExecutionState,
+    input_done: bool,
+    accumulators: Vec<Box<dyn GroupsAccumulator>>,
+    group_values: Box<dyn GroupValues>,
+    current_group_indices: Vec<usize>,
+    k: usize,
+    factor: usize,
+    order: Vec<(usize, SortOptions)>,
+}
+
+impl GroupByLimitAggregateStream {
+    pub fn new(
+        agg: &GroupByLimitAggregateExec,
+        context: Arc<TaskContext>,
+        partition: usize,
+    ) -> DFResult<Self> {
+        let agg_schema = Arc::clone(&agg.schema());
+        let agg_group_by = agg.group_expr().clone();
+        let agg_filter_expr = agg.filter_expr().to_vec();
+
+        let batch_size = context.session_config().batch_size();
+        let input = agg.input().execute(partition, Arc::clone(&context))?;
+
+        let aggregate_arguments =
+            aggregate_expressions(agg.aggr_expr(), agg_group_by.num_group_exprs())?;
+
+        let accumulators: Vec<_> = agg
+            .aggr_expr()
+            .iter()
+            .map(create_group_accumulator)
+            .collect::<DFResult<_>>()?;
+
+        let group_schema = agg_group_by.group_schema(&agg.input().schema())?;
+        let group_values = new_group_values(group_schema, &GroupOrdering::None)?;
+
+        Ok(GroupByLimitAggregateStream {
+            schema: agg_schema,
+            input,
+            aggregate_arguments,
+            filter_expressions: agg_filter_expr,
+            group_by: agg_group_by,
+            batch_size,
+            exec_state: ExecutionState::ReadingInput,
+            input_done: false,
+            accumulators,
+            group_values,
+            current_group_indices: Vec::with_capacity(batch_size),
+            k: agg.k(),
+            factor: agg.factor(),
+            order: agg.order().to_vec(),
+        })
+    }
+}
+
+impl Stream for GroupByLimitAggregateStream {
+    type Item = DFResult<RecordBatch>;
+
+    fn poll_next(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        loop {
+            match &self.exec_state {
+                ExecutionState::ReadingInput => match ready!(self.input.poll_next_unpin(cx)) {
+                    Some(Ok(batch)) => {
+                        if let Err(e) = self.group_aggregate_batch(batch) {
+                            return Poll::Ready(Some(Err(e)));
+                        }
+                    }
+                    Some(Err(e)) => return Poll::Ready(Some(Err(e))),
+                    // Input exhausted: emit the whole group table at once, then trim to top-k.
+                    None => {
+                        self.input_done = true;
+                        match self.emit_all_trimmed() {
+                            Ok(Some(batch)) => {
+                                self.exec_state = ExecutionState::ProducingOutput(batch)
+                            }
+                            Ok(None) => self.exec_state = ExecutionState::Done,
+                            Err(e) => return Poll::Ready(Some(Err(e))),
+                        }
+                    }
+                },
+
+                ExecutionState::ProducingOutput(batch) => {
+                    let batch = batch.clone();
+                    let size = self.batch_size;
+                    let (next_state, output) = if batch.num_rows() <= size {
+                        (ExecutionState::Done, batch)
+                    } else {
+                        let remaining = batch.slice(size, batch.num_rows() - size);
+                        let output = batch.slice(0, size);
+                        (ExecutionState::ProducingOutput(remaining), output)
+                    };
+                    self.exec_state = next_state;
+                    return Poll::Ready(Some(Ok(output)));
+                }
+
+                ExecutionState::Done => return Poll::Ready(None),
+            }
+        }
+    }
+}
+
+impl RecordBatchStream for GroupByLimitAggregateStream {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+}
+
+impl GroupByLimitAggregateStream {
+    fn group_aggregate_batch(&mut self, batch: RecordBatch) -> DFResult<()> {
+        let group_by_values = evaluate_group_by(&self.group_by, &batch)?;
+        let input_values = evaluate_many(&self.aggregate_arguments, &batch)?;
+        let filter_values = evaluate_optional(&self.filter_expressions, &batch)?;
+
+        assert_eq!(group_by_values.len(), 1, "Exactly 1 group value required");
+        self.group_values
+            .intern(&group_by_values[0], &mut self.current_group_indices)?;
+        let group_indices = &self.current_group_indices;
+        let total_num_groups = self.group_values.len();
+
+        for ((acc, values), opt_filter) in self
+            .accumulators
+            .iter_mut()
+            .zip(input_values.iter())
+            .zip(filter_values.iter())
+        {
+            let opt_filter = opt_filter.as_ref().map(|filter| filter.as_boolean());
+            acc.update_batch(values, group_indices, opt_filter, total_num_groups)?;
+        }
+        Ok(())
+    }
+
+    /// Build the partial-state batch for all groups, then keep only the `k` smallest by the total
+    /// order when the number of groups exceeds `factor * k`.
+    fn emit_all_trimmed(&mut self) -> DFResult<Option<RecordBatch>> {
+        if self.group_values.is_empty() {
+            return Ok(None);
+        }
+        let mut columns = self.group_values.emit(EmitTo::All)?;
+        for acc in &mut self.accumulators {
+            columns.extend(acc.state(EmitTo::All)?);
+        }
+        let batch = RecordBatch::try_new(Arc::clone(&self.schema), columns)?;
+        Ok(Some(self.trim_top_k(batch)?))
+    }
+
+    fn trim_top_k(&self, batch: RecordBatch) -> DFResult<RecordBatch> {
+        let g = batch.num_rows();
+        if self.k == 0 || g <= self.factor.saturating_mul(self.k) {
+            return Ok(batch);
+        }
+        let sort_columns: Vec<SortColumn> = self
+            .order
+            .iter()
+            .map(|(idx, options)| SortColumn {
+                values: Arc::clone(batch.column(*idx)),
+                options: Some(*options),
+            })
+            .collect();
+        let indices = lexsort_to_indices(&sort_columns, Some(self.k))?;
+        let columns = batch
+            .columns()
+            .iter()
+            .map(|c| take(c.as_ref(), &indices, None))
+            .collect::<std::result::Result<Vec<_>, _>>()?;
+        Ok(RecordBatch::try_new(batch.schema(), columns)?)
+    }
+}
+
+/// Partial-aggregate argument expressions, one vec per aggregate. Mirrors DataFusion's private
+/// `aggregate_expressions` for `AggregateMode::Partial`.
+fn aggregate_expressions(
+    aggr_expr: &[Arc<AggregateFunctionExpr>],
+    _col_idx_base: usize,
+) -> DFResult<Vec<Vec<Arc<dyn PhysicalExpr>>>> {
+    Ok(aggr_expr
+        .iter()
+        .map(|agg| {
+            let mut result = agg.expressions();
+            if let Some(ordering_req) = agg.order_bys() {
+                result.extend(ordering_req.iter().map(|item| Arc::clone(&item.expr)));
+            }
+            result
+        })
+        .collect())
+}
+
+fn create_group_accumulator(
+    agg_expr: &Arc<AggregateFunctionExpr>,
+) -> DFResult<Box<dyn GroupsAccumulator>> {
+    if agg_expr.groups_accumulator_supported() {
+        agg_expr.create_groups_accumulator()
+    } else {
+        let agg_expr_captured = Arc::clone(agg_expr);
+        let factory = move || agg_expr_captured.create_accumulator();
+        Ok(Box::new(GroupsAccumulatorAdapter::new(factory)))
+    }
+}
+
+fn evaluate(expr: &[Arc<dyn PhysicalExpr>], batch: &RecordBatch) -> DFResult<Vec<ArrayRef>> {
+    expr.iter()
+        .map(|expr| {
+            expr.evaluate(batch)
+                .and_then(|v| v.into_array(batch.num_rows()))
+        })
+        .collect()
+}
+
+fn evaluate_many(
+    expr: &[Vec<Arc<dyn PhysicalExpr>>],
+    batch: &RecordBatch,
+) -> DFResult<Vec<Vec<ArrayRef>>> {
+    expr.iter().map(|expr| evaluate(expr, batch)).collect()
+}
+
+fn evaluate_optional(
+    expr: &[Option<Arc<dyn PhysicalExpr>>],
+    batch: &RecordBatch,
+) -> DFResult<Vec<Option<ArrayRef>>> {
+    expr.iter()
+        .map(|expr| {
+            expr.as_ref()
+                .map(|expr| {
+                    expr.evaluate(batch)
+                        .and_then(|v| v.into_array(batch.num_rows()))
+                })
+                .transpose()
+        })
+        .collect()
+}
+
+fn evaluate_group_by(
+    group_by: &PhysicalGroupBy,
+    batch: &RecordBatch,
+) -> DFResult<Vec<Vec<ArrayRef>>> {
+    let exprs: Vec<ArrayRef> = group_by
+        .expr()
+        .iter()
+        .map(|(expr, _)| {
+            let value = expr.evaluate(batch)?;
+            value.into_array(batch.num_rows())
+        })
+        .collect::<DFResult<Vec<_>>>()?;
+
+    if !group_by.is_single() {
+        return internal_err!("GroupByLimitAggregate does not support grouping sets");
+    }
+
+    Ok(vec![exprs])
+}
diff --git a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/mod.rs b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/mod.rs
new file mode 100644
index 0000000000000..4fe9ba5850cca
--- /dev/null
+++ b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/mod.rs
@@ -0,0 +1,204 @@
+mod group_by_limit_aggregate_stream;
+
+use datafusion::arrow::compute::SortOptions;
+use datafusion::arrow::datatypes::SchemaRef;
+use datafusion::common::stats::Precision;
+use datafusion::common::Statistics;
+use datafusion::error::Result as DFResult;
+use datafusion::execution::TaskContext;
+use datafusion::physical_expr::aggregate::AggregateFunctionExpr;
+use datafusion::physical_expr::{Distribution, LexRequirement};
+use datafusion::physical_plan::execution_plan::CardinalityEffect;
+use datafusion::physical_plan::metrics::MetricsSet;
+use datafusion::physical_plan::{aggregates::*, InputOrderMode};
+use datafusion::physical_plan::{
+    DisplayAs, DisplayFormatType, ExecutionPlan, PhysicalExpr, PlanProperties,
+    SendableRecordBatchStream,
+};
+use std::any::Any;
+use std::fmt::Debug;
+use std::sync::Arc;
+
+/// Worker-side partial hash aggregate that trims its output to the top-k groups by a total order,
+/// so far fewer partial-state rows cross the network to the router's Final aggregate.
+///
+/// This is a custom copy of DataFusion's partial hash aggregate (it reuses DF's `GroupValues` and
+/// `GroupsAccumulator` building blocks but owns the consume/emit loop), so the only change required
+/// in the DataFusion fork is making `new_group_values` public. The aggregation builds the whole
+/// group table and, at the single final emit, keeps only the `k` smallest groups by `order` when
+/// the number of groups exceeds `factor * k`; otherwise it emits all groups unchanged.
+///
+/// `order` is a TOTAL order over groups (ORDER BY columns followed by the remaining group-by
+/// columns), expressed as `(partial-output column index, sort options)`. A total order is required
+/// for correctness: the same group key can live on multiple workers, and a consistent cut across
+/// workers guarantees every partial state the router selects reaches it.
+#[derive(Debug, Clone)]
+pub struct GroupByLimitAggregateExec {
+    group_by: PhysicalGroupBy,
+    aggr_expr: Vec<Arc<AggregateFunctionExpr>>,
+    filter_expr: Vec<Option<Arc<dyn PhysicalExpr>>>,
+    pub input: Arc<dyn ExecutionPlan>,
+    /// Partial-aggregate output schema (group columns followed by accumulator state columns).
+    schema: SchemaRef,
+    input_schema: SchemaRef,
+    cache: PlanProperties,
+    /// Fetch count, `k = limit + offset`.
+    k: usize,
+    /// Only trim when the number of local groups exceeds `factor * k`.
+    factor: usize,
+    /// Total order over the partial output columns.
+    order: Vec<(usize, SortOptions)>,
+}
+
+impl GroupByLimitAggregateExec {
+    /// Build a `GroupByLimitAggregateExec` from a partial hash `AggregateExec`, or `None` if it is not a
+    /// single-group-by partial aggregate (grouping sets and non-partial modes are not supported).
+    pub fn try_new_from_partial(
+        aggregate: &AggregateExec,
+        k: usize,
+        factor: usize,
+        order: Vec<(usize, SortOptions)>,
+    ) -> Option<Self> {
+        if *aggregate.mode() != AggregateMode::Partial {
+            return None;
+        }
+        // Sorted-prefix aggregates are handled by InlineAggregateExec; this targets the hash path.
+        if matches!(aggregate.input_order_mode(), InputOrderMode::Sorted) {
+            return None;
+        }
+        let group_by = aggregate.group_expr().clone();
+        if !group_by.is_single() {
+            return None;
+        }
+        Some(Self {
+            group_by,
+            aggr_expr: aggregate.aggr_expr().to_vec(),
+            filter_expr: aggregate.filter_expr().to_vec(),
+            input: aggregate.input().clone(),
+            schema: aggregate.schema().clone(),
+            input_schema: aggregate.input_schema().clone(),
+            cache: aggregate.cache().clone(),
+            k,
+            factor,
+            order,
+        })
+    }
+
+    pub fn k(&self) -> usize {
+        self.k
+    }
+
+    pub fn factor(&self) -> usize {
+        self.factor
+    }
+
+    pub fn order(&self) -> &[(usize, SortOptions)] {
+        &self.order
+    }
+
+    pub fn aggr_expr(&self) -> &[Arc<AggregateFunctionExpr>] {
+        &self.aggr_expr
+    }
+
+    pub fn filter_expr(&self) -> &[Option<Arc<dyn PhysicalExpr>>] {
+        &self.filter_expr
+    }
+
+    pub fn input(&self) -> &Arc<dyn ExecutionPlan> {
+        &self.input
+    }
+
+    pub fn group_expr(&self) -> &PhysicalGroupBy {
+        &self.group_by
+    }
+}
+
+impl DisplayAs for GroupByLimitAggregateExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(
+                    f,
+                    "GroupByLimitAggregateExec: k={}, factor={}, order={:?}",
+                    self.k, self.factor, self.order
+                )?;
+            }
+        }
+        Ok(())
+    }
+}
+
+impl ExecutionPlan for GroupByLimitAggregateExec {
+    fn name(&self) -> &'static str {
+        "GroupByLimitAggregateExec"
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn properties(&self) -> &PlanProperties {
+        &self.cache
+    }
+
+    fn required_input_distribution(&self) -> Vec<Distribution> {
+        vec![Distribution::UnspecifiedDistribution]
+    }
+
+    fn required_input_ordering(&self) -> Vec<Option<LexRequirement>> {
+        vec![None]
+    }
+
+    fn maintains_input_order(&self) -> Vec<bool> {
+        vec![false]
+    }
+
+    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
+        vec![&self.input]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> DFResult<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(Self {
+            group_by: self.group_by.clone(),
+            aggr_expr: self.aggr_expr.clone(),
+            filter_expr: self.filter_expr.clone(),
+            input: children[0].clone(),
+            schema: self.schema.clone(),
+            input_schema: self.input_schema.clone(),
+            cache: self.cache.clone(),
+            k: self.k,
+            factor: self.factor,
+            order: self.order.clone(),
+        }))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> DFResult<SendableRecordBatchStream> {
+        let stream = group_by_limit_aggregate_stream::GroupByLimitAggregateStream::new(
+            self, context, partition,
+        )?;
+        Ok(Box::pin(stream))
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        None
+    }
+
+    fn statistics(&self) -> DFResult<Statistics> {
+        Ok(Statistics {
+            num_rows: Precision::Absent,
+            column_statistics: Statistics::unknown_column(&self.schema),
+            total_byte_size: Precision::Absent,
+        })
+    }
+
+    fn cardinality_effect(&self) -> CardinalityEffect {
+        CardinalityEffect::LowerEqual
+    }
+}
diff --git a/rust/cubestore/cubestore/src/queryplanner/mod.rs b/rust/cubestore/cubestore/src/queryplanner/mod.rs
index c4d4742312e4d..c02929c32cf88 100644
--- a/rust/cubestore/cubestore/src/queryplanner/mod.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/mod.rs
@@ -10,6 +10,7 @@ use datafusion_datasource::memory::MemorySourceConfig;
 use datafusion_datasource::source::DataSourceExec;
 pub use planning::PlanningMeta;
 mod check_memory;
+mod group_by_limit_aggregate;
 pub mod physical_plan_flags;
 pub mod pretty_printers;
 mod projection_above_limit;
diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/group_by_limit_rewriter.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/group_by_limit_rewriter.rs
new file mode 100644
index 0000000000000..3bf40d08c1c65
--- /dev/null
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/group_by_limit_rewriter.rs
@@ -0,0 +1,251 @@
+use crate::queryplanner::group_by_limit_aggregate::GroupByLimitAggregateExec;
+use crate::queryplanner::planning::WorkerExec;
+use crate::queryplanner::query_executor::ClusterSendExec;
+use datafusion::arrow::compute::SortOptions;
+use datafusion::error::DataFusionError;
+use datafusion::physical_expr::{LexOrdering, PhysicalSortExpr};
+use datafusion::physical_plan::aggregates::{AggregateExec, AggregateMode};
+use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
+use datafusion::physical_plan::expressions::Column;
+use datafusion::physical_plan::limit::GlobalLimitExec;
+use datafusion::physical_plan::sorts::sort::SortExec;
+use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
+use datafusion::physical_plan::{ExecutionPlan, InputOrderMode};
+use std::sync::Arc;
+
+/// Trim the worker-side partial hash aggregate to the top-k groups when the plan is
+/// `LIMIT k` over `ORDER BY <subset of group-by columns>` over a distributed hash aggregate.
+///
+/// Correctness requires a TOTAL order over groups (`T = ORDER BY ++ remaining group-by columns`,
+/// in group-by order) applied in TWO places that must agree:
+///   - the worker cut: each worker keeps its local top-k by `T`;
+///   - the router select: the global Sort + Limit must also order by `T`.
+/// Under `T` the router's top-k equals the global top-k by `T`, and every worker that holds a
+/// partial state for such a group keeps it (its local rank can only be smaller), so every needed
+/// partial state reaches the router. Ordering the router by `T` instead of the bare `ORDER BY` does
+/// not change the query contract: `ORDER BY` is a prefix of `T`, so the output stays validly
+/// ordered and the previously-unspecified tie order just becomes deterministic.
+///
+/// We only rewrite when the plan matches exactly `Sort(/Limit) -> [passthrough] -> Final aggregate
+/// -> [passthrough/cluster boundary] -> Partial hash aggregate`; anything else on the path (a
+/// HAVING filter, a nested aggregate, a computed projection) makes us bail, so we never trim a plan
+/// where the limit does not directly govern this aggregate.
+///
+/// `factor` gates trimming at runtime (only when local groups exceed `factor * k`); `0` disables.
+pub fn replace_with_group_by_limit_aggregate(
+    plan: Arc<dyn ExecutionPlan>,
+    factor: usize,
+) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
+    if factor == 0 {
+        return Ok(plan);
+    }
+    let Some(target) = analyze(&plan) else {
+        return Ok(plan);
+    };
+    apply(plan, &target, factor)
+}
+
+struct Target {
+    /// The router `SortExec` whose ordering must be extended to the total order.
+    sort: Arc<dyn ExecutionPlan>,
+    /// The worker-side partial hash `AggregateExec` to replace with a trimming exec.
+    partial: Arc<dyn ExecutionPlan>,
+    /// Tail of the total order to append to the router sort (over the sort's input schema).
+    router_tail: Vec<PhysicalSortExpr>,
+    /// Full total order over the partial output schema for the worker cut.
+    trim_order: Vec<(usize, SortOptions)>,
+    /// `k = limit + offset`.
+    k: usize,
+}
+
+fn analyze(root: &Arc<dyn ExecutionPlan>) -> Option<Target> {
+    // Peel an optional top GlobalLimit (carries the offset), then require a SortExec.
+    let (skip, extra_fetch, sort_node) =
+        if let Some(gl) = root.as_any().downcast_ref::<GlobalLimitExec>() {
+            (gl.skip(), gl.fetch(), child(root)?)
+        } else {
+            (0, None, root.clone())
+        };
+    let sort = sort_node.as_any().downcast_ref::<SortExec>()?;
+    let order: Vec<PhysicalSortExpr> = sort.expr().iter().cloned().collect();
+    if order.is_empty() {
+        return None;
+    }
+    // The worker must keep enough groups to cover `limit + offset`. When a top GlobalLimit carries
+    // the offset, DataFusion already folds `skip + limit` into the sort's fetch, so prefer it;
+    // otherwise fall back to the GlobalLimit's own `skip + fetch`.
+    let k = sort
+        .fetch()
+        .or_else(|| extra_fetch.map(|fetch| skip + fetch))?;
+
+    // Sort -> [passthrough] -> Final aggregate.
+    let final_agg_node = descend_to_final_aggregate(sort.input().clone())?;
+    let final_agg = final_agg_node.as_any().downcast_ref::<AggregateExec>()?;
+
+    // Final aggregate -> [passthrough/boundary] -> Partial hash aggregate.
+    let partial_node = descend_to_worker_partial(final_agg.input().clone())?;
+    let partial = partial_node.as_any().downcast_ref::<AggregateExec>()?;
+    if !partial.group_expr().is_single()
+        || matches!(partial.input_order_mode(), InputOrderMode::Sorted)
+    {
+        return None;
+    }
+
+    let num_group_cols = partial.group_expr().output_exprs().len();
+    if num_group_cols == 0 {
+        return None;
+    }
+    let partial_schema = partial.schema();
+    let group_names: Vec<String> = partial_schema
+        .fields()
+        .iter()
+        .take(num_group_cols)
+        .map(|f| f.name().clone())
+        .collect();
+
+    // Map ORDER BY columns onto group-by columns (by name; robust to projections).
+    let mut used = vec![false; num_group_cols];
+    let mut trim_order: Vec<(usize, SortOptions)> = Vec::with_capacity(num_group_cols);
+    for e in &order {
+        let column = e.expr.as_any().downcast_ref::<Column>()?;
+        let idx = group_names.iter().position(|n| n == column.name())?;
+        if used[idx] {
+            continue;
+        }
+        used[idx] = true;
+        trim_order.push((idx, e.options));
+    }
+    if trim_order.is_empty() {
+        return None;
+    }
+
+    // Totalize: append the remaining group-by columns in group-by order. Build the matching tail
+    // for the router sort over its own (Final-output) schema, resolved by name.
+    let sort_input_schema = sort.input().schema();
+    let mut router_tail: Vec<PhysicalSortExpr> = Vec::new();
+    for (idx, is_used) in used.into_iter().enumerate() {
+        if is_used {
+            continue;
+        }
+        let name = &group_names[idx];
+        let options = SortOptions::default();
+        let sort_col_idx = sort_input_schema.index_of(name).ok()?;
+        router_tail.push(PhysicalSortExpr {
+            expr: Arc::new(Column::new(name, sort_col_idx)),
+            options,
+        });
+        trim_order.push((idx, options));
+    }
+
+    Some(Target {
+        sort: sort_node,
+        partial: partial_node,
+        router_tail,
+        trim_order,
+        k,
+    })
+}
+
+fn apply(
+    node: Arc<dyn ExecutionPlan>,
+    target: &Target,
+    factor: usize,
+) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
+    let is_sort = Arc::ptr_eq(&node, &target.sort);
+    let is_partial = Arc::ptr_eq(&node, &target.partial);
+
+    let new_children = node
+        .children()
+        .into_iter()
+        .map(|c| apply(c.clone(), target, factor))
+        .collect::<Result<Vec<_>, _>>()?;
+    let node = node.with_new_children(new_children)?;
+
+    if is_partial {
+        if let Some(agg) = node.as_any().downcast_ref::<AggregateExec>() {
+            if let Some(exec) = GroupByLimitAggregateExec::try_new_from_partial(
+                agg,
+                target.k,
+                factor,
+                target.trim_order.clone(),
+            ) {
+                return Ok(Arc::new(exec));
+            }
+        }
+        // Leaving the full aggregate in place stays correct; the router still sorts by the total
+        // order, it just receives every group instead of the trimmed top-k.
+        return Ok(node);
+    }
+
+    if is_sort {
+        if let Some(sort) = node.as_any().downcast_ref::<SortExec>() {
+            let mut exprs: Vec<PhysicalSortExpr> = sort.expr().iter().cloned().collect();
+            exprs.extend(target.router_tail.iter().cloned());
+            let new_sort = SortExec::new(LexOrdering::new(exprs), sort.input().clone())
+                .with_preserve_partitioning(sort.preserve_partitioning())
+                .with_fetch(sort.fetch());
+            return Ok(Arc::new(new_sort));
+        }
+    }
+
+    Ok(node)
+}
+
+/// Walk down single-child passthrough nodes (which preserve rows and grouping) until the first
+/// `Final`/`FinalPartitioned` `AggregateExec`. Returns `None` if a non-passthrough node is hit
+/// first (e.g. a filter or a computed projection).
+fn descend_to_final_aggregate(mut node: Arc<dyn ExecutionPlan>) -> Option<Arc<dyn ExecutionPlan>> {
+    loop {
+        if let Some(agg) = node.as_any().downcast_ref::<AggregateExec>() {
+            return matches!(
+                agg.mode(),
+                AggregateMode::Final | AggregateMode::FinalPartitioned
+            )
+            .then_some(node.clone());
+        }
+        if is_row_passthrough(&node) {
+            node = child(&node)?;
+        } else {
+            return None;
+        }
+    }
+}
+
+/// Walk down passthrough nodes from a `Final` aggregate's input to the worker-side `Partial`
+/// aggregate, requiring that exactly one `ClusterSend`/`Worker` boundary is crossed. Returns `None`
+/// if anything unexpected (a second aggregate, a filter, ...) is on the path.
+fn descend_to_worker_partial(mut node: Arc<dyn ExecutionPlan>) -> Option<Arc<dyn ExecutionPlan>> {
+    let mut crossed_boundary = false;
+    loop {
+        if let Some(agg) = node.as_any().downcast_ref::<AggregateExec>() {
+            return (crossed_boundary && *agg.mode() == AggregateMode::Partial)
+                .then_some(node.clone());
+        }
+        if node.as_any().is::<ClusterSendExec>() || node.as_any().is::<WorkerExec>() {
+            crossed_boundary = true;
+            node = child(&node)?;
+        } else if is_row_passthrough(&node) {
+            node = child(&node)?;
+        } else {
+            return None;
+        }
+    }
+}
+
+/// Single-child nodes that pass rows through unchanged (preserving grouping), so a limit/sort above
+/// them governs the aggregate below them.
+fn is_row_passthrough(node: &Arc<dyn ExecutionPlan>) -> bool {
+    let any = node.as_any();
+    any.is::<CoalescePartitionsExec>()
+        || any.is::<SortPreservingMergeExec>()
+        || any.is::<ClusterSendExec>()
+        || any.is::<WorkerExec>()
+}
+
+fn child(node: &Arc<dyn ExecutionPlan>) -> Option<Arc<dyn ExecutionPlan>> {
+    let children = node.children();
+    if children.len() != 1 {
+        return None;
+    }
+    Some(children[0].clone())
+}
diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/mod.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/mod.rs
index edf902b44d3e5..42b24d252d899 100644
--- a/rust/cubestore/cubestore/src/queryplanner/optimizations/mod.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/mod.rs
@@ -1,5 +1,6 @@
 mod check_memory;
 mod distributed_partial_aggregate;
+mod group_by_limit_rewriter;
 mod inline_aggregate_rewriter;
 pub mod is_not_distinct_from_join_keys;
 pub mod rewrite_plan;
@@ -13,6 +14,7 @@ use crate::queryplanner::optimizations::distributed_partial_aggregate::{
     push_aggregate_to_workers, push_sorted_partial_aggregate_below_merge,
     push_worker_sort_and_limit, replace_suboptimal_merge_sorts,
 };
+use crate::queryplanner::optimizations::group_by_limit_rewriter::replace_with_group_by_limit_aggregate;
 use crate::queryplanner::optimizations::inline_aggregate_rewriter::replace_with_inline_aggregate;
 use crate::queryplanner::planning::CubeExtensionPlanner;
 use crate::queryplanner::pretty_printers::{pp_phys_plan_ext, PPOptions};
@@ -112,12 +114,14 @@ impl QueryPlanner for CubeQueryPlanner {
 #[derive(Debug)]
 pub struct PreOptimizeRule {
     push_partial_aggregate_below_merge: bool,
+    group_by_limit_factor: usize,
 }
 
 impl PreOptimizeRule {
-    pub fn new(push_partial_aggregate_below_merge: bool) -> Self {
+    pub fn new(push_partial_aggregate_below_merge: bool, group_by_limit_factor: usize) -> Self {
         Self {
             push_partial_aggregate_below_merge,
+            group_by_limit_factor,
         }
     }
 }
@@ -128,7 +132,11 @@ impl PhysicalOptimizerRule for PreOptimizeRule {
         plan: Arc<dyn ExecutionPlan>,
         _config: &ConfigOptions,
     ) -> datafusion::common::Result<Arc<dyn ExecutionPlan>> {
-        pre_optimize_physical_plan(plan, self.push_partial_aggregate_below_merge)
+        pre_optimize_physical_plan(
+            plan,
+            self.push_partial_aggregate_below_merge,
+            self.group_by_limit_factor,
+        )
     }
 
     fn name(&self) -> &str {
@@ -143,6 +151,7 @@ impl PhysicalOptimizerRule for PreOptimizeRule {
 fn pre_optimize_physical_plan(
     p: Arc<dyn ExecutionPlan>,
     push_partial_aggregate_below_merge: bool,
+    group_by_limit_factor: usize,
 ) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
     let p = rewrite_physical_plan(p, &mut |p| push_aggregate_to_workers(p))?;
 
@@ -164,6 +173,11 @@ fn pre_optimize_physical_plan(
     // Replace sorted AggregateExec with InlineAggregateExec for better performance
     let p = rewrite_physical_plan(p, &mut |p| replace_with_inline_aggregate(p))?;
 
+    // Trim the worker-side partial hash aggregate to the top-k groups when the query orders by a
+    // subset of group-by columns and has a limit. Runs after inline-aggregate replacement so it
+    // only sees the remaining (hash) partial aggregates.
+    let p = replace_with_group_by_limit_aggregate(p, group_by_limit_factor)?;
+
     Ok(p)
 }
 
diff --git a/rust/cubestore/cubestore/src/queryplanner/pretty_printers.rs b/rust/cubestore/cubestore/src/queryplanner/pretty_printers.rs
index 63d386add4951..3b50620742de7 100644
--- a/rust/cubestore/cubestore/src/queryplanner/pretty_printers.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/pretty_printers.rs
@@ -28,6 +28,7 @@ use std::sync::Arc;
 
 use crate::queryplanner::check_memory::CheckMemoryExec;
 use crate::queryplanner::filter_by_key_range::FilterByKeyRangeExec;
+use crate::queryplanner::group_by_limit_aggregate::GroupByLimitAggregateExec;
 use crate::queryplanner::inline_aggregate::{InlineAggregateExec, InlineAggregateMode};
 use crate::queryplanner::merge_sort::LastRowByUniqueKeyExec;
 use crate::queryplanner::panic::{PanicWorkerExec, PanicWorkerNode};
@@ -617,6 +618,16 @@ fn pp_phys_plan_indented(p: &dyn ExecutionPlan, indent: usize, o: &PPOptions, ou
             if let Some(limit) = agg.limit() {
                 *out += &format!(", limit: {}", limit)
             }
+        } else if let Some(agg) = a.downcast_ref::<GroupByLimitAggregateExec>() {
+            *out += &format!(
+                "GroupByLimitAggregate, k: {}, factor: {}, order: {:?}",
+                agg.k(),
+                agg.factor(),
+                agg.order()
+            );
+            if o.show_aggregations {
+                *out += &format!(", aggs: {:?}", agg.aggr_expr())
+            }
         } else if let Some(l) = a.downcast_ref::<LocalLimitExec>() {
             *out += &format!("LocalLimit, n: {}", l.fetch());
         } else if let Some(l) = a.downcast_ref::<GlobalLimitExec>() {
diff --git a/rust/cubestore/cubestore/src/queryplanner/query_executor.rs b/rust/cubestore/cubestore/src/queryplanner/query_executor.rs
index 528f268f7048f..44914e5217b6c 100644
--- a/rust/cubestore/cubestore/src/queryplanner/query_executor.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/query_executor.rs
@@ -226,7 +226,7 @@ pub struct QueryExecutorImpl {
     metadata_cache_factory: Arc<dyn MetadataCacheFactory>,
     parquet_metadata_cache: Arc<dyn CubestoreParquetMetadataCache>,
     memory_handler: Arc<dyn MemoryHandler>,
-    push_partial_aggregate_below_merge: bool,
+    config: Arc<dyn ConfigObj>,
 }
 
 crate::di_service!(QueryExecutorImpl, [QueryExecutor]);
@@ -547,13 +547,13 @@ impl QueryExecutorImpl {
         metadata_cache_factory: Arc<dyn MetadataCacheFactory>,
         parquet_metadata_cache: Arc<dyn CubestoreParquetMetadataCache>,
         memory_handler: Arc<dyn MemoryHandler>,
-        push_partial_aggregate_below_merge: bool,
+        config: Arc<dyn ConfigObj>,
     ) -> Arc<Self> {
         Arc::new(QueryExecutorImpl {
             metadata_cache_factory,
             parquet_metadata_cache,
             memory_handler,
-            push_partial_aggregate_below_merge,
+            config,
         })
     }
 
@@ -603,7 +603,8 @@ impl QueryExecutorImpl {
         vec![
             // Cube rules
             Arc::new(PreOptimizeRule::new(
-                self.push_partial_aggregate_below_merge,
+                self.config.push_partial_aggregate_below_merge_enabled(),
+                self.config.group_by_limit_factor(),
             )),
             // DF rules without EnforceDistribution.  We do need to keep EnforceSorting.
             Arc::new(OutputRequirements::new_add_mode()),

From 661f8f779914ac5473a329f4e06db8e55719811b Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Fri, 19 Jun 2026 18:06:52 +0200
Subject: [PATCH 02/16] feat(cubestore): engage group-by-limit trim at
 distributed execution via worker_sort_and_limit

The rewriter only plants GroupByLimitAggregate during ROUTER planning, but the
worker re-plans physical from logical independently, so the trim never reached
execution. Instead of a new proto, reuse the existing worker_sort_and_limit
carry (already serialized across ClusterSend): when group_by_limit_factor > 0,
resort_worker_subtree now wraps the worker partial hash aggregate in
GroupByLimitAggregateExec (trim during aggregation, bounded O(factor*k) memory)
instead of a SortExec over the full partial; the Sort above still orders the
<= k surviving groups so the router's sort-preserving merge stays correct.
group_by_limit_factor (CUBESTORE_GROUP_BY_LIMIT_FACTOR, default 2) is threaded
router->worker via CubeQueryPlanner; 0 keeps the previous sort-then-trim.

Verified on the production-dump stend: with factor>0 the worker EXPLAIN ANALYZE
shows GroupByLimitAggregate (was LinearPartialAggregate) and results are
identical. Value is bounded hash-table memory (OOM avoidance) on
high-cardinality group-bys, not speed on small inputs.

WIP: only fires when worker_sort_and_limit fires, i.e. single-table
`ORDER BY <group subset> LIMIT k`. UNION-of-tables (the prod query.sql shape)
does not populate worker_sort_and_limit yet (separate ctx-propagation gap).

(cherry picked from commit f6d09954a8af1afaf30fe041b9f2d1c8bc64a54a)
---
 .../distributed_partial_aggregate.rs          | 52 +++++++++++++++++--
 .../src/queryplanner/optimizations/mod.rs     | 11 +++-
 .../src/queryplanner/query_executor.rs        |  2 +
 3 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
index 5246b1878f132..cde5f026e8e2f 100644
--- a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
@@ -1,5 +1,6 @@
 use crate::cluster::WorkerPlanningParams;
 use crate::queryplanner::check_memory::CheckMemoryExec;
+use crate::queryplanner::group_by_limit_aggregate::GroupByLimitAggregateExec;
 use crate::queryplanner::inline_aggregate::{InlineAggregateExec, InlineAggregateMode};
 use crate::queryplanner::planning::WorkerExec;
 use crate::queryplanner::query_executor::ClusterSendExec;
@@ -286,13 +287,15 @@ pub fn push_sorted_partial_aggregate_below_merge(
 /// key) would drop partial states of tied groups.
 pub fn push_worker_sort_and_limit(
     p: Arc<dyn ExecutionPlan>,
+    group_by_limit_factor: usize,
 ) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
     // Worker side: wrap the partial aggregate with a per-partition bounded sort.
     if let Some(w) = p.as_any().downcast_ref::<WorkerExec>() {
         let Some((cols, fetch)) = w.worker_sort_and_limit.clone() else {
             return Ok(p);
         };
-        let Some(new_input) = resort_worker_subtree(&w.input, &cols, fetch) else {
+        let Some(new_input) = resort_worker_subtree(&w.input, &cols, fetch, group_by_limit_factor)
+        else {
             return Ok(p);
         };
         return Ok(Arc::new(WorkerExec::new(
@@ -324,8 +327,12 @@ pub fn push_worker_sort_and_limit(
     let Some((cols, fetch)) = cs.worker_sort_and_limit.clone() else {
         return Ok(p);
     };
-    let Some(new_worker_subtree) = resort_worker_subtree(&cs.input_for_optimizations, &cols, fetch)
-    else {
+    let Some(new_worker_subtree) = resort_worker_subtree(
+        &cs.input_for_optimizations,
+        &cols,
+        fetch,
+        group_by_limit_factor,
+    ) else {
         return Ok(p);
     };
     let new_cs: Arc<dyn ExecutionPlan> =
@@ -377,6 +384,7 @@ fn resort_worker_subtree(
     worker_subtree: &Arc<dyn ExecutionPlan>,
     cols: &[(usize, bool, bool)],
     fetch: usize,
+    group_by_limit_factor: usize,
 ) -> Option<Arc<dyn ExecutionPlan>> {
     let partial = locate_partial_aggregate(worker_subtree)?;
     let schema = partial.schema();
@@ -392,8 +400,44 @@ fn resort_worker_subtree(
         });
     }
     let worker_order = LexOrdering::new(exprs);
+
+    // Trim during aggregation: replace the partial hash aggregate with a GroupByLimitAggregateExec
+    // so it keeps only the top-k groups by `cols` instead of materializing every group and letting
+    // the Sort below trim afterwards. The Sort above still sorts the (now <= k) groups so the
+    // router's sort-preserving merge stays correct. `group_by_limit_factor == 0` disables this and
+    // falls back to the plain sort-then-trim.
+    let trimmed: Arc<dyn ExecutionPlan> = if group_by_limit_factor > 0 {
+        if let Some(agg) = partial.as_any().downcast_ref::<AggregateExec>() {
+            let order: Vec<(usize, SortOptions)> = cols
+                .iter()
+                .map(|(idx, asc, nulls_first)| {
+                    (
+                        *idx,
+                        SortOptions {
+                            descending: !asc,
+                            nulls_first: *nulls_first,
+                        },
+                    )
+                })
+                .collect();
+            match GroupByLimitAggregateExec::try_new_from_partial(
+                agg,
+                fetch,
+                group_by_limit_factor,
+                order,
+            ) {
+                Some(e) => Arc::new(e) as Arc<dyn ExecutionPlan>,
+                None => partial,
+            }
+        } else {
+            partial
+        }
+    } else {
+        partial
+    };
+
     let per_partition_sort: Arc<dyn ExecutionPlan> = Arc::new(
-        SortExec::new(worker_order.clone(), partial)
+        SortExec::new(worker_order.clone(), trimmed)
             .with_fetch(Some(fetch))
             .with_preserve_partitioning(true),
     );
diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/mod.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/mod.rs
index 42b24d252d899..2df406a1a62af 100644
--- a/rust/cubestore/cubestore/src/queryplanner/optimizations/mod.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/mod.rs
@@ -45,6 +45,7 @@ pub struct CubeQueryPlanner {
     serialized_plan: Arc<PreSerializedPlan>,
     memory_handler: Arc<dyn MemoryHandler>,
     data_loaded_size: Option<Arc<DataLoadedSize>>,
+    group_by_limit_factor: usize,
 }
 
 impl CubeQueryPlanner {
@@ -52,6 +53,7 @@ impl CubeQueryPlanner {
         cluster: Arc<dyn Cluster>,
         serialized_plan: Arc<PreSerializedPlan>,
         memory_handler: Arc<dyn MemoryHandler>,
+        group_by_limit_factor: usize,
     ) -> CubeQueryPlanner {
         CubeQueryPlanner {
             cluster: Some(cluster),
@@ -59,6 +61,7 @@ impl CubeQueryPlanner {
             serialized_plan,
             memory_handler,
             data_loaded_size: None,
+            group_by_limit_factor,
         }
     }
 
@@ -67,6 +70,7 @@ impl CubeQueryPlanner {
         worker_planning_params: WorkerPlanningParams,
         memory_handler: Arc<dyn MemoryHandler>,
         data_loaded_size: Option<Arc<DataLoadedSize>>,
+        group_by_limit_factor: usize,
     ) -> CubeQueryPlanner {
         CubeQueryPlanner {
             serialized_plan,
@@ -74,6 +78,7 @@ impl CubeQueryPlanner {
             worker_partition_count: Some(worker_planning_params),
             memory_handler,
             data_loaded_size,
+            group_by_limit_factor,
         }
     }
 }
@@ -106,6 +111,7 @@ impl QueryPlanner for CubeQueryPlanner {
             self.memory_handler.clone(),
             self.data_loaded_size.clone(),
             ctx_state.config().options(),
+            self.group_by_limit_factor,
         );
         result
     }
@@ -187,6 +193,7 @@ fn finalize_physical_plan(
     memory_handler: Arc<dyn MemoryHandler>,
     data_loaded_size: Option<Arc<DataLoadedSize>>,
     config: &ConfigOptions,
+    group_by_limit_factor: usize,
 ) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
     let p = rewrite_physical_plan(p, &mut |p| add_check_memory_exec(p, memory_handler.clone()))?;
     log::trace!(
@@ -215,7 +222,9 @@ fn finalize_physical_plan(
     // Last: bound worker memory for ORDER BY <group cols> LIMIT that isn't an index prefix. Runs
     // after replace_suboptimal_merge_sorts so it doesn't push the query's row limit into the
     // worker merge we add (which would cut uncombined partial rows and undercount).
-    let p = rewrite_physical_plan(p, &mut |p| push_worker_sort_and_limit(p))?;
+    let p = rewrite_physical_plan(p, &mut |p| {
+        push_worker_sort_and_limit(p, group_by_limit_factor)
+    })?;
     log::trace!(
         "Rewrote physical plan by push_worker_sort_and_limit:\n{}",
         pp_phys_plan_ext(p.as_ref(), &PPOptions::show_nonmeta())
diff --git a/rust/cubestore/cubestore/src/queryplanner/query_executor.rs b/rust/cubestore/cubestore/src/queryplanner/query_executor.rs
index 44914e5217b6c..6025288706314 100644
--- a/rust/cubestore/cubestore/src/queryplanner/query_executor.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/query_executor.rs
@@ -567,6 +567,7 @@ impl QueryExecutorImpl {
             cluster,
             serialized_plan,
             self.memory_handler.clone(),
+            self.config.group_by_limit_factor(),
         ))
     }
 
@@ -582,6 +583,7 @@ impl QueryExecutorImpl {
             worker_planning_params,
             self.memory_handler.clone(),
             data_loaded_size.clone(),
+            self.config.group_by_limit_factor(),
         ))
     }
 

From 67a54ca0b9cad16159e190858ce2531799adbe63 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Fri, 19 Jun 2026 19:57:00 +0200
Subject: [PATCH 03/16] feat(cubestore): extend group-by-limit pushdown to bare
 LIMIT and through UNION

The worker group-by-limit trim previously required an ORDER BY and was dropped
when the per-branch ClusterSends of a UNION were merged. Now:

- compute_worker_sort_and_limit handles a bare LIMIT (no ORDER BY): the total
  order is the full group key in group-by order, so "any n" becomes "the n
  smallest by group key" -- a valid deterministic choice. An ORDER BY prefix,
  when present, still sorts first.
- pull_up_cluster_send preserves worker_sort_and_limit across a UNION (the same
  group-by/limit context descends to every branch, so the descriptors are
  positionally identical and stay valid over the union); kept only when all
  branches agree.

Adds planning coverage (bare LIMIT, UNION ALL + bare LIMIT) and a bare-LIMIT
execution assertion to topk_hash_aggregate_trim; excludes that test from the
migration harness (new test, no recorded fixture).

(cherry picked from commit 65420c6ed559bef8ce9458e49a87c7c8ae5d00e6)
---
 .../cubestore-sql-tests/src/tests.rs          | 49 +++++++++++++++++++
 .../cubestore/src/queryplanner/planning.rs    | 38 ++++++++++----
 2 files changed, 78 insertions(+), 9 deletions(-)

diff --git a/rust/cubestore/cubestore-sql-tests/src/tests.rs b/rust/cubestore/cubestore-sql-tests/src/tests.rs
index 71ed3ddd05f2a..6a5a8910528b3 100644
--- a/rust/cubestore/cubestore-sql-tests/src/tests.rs
+++ b/rust/cubestore/cubestore-sql-tests/src/tests.rs
@@ -428,6 +428,7 @@ lazy_static::lazy_static! {
         "limit_pushdown_group_nonprefix_order",
         "prefilter_chunks_shared_scan",
         "planning_topk_hash_aggregate",
+        "topk_hash_aggregate_trim",
     ].into_iter().map(ToOwned::to_owned).collect();
 }
 
@@ -3285,6 +3286,37 @@ async fn planning_topk_hash_aggregate(service: Box<dyn SqlClient>) -> Result<(),
         router_pp
     );
 
+    // Bare LIMIT (no ORDER BY) on a non-indexed group column: the limit can't ride the index, so the
+    // worker still trims to the smallest groups by the full group key -- "any k" made deterministic.
+    let p = service
+        .plan_query("SELECT day, SUM(hits) FROM s.Data GROUP BY 1 LIMIT 10")
+        .await?;
+    let pp = pp_phys_plan_ext(p.worker.as_ref(), &PPOptions::none());
+    assert!(
+        pp.contains("GroupByLimitAggregate, k: 10, factor: 2,")
+            && pp.contains("(0, SortOptions { descending: false, nulls_first: false })"),
+        "expected GroupByLimitAggregate on a bare LIMIT, got:\n{}",
+        pp
+    );
+
+    // UNION ALL + bare LIMIT: the per-branch trim descriptor must survive the cluster-send pull-up
+    // over the union so the worker still trims above the union.
+    service
+        .exec_query("CREATE TABLE s.Data2(url text, day int, hits int)")
+        .await?;
+    let p = service
+        .plan_query(
+            "SELECT day, SUM(hits) FROM \
+             (SELECT * FROM s.Data UNION ALL SELECT * FROM s.Data2) u GROUP BY 1 LIMIT 10",
+        )
+        .await?;
+    let pp = pp_phys_plan_ext(p.worker.as_ref(), &PPOptions::none());
+    assert!(
+        pp.contains("GroupByLimitAggregate, k: 10, factor: 2,") && pp.contains("Union"),
+        "expected GroupByLimitAggregate over the Union, got:\n{}",
+        pp
+    );
+
     Ok(())
 }
 
@@ -3341,6 +3373,23 @@ async fn topk_hash_aggregate_trim(service: Box<dyn SqlClient>) -> Result<(), Cub
         );
     }
 
+    // Bare LIMIT 3 (no ORDER BY): the trim orders by the full group key, so "any 3" resolves to the
+    // 3 smallest by (a, b). The result order is unspecified, but the group SET and each group's full
+    // sum must be exact -- the latter guards against undercounting a group split across workers.
+    let r = service
+        .exec_query("SELECT a, b, SUM(hits) FROM s.Data GROUP BY 1, 2 LIMIT 3")
+        .await?;
+    let got = to_rows(&r);
+    assert_eq!(got.len(), 3, "expected 3 rows, got: {:?}", got);
+    for expected in rows(&[(1, 1, 15), (1, 2, 3), (2, 1, 10)]) {
+        assert!(
+            got.contains(&expected),
+            "missing {:?} in {:?}",
+            expected,
+            got
+        );
+    }
+
     Ok(())
 }
 
diff --git a/rust/cubestore/cubestore/src/queryplanner/planning.rs b/rust/cubestore/cubestore/src/queryplanner/planning.rs
index 9c9cd7b352df6..a862554ea1725 100644
--- a/rust/cubestore/cubestore/src/queryplanner/planning.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/planning.rs
@@ -1143,19 +1143,22 @@ impl ChooseIndex<'_> {
             return None;
         }
         let limit = ctx.limit?;
-        let sort = ctx.sort.as_ref().filter(|s| !s.is_empty())?;
         let group_by = ctx.group_by.as_ref().filter(|g| !g.is_empty())?;
 
-        // Every ORDER BY column must be a group-by column; map it to its group-key position.
+        // Every ORDER BY column must be a group-by column; map it to its group-key position. A bare
+        // LIMIT (no ORDER BY) leaves the prefix empty, so the total order is the full group key in
+        // group-by order -- "any n groups" becomes "the n smallest by group key", still valid.
         let mut cols: Vec<(usize, bool, bool)> = Vec::with_capacity(group_by.len());
         let mut used = vec![false; group_by.len()];
-        for name in sort {
-            let idx = group_by.iter().position(|g| g == name)?;
-            if used[idx] {
-                continue;
+        if let Some(sort) = ctx.sort.as_ref().filter(|s| !s.is_empty()) {
+            for name in sort {
+                let idx = group_by.iter().position(|g| g == name)?;
+                if used[idx] {
+                    continue;
+                }
+                used[idx] = true;
+                cols.push((idx, ctx.sort_is_asc, !ctx.sort_is_asc));
             }
-            used[idx] = true;
-            cols.push((idx, ctx.sort_is_asc, !ctx.sort_is_asc));
         }
         // Extend with the remaining group keys to make it a total order on the full group key.
         for (idx, is_used) in used.iter().enumerate() {
@@ -1833,6 +1836,7 @@ fn pull_up_cluster_send(mut p: LogicalPlan) -> Result<LogicalPlan, DataFusionErr
             }
             let mut union_snapshots = Vec::new();
             let mut limits = Vec::new();
+            let mut worker_sort_and_limits = Vec::new();
             let mut id = 0;
             for i in inputs.into_iter() {
                 let send;
@@ -1848,6 +1852,7 @@ fn pull_up_cluster_send(mut p: LogicalPlan) -> Result<LogicalPlan, DataFusionErr
                 }
                 union_snapshots.extend(send.snapshots.concat());
                 limits.push(send.limit_and_reverse);
+                worker_sort_and_limits.push(send.worker_sort_and_limit.clone());
                 *i = send.input.clone();
             }
             let limit = if limits.is_empty() || limits.iter().any(|l| l.is_none()) {
@@ -1857,8 +1862,23 @@ fn pull_up_cluster_send(mut p: LogicalPlan) -> Result<LogicalPlan, DataFusionErr
             } else {
                 limits[0]
             };
+            // The same group-by/limit context descends to every UNION branch, so the per-branch
+            // descriptors are identical and stay valid over the union (same schema, group columns by
+            // position). Keep it only when all branches agree; otherwise drop the pushdown.
+            let worker_sort_and_limit = if worker_sort_and_limits.is_empty()
+                || worker_sort_and_limits.iter().any(|w| w.is_none())
+                || worker_sort_and_limits
+                    .iter()
+                    .any(|w| w != &worker_sort_and_limits[0])
+            {
+                None
+            } else {
+                worker_sort_and_limits[0].clone()
+            };
             snapshots = vec![union_snapshots];
-            return Ok(ClusterSendNode::new(id, Arc::new(p), snapshots, limit).into_plan());
+            let mut new_send = ClusterSendNode::new(id, Arc::new(p), snapshots, limit);
+            new_send.worker_sort_and_limit = worker_sort_and_limit;
+            return Ok(new_send.into_plan());
         }
         LogicalPlan::Join(Join { left, right, .. }) => {
             let lsend;

From d38add159d17177116a630a6a6537378b4e0f39f Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Fri, 19 Jun 2026 22:13:46 +0200
Subject: [PATCH 04/16] feat(cubestore): parallel hash-final worker top-k
 (experimental, flag-gated)

Behind CUBESTORE_GROUP_BY_LIMIT_HASH_FINAL (default off, gated on
group_by_limit_factor > 0): route the worker top-k through a router-side hash
final aggregate instead of the sorted-merge framework.

- Worker subtree: CoalescePartitions <- GroupByLimitAggregate (drops the
  per-partition SortExec and SortPreservingMerge; emits the trimmed top-k
  unsorted). CoalescePartitions spawns a task per input partition, so the
  per-union-branch aggregates run in parallel -- the sorted merge drained them
  on a single task.
- Router: SortExec(T, fetch=k) <- AggregateExec(Final, hash) <-
  CoalescePartitions <- ClusterSend. The explicit top-k sort by the full total
  order T is required even for a bare LIMIT: it keeps exactly the groups every
  worker kept (fully combined here), where a plain limit could take an
  undercounted group only one worker retained.

On the production-dump UNION query (GROUP BY 6 cols, LIMIT 10): ~2.1x faster
(~3.1s -> ~1.5s) at neutral peak RSS, because the worker now uses ~2 cores
instead of 1. Same result as the sorted path (ORDER BY and bare LIMIT verified).

The flag must be set fleet-wide: worker and router re-plan independently, and a
worker-on/router-off split would feed unsorted streams into a sorted merge.

(cherry picked from commit 4c9d5a6462ad1284ca86de3b1fc1ff4049dab0f7)
---
 .../distributed_partial_aggregate.rs          | 78 +++++++++++++++----
 1 file changed, 62 insertions(+), 16 deletions(-)

diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
index cde5f026e8e2f..c810ac11558f0 100644
--- a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
@@ -338,6 +338,27 @@ pub fn push_worker_sort_and_limit(
     let new_cs: Arc<dyn ExecutionPlan> =
         Arc::new(cs.with_changed_schema(new_worker_subtree, cs.required_input_ordering.clone()));
     let worker_order = worker_ordering(&final_agg.group_expr, &cols)?;
+
+    // Hash-final variant: combine the worker top-k with a hash final aggregate over coalesced
+    // (unordered) streams, then re-apply the top-k sort by the total order. The Sort(fetch) is
+    // required even for a bare LIMIT: it picks the k smallest by the total order, which are exactly
+    // the groups every worker kept (and therefore fully combined here); a plain limit could take a
+    // group only one worker kept (undercounted).
+    if group_by_limit_factor > 0 && hash_final_enabled() {
+        let coalesced: Arc<dyn ExecutionPlan> = Arc::new(CoalescePartitionsExec::new(new_cs));
+        let final_hash: Arc<dyn ExecutionPlan> = Arc::new(AggregateExec::try_new(
+            AggregateMode::Final,
+            final_agg.group_expr,
+            final_agg.aggr_expr,
+            final_agg.filter_expr,
+            coalesced,
+            final_agg.input_schema,
+        )?);
+        return Ok(Arc::new(
+            SortExec::new(worker_order, final_hash).with_fetch(Some(fetch)),
+        ));
+    }
+
     let merged: Arc<dyn ExecutionPlan> =
         Arc::new(SortPreservingMergeExec::new(worker_order, new_cs));
     Ok(Arc::new(AggregateExec::try_new(
@@ -387,25 +408,10 @@ fn resort_worker_subtree(
     group_by_limit_factor: usize,
 ) -> Option<Arc<dyn ExecutionPlan>> {
     let partial = locate_partial_aggregate(worker_subtree)?;
-    let schema = partial.schema();
-    let mut exprs = Vec::with_capacity(cols.len());
-    for (idx, asc, nulls_first) in cols {
-        let field = schema.fields().get(*idx)?;
-        exprs.push(PhysicalSortExpr {
-            expr: Arc::new(Column::new(field.name(), *idx)),
-            options: SortOptions {
-                descending: !asc,
-                nulls_first: *nulls_first,
-            },
-        });
-    }
-    let worker_order = LexOrdering::new(exprs);
 
     // Trim during aggregation: replace the partial hash aggregate with a GroupByLimitAggregateExec
     // so it keeps only the top-k groups by `cols` instead of materializing every group and letting
-    // the Sort below trim afterwards. The Sort above still sorts the (now <= k) groups so the
-    // router's sort-preserving merge stays correct. `group_by_limit_factor == 0` disables this and
-    // falls back to the plain sort-then-trim.
+    // the Sort below trim afterwards. `group_by_limit_factor == 0` disables this.
     let trimmed: Arc<dyn ExecutionPlan> = if group_by_limit_factor > 0 {
         if let Some(agg) = partial.as_any().downcast_ref::<AggregateExec>() {
             let order: Vec<(usize, SortOptions)> = cols
@@ -436,6 +442,33 @@ fn resort_worker_subtree(
         partial
     };
 
+    // Hash-final variant (CUBESTORE_GROUP_BY_LIMIT_HASH_FINAL): emit the trimmed top-k unsorted,
+    // coalesced to one stream. The router hash-combines and re-applies the top-k sort, so no
+    // worker-side sort/merge holds the whole result, and the aggregates run in parallel
+    // (CoalescePartitions spawns a task per input). Gated on `group_by_limit_factor > 0`; when
+    // `worker_sort_and_limit` is set the partial is the hash `AggregateExec` so the trim engages and
+    // the coalesced output is <= partitions*k rows. (If the trim ever fell back to the untrimmed
+    // partial -- e.g. grouping sets -- the router hash final would size to all groups: a memory
+    // regression, not a correctness bug.)
+    if group_by_limit_factor > 0 && hash_final_enabled() {
+        return Some(Arc::new(CoalescePartitionsExec::new(trimmed)));
+    }
+
+    // Sorted path: per-partition bounded sort, then a sort-preserving merge, so the router's sorted
+    // final aggregate sees adjacent equal keys and its limit cuts on the total order.
+    let schema = trimmed.schema();
+    let mut exprs = Vec::with_capacity(cols.len());
+    for (idx, asc, nulls_first) in cols {
+        let field = schema.fields().get(*idx)?;
+        exprs.push(PhysicalSortExpr {
+            expr: Arc::new(Column::new(field.name(), *idx)),
+            options: SortOptions {
+                descending: !asc,
+                nulls_first: *nulls_first,
+            },
+        });
+    }
+    let worker_order = LexOrdering::new(exprs);
     let per_partition_sort: Arc<dyn ExecutionPlan> = Arc::new(
         SortExec::new(worker_order.clone(), trimmed)
             .with_fetch(Some(fetch))
@@ -447,6 +480,19 @@ fn resort_worker_subtree(
     )))
 }
 
+/// Experiment toggle: route the worker top-k through a router-side hash final aggregate instead of
+/// the sorted-merge framework.
+///
+/// Must be set consistently across the whole fleet: the worker and router re-plan in separate
+/// processes, each reading this flag. A worker emitting unsorted top-k (flag on) into a router that
+/// still expects the sorted-merge input (flag off) would silently undercount. Set it via deployment
+/// env so every node agrees.
+fn hash_final_enabled() -> bool {
+    std::env::var("CUBESTORE_GROUP_BY_LIMIT_HASH_FINAL")
+        .map(|v| v == "true" || v == "1")
+        .unwrap_or(false)
+}
+
 /// The group/aggregate state of either an `InlineAggregateExec` or a plain `AggregateExec`, when in
 /// Final mode.
 struct FinalAggregateInfo {

From 48551a3ee35560e854293f7084a276806377530c Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Fri, 19 Jun 2026 22:44:32 +0200
Subject: [PATCH 05/16] feat(cubestore): per-partition N-way top-k aggregate
 (experimental, flag-gated)

Behind CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION (default off; only active with
CUBESTORE_GROUP_BY_LIMIT_HASH_FINAL and group_by_limit_factor > 0): strip the
CoalescePartitions below the trimmed worker aggregate so it runs over every raw
CubeTableExec partition instead of one stream per union branch. The hash-final
CoalescePartitions on top then parallelizes all partitions.

Per-partition top-k stays complete by the same total-order argument as the
per-worker cut: a group in the global top-k by T has fewer than k smaller-keyed
groups globally, hence in any single partition, so it survives every partition's
local top-k and reaches the router fully combined. The strip only removes
CoalescePartitions, never the SortPreservingMerge that feeds the single-partition
LastRowByUniqueKeyExec, so unique-key tables stay correct.

On the production-dump UNION query: ~799ms vs ~1504ms (hash-final 3-way) and
~2159ms (master), ~9 cores at peak, peak RSS still neutral because the
index-sorted partitions are key-local (small per-partition tables). Memory is
bounded by partition count, so a group key spread across partitions would cost
~N x; gated off by default.

(cherry picked from commit d43016768886f13b8ad036ceae37c7cb0578ecf8)
---
 .../distributed_partial_aggregate.rs          | 49 +++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
index c810ac11558f0..9fa8df19dcd29 100644
--- a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
@@ -409,6 +409,22 @@ fn resort_worker_subtree(
 ) -> Option<Arc<dyn ExecutionPlan>> {
     let partial = locate_partial_aggregate(worker_subtree)?;
 
+    // N-way experiment: drop the per-partition merge below the aggregate so it runs per partition;
+    // the hash-final CoalescePartitions then parallelizes all partitions. Only with hash-final (the
+    // sorted path's SortPreservingMerge would not parallelize the extra partitions anyway).
+    let partial = if group_by_limit_factor > 0 && hash_final_enabled() && per_partition_enabled() {
+        partial
+            .as_any()
+            .downcast_ref::<AggregateExec>()
+            .and_then(|agg| {
+                let new_input = strip_coalesce_partitions(agg.input()).ok()?;
+                partial.clone().with_new_children(vec![new_input]).ok()
+            })
+            .unwrap_or(partial)
+    } else {
+        partial
+    };
+
     // Trim during aggregation: replace the partial hash aggregate with a GroupByLimitAggregateExec
     // so it keeps only the top-k groups by `cols` instead of materializing every group and letting
     // the Sort below trim afterwards. `group_by_limit_factor == 0` disables this.
@@ -493,6 +509,39 @@ fn hash_final_enabled() -> bool {
         .unwrap_or(false)
 }
 
+/// Experiment toggle (only meaningful with [hash_final_enabled]): drop the per-partition merge
+/// below the trimmed aggregate so it runs per partition (N-way) instead of per union branch. The
+/// hash-final `CoalescePartitions` then parallelizes all partitions.
+///
+/// Trades memory for parallelism: one group table per DF partition (= per parquet file and chunk),
+/// so peak memory is bounded by the partition count, not by k. It stays low only while partitions
+/// are key-local (group keys do not span partitions) -- true for index-sorted pre-aggregations. On
+/// a group key spread across many partitions each table approaches full cardinality (~N x memory).
+fn per_partition_enabled() -> bool {
+    std::env::var("CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION")
+        .map(|v| v == "true" || v == "1")
+        .unwrap_or(false)
+}
+
+/// Recursively drop `CoalescePartitionsExec` nodes, exposing the underlying multi-partition streams
+/// to the parent.
+fn strip_coalesce_partitions(
+    p: &Arc<dyn ExecutionPlan>,
+) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
+    if p.as_any().is::<CoalescePartitionsExec>() {
+        return strip_coalesce_partitions(&p.children()[0].clone());
+    }
+    let children = p.children();
+    if children.is_empty() {
+        return Ok(p.clone());
+    }
+    let new_children = children
+        .iter()
+        .map(|c| strip_coalesce_partitions(c))
+        .collect::<Result<Vec<_>, _>>()?;
+    p.clone().with_new_children(new_children)
+}
+
 /// The group/aggregate state of either an `InlineAggregateExec` or a plain `AggregateExec`, when in
 /// Final mode.
 struct FinalAggregateInfo {

From a74a4f2c96893da776caff10ffe3175863a0712a Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Fri, 19 Jun 2026 22:51:59 +0200
Subject: [PATCH 06/16] refactor(cubestore): drop sorted worker top-k,
 hash-final is the only path

The sorted-merge worker top-k (per-partition SortExec + SortPreservingMerge ->
router SortedFinalAggregate) drained the worker partitions on a single task, so
it ran on one core and was slower than master, which keeps a parallel
CoalescePartitions over the partial aggregate. The hash-final path (worker
CoalescePartitions -> router hash Final + Sort(T, fetch=k)) is strictly better:
same memory, ~2x faster because CoalescePartitions parallelizes the per-partition
aggregates. So remove the sorted variant and the CUBESTORE_GROUP_BY_LIMIT_HASH_FINAL
flag entirely; hash-final is now how the trimmed top-k is always combined.

Two knobs remain:
- group_by_limit_factor (env CUBESTORE_GROUP_BY_LIMIT_FACTOR, >0): whether to use
  the trimming aggregate at all. 0 leaves the plan untouched (master behavior).
- CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION: whether to push it below the merge
  (per-partition N-way) -- still experimental, default off.

push_worker_sort_and_limit now returns early when factor == 0. Existing
planning/execution/limit_pushdown/union tests pass against the hash-final plan.

(cherry picked from commit e52d56ab34bd50f2684ab4a6362ae3bf7023e01a)
---
 .../distributed_partial_aggregate.rs          | 134 +++++-------------
 1 file changed, 38 insertions(+), 96 deletions(-)

diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
index 9fa8df19dcd29..970de53ba5ecf 100644
--- a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
@@ -289,7 +289,13 @@ pub fn push_worker_sort_and_limit(
     p: Arc<dyn ExecutionPlan>,
     group_by_limit_factor: usize,
 ) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
-    // Worker side: wrap the partial aggregate with a per-partition bounded sort.
+    // The worker top-k engages only when the trimming aggregate is enabled (factor > 0); otherwise
+    // leave the plan as planned (no trim).
+    if group_by_limit_factor == 0 {
+        return Ok(p);
+    }
+
+    // Worker side: replace the partial aggregate's merge with the trimmed top-k.
     if let Some(w) = p.as_any().downcast_ref::<WorkerExec>() {
         let Some((cols, fetch)) = w.worker_sort_and_limit.clone() else {
             return Ok(p);
@@ -310,10 +316,8 @@ pub fn push_worker_sort_and_limit(
         )));
     }
 
-    // Router side: rebuild the final aggregate over a sort-preserving merge in `worker_order`, and
-    // reorder the (optimization-only) worker subtree to match. Same keys are adjacent in
-    // `worker_order`, so the sorted final combines them; its output is `worker_order`-sorted (whose
-    // prefix is the query's ORDER BY), so the limit above stays correct.
+    // Router side: combine the workers' top-k with a hash final aggregate, then re-apply the top-k
+    // sort by `worker_order` (the total order T).
     let Some(final_agg) = FinalAggregateInfo::extract(&p) else {
         return Ok(p);
     };
@@ -339,36 +343,23 @@ pub fn push_worker_sort_and_limit(
         Arc::new(cs.with_changed_schema(new_worker_subtree, cs.required_input_ordering.clone()));
     let worker_order = worker_ordering(&final_agg.group_expr, &cols)?;
 
-    // Hash-final variant: combine the worker top-k with a hash final aggregate over coalesced
-    // (unordered) streams, then re-apply the top-k sort by the total order. The Sort(fetch) is
-    // required even for a bare LIMIT: it picks the k smallest by the total order, which are exactly
-    // the groups every worker kept (and therefore fully combined here); a plain limit could take a
-    // group only one worker kept (undercounted).
-    if group_by_limit_factor > 0 && hash_final_enabled() {
-        let coalesced: Arc<dyn ExecutionPlan> = Arc::new(CoalescePartitionsExec::new(new_cs));
-        let final_hash: Arc<dyn ExecutionPlan> = Arc::new(AggregateExec::try_new(
-            AggregateMode::Final,
-            final_agg.group_expr,
-            final_agg.aggr_expr,
-            final_agg.filter_expr,
-            coalesced,
-            final_agg.input_schema,
-        )?);
-        return Ok(Arc::new(
-            SortExec::new(worker_order, final_hash).with_fetch(Some(fetch)),
-        ));
-    }
-
-    let merged: Arc<dyn ExecutionPlan> =
-        Arc::new(SortPreservingMergeExec::new(worker_order, new_cs));
-    Ok(Arc::new(AggregateExec::try_new(
+    // Combine the worker top-k with a hash final aggregate over coalesced (unordered) streams, then
+    // re-apply the top-k sort by the total order T. The Sort(fetch) is required even for a bare
+    // LIMIT: it keeps the k smallest by T -- exactly the groups every worker kept and fully combined
+    // here -- where a plain limit could take a group only one worker kept (undercounted). The
+    // coalesce also drains the workers' streams in parallel.
+    let coalesced: Arc<dyn ExecutionPlan> = Arc::new(CoalescePartitionsExec::new(new_cs));
+    let final_hash: Arc<dyn ExecutionPlan> = Arc::new(AggregateExec::try_new(
         AggregateMode::Final,
         final_agg.group_expr,
         final_agg.aggr_expr,
         final_agg.filter_expr,
-        merged,
+        coalesced,
         final_agg.input_schema,
-    )?))
+    )?);
+    Ok(Arc::new(
+        SortExec::new(worker_order, final_hash).with_fetch(Some(fetch)),
+    ))
 }
 
 /// Builds the `worker_order` LexOrdering over an aggregate's group columns from the descriptor.
@@ -393,14 +384,9 @@ fn worker_ordering(
     Ok(LexOrdering::new(exprs))
 }
 
-/// Rebuilds a worker subtree as `SortPreservingMerge(worker_order) <- Sort(worker_order, fetch, per
-/// partition) <- partial`. Returns `None` for an unrecognized or already-rewritten subtree, which
-/// keeps [push_worker_sort_and_limit] idempotent.
-///
-/// The per-partition `Sort` does the bounding (a bounded heap, O(fetch) memory); the merge above it
-/// carries no fetch. Because this pass runs last, `replace_suboptimal_merge_sorts` has already run
-/// and won't push the query's row limit into the merge -- which would cut the merged stream of
-/// (still uncombined) partial rows by rows and undercount groups split across partitions.
+/// Rebuilds a worker subtree as `CoalescePartitions <- GroupByLimitAggregate <- ...`: the trimmed
+/// top-k is emitted unsorted and coalesced to one stream for the router's hash final aggregate.
+/// Returns `None` for an unrecognized subtree (no locatable partial aggregate).
 fn resort_worker_subtree(
     worker_subtree: &Arc<dyn ExecutionPlan>,
     cols: &[(usize, bool, bool)],
@@ -409,10 +395,10 @@ fn resort_worker_subtree(
 ) -> Option<Arc<dyn ExecutionPlan>> {
     let partial = locate_partial_aggregate(worker_subtree)?;
 
-    // N-way experiment: drop the per-partition merge below the aggregate so it runs per partition;
-    // the hash-final CoalescePartitions then parallelizes all partitions. Only with hash-final (the
-    // sorted path's SortPreservingMerge would not parallelize the extra partitions anyway).
-    let partial = if group_by_limit_factor > 0 && hash_final_enabled() && per_partition_enabled() {
+    // Per-partition (CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION): drop the merge below the aggregate so
+    // it runs over every raw partition; the CoalescePartitions below then parallelizes all of them
+    // (N-way) instead of one stream per union branch.
+    let partial = if per_partition_enabled() {
         partial
             .as_any()
             .downcast_ref::<AggregateExec>()
@@ -426,8 +412,8 @@ fn resort_worker_subtree(
     };
 
     // Trim during aggregation: replace the partial hash aggregate with a GroupByLimitAggregateExec
-    // so it keeps only the top-k groups by `cols` instead of materializing every group and letting
-    // the Sort below trim afterwards. `group_by_limit_factor == 0` disables this.
+    // keeping only the top-k groups by `cols`. `group_by_limit_factor` is the trim threshold; the
+    // caller only reaches here with it > 0.
     let trimmed: Arc<dyn ExecutionPlan> = if group_by_limit_factor > 0 {
         if let Some(agg) = partial.as_any().downcast_ref::<AggregateExec>() {
             let order: Vec<(usize, SortOptions)> = cols
@@ -458,60 +444,16 @@ fn resort_worker_subtree(
         partial
     };
 
-    // Hash-final variant (CUBESTORE_GROUP_BY_LIMIT_HASH_FINAL): emit the trimmed top-k unsorted,
-    // coalesced to one stream. The router hash-combines and re-applies the top-k sort, so no
-    // worker-side sort/merge holds the whole result, and the aggregates run in parallel
-    // (CoalescePartitions spawns a task per input). Gated on `group_by_limit_factor > 0`; when
-    // `worker_sort_and_limit` is set the partial is the hash `AggregateExec` so the trim engages and
-    // the coalesced output is <= partitions*k rows. (If the trim ever fell back to the untrimmed
-    // partial -- e.g. grouping sets -- the router hash final would size to all groups: a memory
-    // regression, not a correctness bug.)
-    if group_by_limit_factor > 0 && hash_final_enabled() {
-        return Some(Arc::new(CoalescePartitionsExec::new(trimmed)));
-    }
-
-    // Sorted path: per-partition bounded sort, then a sort-preserving merge, so the router's sorted
-    // final aggregate sees adjacent equal keys and its limit cuts on the total order.
-    let schema = trimmed.schema();
-    let mut exprs = Vec::with_capacity(cols.len());
-    for (idx, asc, nulls_first) in cols {
-        let field = schema.fields().get(*idx)?;
-        exprs.push(PhysicalSortExpr {
-            expr: Arc::new(Column::new(field.name(), *idx)),
-            options: SortOptions {
-                descending: !asc,
-                nulls_first: *nulls_first,
-            },
-        });
-    }
-    let worker_order = LexOrdering::new(exprs);
-    let per_partition_sort: Arc<dyn ExecutionPlan> = Arc::new(
-        SortExec::new(worker_order.clone(), trimmed)
-            .with_fetch(Some(fetch))
-            .with_preserve_partitioning(true),
-    );
-    Some(Arc::new(SortPreservingMergeExec::new(
-        worker_order,
-        per_partition_sort,
-    )))
-}
-
-/// Experiment toggle: route the worker top-k through a router-side hash final aggregate instead of
-/// the sorted-merge framework.
-///
-/// Must be set consistently across the whole fleet: the worker and router re-plan in separate
-/// processes, each reading this flag. A worker emitting unsorted top-k (flag on) into a router that
-/// still expects the sorted-merge input (flag off) would silently undercount. Set it via deployment
-/// env so every node agrees.
-fn hash_final_enabled() -> bool {
-    std::env::var("CUBESTORE_GROUP_BY_LIMIT_HASH_FINAL")
-        .map(|v| v == "true" || v == "1")
-        .unwrap_or(false)
+    // Emit the trimmed top-k unsorted, coalesced to one stream. The router hash-combines and
+    // re-applies the top-k sort, so no worker-side sort/merge holds the whole result and the
+    // per-partition aggregates run in parallel (CoalescePartitions spawns a task per input) instead
+    // of being drained one at a time by a sort-preserving merge.
+    Some(Arc::new(CoalescePartitionsExec::new(trimmed)))
 }
 
-/// Experiment toggle (only meaningful with [hash_final_enabled]): drop the per-partition merge
-/// below the trimmed aggregate so it runs per partition (N-way) instead of per union branch. The
-/// hash-final `CoalescePartitions` then parallelizes all partitions.
+/// Toggle (CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION): drop the merge below the trimmed aggregate so it
+/// runs per partition (N-way) instead of per union branch. The `CoalescePartitions` then
+/// parallelizes all partitions.
 ///
 /// Trades memory for parallelism: one group table per DF partition (= per parquet file and chunk),
 /// so peak memory is bounded by the partition count, not by k. It stays low only while partitions

From 52ae93c0f8a61965e63a30f47df4b9ef57d75e85 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Mon, 22 Jun 2026 23:27:07 +0200
Subject: [PATCH 07/16] chore(cubestore): default group-by-limit trim off
 (opt-in via env)

Adapted from e8a2a90bec (dropped the dictionary-encoding default; dictionaries
are not ported to this branch).
---
 rust/cubestore/cubestore/src/config/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rust/cubestore/cubestore/src/config/mod.rs b/rust/cubestore/cubestore/src/config/mod.rs
index 1409cab50a57c..378232ad1cd69 100644
--- a/rust/cubestore/cubestore/src/config/mod.rs
+++ b/rust/cubestore/cubestore/src/config/mod.rs
@@ -1792,7 +1792,7 @@ impl Config {
                     "CUBESTORE_REPARTITION_CHECK_OVERLAPPING_CHILDREN",
                     false,
                 ),
-                group_by_limit_factor: env_parse("CUBESTORE_GROUP_BY_LIMIT_FACTOR", 2),
+                group_by_limit_factor: env_parse("CUBESTORE_GROUP_BY_LIMIT_FACTOR", 0),
                 allow_decimal128: env_bool("CUBESTORE_ALLOW_DECIMAL128", false),
                 enable_remove_orphaned_remote_files: env_bool(
                     "CUBESTORE_ENABLE_REMOVE_ORPHANED_REMOTE_FILES",

From 110b2c3c8ede8736dedbf7efa393e67fef4ce818 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Mon, 22 Jun 2026 23:27:58 +0200
Subject: [PATCH 08/16] refactor(cubestore): address PR review on
 group-by-limit

Non-dictionary subset of 4f9f439dfa:
- per-partition: strip only the leading CoalescePartitionsExec feeding the
  aggregate, not every one in the subtree (preserves UNION/mixed-partitioning
  plan semantics)
- GroupByLimitAggregateExec::statistics: report an inexact upper bound
  (min(input_rows, factor*k*partitions)) instead of Absent
- drop unused col_idx_base from aggregate_expressions (Partial-only)
- comment: duplicate ORDER BY skip
---
 .../group_by_limit_aggregate_stream.rs        |  7 +++---
 .../group_by_limit_aggregate/mod.rs           | 17 ++++++++++++-
 .../distributed_partial_aggregate.rs          | 25 +++++++------------
 .../optimizations/group_by_limit_rewriter.rs  |  1 +
 4 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs
index e1ee77c685ca9..ff73f8b691b0d 100644
--- a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs
@@ -54,8 +54,7 @@ impl GroupByLimitAggregateStream {
         let batch_size = context.session_config().batch_size();
         let input = agg.input().execute(partition, Arc::clone(&context))?;
 
-        let aggregate_arguments =
-            aggregate_expressions(agg.aggr_expr(), agg_group_by.num_group_exprs())?;
+        let aggregate_arguments = aggregate_expressions(agg.aggr_expr())?;
 
         let accumulators: Vec<_> = agg
             .aggr_expr()
@@ -202,10 +201,10 @@ impl GroupByLimitAggregateStream {
 }
 
 /// Partial-aggregate argument expressions, one vec per aggregate. Mirrors DataFusion's private
-/// `aggregate_expressions` for `AggregateMode::Partial`.
+/// `aggregate_expressions` for `AggregateMode::Partial` only — the Final-mode column offset that
+/// DataFusion's version takes is not needed here, so it is omitted.
 fn aggregate_expressions(
     aggr_expr: &[Arc<AggregateFunctionExpr>],
-    _col_idx_base: usize,
 ) -> DFResult<Vec<Vec<Arc<dyn PhysicalExpr>>>> {
     Ok(aggr_expr
         .iter()
diff --git a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/mod.rs b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/mod.rs
index 4fe9ba5850cca..65eda812fb150 100644
--- a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/mod.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/mod.rs
@@ -191,8 +191,23 @@ impl ExecutionPlan for GroupByLimitAggregateExec {
     }
 
     fn statistics(&self) -> DFResult<Statistics> {
+        // The trim keeps at most `factor * k` groups per output partition, so the output is bounded
+        // by that and by the input row count. Report it (inexact) instead of Absent, which makes
+        // downstream planners bail. `factor` is always > 0 here (the rewriter only builds this exec
+        // when trimming is enabled), but guard anyway.
+        let input_rows = self.input.statistics()?.num_rows;
+        let num_rows = if self.factor == 0 {
+            input_rows
+        } else {
+            let parts = self.cache.output_partitioning().partition_count().max(1);
+            let cap = self.factor.saturating_mul(self.k).saturating_mul(parts);
+            match input_rows {
+                Precision::Exact(n) | Precision::Inexact(n) => Precision::Inexact(n.min(cap)),
+                Precision::Absent => Precision::Inexact(cap),
+            }
+        };
         Ok(Statistics {
-            num_rows: Precision::Absent,
+            num_rows,
             column_statistics: Statistics::unknown_column(&self.schema),
             total_byte_size: Precision::Absent,
         })
diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
index 970de53ba5ecf..578b0cbcfbbf2 100644
--- a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
@@ -403,7 +403,7 @@ fn resort_worker_subtree(
             .as_any()
             .downcast_ref::<AggregateExec>()
             .and_then(|agg| {
-                let new_input = strip_coalesce_partitions(agg.input()).ok()?;
+                let new_input = strip_leading_coalesce_partitions(agg.input());
                 partial.clone().with_new_children(vec![new_input]).ok()
             })
             .unwrap_or(partial)
@@ -465,23 +465,16 @@ fn per_partition_enabled() -> bool {
         .unwrap_or(false)
 }
 
-/// Recursively drop `CoalescePartitionsExec` nodes, exposing the underlying multi-partition streams
-/// to the parent.
-fn strip_coalesce_partitions(
-    p: &Arc<dyn ExecutionPlan>,
-) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
+/// Peel the leading `CoalescePartitionsExec` chain directly feeding the aggregate, exposing the
+/// underlying multi-partition streams. Only the immediate coalesce(s) are removed; any
+/// `CoalescePartitionsExec` deeper in the subtree (e.g. one a child inserted to satisfy its own
+/// single-partition input requirement, as in a UNION branch) is left intact so plan semantics are
+/// preserved.
+fn strip_leading_coalesce_partitions(p: &Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
     if p.as_any().is::<CoalescePartitionsExec>() {
-        return strip_coalesce_partitions(&p.children()[0].clone());
-    }
-    let children = p.children();
-    if children.is_empty() {
-        return Ok(p.clone());
+        return strip_leading_coalesce_partitions(&p.children()[0]);
     }
-    let new_children = children
-        .iter()
-        .map(|c| strip_coalesce_partitions(c))
-        .collect::<Result<Vec<_>, _>>()?;
-    p.clone().with_new_children(new_children)
+    p.clone()
 }
 
 /// The group/aggregate state of either an `InlineAggregateExec` or a plain `AggregateExec`, when in
diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/group_by_limit_rewriter.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/group_by_limit_rewriter.rs
index 3bf40d08c1c65..0b17807bf873b 100644
--- a/rust/cubestore/cubestore/src/queryplanner/optimizations/group_by_limit_rewriter.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/group_by_limit_rewriter.rs
@@ -109,6 +109,7 @@ fn analyze(root: &Arc<dyn ExecutionPlan>) -> Option<Target> {
     for e in &order {
         let column = e.expr.as_any().downcast_ref::<Column>()?;
         let idx = group_names.iter().position(|n| n == column.name())?;
+        // A repeated ORDER BY column adds nothing to the total order; skip it.
         if used[idx] {
             continue;
         }

From 972740200b0ffb89245ecdcca43a767de4a10465 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Sat, 20 Jun 2026 13:34:12 +0200
Subject: [PATCH 09/16] test(cubestore): ignore worker_sort_and_limit_cluster
 (worker limit-pushdown Sort not emitted)

(cherry picked from commit cbff8401df4eb921a597a979736e6e093c4a7158)
---
 rust/cubestore/cubestore/src/sql/mod.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/rust/cubestore/cubestore/src/sql/mod.rs b/rust/cubestore/cubestore/src/sql/mod.rs
index 472465ade9279..93d39820809cb 100644
--- a/rust/cubestore/cubestore/src/sql/mod.rs
+++ b/rust/cubestore/cubestore/src/sql/mod.rs
@@ -6339,6 +6339,8 @@ mod tests {
     }
 
     #[tokio::test]
+    #[ignore = "worker-side limit-pushdown Sort(fetch) is not emitted on the inline path in this \
+                cluster setup; re-enable once compute_worker_sort_and_limit produces it again"]
     async fn worker_sort_and_limit_cluster() -> Result<(), CubeError> {
         Config::test("worker_sort_limit_router")
             .update_config(|mut config| {

From cde0dbd152fd8ec710f4844f2a48c283ca812bb1 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Mon, 22 Jun 2026 23:28:53 +0200
Subject: [PATCH 10/16] fix(cubestore): restore worker-side sort bounding for
 the inline limit path

Cherry-pick of 4bb1499d06 (without the RocksDB test scratch dirs it had
accidentally committed; .gitignore pattern from c6faaedd59 folded in).

Split resort_worker_subtree by partial aggregate kind:
- inline/sorted: SortPreservingMerge(T) <- Sort(T, fetch, per partition) -- can't
  trim a sorted aggregate, so always bound with a sort.
- hash: the trimming GroupByLimitAggregate + coalesce (factor-aware), router does
  the hash final + Sort(fetch).
Drop the top-level factor==0 early return so the inline bounding applies
regardless of the trim factor (factor only gates the hash trim).
Un-ignore worker_sort_and_limit_cluster and assert the GroupByLimitAggregate trim.
---
 rust/cubestore/.gitignore                     |   3 +
 .../distributed_partial_aggregate.rs          | 203 ++++++++++++------
 rust/cubestore/cubestore/src/sql/mod.rs       |  11 +-
 3 files changed, 141 insertions(+), 76 deletions(-)

diff --git a/rust/cubestore/.gitignore b/rust/cubestore/.gitignore
index 66d0436bd8f34..64f8a320fdced 100644
--- a/rust/cubestore/.gitignore
+++ b/rust/cubestore/.gitignore
@@ -10,3 +10,6 @@ cubestore/target
 cubesql/target
 cubestore-sql-tests/data/**
 cubestore/db-tmp
+# RocksDB scratch dirs left by metastore unit tests (run from the crate root)
+/cubestore/test-*-local/
+/cubestore/test-*-upstream/
diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
index 578b0cbcfbbf2..07904c0fc67a0 100644
--- a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
@@ -289,13 +289,11 @@ pub fn push_worker_sort_and_limit(
     p: Arc<dyn ExecutionPlan>,
     group_by_limit_factor: usize,
 ) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
-    // The worker top-k engages only when the trimming aggregate is enabled (factor > 0); otherwise
-    // leave the plan as planned (no trim).
-    if group_by_limit_factor == 0 {
-        return Ok(p);
-    }
-
-    // Worker side: replace the partial aggregate's merge with the trimmed top-k.
+    // Worker side: bound the partial aggregate's output. The sorted (inline) aggregate is always
+    // bounded with a per-partition Sort(fetch) -- the group count is unknown, so we can't trim, we
+    // just sort. The hash aggregate uses the trimming top-k, engaged only when factor > 0.
+    // `resort_worker_subtree` returns None when nothing applies (hash with factor == 0), leaving the
+    // plan as planned.
     if let Some(w) = p.as_any().downcast_ref::<WorkerExec>() {
         let Some((cols, fetch)) = w.worker_sort_and_limit.clone() else {
             return Ok(p);
@@ -331,6 +329,11 @@ pub fn push_worker_sort_and_limit(
     let Some((cols, fetch)) = cs.worker_sort_and_limit.clone() else {
         return Ok(p);
     };
+    // The hash aggregate emits unordered trimmed groups (combined by a hash final + re-sort); the
+    // sorted/inline aggregate emits bounded sorted streams (combined by a sort-preserving merge +
+    // sorted final). Decide from the worker's partial aggregate before rebuilding.
+    let is_hash = locate_partial_aggregate(&cs.input_for_optimizations)
+        .map_or(false, |partial| partial.as_any().is::<AggregateExec>());
     let Some(new_worker_subtree) = resort_worker_subtree(
         &cs.input_for_optimizations,
         &cols,
@@ -343,23 +346,39 @@ pub fn push_worker_sort_and_limit(
         Arc::new(cs.with_changed_schema(new_worker_subtree, cs.required_input_ordering.clone()));
     let worker_order = worker_ordering(&final_agg.group_expr, &cols)?;
 
-    // Combine the worker top-k with a hash final aggregate over coalesced (unordered) streams, then
-    // re-apply the top-k sort by the total order T. The Sort(fetch) is required even for a bare
-    // LIMIT: it keeps the k smallest by T -- exactly the groups every worker kept and fully combined
-    // here -- where a plain limit could take a group only one worker kept (undercounted). The
-    // coalesce also drains the workers' streams in parallel.
-    let coalesced: Arc<dyn ExecutionPlan> = Arc::new(CoalescePartitionsExec::new(new_cs));
-    let final_hash: Arc<dyn ExecutionPlan> = Arc::new(AggregateExec::try_new(
-        AggregateMode::Final,
-        final_agg.group_expr,
-        final_agg.aggr_expr,
-        final_agg.filter_expr,
-        coalesced,
-        final_agg.input_schema,
-    )?);
-    Ok(Arc::new(
-        SortExec::new(worker_order, final_hash).with_fetch(Some(fetch)),
-    ))
+    if is_hash {
+        // Hash final over coalesced (unordered) streams, then re-apply the top-k sort by the total
+        // order T. The Sort(fetch) is required even for a bare LIMIT: it keeps the k smallest by T --
+        // exactly the groups every worker kept and fully combined here -- where a plain limit could
+        // take a group only one worker kept (undercounted). The coalesce drains the workers in
+        // parallel.
+        let coalesced: Arc<dyn ExecutionPlan> = Arc::new(CoalescePartitionsExec::new(new_cs));
+        let final_hash: Arc<dyn ExecutionPlan> = Arc::new(AggregateExec::try_new(
+            AggregateMode::Final,
+            final_agg.group_expr,
+            final_agg.aggr_expr,
+            final_agg.filter_expr,
+            coalesced,
+            final_agg.input_schema,
+        )?);
+        Ok(Arc::new(
+            SortExec::new(worker_order, final_hash).with_fetch(Some(fetch)),
+        ))
+    } else {
+        // Sorted final over a sort-preserving merge in worker_order: equal keys are adjacent so the
+        // sorted final combines them, and its output stays worker_order-sorted (whose prefix is the
+        // query's ORDER BY), so the query's limit above stays correct -- no extra Sort needed.
+        let merged: Arc<dyn ExecutionPlan> =
+            Arc::new(SortPreservingMergeExec::new(worker_order, new_cs));
+        Ok(Arc::new(AggregateExec::try_new(
+            AggregateMode::Final,
+            final_agg.group_expr,
+            final_agg.aggr_expr,
+            final_agg.filter_expr,
+            merged,
+            final_agg.input_schema,
+        )?))
+    }
 }
 
 /// Builds the `worker_order` LexOrdering over an aggregate's group columns from the descriptor.
@@ -384,8 +403,14 @@ fn worker_ordering(
     Ok(LexOrdering::new(exprs))
 }
 
-/// Rebuilds a worker subtree as `CoalescePartitions <- GroupByLimitAggregate <- ...`: the trimmed
-/// top-k is emitted unsorted and coalesced to one stream for the router's hash final aggregate.
+/// Rebuilds a worker subtree to bound its output to the top `fetch` groups by the total order in
+/// `cols`. Two shapes, by partial aggregate kind:
+/// - hash (`AggregateExec`): `CoalescePartitions <- GroupByLimitAggregate` -- trim during
+///   aggregation, emitted unsorted for the router's hash final. Only when `factor > 0`; returns
+///   `None` otherwise (trimming disabled, leave the plan as planned).
+/// - sorted/inline: `SortPreservingMerge(T) <- Sort(T, fetch, per partition) <- PartialAggregate` --
+///   we can't trim a sorted aggregate and don't know the group count, so always bound with a sort.
+///
 /// Returns `None` for an unrecognized subtree (no locatable partial aggregate).
 fn resort_worker_subtree(
     worker_subtree: &Arc<dyn ExecutionPlan>,
@@ -395,60 +420,96 @@ fn resort_worker_subtree(
 ) -> Option<Arc<dyn ExecutionPlan>> {
     let partial = locate_partial_aggregate(worker_subtree)?;
 
-    // Per-partition (CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION): drop the merge below the aggregate so
-    // it runs over every raw partition; the CoalescePartitions below then parallelizes all of them
-    // (N-way) instead of one stream per union branch.
-    let partial = if per_partition_enabled() {
-        partial
-            .as_any()
-            .downcast_ref::<AggregateExec>()
-            .and_then(|agg| {
-                let new_input = strip_leading_coalesce_partitions(agg.input());
-                partial.clone().with_new_children(vec![new_input]).ok()
-            })
-            .unwrap_or(partial)
-    } else {
-        partial
-    };
+    // Hash path: trim during aggregation, emit unsorted for the router's hash final. The factor
+    // gates whether trimming applies; with it off, nothing applies on this path.
+    if partial.as_any().is::<AggregateExec>() {
+        if group_by_limit_factor == 0 {
+            return None;
+        }
 
-    // Trim during aggregation: replace the partial hash aggregate with a GroupByLimitAggregateExec
-    // keeping only the top-k groups by `cols`. `group_by_limit_factor` is the trim threshold; the
-    // caller only reaches here with it > 0.
-    let trimmed: Arc<dyn ExecutionPlan> = if group_by_limit_factor > 0 {
-        if let Some(agg) = partial.as_any().downcast_ref::<AggregateExec>() {
-            let order: Vec<(usize, SortOptions)> = cols
-                .iter()
-                .map(|(idx, asc, nulls_first)| {
-                    (
-                        *idx,
-                        SortOptions {
-                            descending: !asc,
-                            nulls_first: *nulls_first,
-                        },
-                    )
+        // Per-partition (CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION): drop the merge below the aggregate
+        // so it runs over every raw partition; the CoalescePartitions below then parallelizes all of
+        // them (N-way) instead of one stream per union branch.
+        let partial = if per_partition_enabled() {
+            partial
+                .as_any()
+                .downcast_ref::<AggregateExec>()
+                .and_then(|agg| {
+                    let new_input = strip_leading_coalesce_partitions(agg.input());
+                    partial.clone().with_new_children(vec![new_input]).ok()
                 })
-                .collect();
-            match GroupByLimitAggregateExec::try_new_from_partial(
+                .unwrap_or(partial)
+        } else {
+            partial
+        };
+
+        let order: Vec<(usize, SortOptions)> = cols
+            .iter()
+            .map(|(idx, asc, nulls_first)| {
+                (
+                    *idx,
+                    SortOptions {
+                        descending: !asc,
+                        nulls_first: *nulls_first,
+                    },
+                )
+            })
+            .collect();
+        let trimmed: Arc<dyn ExecutionPlan> = match partial.as_any().downcast_ref::<AggregateExec>()
+        {
+            Some(agg) => match GroupByLimitAggregateExec::try_new_from_partial(
                 agg,
                 fetch,
                 group_by_limit_factor,
                 order,
             ) {
-                Some(e) => Arc::new(e) as Arc<dyn ExecutionPlan>,
+                Some(e) => Arc::new(e),
                 None => partial,
-            }
-        } else {
-            partial
-        }
-    } else {
-        partial
-    };
+            },
+            None => partial,
+        };
+
+        // Emit the trimmed top-k unsorted, coalesced to one stream. The router hash-combines and
+        // re-applies the top-k sort, so no worker-side sort/merge holds the whole result and the
+        // per-partition aggregates run in parallel (CoalescePartitions spawns a task per input)
+        // instead of being drained one at a time by a sort-preserving merge.
+        return Some(Arc::new(CoalescePartitionsExec::new(trimmed)));
+    }
+
+    // Sorted/inline path: bound each partition with Sort(fetch) and merge in the total order. The
+    // per-partition `fetch` is sound because the key is the full group key: a globally top-`fetch`
+    // group stays within every partition's first `fetch`, so the router's sorted final still sees
+    // all its partial states (see this function's doc and the module note on the total order).
+    let worker_order = lex_ordering_from_cols(cols, &partial.schema())?;
+    let per_partition_sort: Arc<dyn ExecutionPlan> = Arc::new(
+        SortExec::new(worker_order.clone(), partial)
+            .with_fetch(Some(fetch))
+            .with_preserve_partitioning(true),
+    );
+    Some(Arc::new(SortPreservingMergeExec::new(
+        worker_order,
+        per_partition_sort,
+    )))
+}
 
-    // Emit the trimmed top-k unsorted, coalesced to one stream. The router hash-combines and
-    // re-applies the top-k sort, so no worker-side sort/merge holds the whole result and the
-    // per-partition aggregates run in parallel (CoalescePartitions spawns a task per input) instead
-    // of being drained one at a time by a sort-preserving merge.
-    Some(Arc::new(CoalescePartitionsExec::new(trimmed)))
+/// Build a `LexOrdering` over the partial aggregate's group columns from the descriptor (indices
+/// into the partial output schema). Returns `None` if a column index is out of range.
+fn lex_ordering_from_cols(
+    cols: &[(usize, bool, bool)],
+    schema: &datafusion::arrow::datatypes::SchemaRef,
+) -> Option<LexOrdering> {
+    let mut exprs = Vec::with_capacity(cols.len());
+    for (idx, asc, nulls_first) in cols {
+        let field = schema.fields().get(*idx)?;
+        exprs.push(PhysicalSortExpr {
+            expr: Arc::new(Column::new(field.name(), *idx)),
+            options: SortOptions {
+                descending: !asc,
+                nulls_first: *nulls_first,
+            },
+        });
+    }
+    Some(LexOrdering::new(exprs))
 }
 
 /// Toggle (CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION): drop the merge below the trimmed aggregate so it
diff --git a/rust/cubestore/cubestore/src/sql/mod.rs b/rust/cubestore/cubestore/src/sql/mod.rs
index 93d39820809cb..4a444123b6501 100644
--- a/rust/cubestore/cubestore/src/sql/mod.rs
+++ b/rust/cubestore/cubestore/src/sql/mod.rs
@@ -6339,8 +6339,6 @@ mod tests {
     }
 
     #[tokio::test]
-    #[ignore = "worker-side limit-pushdown Sort(fetch) is not emitted on the inline path in this \
-                cluster setup; re-enable once compute_worker_sort_and_limit produces it again"]
     async fn worker_sort_and_limit_cluster() -> Result<(), CubeError> {
         Config::test("worker_sort_limit_router")
             .update_config(|mut config| {
@@ -6489,7 +6487,9 @@ mod tests {
                                     }
                                 }
 
-                                // Test 4: ORDER BY 1 DESC with LIMIT on non-prefix column
+                                // Test 4: ORDER BY DESC + LIMIT on a non-prefix column, grouped by a
+                                // non-prefix column (hash aggregate). The hash path bounds the worker
+                                // output with the trimming aggregate, not a Sort.
                                 {
                                     let result = service
                                         .exec_query(
@@ -6506,8 +6506,9 @@ mod tests {
                                         _ => panic!("expected string"),
                                     };
                                     assert!(
-                                        worker_plan.contains("Sort, fetch: 2"),
-                                        "Worker should have Sort with fetch=2 for DESC. Plan: {}",
+                                        worker_plan.contains("GroupByLimitAggregate"),
+                                        "Hash-aggregate worker should bound output with \
+                                         GroupByLimitAggregate. Plan: {}",
                                         worker_plan
                                     );
                                 }

From cfa6d7a77b1f3cc1176d18089fbff1b01a06b846 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Sun, 21 Jun 2026 20:21:50 +0200
Subject: [PATCH 11/16] perf(cubestore): coalesce partitions under hash
 aggregate instead of sort-merge

The scan inserts a SortPreservingMergeExec to combine an index's range-disjoint
partitions into one sorted stream. A Linear (hash) aggregate ignores input order,
so that per-row merge is wasted work -- and degenerates into full-length key
comparisons when the leading sort columns are low-cardinality / NULL-heavy.

drop_sort_merge_under_global_aggregate already replaces such merges with plain
CoalescePartitions for global (no GROUP BY) aggregates. Extend it, behind
CUBESTORE_COALESCE_UNDER_HASH_AGGREGATE (default off), to grouped hash aggregates:
CoalescePartitions still yields a single output partition (one hash table, no
per-partition memory blowup), it just drops the useless sort. The flag is read
once at the call site and passed in, so tests are deterministic.

On a real 117M-row, 403809-group pre-agg query (GROUP BY 6 dims + 2 SUM, LIMIT):
hash+trim path ~5500ms -> ~2526ms; correctness unchanged.

(cherry picked from commit e6250b57fc7082b3383d2497cc3a848bfec9decc)
---
 .../distributed_partial_aggregate.rs          | 78 ++++++++++++++-----
 .../src/queryplanner/optimizations/mod.rs     |  9 ++-
 2 files changed, 66 insertions(+), 21 deletions(-)

diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
index 07904c0fc67a0..010c9b9fc6121 100644
--- a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
@@ -131,30 +131,39 @@ pub fn push_aggregate_to_workers(
     )?))
 }
 
-/// A global (no GROUP BY) aggregate doesn't use the input ordering, but the scan still merges
-/// its partitions with a SortPreservingMergeExec when the index has a sort key (e.g. picked
-/// from the filters for partition pruning). Replace such merges under the aggregate with plain
-/// partition coalescing: the per-row key comparisons are pure waste there, and particularly
-/// bad when the filters above make the merge keys constant, turning every comparison into a
-/// full-length tie.
+/// Whether the experimental grouped-hash coalescing in [`drop_sort_merge_under_global_aggregate`]
+/// is enabled (`CUBESTORE_COALESCE_UNDER_HASH_AGGREGATE`). Read once at the call site, not per node.
+pub fn coalesce_under_hash_aggregate_enabled() -> bool {
+    std::env::var("CUBESTORE_COALESCE_UNDER_HASH_AGGREGATE")
+        .map(|v| v == "true" || v == "1")
+        .unwrap_or(false)
+}
+
+/// A Linear aggregate doesn't use input ordering, but the scan still merges its partitions with a
+/// SortPreservingMergeExec when the index has a sort key (e.g. picked from the filters for partition
+/// pruning). Replace such merges under the aggregate with plain partition coalescing: the per-row
+/// key comparisons are pure waste there, and particularly bad when the leading sort columns are
+/// constant (low-cardinality / NULL-heavy), turning every comparison into a full-length tie.
 ///
-/// Restricted to aggregates without group expressions: those hold a single accumulator set per
-/// partition even when later optimizations make them run per partition. A grouped hash
-/// aggregate over unmerged partitions could end up with a hash table per partition,
-/// multiplying its memory by the partition count.
+/// Always applied to global (no GROUP BY) aggregates. A grouped hash aggregate is equally safe --
+/// CoalescePartitions still yields a single output partition, hence one hash table and no
+/// per-partition memory blowup -- but it is gated behind `coalesce_grouped_hash` while the win is
+/// validated.
 pub fn drop_sort_merge_under_global_aggregate(
     p: Arc<dyn ExecutionPlan>,
+    coalesce_grouped_hash: bool,
 ) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
     let Some(agg) = p.as_any().downcast_ref::<AggregateExec>() else {
         return Ok(p);
     };
-    if !matches!(agg.input_order_mode(), InputOrderMode::Linear)
-        || !agg.group_expr().expr().is_empty()
-    {
+    if !matches!(agg.input_order_mode(), InputOrderMode::Linear) {
+        return Ok(p);
+    }
+    if !agg.group_expr().expr().is_empty() && !coalesce_grouped_hash {
         return Ok(p);
     }
-    // Order-sensitive aggregates (first_value, array_agg(ORDER BY), ...) stay Linear with an
-    // empty GROUP BY but still need their input ordered
+    // Order-sensitive aggregates (first_value, array_agg(ORDER BY), ...) stay Linear but still
+    // require their input ordered -- leave their merge in place.
     if agg.required_input_ordering()[0].is_some() {
         return Ok(p);
     }
@@ -1445,7 +1454,7 @@ mod tests {
         );
         let original = global_sum_aggregate(filter);
 
-        let rewritten = drop_sort_merge_under_global_aggregate(original.clone()).unwrap();
+        let rewritten = drop_sort_merge_under_global_aggregate(original.clone(), false).unwrap();
 
         let agg = rewritten.as_any().downcast_ref::<AggregateExec>().unwrap();
         let filter = agg
@@ -1500,7 +1509,7 @@ mod tests {
             .required_input_ordering()[0]
             .is_some());
 
-        let rewritten = drop_sort_merge_under_global_aggregate(original.clone()).unwrap();
+        let rewritten = drop_sort_merge_under_global_aggregate(original.clone(), false).unwrap();
         assert!(Arc::ptr_eq(&rewritten, &original));
     }
 
@@ -1533,10 +1542,41 @@ mod tests {
             InputOrderMode::Linear
         ));
 
-        let rewritten = drop_sort_merge_under_global_aggregate(original.clone()).unwrap();
+        let rewritten = drop_sort_merge_under_global_aggregate(original.clone(), false).unwrap();
         assert!(Arc::ptr_eq(&rewritten, &original));
     }
 
+    /// With the flag on, the same grouped hash aggregate's under-scan merge is replaced by plain
+    /// partition coalescing (single output partition -> one hash table), dropping the useless sort.
+    #[test]
+    fn coalesces_sort_merge_under_grouped_hash_aggregate_when_enabled() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("k", DataType::Int64, false),
+            Field::new("g", DataType::Int64, false),
+            Field::new("v", DataType::Int64, false),
+        ]));
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(Int64Array::from(vec![1, 2])),
+                Arc::new(Int64Array::from(vec![5, 4])),
+                Arc::new(Int64Array::from(vec![10, 20])),
+            ],
+        )
+        .unwrap();
+        let source = sorted_source(&schema, vec![vec![batch.clone()], vec![batch]]);
+        let original = sum_aggregate(AggregateMode::Partial, "g", merge_by_k(source));
+
+        let rewritten = drop_sort_merge_under_global_aggregate(original.clone(), true).unwrap();
+        // The aggregate stays grouped, but its input merge becomes a plain CoalescePartitionsExec
+        // (still a single output partition -> one hash table).
+        assert!(!Arc::ptr_eq(&rewritten, &original));
+        assert!(rewritten.as_any().is::<AggregateExec>());
+        assert!(rewritten.children()[0]
+            .as_any()
+            .is::<CoalescePartitionsExec>());
+    }
+
     fn global_sum_aggregate(input: Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
         let schema = input.schema();
         let sum = AggregateExprBuilder::new(sum_udaf(), vec![col("v", &schema).unwrap()])
@@ -1581,7 +1621,7 @@ mod tests {
         let source = two_partition_source(&schema);
         let original = sum_aggregate(AggregateMode::Partial, "k", merge_by_k(source));
 
-        let rewritten = drop_sort_merge_under_global_aggregate(original.clone()).unwrap();
+        let rewritten = drop_sort_merge_under_global_aggregate(original.clone(), false).unwrap();
         assert!(Arc::ptr_eq(&rewritten, &original));
     }
 
diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/mod.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/mod.rs
index 2df406a1a62af..ca9de720acd92 100644
--- a/rust/cubestore/cubestore/src/queryplanner/optimizations/mod.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/mod.rs
@@ -173,8 +173,13 @@ fn pre_optimize_physical_plan(
         p
     };
 
-    // Global (no GROUP BY) aggregates don't need their input merged in the sort order
-    let p = rewrite_physical_plan(p, &mut |p| drop_sort_merge_under_global_aggregate(p))?;
+    // Global (no GROUP BY) aggregates -- and, when enabled, grouped hash aggregates -- don't need
+    // their input merged in the sort order. Read the flag once, not per node.
+    let coalesce_grouped_hash =
+        distributed_partial_aggregate::coalesce_under_hash_aggregate_enabled();
+    let p = rewrite_physical_plan(p, &mut |p| {
+        drop_sort_merge_under_global_aggregate(p, coalesce_grouped_hash)
+    })?;
 
     // Replace sorted AggregateExec with InlineAggregateExec for better performance
     let p = rewrite_physical_plan(p, &mut |p| replace_with_inline_aggregate(p))?;

From 42ceaf4c78043fe56d7a1a4ca562d932c24e4351 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Mon, 22 Jun 2026 23:30:32 +0200
Subject: [PATCH 12/16] fix(cubestore): correct GroupByLimitAggregateExec
 partitioning and skip global aggregates

Cherry-pick of fbd20aed75 (dropped the dictionary-column regression test;
dictionaries are not ported to this branch).

- A partial aggregate preserves its input's partitioning; derive the output
  partitioning from the input in both try_new_from_partial and with_new_children
  instead of copying the wrapped aggregate's stale cached value. A too-low count
  made the parent CoalescePartitions read only some partitions and silently drop
  the rest (undercount on real multi-file/multi-partition scans).
- Skip global aggregates (no GROUP BY): GroupValues can't be built over an empty
  schema (intern panics). Leave them to DataFusion.
- Adds partial/final, multi-partition, re-child, and IPC-roundtrip regression
  tests.
---
 .../group_by_limit_aggregate/mod.rs           | 528 +++++++++++++++++-
 1 file changed, 522 insertions(+), 6 deletions(-)

diff --git a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/mod.rs b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/mod.rs
index 65eda812fb150..f53c07c91cad8 100644
--- a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/mod.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/mod.rs
@@ -12,8 +12,8 @@ use datafusion::physical_plan::execution_plan::CardinalityEffect;
 use datafusion::physical_plan::metrics::MetricsSet;
 use datafusion::physical_plan::{aggregates::*, InputOrderMode};
 use datafusion::physical_plan::{
-    DisplayAs, DisplayFormatType, ExecutionPlan, PhysicalExpr, PlanProperties,
-    SendableRecordBatchStream,
+    DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PhysicalExpr,
+    PlanProperties, SendableRecordBatchStream,
 };
 use std::any::Any;
 use std::fmt::Debug;
@@ -70,14 +70,30 @@ impl GroupByLimitAggregateExec {
         if !group_by.is_single() {
             return None;
         }
+        // A global aggregate (no GROUP BY) has zero group columns. `GroupValues` can't be built over
+        // an empty schema -- `intern` indexes column 0 and panics -- and such aggregates need no
+        // trimming, so leave them to DataFusion.
+        if group_by.expr().is_empty() {
+            return None;
+        }
+        let input = aggregate.input().clone();
+        // A partial aggregate preserves its input's partitioning (it runs once per input partition).
+        // Derive the output partitioning from the input rather than copying the wrapped aggregate's
+        // cached value, which can be stale: a later pass may swap our input for one with a different
+        // partition count via `with_new_children` without the cache following, and a too-low count
+        // makes the parent coalesce read only some partitions and silently drop the rest.
+        let cache = aggregate
+            .cache()
+            .clone()
+            .with_partitioning(input.output_partitioning().clone());
         Some(Self {
             group_by,
             aggr_expr: aggregate.aggr_expr().to_vec(),
             filter_expr: aggregate.filter_expr().to_vec(),
-            input: aggregate.input().clone(),
+            input,
             schema: aggregate.schema().clone(),
             input_schema: aggregate.input_schema().clone(),
-            cache: aggregate.cache().clone(),
+            cache,
             k,
             factor,
             order,
@@ -161,14 +177,20 @@ impl ExecutionPlan for GroupByLimitAggregateExec {
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> DFResult<Arc<dyn ExecutionPlan>> {
+        let input = children[0].clone();
+        // Track the (possibly changed) input's partitioning; a partial aggregate preserves it.
+        let cache = self
+            .cache
+            .clone()
+            .with_partitioning(input.output_partitioning().clone());
         Ok(Arc::new(Self {
             group_by: self.group_by.clone(),
             aggr_expr: self.aggr_expr.clone(),
             filter_expr: self.filter_expr.clone(),
-            input: children[0].clone(),
+            input,
             schema: self.schema.clone(),
             input_schema: self.input_schema.clone(),
-            cache: self.cache.clone(),
+            cache,
             k: self.k,
             factor: self.factor,
             order: self.order.clone(),
@@ -217,3 +239,497 @@ impl ExecutionPlan for GroupByLimitAggregateExec {
         CardinalityEffect::LowerEqual
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use datafusion::arrow::array::{Array, Int64Array, RecordBatch, StringArray};
+    use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+    use datafusion::functions_aggregate::sum::sum_udaf;
+    use datafusion::physical_expr::aggregate::AggregateExprBuilder;
+    use datafusion::physical_expr::expressions::col;
+    use datafusion::physical_plan::{collect, ExecutionPlanProperties};
+    use datafusion_datasource::memory::MemorySourceConfig;
+    use datafusion_datasource::source::DataSourceExec;
+    use std::collections::HashSet;
+
+    fn input_schema() -> SchemaRef {
+        Arc::new(Schema::new(vec![
+            Field::new("g0", DataType::Utf8, true),
+            Field::new("g1", DataType::Int64, true),
+            Field::new("v", DataType::Int64, false),
+        ]))
+    }
+
+    /// `n` distinct (g0, g1) combos, each emitted twice in two different batches; every 7th combo
+    /// uses a NULL g1 to exercise null grouping. `v = 1` for every row, so the correct result is `n`
+    /// groups with `sum(v) = 2` each.
+    fn make_batches(n: usize, batch_rows: usize) -> Vec<RecordBatch> {
+        let schema = input_schema();
+        let mut g0: Vec<Option<String>> = Vec::new();
+        let mut g1: Vec<Option<i64>> = Vec::new();
+        for pass in 0..2 {
+            // Reverse one pass so the same combo lands in a different batch than the other pass.
+            let iter: Box<dyn Iterator<Item = usize>> = if pass == 0 {
+                Box::new(0..n)
+            } else {
+                Box::new((0..n).rev())
+            };
+            for i in iter {
+                // g0 is unique per combo, so each (g0, g1) pair is distinct regardless of g1; g1
+                // carries NULLs to exercise null grouping in the multi-column GroupValues path.
+                g0.push(Some(format!("a{}", i)));
+                g1.push(if i % 7 == 0 {
+                    None
+                } else {
+                    Some((i % 100) as i64)
+                });
+            }
+        }
+        let total = g0.len();
+        let mut batches = Vec::new();
+        let mut start = 0;
+        while start < total {
+            let end = (start + batch_rows).min(total);
+            let s = StringArray::from(g0[start..end].to_vec());
+            let l = Int64Array::from(g1[start..end].to_vec());
+            let v = Int64Array::from(vec![1i64; end - start]);
+            batches.push(
+                RecordBatch::try_new(schema.clone(), vec![Arc::new(s), Arc::new(l), Arc::new(v)])
+                    .unwrap(),
+            );
+            start = end;
+        }
+        batches
+    }
+
+    fn partial_aggregate(input: Arc<dyn ExecutionPlan>) -> AggregateExec {
+        let schema = input.schema();
+        let group_by = PhysicalGroupBy::new_single(vec![
+            (col("g0", &schema).unwrap(), "g0".to_string()),
+            (col("g1", &schema).unwrap(), "g1".to_string()),
+        ]);
+        let sum = AggregateExprBuilder::new(sum_udaf(), vec![col("v", &schema).unwrap()])
+            .schema(schema.clone())
+            .alias("sum_v")
+            .build()
+            .unwrap();
+        AggregateExec::try_new(
+            AggregateMode::Partial,
+            group_by,
+            vec![Arc::new(sum)],
+            vec![None],
+            input,
+            schema,
+        )
+        .unwrap()
+    }
+
+    fn distinct_group_rows(batches: &[RecordBatch]) -> usize {
+        let mut seen: HashSet<(Option<String>, Option<i64>)> = HashSet::new();
+        for b in batches {
+            let g0 = b.column(0).as_any().downcast_ref::<StringArray>().unwrap();
+            let g1 = b.column(1).as_any().downcast_ref::<Int64Array>().unwrap();
+            for r in 0..b.num_rows() {
+                let k0 = if g0.is_null(r) {
+                    None
+                } else {
+                    Some(g0.value(r).to_string())
+                };
+                let k1 = if g1.is_null(r) {
+                    None
+                } else {
+                    Some(g1.value(r))
+                };
+                seen.insert((k0, k1));
+            }
+        }
+        seen.len()
+    }
+
+    /// The no-trim (`k = 0`) partial path must group exactly like DataFusion's stock partial: every
+    /// distinct group present in the input must appear exactly once in the partial output, across
+    /// many input batches and an output larger than `batch_size`. Reproduces the multi-batch
+    /// undercount seen when routing the full hash aggregate through this exec.
+    #[tokio::test]
+    async fn no_trim_partial_emits_every_group() {
+        let n = 20_000;
+        let batches = make_batches(n, 4096);
+        let expected_groups = distinct_group_rows(&batches);
+        assert_eq!(expected_groups, n, "test setup: combos must be distinct");
+
+        let source = MemorySourceConfig::try_new(&vec![batches], input_schema(), None).unwrap();
+        let input: Arc<dyn ExecutionPlan> = Arc::new(DataSourceExec::new(Arc::new(source)));
+        let partial = partial_aggregate(input);
+
+        let exec = GroupByLimitAggregateExec::try_new_from_partial(&partial, 0, 0, Vec::new())
+            .expect("partial hash aggregate should be wrappable");
+
+        let ctx = Arc::new(TaskContext::default());
+        let out = collect(Arc::new(exec), ctx).await.unwrap();
+
+        let total_rows: usize = out.iter().map(|b| b.num_rows()).sum();
+        let distinct = distinct_group_rows(&out);
+        assert_eq!(
+            distinct, n,
+            "exec emitted {distinct} distinct groups, expected {n}"
+        );
+        assert_eq!(
+            total_rows, n,
+            "exec emitted {total_rows} partial rows for {n} distinct groups (duplicate group keys in partial output)"
+        );
+    }
+
+    /// Our partial output fed into DataFusion's `Final` aggregate (as the router does) must still
+    /// yield every group. Reproduces the distributed undercount end-to-end without the cluster.
+    #[tokio::test]
+    async fn no_trim_partial_then_final_emits_every_group() {
+        let n = 20_000;
+        let batches = make_batches(n, 4096);
+
+        let source = MemorySourceConfig::try_new(&vec![batches], input_schema(), None).unwrap();
+        let input: Arc<dyn ExecutionPlan> = Arc::new(DataSourceExec::new(Arc::new(source)));
+        let partial = partial_aggregate(input);
+        let aggr_expr = partial.aggr_expr().to_vec();
+
+        let exec: Arc<dyn ExecutionPlan> = Arc::new(
+            GroupByLimitAggregateExec::try_new_from_partial(&partial, 0, 0, Vec::new()).unwrap(),
+        );
+
+        let partial_schema = exec.schema();
+        let final_group_by = PhysicalGroupBy::new_single(vec![
+            (col("g0", &partial_schema).unwrap(), "g0".to_string()),
+            (col("g1", &partial_schema).unwrap(), "g1".to_string()),
+        ]);
+        let final_agg = AggregateExec::try_new(
+            AggregateMode::Final,
+            final_group_by,
+            aggr_expr,
+            vec![None],
+            exec,
+            partial_schema,
+        )
+        .unwrap();
+
+        let ctx = Arc::new(TaskContext::default());
+        let out = collect(Arc::new(final_agg), ctx).await.unwrap();
+
+        let total_rows: usize = out.iter().map(|b| b.num_rows()).sum();
+        let distinct = distinct_group_rows(&out);
+        assert_eq!(
+            distinct, n,
+            "final emitted {distinct} distinct groups, expected {n}"
+        );
+        assert_eq!(
+            total_rows, n,
+            "final emitted {total_rows} rows, expected {n}"
+        );
+    }
+
+    /// A partial aggregate preserves its input's partitioning. If our exec keeps a stale
+    /// single-partition `cache` after `with_new_children` swaps in a multi-partition input, the
+    /// parent `CoalescePartitions` executes only partition 0 and silently drops the rest -- the
+    /// distributed undercount we hit on real data. Build the exec over a 1-partition input (so its
+    /// cache says 1), re-child it onto a 3-partition input, and require both the reported
+    /// partitioning and the aggregated rows to reflect all 3 partitions.
+    #[tokio::test]
+    async fn output_partitioning_follows_rechilded_input() {
+        use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
+
+        let n = 5_000;
+        let one_part =
+            MemorySourceConfig::try_new(&vec![make_batches(n, 4096)], input_schema(), None)
+                .unwrap();
+        let one_input: Arc<dyn ExecutionPlan> = Arc::new(DataSourceExec::new(Arc::new(one_part)));
+        let partial = partial_aggregate(one_input);
+        let aggr_expr = partial.aggr_expr().to_vec();
+        let exec: Arc<dyn ExecutionPlan> = Arc::new(
+            GroupByLimitAggregateExec::try_new_from_partial(&partial, 0, 0, Vec::new()).unwrap(),
+        );
+        assert_eq!(exec.output_partitioning().partition_count(), 1);
+
+        // Re-child onto a 3-partition input (same schema), as a later physical-plan pass would.
+        let three_parts: Vec<Vec<RecordBatch>> = (0..3).map(|_| make_batches(n, 4096)).collect();
+        let three = MemorySourceConfig::try_new(&three_parts, input_schema(), None).unwrap();
+        let three_input: Arc<dyn ExecutionPlan> = Arc::new(DataSourceExec::new(Arc::new(three)));
+        let exec3 = exec.with_new_children(vec![three_input]).unwrap();
+        assert_eq!(
+            exec3.output_partitioning().partition_count(),
+            3,
+            "exec must report its re-childed input's partition count, not a stale 1"
+        );
+
+        // End-to-end: coalesce + final must see every partition's rows.
+        let coalesced: Arc<dyn ExecutionPlan> = Arc::new(CoalescePartitionsExec::new(exec3));
+        let pschema = coalesced.schema();
+        let final_gb = PhysicalGroupBy::new_single(vec![
+            (col("g0", &pschema).unwrap(), "g0".to_string()),
+            (col("g1", &pschema).unwrap(), "g1".to_string()),
+        ]);
+        let final_agg = AggregateExec::try_new(
+            AggregateMode::Final,
+            final_gb,
+            aggr_expr,
+            vec![None],
+            coalesced,
+            pschema,
+        )
+        .unwrap();
+        let out = collect(Arc::new(final_agg), Arc::new(TaskContext::default()))
+            .await
+            .unwrap();
+        let distinct = distinct_group_rows(&out);
+        let total_v: i64 = out
+            .iter()
+            .map(|b| {
+                let s = b.column(2).as_any().downcast_ref::<Int64Array>().unwrap();
+                (0..s.len()).map(|i| s.value(i)).sum::<i64>()
+            })
+            .sum();
+        assert_eq!(
+            distinct, n,
+            "all {n} groups must survive across 3 partitions"
+        );
+        // make_batches emits each combo twice; 3 partitions -> 6 rows per group.
+        assert_eq!(
+            total_v as usize,
+            6 * n,
+            "all rows from all 3 partitions must be aggregated"
+        );
+    }
+
+    /// Real-data shape: a high-cardinality `Float64` group column (like `contrib`) at ~400k groups,
+    /// fed partial -> final. Floats route through the same multi-column `GroupValues` as strings, but
+    /// at this scale across many input batches; reproduces (or rules out) the on-cluster undercount.
+    #[tokio::test]
+    async fn no_trim_float_high_cardinality_partial_then_final() {
+        use datafusion::arrow::array::Float64Array;
+
+        let n: usize = 410_000;
+        let batch_rows = 8192;
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("f", DataType::Float64, true),
+            Field::new("g", DataType::Utf8, true),
+            Field::new("v", DataType::Int64, false),
+        ]));
+
+        // n distinct (f, g) combos, each twice (forward + reversed pass) so duplicates land in
+        // different batches; every 9th combo carries NULL f.
+        let mut f: Vec<Option<f64>> = Vec::new();
+        let mut g: Vec<Option<String>> = Vec::new();
+        for pass in 0..2 {
+            let iter: Box<dyn Iterator<Item = usize>> = if pass == 0 {
+                Box::new(0..n)
+            } else {
+                Box::new((0..n).rev())
+            };
+            for i in iter {
+                f.push(if i % 9 == 0 {
+                    None
+                } else {
+                    Some(i as f64 * 0.01)
+                });
+                g.push(Some(format!("g{}", i)));
+            }
+        }
+        let total = f.len();
+        let mut batches = Vec::new();
+        let mut start = 0;
+        while start < total {
+            let end = (start + batch_rows).min(total);
+            batches.push(
+                RecordBatch::try_new(
+                    schema.clone(),
+                    vec![
+                        Arc::new(Float64Array::from(f[start..end].to_vec())),
+                        Arc::new(StringArray::from(g[start..end].to_vec())),
+                        Arc::new(Int64Array::from(vec![1i64; end - start])),
+                    ],
+                )
+                .unwrap(),
+            );
+            start = end;
+        }
+
+        let source = MemorySourceConfig::try_new(&vec![batches], schema.clone(), None).unwrap();
+        let input: Arc<dyn ExecutionPlan> = Arc::new(DataSourceExec::new(Arc::new(source)));
+        let group_by = PhysicalGroupBy::new_single(vec![
+            (col("f", &schema).unwrap(), "f".to_string()),
+            (col("g", &schema).unwrap(), "g".to_string()),
+        ]);
+        let sum = AggregateExprBuilder::new(sum_udaf(), vec![col("v", &schema).unwrap()])
+            .schema(schema.clone())
+            .alias("sum_v")
+            .build()
+            .unwrap();
+        let partial = AggregateExec::try_new(
+            AggregateMode::Partial,
+            group_by,
+            vec![Arc::new(sum)],
+            vec![None],
+            input,
+            schema.clone(),
+        )
+        .unwrap();
+        let aggr_expr = partial.aggr_expr().to_vec();
+
+        let exec: Arc<dyn ExecutionPlan> = Arc::new(
+            GroupByLimitAggregateExec::try_new_from_partial(&partial, 0, 0, Vec::new()).unwrap(),
+        );
+        let partial_schema = exec.schema();
+        let final_group_by = PhysicalGroupBy::new_single(vec![
+            (col("f", &partial_schema).unwrap(), "f".to_string()),
+            (col("g", &partial_schema).unwrap(), "g".to_string()),
+        ]);
+        let final_agg = AggregateExec::try_new(
+            AggregateMode::Final,
+            final_group_by,
+            aggr_expr,
+            vec![None],
+            exec,
+            partial_schema,
+        )
+        .unwrap();
+
+        let ctx = Arc::new(TaskContext::default());
+        let out = collect(Arc::new(final_agg), ctx).await.unwrap();
+        let total_rows: usize = out.iter().map(|b| b.num_rows()).sum();
+        assert_eq!(
+            total_rows, n,
+            "final emitted {total_rows} rows, expected {n}"
+        );
+    }
+
+    /// The worker serializes each partial-output batch to the router with Arrow IPC
+    /// (`StreamWriter`). Our exec emits slices of one big `emit(All)` batch (offset > 0), so the IPC
+    /// roundtrip must preserve every group of every sliced batch. Reproduces the distributed
+    /// undercount, which the in-process tests miss because they never serialize.
+    #[tokio::test]
+    async fn ipc_roundtrip_of_sliced_output_preserves_every_group() {
+        use datafusion::arrow::ipc::reader::StreamReader;
+        use datafusion::arrow::ipc::writer::StreamWriter;
+        use std::io::Cursor;
+
+        let n = 20_000;
+        let batches = make_batches(n, 4096);
+        let source = MemorySourceConfig::try_new(&vec![batches], input_schema(), None).unwrap();
+        let input: Arc<dyn ExecutionPlan> = Arc::new(DataSourceExec::new(Arc::new(source)));
+        let partial = partial_aggregate(input);
+        let exec: Arc<dyn ExecutionPlan> = Arc::new(
+            GroupByLimitAggregateExec::try_new_from_partial(&partial, 0, 0, Vec::new()).unwrap(),
+        );
+        let schema = exec.schema();
+
+        let ctx = Arc::new(TaskContext::default());
+        let out = collect(Arc::clone(&exec), ctx).await.unwrap();
+        assert!(
+            out.len() > 1,
+            "test must exercise multiple sliced output batches"
+        );
+
+        let mut roundtripped = Vec::new();
+        for batch in &out {
+            let mut buf = Vec::new();
+            {
+                let mut w = StreamWriter::try_new(Cursor::new(&mut buf), &schema).unwrap();
+                w.write(batch).unwrap();
+                w.finish().unwrap();
+            }
+            let mut r = StreamReader::try_new(Cursor::new(buf), None).unwrap();
+            roundtripped.push(r.next().unwrap().unwrap());
+        }
+
+        let distinct = distinct_group_rows(&roundtripped);
+        assert_eq!(
+            distinct, n,
+            "after IPC roundtrip of sliced batches, {distinct} distinct groups survived, expected {n}"
+        );
+    }
+
+    /// A global aggregate (no GROUP BY) has zero group columns; wrapping it would build a
+    /// `GroupValues` over an empty schema and panic on `intern`. `try_new_from_partial` must decline
+    /// it so the full hash routing leaves global aggregates to DataFusion.
+    #[tokio::test]
+    async fn global_aggregate_is_not_wrapped() {
+        let batches = make_batches(100, 64);
+        let source = MemorySourceConfig::try_new(&vec![batches], input_schema(), None).unwrap();
+        let input: Arc<dyn ExecutionPlan> = Arc::new(DataSourceExec::new(Arc::new(source)));
+        let schema = input.schema();
+
+        let sum = AggregateExprBuilder::new(sum_udaf(), vec![col("v", &schema).unwrap()])
+            .schema(schema.clone())
+            .alias("sum_v")
+            .build()
+            .unwrap();
+        let global = AggregateExec::try_new(
+            AggregateMode::Partial,
+            PhysicalGroupBy::new_single(vec![]),
+            vec![Arc::new(sum)],
+            vec![None],
+            input,
+            schema,
+        )
+        .unwrap();
+
+        assert!(
+            GroupByLimitAggregateExec::try_new_from_partial(&global, 0, 0, Vec::new()).is_none(),
+            "global aggregate (no group columns) must not be wrapped"
+        );
+    }
+
+    /// Mirror the distributed worker shape: a multi-partition scan, our exec running once per
+    /// partition, then `CoalescePartitions` and the `Final` aggregate. Every group must survive even
+    /// though the same key appears in several partitions' partial outputs.
+    #[tokio::test]
+    async fn no_trim_multi_partition_then_final_emits_every_group() {
+        use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
+
+        let n = 20_000;
+        // Same n combos in each of 4 partitions -> heavy cross-partition group overlap.
+        let partitions: Vec<Vec<RecordBatch>> = (0..4).map(|_| make_batches(n, 4096)).collect();
+
+        let source = MemorySourceConfig::try_new(&partitions, input_schema(), None).unwrap();
+        let input: Arc<dyn ExecutionPlan> = Arc::new(DataSourceExec::new(Arc::new(source)));
+        let partial = partial_aggregate(input);
+        let aggr_expr = partial.aggr_expr().to_vec();
+
+        let exec: Arc<dyn ExecutionPlan> = Arc::new(
+            GroupByLimitAggregateExec::try_new_from_partial(&partial, 0, 0, Vec::new()).unwrap(),
+        );
+        assert_eq!(
+            exec.output_partitioning().partition_count(),
+            4,
+            "exec should preserve the input's partition count"
+        );
+        let coalesced: Arc<dyn ExecutionPlan> = Arc::new(CoalescePartitionsExec::new(exec));
+
+        let partial_schema = coalesced.schema();
+        let final_group_by = PhysicalGroupBy::new_single(vec![
+            (col("g0", &partial_schema).unwrap(), "g0".to_string()),
+            (col("g1", &partial_schema).unwrap(), "g1".to_string()),
+        ]);
+        let final_agg = AggregateExec::try_new(
+            AggregateMode::Final,
+            final_group_by,
+            aggr_expr,
+            vec![None],
+            coalesced,
+            partial_schema,
+        )
+        .unwrap();
+
+        let ctx = Arc::new(TaskContext::default());
+        let out = collect(Arc::new(final_agg), ctx).await.unwrap();
+
+        let total_rows: usize = out.iter().map(|b| b.num_rows()).sum();
+        let distinct = distinct_group_rows(&out);
+        assert_eq!(
+            distinct, n,
+            "final emitted {distinct} distinct groups, expected {n}"
+        );
+        assert_eq!(
+            total_rows, n,
+            "final emitted {total_rows} rows, expected {n}"
+        );
+    }
+}

From b1f31b2039a7a624d387b6522cb64b3f598b2187 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Tue, 23 Jun 2026 14:41:01 +0200
Subject: [PATCH 13/16] refactor(cubestore): make GROUP_BY_LIMIT_PER_PARTITION
 toggle over/under-merge hash trim

CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION now genuinely controls where the worker's
group-by-limit hash table lives, defaulting to over-merge:
- off (default): coalesce the partial aggregate's input to a single partition, so
  the worker builds one hash table over the merged partitions (peak ~k, no
  intra-worker aggregation parallelism).
- on: keep the raw multi-partition input, so the aggregate runs per partition
  (N hash tables; more parallelism, peak ~N*k).

Previously the flag only tried to strip a leading CoalescePartitions feeding the
aggregate -- a no-op on every real plan shape (the scan feeds raw partitions
straight into the partial agg, so the trim was always per-partition and the flag
did nothing). Replace strip_leading_coalesce_partitions (now dead, removed) with a
coalesce-insert in the default path.

Both modes produce identical results (verified md5-equal to ClickHouse on the
union hash-trim). A/B on hash_union_trim: over-merge ~1717ms/+203MiB vs under-merge
~1482ms/+239MiB. group_by_limit lib, worker_sort_and_limit_cluster, topk and
limit_pushdown_group suites stay green.
---
 .../distributed_partial_aggregate.rs          | 57 +++++++++----------
 1 file changed, 26 insertions(+), 31 deletions(-)

diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
index 010c9b9fc6121..3ae2e7471cb03 100644
--- a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
@@ -436,20 +436,29 @@ fn resort_worker_subtree(
             return None;
         }
 
-        // Per-partition (CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION): drop the merge below the aggregate
-        // so it runs over every raw partition; the CoalescePartitions below then parallelizes all of
-        // them (N-way) instead of one stream per union branch.
+        // CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION decides where the worker's hash table lives:
+        //  - off (default): coalesce the aggregate's input to a single partition, so the worker
+        //    builds ONE hash table over the merged partitions ("over merge"). Peak memory ~k, no
+        //    intra-worker parallelism on the aggregation.
+        //  - on: keep the raw multi-partition input, so the aggregate runs once per partition
+        //    ("under merge"): N parallel hash tables -- more parallelism, peak ~N*k. The trim keeps
+        //    each per-partition output to ~factor*k, and the global top-k argument (full group key)
+        //    guarantees a surviving group stays within every partition's local top-k.
         let partial = if per_partition_enabled() {
+            partial
+        } else {
             partial
                 .as_any()
                 .downcast_ref::<AggregateExec>()
                 .and_then(|agg| {
-                    let new_input = strip_leading_coalesce_partitions(agg.input());
-                    partial.clone().with_new_children(vec![new_input]).ok()
+                    if agg.input().output_partitioning().partition_count() <= 1 {
+                        return None;
+                    }
+                    let merged: Arc<dyn ExecutionPlan> =
+                        Arc::new(CoalescePartitionsExec::new(agg.input().clone()));
+                    partial.clone().with_new_children(vec![merged]).ok()
                 })
                 .unwrap_or(partial)
-        } else {
-            partial
         };
 
         let order: Vec<(usize, SortOptions)> = cols
@@ -478,10 +487,10 @@ fn resort_worker_subtree(
             None => partial,
         };
 
-        // Emit the trimmed top-k unsorted, coalesced to one stream. The router hash-combines and
-        // re-applies the top-k sort, so no worker-side sort/merge holds the whole result and the
-        // per-partition aggregates run in parallel (CoalescePartitions spawns a task per input)
-        // instead of being drained one at a time by a sort-preserving merge.
+        // Emit the trimmed top-k unsorted, coalesced to one stream for the ClusterSend. The router
+        // hash-combines and re-applies the top-k sort, so no worker-side sort/merge holds the whole
+        // result. Under "over merge" the trimmed agg is already single-partition and this coalesce is
+        // a passthrough; under "under merge" it drains the per-partition aggregates concurrently.
         return Some(Arc::new(CoalescePartitionsExec::new(trimmed)));
     }
 
@@ -521,32 +530,18 @@ fn lex_ordering_from_cols(
     Some(LexOrdering::new(exprs))
 }
 
-/// Toggle (CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION): drop the merge below the trimmed aggregate so it
-/// runs per partition (N-way) instead of per union branch. The `CoalescePartitions` then
-/// parallelizes all partitions.
-///
-/// Trades memory for parallelism: one group table per DF partition (= per parquet file and chunk),
-/// so peak memory is bounded by the partition count, not by k. It stays low only while partitions
-/// are key-local (group keys do not span partitions) -- true for index-sorted pre-aggregations. On
-/// a group key spread across many partitions each table approaches full cardinality (~N x memory).
+/// Toggle (CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION) for the worker hash trim. Off by default: the
+/// aggregate's input is coalesced so the worker builds a single hash table over the merged partitions
+/// ("over merge"), peak ~k. On: the aggregate keeps its raw multi-partition input and runs per
+/// partition ("under merge") -- one group table per DF partition (= per parquet file/chunk), trading
+/// memory for intra-worker parallelism (peak ~N*k; on a key spread across many partitions each table
+/// approaches full cardinality).
 fn per_partition_enabled() -> bool {
     std::env::var("CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION")
         .map(|v| v == "true" || v == "1")
         .unwrap_or(false)
 }
 
-/// Peel the leading `CoalescePartitionsExec` chain directly feeding the aggregate, exposing the
-/// underlying multi-partition streams. Only the immediate coalesce(s) are removed; any
-/// `CoalescePartitionsExec` deeper in the subtree (e.g. one a child inserted to satisfy its own
-/// single-partition input requirement, as in a UNION branch) is left intact so plan semantics are
-/// preserved.
-fn strip_leading_coalesce_partitions(p: &Arc<dyn ExecutionPlan>) -> Arc<dyn ExecutionPlan> {
-    if p.as_any().is::<CoalescePartitionsExec>() {
-        return strip_leading_coalesce_partitions(&p.children()[0]);
-    }
-    p.clone()
-}
-
 /// The group/aggregate state of either an `InlineAggregateExec` or a plain `AggregateExec`, when in
 /// Final mode.
 struct FinalAggregateInfo {

From 7d00752559cdb9fb0295f8f286d0dbb45206b581 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Tue, 23 Jun 2026 17:26:18 +0200
Subject: [PATCH 14/16] fix(cubestore): align group-by-limit total-order NULL
 placement across both planters

The top-k total order T (ORDER BY cols ++ remaining group cols) is built in two
places that must agree: the rewriter (pre_optimize, SortOptions::default() =
ascending nulls-first for appended cols) and the descriptor in
compute_worker_sort_and_limit (drives the finalize-time worker cut + router sort).
The descriptor used nulls-last for appended cols, so worker and router could rank a
group with a NULL in an appended column differently and drop its partial states
(latent undercount; safe today only because the descriptor's router branch no-ops
once the rewriter has planted the exec). Align the descriptor to ascending
nulls-first.

Also:
- trim_top_k: guard factor == 0 (no trim), mirroring statistics(); defensive, the
  exec is only built with factor > 0.
- new test limit_pushdown_group_null_appended: multi-column GROUP BY with a NULL in
  the appended total-order column, group spanning three chunks, trim engaged;
  asserts the fully-combined sum (undercount guard), membership fixed by the ORDER
  BY prefix so it does not depend on tie order.
- update planning_topk_hash_aggregate's bare-LIMIT assertion to the aligned
  nulls-first appended order (it had encoded the desync).
---
 .../cubestore-sql-tests/src/tests.rs          | 56 ++++++++++++++++++-
 .../group_by_limit_aggregate_stream.rs        |  4 +-
 .../optimizations/group_by_limit_rewriter.rs  |  2 +-
 .../cubestore/src/queryplanner/planning.rs    |  5 +-
 4 files changed, 63 insertions(+), 4 deletions(-)

diff --git a/rust/cubestore/cubestore-sql-tests/src/tests.rs b/rust/cubestore/cubestore-sql-tests/src/tests.rs
index 6a5a8910528b3..f9d54f405d4b7 100644
--- a/rust/cubestore/cubestore-sql-tests/src/tests.rs
+++ b/rust/cubestore/cubestore-sql-tests/src/tests.rs
@@ -332,6 +332,10 @@ pub fn sql_tests(prefix: &str) -> Vec<(&'static str, TestFn)> {
             "limit_pushdown_group_nonprefix_order",
             limit_pushdown_group_nonprefix_order,
         ),
+        t(
+            "limit_pushdown_group_null_appended",
+            limit_pushdown_group_null_appended,
+        ),
         t("limit_pushdown_group_order", limit_pushdown_group_order),
         t(
             "limit_pushdown_group_where_order",
@@ -426,6 +430,7 @@ lazy_static::lazy_static! {
         "global_aggregate_unique_key_keeps_merge",
         "limit_pushdown_group_having",
         "limit_pushdown_group_nonprefix_order",
+        "limit_pushdown_group_null_appended",
         "prefilter_chunks_shared_scan",
         "planning_topk_hash_aggregate",
         "topk_hash_aggregate_trim",
@@ -3288,13 +3293,15 @@ async fn planning_topk_hash_aggregate(service: Box<dyn SqlClient>) -> Result<(),
 
     // Bare LIMIT (no ORDER BY) on a non-indexed group column: the limit can't ride the index, so the
     // worker still trims to the smallest groups by the full group key -- "any k" made deterministic.
+    // The appended group column uses ascending nulls-first, matching the rewriter's appended-column
+    // order (`SortOptions::default()`) so the worker cut and the router select agree.
     let p = service
         .plan_query("SELECT day, SUM(hits) FROM s.Data GROUP BY 1 LIMIT 10")
         .await?;
     let pp = pp_phys_plan_ext(p.worker.as_ref(), &PPOptions::none());
     assert!(
         pp.contains("GroupByLimitAggregate, k: 10, factor: 2,")
-            && pp.contains("(0, SortOptions { descending: false, nulls_first: false })"),
+            && pp.contains("(0, SortOptions { descending: false, nulls_first: true })"),
         "expected GroupByLimitAggregate on a bare LIMIT, got:\n{}",
         pp
     );
@@ -9305,6 +9312,53 @@ async fn limit_pushdown_group_nonprefix_order(
     Ok(())
 }
 
+/// Multi-column GROUP BY where the appended (non-ORDER-BY) total-order column carries NULLs and the
+/// group spans chunks. `GROUP BY b, c` is not the index prefix, so this is the hash trim path; the
+/// worker cut and the router select both extend the order with the appended column `c`, and they must
+/// agree on its NULL placement or the NULL group's partial states get dropped across chunks
+/// (undercount). The NULL group's membership is fixed by `b` (uniquely smallest), so the assertion
+/// does not depend on the tie order -- only on the sum being fully combined across all three chunks.
+async fn limit_pushdown_group_null_appended(service: Box<dyn SqlClient>) -> Result<(), CubeError> {
+    service.exec_query("CREATE SCHEMA s").await?;
+    service
+        .exec_query("CREATE TABLE s.nag (a int, b int, c int, val int) INDEX bidx (a, b, c)")
+        .await?;
+    // Three chunks. The (b = 1, c = NULL) group appears in every chunk, so it spans all three and
+    // its sum must combine to 1110. Five distinct (b, c) groups so the trim engages (5 > factor*k).
+    service
+        .exec_query(
+            "INSERT INTO s.nag (a, b, c, val) VALUES \
+             (1, 1, NULL, 10), (1, 2, 20, 1), (1, 3, 30, 1), (1, 4, 40, 1), (1, 5, 50, 1)",
+        )
+        .await?;
+    service
+        .exec_query(
+            "INSERT INTO s.nag (a, b, c, val) VALUES \
+             (1, 1, NULL, 100), (1, 2, 20, 1), (1, 3, 30, 1), (1, 4, 40, 1), (1, 5, 50, 1)",
+        )
+        .await?;
+    service
+        .exec_query(
+            "INSERT INTO s.nag (a, b, c, val) VALUES \
+             (1, 1, NULL, 1000), (1, 2, 20, 1), (1, 3, 30, 1), (1, 4, 40, 1), (1, 5, 50, 1)",
+        )
+        .await?;
+
+    // Smallest b is the NULL-c group; its sum must combine across all three chunks.
+    let r = service
+        .exec_query("SELECT b, c, sum(val) FROM s.nag GROUP BY b, c ORDER BY b LIMIT 1")
+        .await?;
+    assert_eq!(
+        to_rows(&r),
+        vec![vec![
+            TableValue::Int(1),
+            TableValue::Null,
+            TableValue::Int(1110),
+        ]]
+    );
+    Ok(())
+}
+
 async fn limit_pushdown_group_order(service: Box<dyn SqlClient>) -> Result<(), CubeError> {
     service.exec_query("CREATE SCHEMA foo").await?;
 
diff --git a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs
index ff73f8b691b0d..a194edbec5322 100644
--- a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs
@@ -179,7 +179,9 @@ impl GroupByLimitAggregateStream {
 
     fn trim_top_k(&self, batch: RecordBatch) -> DFResult<RecordBatch> {
         let g = batch.num_rows();
-        if self.k == 0 || g <= self.factor.saturating_mul(self.k) {
+        // `factor == 0` means "no trimming" (mirrors the guard in `statistics`); the exec is only
+        // built with `factor > 0` today, so this is defensive.
+        if self.k == 0 || self.factor == 0 || g <= self.factor.saturating_mul(self.k) {
             return Ok(batch);
         }
         let sort_columns: Vec<SortColumn> = self
diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/group_by_limit_rewriter.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/group_by_limit_rewriter.rs
index 0b17807bf873b..c2ebe2a667b47 100644
--- a/rust/cubestore/cubestore/src/queryplanner/optimizations/group_by_limit_rewriter.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/group_by_limit_rewriter.rs
@@ -24,7 +24,7 @@ use std::sync::Arc;
 /// partial state for such a group keeps it (its local rank can only be smaller), so every needed
 /// partial state reaches the router. Ordering the router by `T` instead of the bare `ORDER BY` does
 /// not change the query contract: `ORDER BY` is a prefix of `T`, so the output stays validly
-/// ordered and the previously-unspecified tie order just becomes deterministic.
+/// ordered and the tie order, left unspecified by the query, just becomes deterministic.
 ///
 /// We only rewrite when the plan matches exactly `Sort(/Limit) -> [passthrough] -> Final aggregate
 /// -> [passthrough/cluster boundary] -> Partial hash aggregate`; anything else on the path (a
diff --git a/rust/cubestore/cubestore/src/queryplanner/planning.rs b/rust/cubestore/cubestore/src/queryplanner/planning.rs
index a862554ea1725..34c1d6c5be715 100644
--- a/rust/cubestore/cubestore/src/queryplanner/planning.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/planning.rs
@@ -1161,9 +1161,12 @@ impl ChooseIndex<'_> {
             }
         }
         // Extend with the remaining group keys to make it a total order on the full group key.
+        // Their NULL placement is arbitrary (these columns are not in the query's ORDER BY), but it
+        // must match the rewriter's appended-column order (`SortOptions::default()`, i.e. ascending
+        // nulls-first) so the worker cut and the router select agree on the total order.
         for (idx, is_used) in used.iter().enumerate() {
             if !is_used {
-                cols.push((idx, true, false));
+                cols.push((idx, true, true));
             }
         }
         Some((cols, limit))

From 5f6959bac8c119ed64f38484e95cd2062f445313 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Tue, 23 Jun 2026 21:16:45 +0200
Subject: [PATCH 15/16] fix(cubestore): move group-by-limit
 per-partition/coalesce flags into ConfigObj

CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION and CUBESTORE_COALESCE_UNDER_HASH_AGGREGATE
were read via raw std::env::var, bypassing the ConfigObj surface that
CUBESTORE_GROUP_BY_LIMIT_FACTOR already uses -- so they couldn't be overridden by
the test context or configured from the JS side, and the flag surface was
inconsistent (CI review). Add group_by_limit_per_partition() and
coalesce_under_hash_aggregate() to ConfigObj and thread them like
group_by_limit_factor: coalesce via PreOptimizeRule -> pre_optimize_physical_plan,
per-partition via CubeQueryPlanner -> finalize_physical_plan ->
push_worker_sort_and_limit. Delete the two raw-env helpers.

Behavior-preserving: the from-env constructor still reads both via env_bool, so
stend --env keeps working; the test config keeps both false (= previous unset-env
default). Note: env_bool is stricter than the old lenient `=="true"||=="1"` reads
-- a malformed value now fails fast at startup instead of silently defaulting to
false, matching the group_by_limit_factor convention.

Also downgrade an assert_eq! to debug_assert_eq! (the single-group invariant is
guaranteed by try_new_from_partial rejecting grouping sets).
---
 rust/cubestore/cubestore/src/config/mod.rs    | 30 ++++++++++++++++
 .../group_by_limit_aggregate_stream.rs        |  3 +-
 .../distributed_partial_aggregate.rs          | 34 ++++++-------------
 .../src/queryplanner/optimizations/mod.rs     | 25 ++++++++++----
 .../src/queryplanner/query_executor.rs        |  3 ++
 5 files changed, 65 insertions(+), 30 deletions(-)

diff --git a/rust/cubestore/cubestore/src/config/mod.rs b/rust/cubestore/cubestore/src/config/mod.rs
index 378232ad1cd69..42eb40652b314 100644
--- a/rust/cubestore/cubestore/src/config/mod.rs
+++ b/rust/cubestore/cubestore/src/config/mod.rs
@@ -590,6 +590,16 @@ pub trait ConfigObj: DIService {
     /// `k = limit + offset`. `0` disables the optimization.
     fn group_by_limit_factor(&self) -> usize;
 
+    /// When the worker group-by-limit hash trim is active, controls where the worker's hash table
+    /// lives: `false` (default) coalesces the partial aggregate's input to one partition (one hash
+    /// table per worker, "over merge"); `true` keeps the raw multi-partition input so it runs per
+    /// partition ("under merge").
+    fn group_by_limit_per_partition(&self) -> bool;
+
+    /// Replace the sort-preserving merge feeding a grouped Linear (hash) aggregate with a plain
+    /// partition coalesce (the hash aggregate ignores input order, so the per-row merge is wasted).
+    fn coalesce_under_hash_aggregate(&self) -> bool;
+
     fn allow_decimal128(&self) -> bool;
 
     fn enable_remove_orphaned_remote_files(&self) -> bool;
@@ -750,6 +760,8 @@ pub struct ConfigObjImpl {
     pub repartition_merge_max_rows: u64,
     pub repartition_check_overlapping_children: bool,
     pub group_by_limit_factor: usize,
+    pub group_by_limit_per_partition: bool,
+    pub coalesce_under_hash_aggregate: bool,
     pub allow_decimal128: bool,
     pub enable_remove_orphaned_remote_files: bool,
     pub enable_startup_warmup: bool,
@@ -1096,6 +1108,14 @@ impl ConfigObj for ConfigObjImpl {
         self.group_by_limit_factor
     }
 
+    fn group_by_limit_per_partition(&self) -> bool {
+        self.group_by_limit_per_partition
+    }
+
+    fn coalesce_under_hash_aggregate(&self) -> bool {
+        self.coalesce_under_hash_aggregate
+    }
+
     fn allow_decimal128(&self) -> bool {
         self.allow_decimal128
     }
@@ -1793,6 +1813,14 @@ impl Config {
                     false,
                 ),
                 group_by_limit_factor: env_parse("CUBESTORE_GROUP_BY_LIMIT_FACTOR", 0),
+                group_by_limit_per_partition: env_bool(
+                    "CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION",
+                    false,
+                ),
+                coalesce_under_hash_aggregate: env_bool(
+                    "CUBESTORE_COALESCE_UNDER_HASH_AGGREGATE",
+                    false,
+                ),
                 allow_decimal128: env_bool("CUBESTORE_ALLOW_DECIMAL128", false),
                 enable_remove_orphaned_remote_files: env_bool(
                     "CUBESTORE_ENABLE_REMOVE_ORPHANED_REMOTE_FILES",
@@ -2050,6 +2078,8 @@ impl Config {
                 repartition_merge_max_rows: 4_000_000,
                 repartition_check_overlapping_children: false,
                 group_by_limit_factor: 2,
+                group_by_limit_per_partition: false,
+                coalesce_under_hash_aggregate: false,
                 allow_decimal128: false,
                 enable_remove_orphaned_remote_files: false,
                 enable_startup_warmup: true,
diff --git a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs
index a194edbec5322..707b55e889359 100644
--- a/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/group_by_limit_aggregate/group_by_limit_aggregate_stream.rs
@@ -145,7 +145,8 @@ impl GroupByLimitAggregateStream {
         let input_values = evaluate_many(&self.aggregate_arguments, &batch)?;
         let filter_values = evaluate_optional(&self.filter_expressions, &batch)?;
 
-        assert_eq!(group_by_values.len(), 1, "Exactly 1 group value required");
+        // Grouping sets are rejected by try_new_from_partial, so there is exactly one group value.
+        debug_assert_eq!(group_by_values.len(), 1, "Exactly 1 group value required");
         self.group_values
             .intern(&group_by_values[0], &mut self.current_group_indices)?;
         let group_indices = &self.current_group_indices;
diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
index 3ae2e7471cb03..a1dd9586a027b 100644
--- a/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/distributed_partial_aggregate.rs
@@ -131,14 +131,6 @@ pub fn push_aggregate_to_workers(
     )?))
 }
 
-/// Whether the experimental grouped-hash coalescing in [`drop_sort_merge_under_global_aggregate`]
-/// is enabled (`CUBESTORE_COALESCE_UNDER_HASH_AGGREGATE`). Read once at the call site, not per node.
-pub fn coalesce_under_hash_aggregate_enabled() -> bool {
-    std::env::var("CUBESTORE_COALESCE_UNDER_HASH_AGGREGATE")
-        .map(|v| v == "true" || v == "1")
-        .unwrap_or(false)
-}
-
 /// A Linear aggregate doesn't use input ordering, but the scan still merges its partitions with a
 /// SortPreservingMergeExec when the index has a sort key (e.g. picked from the filters for partition
 /// pruning). Replace such merges under the aggregate with plain partition coalescing: the per-row
@@ -297,6 +289,7 @@ pub fn push_sorted_partial_aggregate_below_merge(
 pub fn push_worker_sort_and_limit(
     p: Arc<dyn ExecutionPlan>,
     group_by_limit_factor: usize,
+    group_by_limit_per_partition: bool,
 ) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
     // Worker side: bound the partial aggregate's output. The sorted (inline) aggregate is always
     // bounded with a per-partition Sort(fetch) -- the group count is unknown, so we can't trim, we
@@ -307,8 +300,13 @@ pub fn push_worker_sort_and_limit(
         let Some((cols, fetch)) = w.worker_sort_and_limit.clone() else {
             return Ok(p);
         };
-        let Some(new_input) = resort_worker_subtree(&w.input, &cols, fetch, group_by_limit_factor)
-        else {
+        let Some(new_input) = resort_worker_subtree(
+            &w.input,
+            &cols,
+            fetch,
+            group_by_limit_factor,
+            group_by_limit_per_partition,
+        ) else {
             return Ok(p);
         };
         return Ok(Arc::new(WorkerExec::new(
@@ -348,6 +346,7 @@ pub fn push_worker_sort_and_limit(
         &cols,
         fetch,
         group_by_limit_factor,
+        group_by_limit_per_partition,
     ) else {
         return Ok(p);
     };
@@ -426,6 +425,7 @@ fn resort_worker_subtree(
     cols: &[(usize, bool, bool)],
     fetch: usize,
     group_by_limit_factor: usize,
+    group_by_limit_per_partition: bool,
 ) -> Option<Arc<dyn ExecutionPlan>> {
     let partial = locate_partial_aggregate(worker_subtree)?;
 
@@ -444,7 +444,7 @@ fn resort_worker_subtree(
         //    ("under merge"): N parallel hash tables -- more parallelism, peak ~N*k. The trim keeps
         //    each per-partition output to ~factor*k, and the global top-k argument (full group key)
         //    guarantees a surviving group stays within every partition's local top-k.
-        let partial = if per_partition_enabled() {
+        let partial = if group_by_limit_per_partition {
             partial
         } else {
             partial
@@ -530,18 +530,6 @@ fn lex_ordering_from_cols(
     Some(LexOrdering::new(exprs))
 }
 
-/// Toggle (CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION) for the worker hash trim. Off by default: the
-/// aggregate's input is coalesced so the worker builds a single hash table over the merged partitions
-/// ("over merge"), peak ~k. On: the aggregate keeps its raw multi-partition input and runs per
-/// partition ("under merge") -- one group table per DF partition (= per parquet file/chunk), trading
-/// memory for intra-worker parallelism (peak ~N*k; on a key spread across many partitions each table
-/// approaches full cardinality).
-fn per_partition_enabled() -> bool {
-    std::env::var("CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION")
-        .map(|v| v == "true" || v == "1")
-        .unwrap_or(false)
-}
-
 /// The group/aggregate state of either an `InlineAggregateExec` or a plain `AggregateExec`, when in
 /// Final mode.
 struct FinalAggregateInfo {
diff --git a/rust/cubestore/cubestore/src/queryplanner/optimizations/mod.rs b/rust/cubestore/cubestore/src/queryplanner/optimizations/mod.rs
index ca9de720acd92..ccf44b753d11e 100644
--- a/rust/cubestore/cubestore/src/queryplanner/optimizations/mod.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/optimizations/mod.rs
@@ -46,6 +46,7 @@ pub struct CubeQueryPlanner {
     memory_handler: Arc<dyn MemoryHandler>,
     data_loaded_size: Option<Arc<DataLoadedSize>>,
     group_by_limit_factor: usize,
+    group_by_limit_per_partition: bool,
 }
 
 impl CubeQueryPlanner {
@@ -54,6 +55,7 @@ impl CubeQueryPlanner {
         serialized_plan: Arc<PreSerializedPlan>,
         memory_handler: Arc<dyn MemoryHandler>,
         group_by_limit_factor: usize,
+        group_by_limit_per_partition: bool,
     ) -> CubeQueryPlanner {
         CubeQueryPlanner {
             cluster: Some(cluster),
@@ -62,6 +64,7 @@ impl CubeQueryPlanner {
             memory_handler,
             data_loaded_size: None,
             group_by_limit_factor,
+            group_by_limit_per_partition,
         }
     }
 
@@ -71,6 +74,7 @@ impl CubeQueryPlanner {
         memory_handler: Arc<dyn MemoryHandler>,
         data_loaded_size: Option<Arc<DataLoadedSize>>,
         group_by_limit_factor: usize,
+        group_by_limit_per_partition: bool,
     ) -> CubeQueryPlanner {
         CubeQueryPlanner {
             serialized_plan,
@@ -79,6 +83,7 @@ impl CubeQueryPlanner {
             memory_handler,
             data_loaded_size,
             group_by_limit_factor,
+            group_by_limit_per_partition,
         }
     }
 }
@@ -112,6 +117,7 @@ impl QueryPlanner for CubeQueryPlanner {
             self.data_loaded_size.clone(),
             ctx_state.config().options(),
             self.group_by_limit_factor,
+            self.group_by_limit_per_partition,
         );
         result
     }
@@ -121,13 +127,19 @@ impl QueryPlanner for CubeQueryPlanner {
 pub struct PreOptimizeRule {
     push_partial_aggregate_below_merge: bool,
     group_by_limit_factor: usize,
+    coalesce_under_hash_aggregate: bool,
 }
 
 impl PreOptimizeRule {
-    pub fn new(push_partial_aggregate_below_merge: bool, group_by_limit_factor: usize) -> Self {
+    pub fn new(
+        push_partial_aggregate_below_merge: bool,
+        group_by_limit_factor: usize,
+        coalesce_under_hash_aggregate: bool,
+    ) -> Self {
         Self {
             push_partial_aggregate_below_merge,
             group_by_limit_factor,
+            coalesce_under_hash_aggregate,
         }
     }
 }
@@ -142,6 +154,7 @@ impl PhysicalOptimizerRule for PreOptimizeRule {
             plan,
             self.push_partial_aggregate_below_merge,
             self.group_by_limit_factor,
+            self.coalesce_under_hash_aggregate,
         )
     }
 
@@ -158,6 +171,7 @@ fn pre_optimize_physical_plan(
     p: Arc<dyn ExecutionPlan>,
     push_partial_aggregate_below_merge: bool,
     group_by_limit_factor: usize,
+    coalesce_under_hash_aggregate: bool,
 ) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
     let p = rewrite_physical_plan(p, &mut |p| push_aggregate_to_workers(p))?;
 
@@ -174,11 +188,9 @@ fn pre_optimize_physical_plan(
     };
 
     // Global (no GROUP BY) aggregates -- and, when enabled, grouped hash aggregates -- don't need
-    // their input merged in the sort order. Read the flag once, not per node.
-    let coalesce_grouped_hash =
-        distributed_partial_aggregate::coalesce_under_hash_aggregate_enabled();
+    // their input merged in the sort order.
     let p = rewrite_physical_plan(p, &mut |p| {
-        drop_sort_merge_under_global_aggregate(p, coalesce_grouped_hash)
+        drop_sort_merge_under_global_aggregate(p, coalesce_under_hash_aggregate)
     })?;
 
     // Replace sorted AggregateExec with InlineAggregateExec for better performance
@@ -199,6 +211,7 @@ fn finalize_physical_plan(
     data_loaded_size: Option<Arc<DataLoadedSize>>,
     config: &ConfigOptions,
     group_by_limit_factor: usize,
+    group_by_limit_per_partition: bool,
 ) -> Result<Arc<dyn ExecutionPlan>, DataFusionError> {
     let p = rewrite_physical_plan(p, &mut |p| add_check_memory_exec(p, memory_handler.clone()))?;
     log::trace!(
@@ -228,7 +241,7 @@ fn finalize_physical_plan(
     // after replace_suboptimal_merge_sorts so it doesn't push the query's row limit into the
     // worker merge we add (which would cut uncombined partial rows and undercount).
     let p = rewrite_physical_plan(p, &mut |p| {
-        push_worker_sort_and_limit(p, group_by_limit_factor)
+        push_worker_sort_and_limit(p, group_by_limit_factor, group_by_limit_per_partition)
     })?;
     log::trace!(
         "Rewrote physical plan by push_worker_sort_and_limit:\n{}",
diff --git a/rust/cubestore/cubestore/src/queryplanner/query_executor.rs b/rust/cubestore/cubestore/src/queryplanner/query_executor.rs
index 6025288706314..c049b44c5b0ed 100644
--- a/rust/cubestore/cubestore/src/queryplanner/query_executor.rs
+++ b/rust/cubestore/cubestore/src/queryplanner/query_executor.rs
@@ -568,6 +568,7 @@ impl QueryExecutorImpl {
             serialized_plan,
             self.memory_handler.clone(),
             self.config.group_by_limit_factor(),
+            self.config.group_by_limit_per_partition(),
         ))
     }
 
@@ -584,6 +585,7 @@ impl QueryExecutorImpl {
             self.memory_handler.clone(),
             data_loaded_size.clone(),
             self.config.group_by_limit_factor(),
+            self.config.group_by_limit_per_partition(),
         ))
     }
 
@@ -607,6 +609,7 @@ impl QueryExecutorImpl {
             Arc::new(PreOptimizeRule::new(
                 self.config.push_partial_aggregate_below_merge_enabled(),
                 self.config.group_by_limit_factor(),
+                self.config.coalesce_under_hash_aggregate(),
             )),
             // DF rules without EnforceDistribution.  We do need to keep EnforceSorting.
             Arc::new(OutputRequirements::new_add_mode()),

From 6697321cc6cb0420131860c38456e2e06f41d5c7 Mon Sep 17 00:00:00 2001
From: Aleksandr Romanenko <alex.romanenko@cube.dev>
Date: Tue, 23 Jun 2026 21:19:42 +0200
Subject: [PATCH 16/16] fix(cubestore): don't panic on a malformed
 group-by-limit flag value

env_bool panics at startup on anything but 0/1/true/false. For opt-in performance
toggles a typo in CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION /
CUBESTORE_COALESCE_UNDER_HASH_AGGREGATE must not take a node down. Read them with a
lenient env_flag helper (1/true = on, anything else = off), restoring the previous
raw-env behavior while keeping the ConfigObj surface.
---
 rust/cubestore/cubestore/src/config/mod.rs | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/rust/cubestore/cubestore/src/config/mod.rs b/rust/cubestore/cubestore/src/config/mod.rs
index 42eb40652b314..524228f667d97 100644
--- a/rust/cubestore/cubestore/src/config/mod.rs
+++ b/rust/cubestore/cubestore/src/config/mod.rs
@@ -1302,6 +1302,13 @@ fn env_bool(name: &str, default: bool) -> bool {
         .unwrap_or(default)
 }
 
+/// Lenient boolean env read for opt-in toggles: only `1`/`true` enable it, anything else (including a
+/// typo) is treated as off. Unlike [`env_bool`] it never panics -- a malformed value on a
+/// performance flag must not take a node down on startup.
+fn env_flag(name: &str) -> bool {
+    env::var(name).map_or(false, |v| v == "true" || v == "1")
+}
+
 pub fn env_parse<T>(name: &str, default: T) -> T
 where
     T: FromStr,
@@ -1813,14 +1820,8 @@ impl Config {
                     false,
                 ),
                 group_by_limit_factor: env_parse("CUBESTORE_GROUP_BY_LIMIT_FACTOR", 0),
-                group_by_limit_per_partition: env_bool(
-                    "CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION",
-                    false,
-                ),
-                coalesce_under_hash_aggregate: env_bool(
-                    "CUBESTORE_COALESCE_UNDER_HASH_AGGREGATE",
-                    false,
-                ),
+                group_by_limit_per_partition: env_flag("CUBESTORE_GROUP_BY_LIMIT_PER_PARTITION"),
+                coalesce_under_hash_aggregate: env_flag("CUBESTORE_COALESCE_UNDER_HASH_AGGREGATE"),
                 allow_decimal128: env_bool("CUBESTORE_ALLOW_DECIMAL128", false),
                 enable_remove_orphaned_remote_files: env_bool(
                     "CUBESTORE_ENABLE_REMOVE_ORPHANED_REMOTE_FILES",