From 7ade456d43df177d1e620bc6f394e1cf1906cab9 Mon Sep 17 00:00:00 2001 From: Akanksha Akkihal Date: Tue, 14 Apr 2026 02:29:07 +0000 Subject: [PATCH 01/10] support multiple SELECT columns with single aggregate in SQL matcher --- .../src/ast_matching/sqlparser_test.rs | 20 ++ .../src/ast_matching/sqlpattern_matcher.rs | 14 +- .../src/ast_matching/sqlpattern_parser.rs | 191 +++++++++++------- .../src/engines/simple_engine.rs | 37 ++-- .../src/tests/sql_pattern_matching_tests.rs | 1 + 5 files changed, 166 insertions(+), 97 deletions(-) diff --git a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlparser_test.rs b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlparser_test.rs index 72b0940b..3bde87ed 100644 --- a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlparser_test.rs +++ b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlparser_test.rs @@ -60,6 +60,26 @@ mod tests { assert_eq!(result.error, expected_error); } + /// Multi-column SELECT: GROUP BY keys repeated as bare identifiers + one aggregate (ClickHouse style). + #[test] + fn test_multi_column_select_quantile_matches_single_aggregate_semantics() { + let sql = "\ + SELECT L1, L2, L3, L4, quantile(0.99)(value) AS p99 \ + FROM cpu_usage \ + WHERE time BETWEEN DATEADD(s, -10, '2025-10-01 00:00:00') AND '2025-10-01 00:00:00' \ + GROUP BY L1, L2, L3, L4"; + let q = parse_sql_query(sql).expect("multi-column quantile should parse"); + assert_eq!(q.metric, "cpu_usage"); + assert_eq!(q.aggregation_info.get_name(), "QUANTILE"); + assert_eq!( + q.labels, + HashSet::from_iter( + ["L1", "L2", "L3", "L4"] + .map(|s| s.to_string()) + ) + ); + } + // ── Basic smoke tests ──────────────────────────────────────────────────── #[test] diff --git a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_matcher.rs b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_matcher.rs index 1aac0da6..8d51c5e9 100644 --- a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_matcher.rs +++ b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_matcher.rs @@ -174,17 +174,18 @@ impl SQLPatternMatcher { )); } + // Wall-clock span in units of prometheus scrape interval (ratio, not seconds). let scrape_duration = time_info.get_duration(); scraped_intervals = scrape_duration / self.scrape_interval; - if scraped_intervals < self.scrape_interval { + if scraped_intervals < 1.0 { println!("Returned QueryError::SpatialDurationSmall"); return Err(( QueryError::SpatialDurationSmall, format!( - "scrape duration {} less than one interval {}", - scraped_intervals, self.scrape_interval + "query time span {:.4} is shorter than one scrape interval ({:.4} s)", + scrape_duration, self.scrape_interval ), )); } @@ -216,7 +217,8 @@ impl SQLPatternMatcher { let mut sql_query = SQLQuery::new(Vec::new(), None, None); - for (i, (metric, aggregation_info, scrape_duration, labels, time_info)) in + // Third tuple field is `wall_seconds / scrape_interval` (how many scrape steps). + for (i, (metric, aggregation_info, scrape_steps, labels, time_info)) in query_data.iter().enumerate() { if i < query_data.len() - 1 { @@ -233,7 +235,7 @@ impl SQLPatternMatcher { // Last query // let time_info = TimeInfo::new("time".to_string(), *start, *scrape_duration); - if (scrape_duration - self.scrape_interval).abs() < f64::EPSILON { + if (scrape_steps - 1.0).abs() < f64::EPSILON { sql_query.add_subquery( QueryType::Spatial, aggregation_info.clone(), @@ -241,7 +243,7 @@ impl SQLPatternMatcher { labels.clone(), time_info.clone(), ); - } else if *scrape_duration > self.scrape_interval { + } else if *scrape_steps > 1.0 { // Check if labels match all metadata columns let has_all_labels = self .schema diff --git a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_parser.rs b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_parser.rs index 3c833a08..0dc520ce 100644 --- a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_parser.rs +++ b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_parser.rs @@ -83,9 +83,8 @@ impl SQLPatternParser { fn parse_select(&self, select: &Select) -> Option { let (metric, has_subquery) = self.get_metric(select)?; - let aggregation = self.get_aggregation(select)?; - let group_bys = self.get_groupbys(select)?; + let aggregation = self.get_aggregation(select, &group_bys)?; if !has_subquery { let time_info = self.get_time_info(select, &metric)?; @@ -119,8 +118,9 @@ impl SQLPatternParser { let subquery = match &select.from[0].relation { TableFactor::Derived { subquery, .. } => match subquery.body.as_ref() { SetExpr::Select(inner_select) => { - let inner_aggregation = self.get_aggregation(inner_select)?; let inner_group_bys = self.get_groupbys(inner_select)?; + let inner_aggregation = + self.get_aggregation(inner_select, &inner_group_bys)?; let time_info = self.get_time_info(inner_select, &metric)?; Some(Box::new(SQLQueryData { @@ -205,89 +205,130 @@ impl SQLPatternParser { } } - fn get_aggregation(&self, select: &Select) -> Option { - if select.projection.len() != 1 { - return None; - } + /// Parses `func(...)` into `AggregationInfo` (SUM, COUNT, QUANTILE, etc.). + fn try_parse_aggregation_function(&self, func: &Function) -> Option { + let name = func.name.to_string().to_uppercase(); - match &select.projection[0] { - SelectItem::UnnamedExpr(Expr::Function(func)) - | SelectItem::ExprWithAlias { - expr: Expr::Function(func), - .. - } => { - let name = func.name.to_string().to_uppercase(); - - let args = self.get_quantile_args(func); - - // Get the column being aggregated - let col = match &func.args { - FunctionArguments::None => return None, - FunctionArguments::Subquery(_) => return None, - FunctionArguments::List(func_args) => { - if name == "QUANTILE" { - if let FunctionArguments::List(params) = &func.parameters { - if !params.args.is_empty() { - // ClickHouse parametric syntax: quantile(0.95)(column) - // Column is the sole argument in func.args. - match func_args.args.first() { - Some(FunctionArg::Unnamed(FunctionArgExpr::Expr( - Expr::Identifier(ident), - ))) => ident.value.clone(), - _ => return None, - } - } else { - return None; - } - } else { - // ASAP syntax: QUANTILE(0.95, value) - column is second argument - if func_args.args.len() < 2 { - return None; - } - match &func_args.args[1] { - FunctionArg::Unnamed(FunctionArgExpr::Expr( - Expr::Identifier(ident), - )) => ident.value.clone(), - _ => return None, - } - } - } else if name == "PERCENTILE" { - // PERCENTILE(value, 95) - column is first argument - if func_args.args.is_empty() { - return None; - } - match &func_args.args[0] { - FunctionArg::Unnamed(FunctionArgExpr::Expr(Expr::Identifier( - ident, + let args = self.get_quantile_args(func); + + // Get the column being aggregated + let col = match &func.args { + FunctionArguments::None => return None, + FunctionArguments::Subquery(_) => return None, + FunctionArguments::List(func_args) => { + if name == "QUANTILE" { + if let FunctionArguments::List(params) = &func.parameters { + if !params.args.is_empty() { + // ClickHouse parametric syntax: quantile(0.95)(column) + match func_args.args.first() { + Some(FunctionArg::Unnamed(FunctionArgExpr::Expr( + Expr::Identifier(ident), ))) => ident.value.clone(), _ => return None, } } else { - // For other aggregations - column is first argument - if func_args.args.is_empty() { - return None; - } - match &func_args.args[0] { - FunctionArg::Unnamed(FunctionArgExpr::Expr(Expr::Identifier( - ident, - ))) => ident.value.clone(), - _ => return None, - } + return None; + } + } else { + // ASAP syntax: QUANTILE(0.95, value) - column is second argument + if func_args.args.len() < 2 { + return None; + } + match &func_args.args[1] { + FunctionArg::Unnamed(FunctionArgExpr::Expr(Expr::Identifier( + ident, + ))) => ident.value.clone(), + _ => return None, } } - }; - - // Always store PERCENTILE as QUANTILE internally - let normalized_name = if name == "PERCENTILE" { - "QUANTILE".to_string() + } else if name == "PERCENTILE" { + if func_args.args.is_empty() { + return None; + } + match &func_args.args[0] { + FunctionArg::Unnamed(FunctionArgExpr::Expr(Expr::Identifier(ident))) => { + ident.value.clone() + } + _ => return None, + } } else { - name - }; + if func_args.args.is_empty() { + return None; + } + match &func_args.args[0] { + FunctionArg::Unnamed(FunctionArgExpr::Expr(Expr::Identifier(ident))) => { + ident.value.clone() + } + _ => return None, + } + } + } + }; + + let normalized_name = if name == "PERCENTILE" { + "QUANTILE".to_string() + } else { + name + }; + + Some(AggregationInfo::new(normalized_name, col, args)) + } + + /// Single aggregate in `SELECT`, or grouping columns (identifiers) plus exactly one aggregate. + /// In the multi-column form, bare identifiers must match `GROUP BY` exactly as a set. + fn get_aggregation( + &self, + select: &Select, + group_bys: &HashSet, + ) -> Option { + if select.projection.is_empty() { + return None; + } + + if select.projection.len() == 1 { + return match &select.projection[0] { + SelectItem::UnnamedExpr(Expr::Function(func)) + | SelectItem::ExprWithAlias { + expr: Expr::Function(func), + .. + } => self.try_parse_aggregation_function(func), + _ => None, + }; + } + + let mut agg: Option = None; + let mut key_cols: HashSet = HashSet::new(); - Some(AggregationInfo::new(normalized_name, col, args)) + for item in &select.projection { + match item { + SelectItem::UnnamedExpr(Expr::Function(func)) + | SelectItem::ExprWithAlias { + expr: Expr::Function(func), + .. + } => { + let parsed = self.try_parse_aggregation_function(func)?; + if agg.replace(parsed).is_some() { + return None; + } + } + SelectItem::UnnamedExpr(Expr::Identifier(ident)) => { + key_cols.insert(ident.value.clone()); + } + SelectItem::ExprWithAlias { + expr: Expr::Identifier(ident), + .. + } => { + key_cols.insert(ident.value.clone()); + } + _ => return None, } - _ => None, } + + let agg = agg?; + if &key_cols != group_bys { + return None; + } + Some(agg) } fn get_metric(&self, select: &Select) -> Option<(String, bool)> { diff --git a/asap-query-engine/src/engines/simple_engine.rs b/asap-query-engine/src/engines/simple_engine.rs index 64da6ebb..f9543f21 100644 --- a/asap-query-engine/src/engines/simple_engine.rs +++ b/asap-query-engine/src/engines/simple_engine.rs @@ -395,14 +395,18 @@ impl SimpleEngine { ) -> u64 { match query_pattern_type { QueryPatternType::OnlyTemporal => { - let scrape_intervals = - match_result.query_data[0].time_info.clone().get_duration() as u64; - end_timestamp - (scrape_intervals * self.prometheus_scrape_interval * 1000) + let window_ms = (match_result.query_data[0].time_info.get_duration() + * self.prometheus_scrape_interval as f64 + * 1000.0) + .round() as u64; + end_timestamp.saturating_sub(window_ms) } QueryPatternType::OneTemporalOneSpatial => { - let scrape_intervals = - match_result.query_data[1].time_info.clone().get_duration() as u64; - end_timestamp - (scrape_intervals * self.prometheus_scrape_interval * 1000) + let window_ms = (match_result.query_data[1].time_info.get_duration() + * self.prometheus_scrape_interval as f64 + * 1000.0) + .round() as u64; + end_timestamp.saturating_sub(window_ms) } QueryPatternType::OnlySpatial => { end_timestamp - (self.prometheus_scrape_interval * 1000) @@ -485,7 +489,7 @@ impl SimpleEngine { match_result .query_data .first() - .map(|data| data.aggregation_info.get_args()[0].to_string()) + .and_then(|data| data.aggregation_info.get_args().first().map(|s| s.to_string())) } /// Extracts topk k parameter from PromQL match result @@ -551,7 +555,6 @@ impl SimpleEngine { .ok_or_else(|| "Missing quantile parameter for quantile query".to_string())?; query_kwargs.insert("quantile".to_string(), quantile); } - // Note: SQL doesn't support topk limiting yet Ok(query_kwargs) } @@ -1620,13 +1623,13 @@ impl SimpleEngine { let data_range_ms = match query_pattern_type { QueryPatternType::OnlySpatial => None, QueryPatternType::OnlyTemporal => { - let scrape_intervals = query_data.time_info.clone().get_duration() as u64; - Some(scrape_intervals * self.prometheus_scrape_interval * 1000) + let window_ms = (query_data.time_info.get_duration() * 1000.0).round() as u64; + Some(window_ms) } QueryPatternType::OneTemporalOneSpatial => { - let scrape_intervals = - match_result.query_data[1].time_info.clone().get_duration() as u64; - Some(scrape_intervals * self.prometheus_scrape_interval * 1000) + let window_ms = + (match_result.query_data[1].time_info.get_duration() * 1000.0).round() as u64; + Some(window_ms) } }; @@ -2028,9 +2031,11 @@ impl SimpleEngine { // Calculate timestamps - similar to OnlyTemporal let end_timestamp = self.validate_and_align_end_timestamp(query_time, QueryPatternType::OnlyTemporal); - let scrape_intervals = match_result.query_data[0].time_info.get_duration() as u64; - let start_timestamp = - end_timestamp - (scrape_intervals * self.prometheus_scrape_interval * 1000); + let window_ms = (match_result.query_data[0].time_info.get_duration() + * self.prometheus_scrape_interval as f64 + * 1000.0) + .round() as u64; + let start_timestamp = end_timestamp.saturating_sub(window_ms); let timestamps = QueryTimestamps { start_timestamp, diff --git a/asap-query-engine/src/tests/sql_pattern_matching_tests.rs b/asap-query-engine/src/tests/sql_pattern_matching_tests.rs index 44db4fc2..54270d23 100644 --- a/asap-query-engine/src/tests/sql_pattern_matching_tests.rs +++ b/asap-query-engine/src/tests/sql_pattern_matching_tests.rs @@ -191,4 +191,5 @@ mod tests { "Expected build_query_execution_context_sql to return None for a query that doesn't match the template, got Some." ); } + } From 9258c585bfa88cec84d26790236923749ebfda31 Mon Sep 17 00:00:00 2001 From: akan_27 Date: Tue, 14 Apr 2026 03:25:52 +0000 Subject: [PATCH 02/10] add SQL ORDER BY support for grouped aggregate results --- .../src/ast_matching/sqlparser_test.rs | 9 ++ .../src/ast_matching/sqlpattern_parser.rs | 1 - .../src/engines/simple_engine.rs | 148 +++++++++++++++++- asap-query-engine/src/main.rs | 22 ++- .../src/tests/sql_pattern_matching_tests.rs | 54 ++++++- 5 files changed, 218 insertions(+), 16 deletions(-) diff --git a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlparser_test.rs b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlparser_test.rs index 3bde87ed..8a42c4d3 100644 --- a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlparser_test.rs +++ b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlparser_test.rs @@ -80,6 +80,15 @@ mod tests { ); } + #[test] + fn test_order_by_with_grouped_count_is_accepted() { + check_query( + "SELECT L1, COUNT(value) AS c FROM cpu_usage WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() GROUP BY L1 ORDER BY c DESC", + vec![QueryType::SpatioTemporal], + None, + ); + } + // ── Basic smoke tests ──────────────────────────────────────────────────── #[test] diff --git a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_parser.rs b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_parser.rs index 0dc520ce..c93cce41 100644 --- a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_parser.rs +++ b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_parser.rs @@ -97,7 +97,6 @@ impl SQLPatternParser { || select.prewhere.is_some() || !select.cluster_by.is_empty() || !select.distribute_by.is_empty() - || !select.sort_by.is_empty() || select.having.is_some() || !select.named_window.is_empty() || select.window_before_qualify diff --git a/asap-query-engine/src/engines/simple_engine.rs b/asap-query-engine/src/engines/simple_engine.rs index f9543f21..b73a83cb 100644 --- a/asap-query-engine/src/engines/simple_engine.rs +++ b/asap-query-engine/src/engines/simple_engine.rs @@ -31,6 +31,7 @@ use promql_utilities::query_logics::parsing::{ use sql_utilities::ast_matching::QueryType; use sql_utilities::ast_matching::{SQLPatternMatcher, SQLPatternParser, SQLQuery}; use sql_utilities::sqlhelper::{AggregationInfo, SQLQueryData}; +use sqlparser::ast::{OrderByKind, SelectItem, SetExpr, Statement}; use sqlparser::dialect::*; use sqlparser::parser::Parser as parser; @@ -42,6 +43,18 @@ use elastic_dsl_utilities::types::{EsDslQueryPattern, GroupBySpec, MetricAggType // Type alias for merged outputs (single aggregate per key after merging) type MergedOutputsMap = HashMap, Box>; +#[derive(Debug, Clone, PartialEq, Eq)] +enum SqlOrderByTarget { + Value, + Label(usize), +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct SqlOrderBySpec { + target: SqlOrderByTarget, + asc: bool, +} + /// Metadata extracted from a query, independent of query language #[derive(Debug, Clone)] pub struct QueryMetadata { @@ -1539,6 +1552,134 @@ impl SimpleEngine { .collect() } + fn extract_sql_value_alias(statement: &Statement) -> Option { + let Statement::Query(query) = statement else { + return None; + }; + + let SetExpr::Select(select) = query.body.as_ref() else { + return None; + }; + + for item in &select.projection { + if let SelectItem::ExprWithAlias { + expr: sqlparser::ast::Expr::Function(_), + alias, + } = item + { + return Some(alias.value.clone()); + } + } + + None + } + + fn parse_sql_order_by_spec(query: &str, query_output_labels: &KeyByLabelNames) -> Option { + let statements = parser::parse_sql(&GenericDialect {}, query).ok()?; + let statement = statements.first()?; + let Statement::Query(parsed_query) = statement else { + return None; + }; + + let order_by = parsed_query.order_by.as_ref()?; + let order_exprs = match &order_by.kind { + OrderByKind::Expressions(exprs) => exprs, + _ => return None, + }; + + let first_order_expr = order_exprs.first()?; + let asc = first_order_expr.options.asc.unwrap_or(true); + let value_alias = Self::extract_sql_value_alias(statement); + + let target = match &first_order_expr.expr { + sqlparser::ast::Expr::Identifier(ident) => { + if let Some(idx) = query_output_labels + .labels + .iter() + .position(|label| label == &ident.value) + { + SqlOrderByTarget::Label(idx) + } else if value_alias + .as_ref() + .map(|alias| alias.eq_ignore_ascii_case(&ident.value)) + .unwrap_or(false) + || ident.value.eq_ignore_ascii_case("value") + { + SqlOrderByTarget::Value + } else { + return None; + } + } + sqlparser::ast::Expr::CompoundIdentifier(idents) => { + let last = idents.last()?.value.as_str(); + if let Some(idx) = query_output_labels + .labels + .iter() + .position(|label| label.eq_ignore_ascii_case(last)) + { + SqlOrderByTarget::Label(idx) + } else if value_alias + .as_ref() + .map(|alias| alias.eq_ignore_ascii_case(last)) + .unwrap_or(false) + || last.eq_ignore_ascii_case("value") + { + SqlOrderByTarget::Value + } else { + return None; + } + } + sqlparser::ast::Expr::Function(_) => SqlOrderByTarget::Value, + sqlparser::ast::Expr::Value(v) => match &v.value { + sqlparser::ast::Value::Number(pos, _) => { + let position = pos.parse::().ok()?; + if position == 0 { + return None; + } + if position <= query_output_labels.labels.len() { + SqlOrderByTarget::Label(position - 1) + } else if position == query_output_labels.labels.len() + 1 { + SqlOrderByTarget::Value + } else { + return None; + } + } + _ => return None, + }, + _ => return None, + }; + + Some(SqlOrderBySpec { target, asc }) + } + + pub(crate) fn apply_sql_order_by( + mut results: Vec, + query_output_labels: &KeyByLabelNames, + query: &str, + ) -> Vec { + let Some(spec) = Self::parse_sql_order_by_spec(query, query_output_labels) else { + return results; + }; + + results.sort_by(|a, b| match &spec.target { + SqlOrderByTarget::Value => a + .value + .partial_cmp(&b.value) + .unwrap_or(std::cmp::Ordering::Equal), + SqlOrderByTarget::Label(idx) => { + let a_label = a.labels.labels.get(*idx).map(String::as_str).unwrap_or(""); + let b_label = b.labels.labels.get(*idx).map(String::as_str).unwrap_or(""); + a_label.cmp(b_label) + } + }); + + if !spec.asc { + results.reverse(); + } + + results + } + fn sql_get_is_collapsable( &self, temporal_aggregation: &AggregationInfo, @@ -1714,7 +1855,7 @@ impl SimpleEngine { query: String, time: f64, ) -> Option<(KeyByLabelNames, QueryResult)> { - let context = self.build_query_execution_context_sql(query, time)?; + let context = self.build_query_execution_context_sql(query.clone(), time)?; // Execute complete query pipeline let results = self .execute_query_pipeline(&context, false) // SQL: topk disabled @@ -1724,9 +1865,12 @@ impl SimpleEngine { }) .ok()?; + let ordered_results = + Self::apply_sql_order_by(results, &context.metadata.query_output_labels, &query); + Some(( context.metadata.query_output_labels, - QueryResult::vector(results, context.query_time), + QueryResult::vector(ordered_results, context.query_time), )) } diff --git a/asap-query-engine/src/main.rs b/asap-query-engine/src/main.rs index 0be1d13c..3950af93 100644 --- a/asap-query-engine/src/main.rs +++ b/asap-query-engine/src/main.rs @@ -330,20 +330,18 @@ async fn main() -> Result<()> { //info!("=== TEMPORARY: Using ClickHouse HTTP adapter ==="); //info!("ClickHouse endpoint will be available at: /clickhouse/query"); - //info!("ClickHouse fallback URL: http://localhost:8123/?database=default"); - - //let adapter_config = AdapterConfig::clickhouse_sql( - // "http://localhost:8123".to_string(), // ClickHouse server URL - // "default".to_string(), // Database name - // true, // Always forward (fallback for every query) - //); - - // Original Prometheus config (commented out temporarily): - let adapter_config = AdapterConfig::prometheus_promql( - args.prometheus_server.clone(), - args.forward_unsupported_queries, + let adapter_config = AdapterConfig::clickhouse_sql( + "http://localhost:8123".to_string(), + "default".to_string(), + true, ); + // Prometheus config (commented out): + // let adapter_config = AdapterConfig::prometheus_promql( + // args.prometheus_server.clone(), + // args.forward_unsupported_queries, + // ); + let http_config = HttpServerConfig { port: args.http_port, handle_http_requests: true, diff --git a/asap-query-engine/src/tests/sql_pattern_matching_tests.rs b/asap-query-engine/src/tests/sql_pattern_matching_tests.rs index 54270d23..2e5dd871 100644 --- a/asap-query-engine/src/tests/sql_pattern_matching_tests.rs +++ b/asap-query-engine/src/tests/sql_pattern_matching_tests.rs @@ -6,9 +6,10 @@ #[cfg(test)] mod tests { use crate::data_model::{ - AggregationConfig, AggregationReference, AggregationType, CleanupPolicy, InferenceConfig, + AggregationConfig, AggregationReference, AggregationType, CleanupPolicy, InferenceConfig, KeyByLabelValues, QueryConfig, QueryLanguage, SchemaConfig, StreamingConfig, WindowType, }; + use crate::engines::query_result::InstantVectorElement; use crate::engines::simple_engine::SimpleEngine; use crate::stores::simple_map_store::SimpleMapStore; use promql_utilities::data_model::KeyByLabelNames; @@ -192,4 +193,55 @@ mod tests { ); } + #[test] + fn test_sql_order_by_value_desc_applies_result_sorting() { + let labels = KeyByLabelNames::new(vec!["srcip".to_string()]); + let query = "SELECT srcip, COUNT(pkt_len) AS transfer_events FROM netflow_table WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip ORDER BY transfer_events DESC"; + let results = vec![ + InstantVectorElement::new( + KeyByLabelValues::new_with_labels(vec!["10.0.0.2".to_string()]), + 3.0, + ), + InstantVectorElement::new( + KeyByLabelValues::new_with_labels(vec!["10.0.0.1".to_string()]), + 9.0, + ), + InstantVectorElement::new( + KeyByLabelValues::new_with_labels(vec!["10.0.0.3".to_string()]), + 5.0, + ), + ]; + + let ordered = SimpleEngine::apply_sql_order_by(results, &labels, query); + let values: Vec = ordered.iter().map(|e| e.value).collect(); + assert_eq!(values, vec![9.0, 5.0, 3.0]); + } + + #[test] + fn test_sql_order_by_label_asc_applies_result_sorting() { + let labels = KeyByLabelNames::new(vec!["srcip".to_string()]); + let query = "SELECT srcip, COUNT(pkt_len) AS transfer_events FROM netflow_table WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip ORDER BY srcip ASC"; + let results = vec![ + InstantVectorElement::new( + KeyByLabelValues::new_with_labels(vec!["10.0.0.2".to_string()]), + 3.0, + ), + InstantVectorElement::new( + KeyByLabelValues::new_with_labels(vec!["10.0.0.1".to_string()]), + 9.0, + ), + InstantVectorElement::new( + KeyByLabelValues::new_with_labels(vec!["10.0.0.3".to_string()]), + 5.0, + ), + ]; + + let ordered = SimpleEngine::apply_sql_order_by(results, &labels, query); + let keys: Vec = ordered + .iter() + .map(|e| e.labels.labels.first().cloned().unwrap_or_default()) + .collect(); + assert_eq!(keys, vec!["10.0.0.1", "10.0.0.2", "10.0.0.3"]); + } + } From 7053dd1d4206cec4e7b25cbe0edc28962b0ff82d Mon Sep 17 00:00:00 2001 From: akan_27 Date: Tue, 14 Apr 2026 03:52:51 +0000 Subject: [PATCH 03/10] support SQL ORDER BY and LIMIT for grouped aggregate queries --- .../src/engines/simple_engine.rs | 43 +++++++++++++++- .../src/tests/sql_pattern_matching_tests.rs | 50 +++++++++++++++++++ 2 files changed, 92 insertions(+), 1 deletion(-) diff --git a/asap-query-engine/src/engines/simple_engine.rs b/asap-query-engine/src/engines/simple_engine.rs index b73a83cb..c0dabcfd 100644 --- a/asap-query-engine/src/engines/simple_engine.rs +++ b/asap-query-engine/src/engines/simple_engine.rs @@ -1680,6 +1680,46 @@ impl SimpleEngine { results } + fn parse_sql_limit(query: &str) -> Option { + let statements = parser::parse_sql(&GenericDialect {}, query).ok()?; + let statement = statements.first()?; + let Statement::Query(parsed_query) = statement else { + return None; + }; + + let limit_clause = parsed_query.limit_clause.as_ref()?; + match limit_clause { + sqlparser::ast::LimitClause::LimitOffset { + limit, + offset: _, + limit_by: _, + } => match limit { + Some(sqlparser::ast::Expr::Value(v)) => match &v.value { + sqlparser::ast::Value::Number(n, _) => n.parse::().ok(), + _ => None, + }, + _ => None, + }, + sqlparser::ast::LimitClause::OffsetCommaLimit { offset: _, limit } => match limit { + sqlparser::ast::Expr::Value(v) => match &v.value { + sqlparser::ast::Value::Number(n, _) => n.parse::().ok(), + _ => None, + }, + _ => None, + }, + } + } + + pub(crate) fn apply_sql_limit( + mut results: Vec, + query: &str, + ) -> Vec { + if let Some(limit) = Self::parse_sql_limit(query) { + results.truncate(limit); + } + results + } + fn sql_get_is_collapsable( &self, temporal_aggregation: &AggregationInfo, @@ -1867,10 +1907,11 @@ impl SimpleEngine { let ordered_results = Self::apply_sql_order_by(results, &context.metadata.query_output_labels, &query); + let final_results = Self::apply_sql_limit(ordered_results, &query); Some(( context.metadata.query_output_labels, - QueryResult::vector(ordered_results, context.query_time), + QueryResult::vector(final_results, context.query_time), )) } diff --git a/asap-query-engine/src/tests/sql_pattern_matching_tests.rs b/asap-query-engine/src/tests/sql_pattern_matching_tests.rs index 2e5dd871..b82cade8 100644 --- a/asap-query-engine/src/tests/sql_pattern_matching_tests.rs +++ b/asap-query-engine/src/tests/sql_pattern_matching_tests.rs @@ -244,4 +244,54 @@ mod tests { assert_eq!(keys, vec!["10.0.0.1", "10.0.0.2", "10.0.0.3"]); } + #[test] + fn test_sql_limit_truncates_results() { + let query = "SELECT srcip, COUNT(pkt_len) AS transfer_events FROM netflow_table WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip LIMIT 2"; + let results = vec![ + InstantVectorElement::new( + KeyByLabelValues::new_with_labels(vec!["10.0.0.2".to_string()]), + 3.0, + ), + InstantVectorElement::new( + KeyByLabelValues::new_with_labels(vec!["10.0.0.1".to_string()]), + 9.0, + ), + InstantVectorElement::new( + KeyByLabelValues::new_with_labels(vec!["10.0.0.3".to_string()]), + 5.0, + ), + ]; + + let limited = SimpleEngine::apply_sql_limit(results, query); + assert_eq!(limited.len(), 2); + } + + #[test] + fn test_sql_order_by_then_limit_matches_topk_shape() { + let labels = KeyByLabelNames::new(vec!["srcip".to_string()]); + let query = "SELECT srcip, COUNT(pkt_len) AS transfer_events FROM netflow_table WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip ORDER BY transfer_events DESC LIMIT 2"; + let results = vec![ + InstantVectorElement::new( + KeyByLabelValues::new_with_labels(vec!["10.0.0.2".to_string()]), + 3.0, + ), + InstantVectorElement::new( + KeyByLabelValues::new_with_labels(vec!["10.0.0.1".to_string()]), + 9.0, + ), + InstantVectorElement::new( + KeyByLabelValues::new_with_labels(vec!["10.0.0.3".to_string()]), + 5.0, + ), + ]; + + let ordered = SimpleEngine::apply_sql_order_by(results, &labels, query); + let top2 = SimpleEngine::apply_sql_limit(ordered, query); + let keys: Vec = top2 + .iter() + .map(|e| e.labels.labels.first().cloned().unwrap_or_default()) + .collect(); + assert_eq!(keys, vec!["10.0.0.1", "10.0.0.3"]); + } + } From 23cc3516ac86eda76256c4538963f29bef4b1819 Mon Sep 17 00:00:00 2001 From: Akanksha Akkihal Date: Tue, 14 Apr 2026 04:22:34 +0000 Subject: [PATCH 04/10] add benchmark scripts and planner configs for KLL and CountMinSketch SQL queries --- .../inference_config.yaml | 16 + .../streaming_config.yaml | 49 +++ .../inference_config.yaml | 15 + .../streaming_config.yaml | 29 ++ .../planner_sql_count_srcip_workload.yaml | 19 + asp_config/planner_sql_quantile_workload.yaml | 21 + .../benchmark_baseline_vs_asap.py | 365 ++++++++++++++++++ benchmark_script/benchmark_count_srcip_1s.py | 201 ++++++++++ benchmark_script/benchmark_quantile_1s.py | 199 ++++++++++ benchmark_script/benchmark_time_windows.py | 181 +++++++++ 10 files changed, 1095 insertions(+) create mode 100644 asp_config/planner_out_count_srcip/inference_config.yaml create mode 100644 asp_config/planner_out_count_srcip/streaming_config.yaml create mode 100644 asp_config/planner_out_quantile/inference_config.yaml create mode 100644 asp_config/planner_out_quantile/streaming_config.yaml create mode 100644 asp_config/planner_sql_count_srcip_workload.yaml create mode 100644 asp_config/planner_sql_quantile_workload.yaml create mode 100644 benchmark_script/benchmark_baseline_vs_asap.py create mode 100644 benchmark_script/benchmark_count_srcip_1s.py create mode 100644 benchmark_script/benchmark_quantile_1s.py create mode 100644 benchmark_script/benchmark_time_windows.py diff --git a/asp_config/planner_out_count_srcip/inference_config.yaml b/asp_config/planner_out_count_srcip/inference_config.yaml new file mode 100644 index 00000000..7fd0d61d --- /dev/null +++ b/asp_config/planner_out_count_srcip/inference_config.yaml @@ -0,0 +1,16 @@ +cleanup_policy: + name: no_cleanup +queries: +- aggregations: + - aggregation_id: 1 + - aggregation_id: 2 + query: SELECT srcip, COUNT(pkt_len) AS transfer_events FROM netflow_table WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip +tables: +- name: netflow_table + time_column: time + value_columns: + - pkt_len + metadata_columns: + - srcip + - dstip + - proto diff --git a/asp_config/planner_out_count_srcip/streaming_config.yaml b/asp_config/planner_out_count_srcip/streaming_config.yaml new file mode 100644 index 00000000..502aac5c --- /dev/null +++ b/asp_config/planner_out_count_srcip/streaming_config.yaml @@ -0,0 +1,49 @@ +aggregations: +- aggregationId: 1 + aggregationSubType: '' + aggregationType: DeltaSetAggregator + labels: + aggregated: + - srcip + grouping: [] + rollup: + - dstip + - proto + metric: netflow_table + parameters: {} + slideInterval: 1 + spatialFilter: '' + table_name: netflow_table + value_column: pkt_len + windowSize: 1 + windowType: tumbling +- aggregationId: 2 + aggregationSubType: count + aggregationType: CountMinSketch + labels: + aggregated: + - srcip + grouping: [] + rollup: + - dstip + - proto + metric: netflow_table + parameters: + depth: 3 + width: 1024 + _impl_mode: Sketchlib + slideInterval: 1 + spatialFilter: '' + table_name: netflow_table + value_column: pkt_len + windowSize: 1 + windowType: tumbling +tables: +- name: netflow_table + time_column: time + value_columns: + - pkt_len + metadata_columns: + - srcip + - dstip + - proto diff --git a/asp_config/planner_out_quantile/inference_config.yaml b/asp_config/planner_out_quantile/inference_config.yaml new file mode 100644 index 00000000..66468868 --- /dev/null +++ b/asp_config/planner_out_quantile/inference_config.yaml @@ -0,0 +1,15 @@ +cleanup_policy: + name: no_cleanup +queries: +- aggregations: + - aggregation_id: 21 + query: SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY proto, srcip, dstip ORDER BY p99_pkt_len DESC LIMIT 10 +tables: +- name: netflow_table + time_column: time + value_columns: + - pkt_len + metadata_columns: + - srcip + - dstip + - proto diff --git a/asp_config/planner_out_quantile/streaming_config.yaml b/asp_config/planner_out_quantile/streaming_config.yaml new file mode 100644 index 00000000..b6a0bd6a --- /dev/null +++ b/asp_config/planner_out_quantile/streaming_config.yaml @@ -0,0 +1,29 @@ +aggregations: +- aggregationId: 21 + aggregationSubType: '' + aggregationType: DatasketchesKLL + labels: + aggregated: [] + grouping: + - dstip + - proto + - srcip + rollup: [] + metric: netflow_table + parameters: + K: 20 + slideInterval: 1 + spatialFilter: '' + table_name: netflow_table + value_column: pkt_len + windowSize: 1 + windowType: tumbling +tables: +- name: netflow_table + time_column: time + value_columns: + - pkt_len + metadata_columns: + - srcip + - dstip + - proto diff --git a/asp_config/planner_sql_count_srcip_workload.yaml b/asp_config/planner_sql_count_srcip_workload.yaml new file mode 100644 index 00000000..4a8677d0 --- /dev/null +++ b/asp_config/planner_sql_count_srcip_workload.yaml @@ -0,0 +1,19 @@ +tables: + - name: netflow_table + time_column: time + value_columns: [pkt_len] + metadata_columns: [srcip, dstip, proto] +query_groups: + - id: 1 + repetition_delay: 1 + controller_options: + accuracy_sla: 0.95 + latency_sla: 100.0 + queries: + - >- + SELECT srcip, COUNT(pkt_len) AS transfer_events + FROM netflow_table + WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) + GROUP BY srcip +aggregate_cleanup: + policy: no_cleanup diff --git a/asp_config/planner_sql_quantile_workload.yaml b/asp_config/planner_sql_quantile_workload.yaml new file mode 100644 index 00000000..a4fc7f15 --- /dev/null +++ b/asp_config/planner_sql_quantile_workload.yaml @@ -0,0 +1,21 @@ +tables: + - name: netflow_table + time_column: time + value_columns: [pkt_len] + metadata_columns: [srcip, dstip, proto] +query_groups: + - id: 1 + repetition_delay: 1 + controller_options: + accuracy_sla: 0.95 + latency_sla: 100.0 + queries: + - >- + SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len + FROM netflow_table + WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) + GROUP BY proto, srcip, dstip + ORDER BY p99_pkt_len DESC + LIMIT 10 +aggregate_cleanup: + policy: no_cleanup diff --git a/benchmark_script/benchmark_baseline_vs_asap.py b/benchmark_script/benchmark_baseline_vs_asap.py new file mode 100644 index 00000000..9f941f7c --- /dev/null +++ b/benchmark_script/benchmark_baseline_vs_asap.py @@ -0,0 +1,365 @@ +#!/usr/bin/env python3 +""" +Compare baseline ClickHouse vs ASAP query engine over increasing time windows. + +Default: widening windows (1 s, 5 s, 10 s, then 1 min → … → 6 h). Sub-minute rows use **settled** +ranges (end ~10s ago). For W≥60, windows use a **70s lag** on the upper bound: +DATEADD(s,-(W+70),NOW())..DATEADD(s,-70,NOW()) (same as netflow_inference agg2 templates). + +Run the query engine with **--prometheus-scrape-interval 1** (or equivalent) so 1s tumbling netflow +matches the SQL pattern matcher; a larger value can reject short windows and also yield fallback. +Baseline always scans flow_table_streaming on :8123. ASAP (:8088) uses sketches only +if netflow_inference_config.yaml has a matching template per window; otherwise +8088 forwards to ClickHouse (you are timing proxy+CH). + +template1s mode: single 1-second slice matching the default quantile template +(DATEADD -11 .. -10). + +Environment +----------- +BASELINE_URL, ASAP_URL, BASELINE_TABLE, ASAP_TABLE, BENCHMARK_RUNS, +BENCHMARK_DROP_CACHES (1=run sudo drop_caches between each timed curl) +""" + +from __future__ import annotations + +import argparse +import os +import statistics +import subprocess +import sys +import time +from datetime import datetime +from typing import List, Tuple + +BASELINE_URL = os.environ.get("BASELINE_URL", "http://localhost:8123/") +ASAP_URL = os.environ.get("ASAP_URL", "http://localhost:8088/clickhouse/query") +BASELINE_TABLE = os.environ.get("BASELINE_TABLE", "flow_table_streaming") +ASAP_TABLE = os.environ.get("ASAP_TABLE", "netflow_table") +DEFAULT_RUNS = int(os.environ.get("BENCHMARK_RUNS", "5")) +DROP_CACHES = os.environ.get("BENCHMARK_DROP_CACHES", "1") == "1" + +# Seconds before NOW() for the **end** of each W≥60 window (matches netflow_inference_config). +SETTLED_LAG_S = 70 + +# (label, seconds) — increasing windows +DEFAULT_WINDOWS: List[Tuple[str, int]] = [ + ("1 s", 1), + ("5 s", 5), + ("10 s", 10), + ("1 min", 60), + ("5 min", 300), + ("15 min", 900), + ("30 min", 1800), + ("1 hour", 3600), + ("3 hours", 10800), + ("6 hours", 21600), +] + + +def drop_caches() -> None: + if not DROP_CACHES: + return + r = subprocess.run( + "sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'", + shell=True, + capture_output=True, + ) + if r.returncode != 0: + print("warning: drop_caches failed (run without sudo?); continuing", file=sys.stderr) + time.sleep(0.3) + + +def curl_timed(url: str, query: str) -> Tuple[float, str]: + result = subprocess.run( + [ + "curl", + "-o", + "/dev/null", + "-s", + "-S", + "-w", + "%{http_code} %{time_total}", + "-G", + url, + "--data-urlencode", + f"query={query}", + ], + capture_output=True, + text=True, + ) + if result.returncode != 0: + raise RuntimeError(f"curl failed: {result.stderr}") + parts = result.stdout.strip().split() + if len(parts) != 2: + raise RuntimeError(f"bad curl output: {result.stdout!r}") + code, t = parts[0], float(parts[1]) + if not code.startswith("2"): + raise RuntimeError(f"HTTP {code} — check ClickHouse / query_engine logs") + return t, code + + +def quantile_sql(table: str, where_clause: str, with_order_limit: bool) -> str: + core = f"""SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len +FROM {table} +WHERE {where_clause} +GROUP BY proto, srcip, dstip""" + if with_order_limit: + return core + "\nORDER BY p99_pkt_len DESC\nLIMIT 10\nFORMAT TabSeparated" + return core + "\nFORMAT TabSeparated" + + +def time_predicate(seconds: int, use_now: bool, fixed_end: str | None) -> str: + # Sub-minute: settled upper bound so 1s tumbling buckets tend to exist in the store. + if seconds == 1: + if use_now: + return "time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW())" + if not fixed_end: + raise ValueError("fixed_end required when not using NOW()") + return ( + f"time BETWEEN DATEADD(s, -11, '{fixed_end}') " + f"AND DATEADD(s, -10, '{fixed_end}')" + ) + if seconds == 5: + if use_now: + return "time BETWEEN DATEADD(s, -15, NOW()) AND DATEADD(s, -10, NOW())" + if not fixed_end: + raise ValueError("fixed_end required when not using NOW()") + return ( + f"time BETWEEN DATEADD(s, -15, '{fixed_end}') " + f"AND DATEADD(s, -10, '{fixed_end}')" + ) + if seconds == 10: + if use_now: + return "time BETWEEN DATEADD(s, -25, NOW()) AND DATEADD(s, -15, NOW())" + if not fixed_end: + raise ValueError("fixed_end required when not using NOW()") + return ( + f"time BETWEEN DATEADD(s, -25, '{fixed_end}') " + f"AND DATEADD(s, -15, '{fixed_end}')" + ) + if seconds >= 60: + lag = SETTLED_LAG_S + if use_now: + return ( + f"time BETWEEN DATEADD(s, -{seconds + lag}, NOW()) " + f"AND DATEADD(s, -{lag}, NOW())" + ) + if not fixed_end: + raise ValueError("fixed_end required when not using NOW()") + return ( + f"time BETWEEN DATEADD(s, -{seconds + lag}, '{fixed_end}') " + f"AND DATEADD(s, -{lag}, '{fixed_end}')" + ) + if use_now: + return f"time BETWEEN DATEADD(s, -{seconds}, NOW()) AND NOW()" + if not fixed_end: + raise ValueError("fixed_end required when not using NOW()") + return f"time BETWEEN DATEADD(s, -{seconds}, '{fixed_end}') AND '{fixed_end}'" + + +def parse_windows_arg(spec: str) -> List[Tuple[str, int]]: + """e.g. '60,300,900' or '60:1min,300:5min' (label optional).""" + out: List[Tuple[str, int]] = [] + for part in spec.split(","): + part = part.strip() + if not part: + continue + if ":" in part: + sec_s, label = part.split(":", 1) + out.append((label.strip(), int(sec_s.strip()))) + else: + s = int(part) + out.append((f"{s}s", s)) + if not out: + raise ValueError("empty --windows") + return out + + +def run_increasing_windows( + windows_sec: List[Tuple[str, int]], + runs: int, + order_limit: bool, + use_now: bool, + fixed_end: str | None, + verify_counts: bool, +) -> None: + windows: List[Tuple[str, str]] = [ + (label, time_predicate(sec, use_now, fixed_end)) for label, sec in windows_sec + ] + + print(f"Started: {datetime.now()}") + if use_now: + print( + f"Time range: W≥60 → DATEADD(s,-(W+{SETTLED_LAG_S}),NOW()).." + f"DATEADD(s,-{SETTLED_LAG_S},NOW()); " + "1s/5s/10s use settled sub-minute offsets (see docstring)." + ) + else: + print(f"Time range: fixed end '{fixed_end}' for all windows.") + print(f"Baseline: {BASELINE_URL} table={BASELINE_TABLE}") + print(f"ASAP: {ASAP_URL} table={ASAP_TABLE}") + print(f"Runs per (window, side): {runs}") + print( + "\nASAP: use --prometheus-scrape-interval 1 with 1s tumbling netflow; else short windows\n" + "may never match the SQL matcher. Without templates or store data, 8088 falls back —\n" + "check QUERY ENGINE SUCCESS vs FORWARDING TO CLICKHOUSE in logs.\n" + ) + print("=" * 88) + print( + f"{'Window':<12} {'B_avg':>9} {'B_med':>9} {'B_min':>9} {'B_max':>9} " + f"{'A_avg':>9} {'A_med':>9} {'B/A':>8}" + ) + print("=" * 88) + + summary: List[dict] = [] + + for label, pred in windows: + if verify_counts: + cq = f"SELECT count() FROM {BASELINE_TABLE} WHERE {pred}" + try: + r = subprocess.run( + ["curl", "-s", "-S", "-G", BASELINE_URL, "--data-urlencode", f"query={cq}"], + capture_output=True, + text=True, + ) + cnt = r.stdout.strip().split("\n")[0] if r.returncode == 0 else "?" + print(f" [{label}] rows in window (baseline table): {cnt}") + except Exception as e: + print(f" [{label}] count check failed: {e}") + + bt: List[float] = [] + at: List[float] = [] + for i in range(runs): + qb = quantile_sql(BASELINE_TABLE, pred, order_limit) + qa = quantile_sql(ASAP_TABLE, pred, order_limit) + + drop_caches() + tb, _ = curl_timed(BASELINE_URL, qb) + bt.append(tb) + drop_caches() + ta, _ = curl_timed(ASAP_URL, qa) + at.append(ta) + print( + f" [{label}] run {i + 1}/{runs}: baseline={tb:.4f}s asap={ta:.4f}s", + flush=True, + ) + + b_avg = statistics.mean(bt) + b_med = statistics.median(bt) + a_avg = statistics.mean(at) + a_med = statistics.median(at) + ratio = b_avg / a_avg if a_avg > 0 else 0.0 + print( + f"{label:<12} {b_avg:>8.4f}s {b_med:>8.4f}s {min(bt):>8.4f}s {max(bt):>8.4f}s " + f"{a_avg:>8.4f}s {a_med:>8.4f}s {ratio:>7.2f}x\n" + ) + summary.append( + { + "label": label, + "b_avg": b_avg, + "a_avg": a_avg, + "speedup": ratio, + } + ) + + print("=" * 88) + print("Summary (baseline avg / ASAP avg):") + for r in summary: + print(f" {r['label']:<12} {r['b_avg']:.4f}s / {r['a_avg']:.4f}s → {r['speedup']:.2f}x") + print(f"\nFinished: {datetime.now()}") + + +def main() -> None: + p = argparse.ArgumentParser( + description="Baseline vs ASAP over increasing time windows (default) or 1s template." + ) + p.add_argument( + "mode", + nargs="?", + default="windows", + choices=("windows", "template1s"), + help="windows=increasing NOW() ranges; template1s=1s slice from inference YAML", + ) + p.add_argument("--runs", type=int, default=DEFAULT_RUNS) + p.add_argument( + "--windows", + metavar="SPEC", + help=( + "Comma-separated seconds for increasing windows, e.g. " + "'60,300,900,1800,3600'. Optional labels: '60:1min,300:5min'." + ), + ) + p.add_argument( + "--order-limit", + action="store_true", + help="ORDER BY p99 DESC LIMIT 10 (CH honors; ASAP local path ignores)", + ) + p.add_argument( + "--fixed-end", + metavar="TS", + help="Use fixed end timestamp instead of NOW()", + ) + p.add_argument( + "--no-verify-counts", + action="store_true", + help="Skip SELECT count() per window on baseline table", + ) + args = p.parse_args() + runs = max(1, args.runs) + use_now = args.fixed_end is None + verify = not args.no_verify_counts + + if args.mode == "template1s": + if args.windows: + print("warning: --windows ignored in template1s mode", file=sys.stderr) + where = "time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW())" + if not use_now: + where = ( + f"time BETWEEN DATEADD(s, -11, '{args.fixed_end}') " + f"AND DATEADD(s, -10, '{args.fixed_end}')" + ) + print(f"Started: {datetime.now()}") + print("Mode: template1s (matches default netflow_inference quantile window)") + print(f"Baseline: {BASELINE_URL} table={BASELINE_TABLE}") + print(f"ASAP: {ASAP_URL} table={ASAP_TABLE}") + print(f"Runs per side: {runs}\n") + bt, at = [], [] + for i in range(runs): + qb = quantile_sql(BASELINE_TABLE, where, args.order_limit) + qa = quantile_sql(ASAP_TABLE, where, args.order_limit) + drop_caches() + tb, _ = curl_timed(BASELINE_URL, qb) + bt.append(tb) + drop_caches() + ta, _ = curl_timed(ASAP_URL, qa) + at.append(ta) + print(f" run {i + 1}/{runs}: baseline={tb:.4f}s asap={ta:.4f}s", flush=True) + print( + f"\n1s slice {statistics.mean(bt):>8.4f}s {statistics.median(bt):>8.4f}s " + f"{statistics.mean(at):>8.4f}s {statistics.median(at):>8.4f}s " + f"{statistics.mean(bt) / statistics.mean(at):>7.2f}x\n" + ) + print( + "Check query_engine log for QUERY ENGINE SUCCESS vs FORWARDING TO CLICKHOUSE.\n" + f"Finished: {datetime.now()}" + ) + return + + if args.windows: + windows_sec = parse_windows_arg(args.windows) + else: + windows_sec = DEFAULT_WINDOWS + + run_increasing_windows( + windows_sec, + runs, + args.order_limit, + use_now, + args.fixed_end, + verify_counts=verify, + ) + + +if __name__ == "__main__": + main() diff --git a/benchmark_script/benchmark_count_srcip_1s.py b/benchmark_script/benchmark_count_srcip_1s.py new file mode 100644 index 00000000..d7fd3e81 --- /dev/null +++ b/benchmark_script/benchmark_count_srcip_1s.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 +""" +Benchmark baseline ClickHouse vs ASAP query engine for three SQL variants: + +1) Basic: + SELECT ... GROUP BY srcip +2) Ordered: + SELECT ... GROUP BY srcip ORDER BY COUNT(pkt_len) DESC +3) Ordered + limited: + SELECT ... GROUP BY srcip ORDER BY COUNT(pkt_len) DESC LIMIT 10 + +The 1s window uses a settled slice (-11s..-10s) to better align with 1s tumbling output. +""" + +from __future__ import annotations + +import argparse +import os +import statistics +import subprocess +import sys +import time +from datetime import datetime + +BASELINE_URL = os.environ.get("BASELINE_URL", "http://localhost:8123/") +ASAP_URL = os.environ.get("ASAP_URL", "http://localhost:8088/clickhouse/query") +BASELINE_TABLE = os.environ.get("BASELINE_TABLE", "flow_table_streaming") +ASAP_TABLE = os.environ.get("ASAP_TABLE", "netflow_table") +DEFAULT_RUNS = int(os.environ.get("BENCHMARK_RUNS", "10")) +DROP_CACHES = os.environ.get("BENCHMARK_DROP_CACHES", "0") == "1" + + +def drop_caches() -> None: + if not DROP_CACHES: + return + result = subprocess.run( + "sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'", + shell=True, + capture_output=True, + text=True, + ) + if result.returncode != 0: + print("warning: drop_caches failed; continuing", file=sys.stderr) + time.sleep(0.2) + + +def curl_timed(url: str, query: str) -> float: + result = subprocess.run( + [ + "curl", + "-o", + "/dev/null", + "-s", + "-S", + "-w", + "%{http_code} %{time_total}", + "-G", + url, + "--data-urlencode", + f"query={query}", + ], + capture_output=True, + text=True, + ) + if result.returncode != 0: + raise RuntimeError(f"curl failed: {result.stderr}") + parts = result.stdout.strip().split() + if len(parts) != 2: + raise RuntimeError(f"unexpected curl output: {result.stdout!r}") + status, elapsed = parts[0], float(parts[1]) + if not status.startswith("2"): + raise RuntimeError(f"HTTP {status}") + return elapsed + + +def build_query(table: str, fixed_end: str | None, query_type: str) -> str: + if fixed_end: + where = ( + f"time BETWEEN DATEADD(s, -11, '{fixed_end}') " + f"AND DATEADD(s, -10, '{fixed_end}')" + ) + else: + where = "time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW())" + + base = ( + "SELECT srcip, COUNT(pkt_len) AS transfer_events\n" + f"FROM {table}\n" + f"WHERE {where}\n" + "GROUP BY srcip\n" + ) + + if query_type == "basic": + suffix = "FORMAT TabSeparated" + elif query_type == "orderby": + suffix = "ORDER BY COUNT(pkt_len) DESC\nFORMAT TabSeparated" + elif query_type == "orderby_limit": + suffix = "ORDER BY COUNT(pkt_len) DESC\nLIMIT 10\nFORMAT TabSeparated" + else: + raise ValueError(f"unknown query_type: {query_type}") + + return base + suffix + + +def print_stats(label: str, values: list[float]) -> None: + print( + f"{label:<9} avg={statistics.mean(values):.4f}s " + f"med={statistics.median(values):.4f}s " + f"min={min(values):.4f}s max={max(values):.4f}s" + ) + + +def run_single_variant( + query_type: str, + runs: int, + warmup: int, + fixed_end: str | None, +) -> None: + baseline_query = build_query(BASELINE_TABLE, fixed_end, query_type) + asap_query = build_query(ASAP_TABLE, fixed_end, query_type) + + print(f"\n=== Variant: {query_type} ===") + print(f"Baseline: {BASELINE_URL} table={BASELINE_TABLE}") + print(f"ASAP: {ASAP_URL} table={ASAP_TABLE}") + print(f"Runs: {runs} warmup={warmup}") + print("-" * 78) + + for i in range(warmup): + drop_caches() + _ = curl_timed(BASELINE_URL, baseline_query) + drop_caches() + _ = curl_timed(ASAP_URL, asap_query) + print(f"warmup {i + 1}/{warmup} done") + + baseline_times: list[float] = [] + asap_times: list[float] = [] + + for i in range(runs): + drop_caches() + baseline_t = curl_timed(BASELINE_URL, baseline_query) + baseline_times.append(baseline_t) + + drop_caches() + asap_t = curl_timed(ASAP_URL, asap_query) + asap_times.append(asap_t) + + print( + f"run {i + 1:>2}/{runs}: baseline={baseline_t:.4f}s " + f"asap={asap_t:.4f}s speedup={baseline_t / asap_t if asap_t > 0 else 0.0:.2f}x" + ) + + print("-" * 78) + print_stats("baseline", baseline_times) + print_stats("asap", asap_times) + avg_speedup = statistics.mean(baseline_times) / statistics.mean(asap_times) + med_speedup = statistics.median(baseline_times) / statistics.median(asap_times) + print(f"speedup avg={avg_speedup:.2f}x med={med_speedup:.2f}x") + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Benchmark COUNT(srcip) query for 1s tumbling window." + ) + parser.add_argument("--runs", type=int, default=DEFAULT_RUNS) + parser.add_argument( + "--fixed-end", + metavar="TS", + help="Use fixed end timestamp, e.g. '2026-04-13 23:50:00'", + ) + parser.add_argument( + "--warmup", + type=int, + default=2, + help="Untimed warmup requests per side before measurements.", + ) + parser.add_argument( + "--query-type", + choices=["all", "basic", "orderby", "orderby_limit"], + default="all", + help="Which query variant to benchmark.", + ) + args = parser.parse_args() + runs = max(1, args.runs) + + print(f"Started: {datetime.now()}") + print(f"Variants: {args.query_type}") + print(f"Caches: {'drop each request' if DROP_CACHES else 'normal OS cache'}") + + warmup = max(0, args.warmup) + variants = ( + ["basic", "orderby", "orderby_limit"] + if args.query_type == "all" + else [args.query_type] + ) + for variant in variants: + run_single_variant(variant, runs, warmup, args.fixed_end) + + print(f"Finished: {datetime.now()}") + + +if __name__ == "__main__": + main() diff --git a/benchmark_script/benchmark_quantile_1s.py b/benchmark_script/benchmark_quantile_1s.py new file mode 100644 index 00000000..605635bf --- /dev/null +++ b/benchmark_script/benchmark_quantile_1s.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 +""" +Benchmark baseline ClickHouse vs ASAP query engine for quantile SQL variants: + +1) basic: + SELECT ... quantile(0.99)(pkt_len) ... GROUP BY proto, srcip, dstip +2) orderby: + SELECT ... GROUP BY ... ORDER BY p99_pkt_len DESC +3) orderby_limit: + SELECT ... GROUP BY ... ORDER BY p99_pkt_len DESC LIMIT 10 +""" + +from __future__ import annotations + +import argparse +import os +import statistics +import subprocess +import sys +import time +from datetime import datetime + +BASELINE_URL = os.environ.get("BASELINE_URL", "http://localhost:8123/") +ASAP_URL = os.environ.get("ASAP_URL", "http://localhost:8088/clickhouse/query") +BASELINE_TABLE = os.environ.get("BASELINE_TABLE", "flow_table_streaming") +ASAP_TABLE = os.environ.get("ASAP_TABLE", "netflow_table") +DEFAULT_RUNS = int(os.environ.get("BENCHMARK_RUNS", "10")) +DROP_CACHES = os.environ.get("BENCHMARK_DROP_CACHES", "0") == "1" + + +def drop_caches() -> None: + if not DROP_CACHES: + return + result = subprocess.run( + "sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'", + shell=True, + capture_output=True, + text=True, + ) + if result.returncode != 0: + print("warning: drop_caches failed; continuing", file=sys.stderr) + time.sleep(0.2) + + +def curl_timed(url: str, query: str) -> float: + result = subprocess.run( + [ + "curl", + "-o", + "/dev/null", + "-s", + "-S", + "-w", + "%{http_code} %{time_total}", + "-G", + url, + "--data-urlencode", + f"query={query}", + ], + capture_output=True, + text=True, + ) + if result.returncode != 0: + raise RuntimeError(f"curl failed: {result.stderr}") + parts = result.stdout.strip().split() + if len(parts) != 2: + raise RuntimeError(f"unexpected curl output: {result.stdout!r}") + status, elapsed = parts[0], float(parts[1]) + if not status.startswith("2"): + raise RuntimeError(f"HTTP {status}") + return elapsed + + +def build_query(table: str, fixed_end: str | None, query_type: str) -> str: + if fixed_end: + where = ( + f"time BETWEEN DATEADD(s, -11, '{fixed_end}') " + f"AND DATEADD(s, -10, '{fixed_end}')" + ) + else: + where = "time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW())" + + base = ( + "SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len\n" + f"FROM {table}\n" + f"WHERE {where}\n" + "GROUP BY proto, srcip, dstip\n" + ) + + if query_type == "basic": + suffix = "FORMAT TabSeparated" + elif query_type == "orderby": + suffix = "ORDER BY p99_pkt_len DESC\nFORMAT TabSeparated" + elif query_type == "orderby_limit": + suffix = "ORDER BY p99_pkt_len DESC\nLIMIT 10\nFORMAT TabSeparated" + else: + raise ValueError(f"unknown query_type: {query_type}") + + return base + suffix + + +def print_stats(label: str, values: list[float]) -> None: + print( + f"{label:<9} avg={statistics.mean(values):.4f}s " + f"med={statistics.median(values):.4f}s " + f"min={min(values):.4f}s max={max(values):.4f}s" + ) + + +def run_single_variant( + query_type: str, + runs: int, + warmup: int, + fixed_end: str | None, +) -> None: + baseline_query = build_query(BASELINE_TABLE, fixed_end, query_type) + asap_query = build_query(ASAP_TABLE, fixed_end, query_type) + + print(f"\n=== Variant: {query_type} ===") + print(f"Baseline: {BASELINE_URL} table={BASELINE_TABLE}") + print(f"ASAP: {ASAP_URL} table={ASAP_TABLE}") + print(f"Runs: {runs} warmup={warmup}") + print("-" * 78) + + for i in range(warmup): + drop_caches() + _ = curl_timed(BASELINE_URL, baseline_query) + drop_caches() + _ = curl_timed(ASAP_URL, asap_query) + print(f"warmup {i + 1}/{warmup} done") + + baseline_times: list[float] = [] + asap_times: list[float] = [] + + for i in range(runs): + drop_caches() + baseline_t = curl_timed(BASELINE_URL, baseline_query) + baseline_times.append(baseline_t) + + drop_caches() + asap_t = curl_timed(ASAP_URL, asap_query) + asap_times.append(asap_t) + + print( + f"run {i + 1:>2}/{runs}: baseline={baseline_t:.4f}s " + f"asap={asap_t:.4f}s speedup={baseline_t / asap_t if asap_t > 0 else 0.0:.2f}x" + ) + + print("-" * 78) + print_stats("baseline", baseline_times) + print_stats("asap", asap_times) + avg_speedup = statistics.mean(baseline_times) / statistics.mean(asap_times) + med_speedup = statistics.median(baseline_times) / statistics.median(asap_times) + print(f"speedup avg={avg_speedup:.2f}x med={med_speedup:.2f}x") + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Benchmark quantile query for 1s tumbling window." + ) + parser.add_argument("--runs", type=int, default=DEFAULT_RUNS) + parser.add_argument( + "--fixed-end", + metavar="TS", + help="Use fixed end timestamp, e.g. '2026-04-13 23:50:00'", + ) + parser.add_argument( + "--warmup", + type=int, + default=2, + help="Untimed warmup requests per side before measurements.", + ) + parser.add_argument( + "--query-type", + choices=["all", "basic", "orderby", "orderby_limit"], + default="all", + help="Which query variant to benchmark.", + ) + args = parser.parse_args() + runs = max(1, args.runs) + warmup = max(0, args.warmup) + + print(f"Started: {datetime.now()}") + print(f"Variants: {args.query_type}") + print(f"Caches: {'drop each request' if DROP_CACHES else 'normal OS cache'}") + + variants = ( + ["basic", "orderby", "orderby_limit"] + if args.query_type == "all" + else [args.query_type] + ) + for variant in variants: + run_single_variant(variant, runs, warmup, args.fixed_end) + + print(f"Finished: {datetime.now()}") + + +if __name__ == "__main__": + main() diff --git a/benchmark_script/benchmark_time_windows.py b/benchmark_script/benchmark_time_windows.py new file mode 100644 index 00000000..816b3e66 --- /dev/null +++ b/benchmark_script/benchmark_time_windows.py @@ -0,0 +1,181 @@ +import os +import subprocess +import statistics +import time +from datetime import datetime + +# Use the same time predicate for baseline and ASAP so both read the same rows. +# (Replay uses wall-clock ms; a fixed BASELINE_END in the past makes baseline scan +# almost nothing while ASAP still hits live data — that inverts speedups.) +USE_NOW_FOR_BASELINE = True +# If False, set BASELINE_END to a moment inside your ingested data range (e.g. bulk load). +BASELINE_END = "2026-04-12 17:35:36" + +# Time windows to test +TIME_WINDOWS = [ + ("1 min", 60), + ("5 min", 300), + ("15 min", 900), + ("30 min", 1800), + ("1 hour", 3600), + ("3 hours", 10800), + ("6 hours", 21600), +] + +RUNS = 5 +VERIFY_COUNTS = os.environ.get("BENCHMARK_VERIFY_COUNTS", "1") == "1" + + +def drop_cache(): + subprocess.run("sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'", shell=True) + time.sleep(0.5) + + +def run_query(url, query): + result = subprocess.run( + [ + "curl", + "-o", + "/dev/null", + "-s", + "-S", + "-w", + "%{http_code} %{time_total}", + "-G", + url, + "--data-urlencode", + f"query={query}", + ], + capture_output=True, + text=True, + ) + if result.returncode != 0: + raise RuntimeError(f"curl failed: {result.stderr}") + parts = result.stdout.strip().split() + if len(parts) != 2: + raise RuntimeError(f"bad curl output: {result.stdout!r}") + code, t = parts[0], float(parts[1]) + if not code.startswith("2"): + raise RuntimeError(f"HTTP {code} for query (check ClickHouse / QueryEngine logs)") + return t + + +def run_scalar(url, query): + """Return first line of body (e.g. count).""" + result = subprocess.run( + ["curl", "-s", "-S", "-G", url, "--data-urlencode", f"query={query}"], + capture_output=True, + text=True, + ) + if result.returncode != 0: + raise RuntimeError(result.stderr) + return result.stdout.strip().split("\n")[0] + + +def time_predicate(window_seconds): + if USE_NOW_FOR_BASELINE: + return f"time BETWEEN DATEADD(s, -{window_seconds}, NOW()) AND NOW()" + return f"time BETWEEN DATEADD(s, -{window_seconds}, '{BASELINE_END}') AND '{BASELINE_END}'" + + +def baseline_query(window_seconds): + return f"""SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len +FROM flow_table_streaming +WHERE {time_predicate(window_seconds)} +GROUP BY proto, srcip, dstip +ORDER BY p99_pkt_len DESC +LIMIT 10""" + + +def asap_query(window_seconds): + return f"""SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len +FROM netflow_table +WHERE {time_predicate(window_seconds)} +GROUP BY proto, srcip, dstip +ORDER BY p99_pkt_len DESC +LIMIT 10""" + + +def count_sql(window_seconds, table): + return f"SELECT count() FROM {table} WHERE {time_predicate(window_seconds)}" + + +print(f"Benchmark started at: {datetime.now()}") +if USE_NOW_FOR_BASELINE: + print("Time range: NOW() for both baseline and ASAP (same live window).") +else: + print(f"Time range: fixed end {BASELINE_END} for both (only if data spans this range).") +print(f"Runs per window: {RUNS}") +print("=" * 80) +print( + f"{'Window':<12} {'B_avg':>8} {'B_med':>8} {'B_min':>8} {'B_max':>8} " + f"{'A_avg':>8} {'A_med':>8} {'Speedup':>8}" +) +print("=" * 80) + +results = [] + +for label, seconds in TIME_WINDOWS: + if VERIFY_COUNTS: + try: + cb = run_scalar("http://localhost:8123/", count_sql(seconds, "flow_table_streaming")) + ca = run_scalar("http://localhost:8123/", count_sql(seconds, "netflow_table")) + print(f" [{label}] rows in window — baseline table: {cb}, asap table: {ca}") + except Exception as e: + print(f" [{label}] count check failed: {e}") + + baseline_times = [] + asap_times = [] + + for i in range(RUNS): + drop_cache() + b = run_query("http://localhost:8123/", baseline_query(seconds)) + baseline_times.append(b) + + a = run_query("http://localhost:8088/clickhouse/query", asap_query(seconds)) + asap_times.append(a) + print(f" [{label}] run {i+1}: baseline={b:.3f}s asap={a:.3f}s", flush=True) + + b_avg = statistics.mean(baseline_times) + b_med = statistics.median(baseline_times) + b_min = min(baseline_times) + b_max = max(baseline_times) + a_avg = statistics.mean(asap_times) + a_med = statistics.median(asap_times) + speedup = b_avg / a_avg if a_avg > 0 else 0 + + print( + f"\n{label:<12} {b_avg:>7.3f}s {b_med:>7.3f}s {b_min:>7.3f}s {b_max:>7.3f}s " + f"{a_avg:>7.3f}s {a_med:>7.3f}s {speedup:>7.2f}x\n" + ) + + results.append( + { + "window": label, + "baseline_avg": b_avg, + "baseline_med": b_med, + "asap_avg": a_avg, + "asap_med": a_med, + "speedup": speedup, + } + ) + +print("=" * 80) +print("\nFinal Summary:") +print(f"{'Window':<12} {'Baseline Avg':>14} {'ASAP Avg':>10} {'Speedup':>8}") +print("-" * 50) +for r in results: + print( + f"{r['window']:<12} {r['baseline_avg']:>13.3f}s {r['asap_avg']:>9.3f}s {r['speedup']:>7.2f}x" + ) + +print(f"\nBenchmark completed at: {datetime.now()}") +print( + """ +Interpretation hints: +- If row counts stop growing for larger windows (e.g. 1 h == 3 h), you only have that much + history in MergeTree; widen the '6 hours' story accordingly. +- Short windows: baseline scan is already cheap; QueryEngine + merging many 1s-tumbling + sketches can lose. Speedup usually appears once raw scan + per-row quantile work dominates. +- Small baseline vs ASAP row deltas are often Kafka consumer lag between the two topics.""" +) From 2c87c12df6e411060b2d577a85156f30904881a3 Mon Sep 17 00:00:00 2001 From: Akanksha Akkihal Date: Wed, 15 Apr 2026 14:51:31 +0000 Subject: [PATCH 05/10] fix: MultipleMinMax Arroyo deserialization and subtype handling for MAX/MIN queries --- .../src/data_model/precomputed_output.rs | 16 +- .../multiple_min_max_accumulator.rs | 53 +++++ .../inference_config.yaml | 24 ++ .../streaming_config.yaml | 28 +++ .../planner_sql_max_dstip_workload.yaml | 42 ++++ .../benchmark_max_dstip_windows.py | 218 ++++++++++++++++++ 6 files changed, 376 insertions(+), 5 deletions(-) create mode 100644 asp_config/planner_out_max_dstip/inference_config.yaml create mode 100644 asp_config/planner_out_max_dstip/streaming_config.yaml create mode 100644 asp_config/planner_sql_max_dstip_workload.yaml create mode 100644 benchmark_script/benchmark_max_dstip_windows.py diff --git a/asap-query-engine/src/data_model/precomputed_output.rs b/asap-query-engine/src/data_model/precomputed_output.rs index 88810dab..80ea302a 100644 --- a/asap-query-engine/src/data_model/precomputed_output.rs +++ b/asap-query-engine/src/data_model/precomputed_output.rs @@ -232,6 +232,7 @@ impl PrecomputedOutput { let precompute = Self::create_precompute_from_bytes( config.aggregation_type, + &config.aggregation_sub_type, Vec::as_slice(&precompute_bytes), )?; @@ -363,6 +364,7 @@ impl PrecomputedOutput { /// Factory method to create precompute accumulator from bytes fn create_precompute_from_bytes( precompute_type: AggregationType, + aggregation_sub_type: &str, buffer: &[u8], ) -> Result, Box> { @@ -390,11 +392,15 @@ impl PrecomputedOutput { Ok(Box::new(accumulator)) } AggregationType::MultipleMinMax => { - let accumulator = - MultipleMinMaxAccumulator::deserialize_from_bytes(buffer, "min".to_string()) - .map_err(|e| { - format!("Failed to deserialize MultipleMinMaxAccumulator: {e}") - })?; + let sub_type = if aggregation_sub_type.eq_ignore_ascii_case("max") { + "max".to_string() + } else { + "min".to_string() + }; + let accumulator = MultipleMinMaxAccumulator::deserialize_from_bytes_arroyo( + buffer, sub_type, + ) + .map_err(|e| format!("Failed to deserialize MultipleMinMaxAccumulator: {e}"))?; Ok(Box::new(accumulator)) } AggregationType::MultipleIncrease => { diff --git a/asap-query-engine/src/precompute_operators/multiple_min_max_accumulator.rs b/asap-query-engine/src/precompute_operators/multiple_min_max_accumulator.rs index a8f14a3d..b777c88d 100644 --- a/asap-query-engine/src/precompute_operators/multiple_min_max_accumulator.rs +++ b/asap-query-engine/src/precompute_operators/multiple_min_max_accumulator.rs @@ -156,6 +156,44 @@ impl MultipleMinMaxAccumulator { Ok(Self { values, sub_type }) } + + /// Deserialize from Arroyo-compatible format (MessagePack HashMap) + pub fn deserialize_from_bytes_arroyo( + buffer: &[u8], + sub_type: String, + ) -> Result> { + if sub_type != "min" && sub_type != "max" { + return Err("sub_type must be 'min' or 'max'".into()); + } + + let precompute: HashMap = rmp_serde::from_slice(buffer).map_err(|e| { + format!("Failed to deserialize MultipleMinMaxAccumulator from MessagePack: {e}") + })?; + + let mut values = HashMap::new(); + for (key_str, value) in precompute { + let key_values: Vec = key_str.split(';').map(|s| s.to_string()).collect(); + let key = KeyByLabelValues::new_with_labels(key_values); + values.insert(key, value); + } + + Ok(Self { values, sub_type }) + } + + /// Serialize to Arroyo-compatible format (MessagePack HashMap) + pub fn serialize_to_bytes_arroyo(&self) -> Vec { + let per_key_storage: HashMap = self + .values + .iter() + .map(|(key, &value)| (key.labels.join(";"), value)) + .collect(); + + let mut buf = Vec::new(); + per_key_storage + .serialize(&mut rmp_serde::Serializer::new(&mut buf)) + .expect("Failed to serialize MultipleMinMaxAccumulator to MessagePack"); + buf + } } impl SerializableToSink for MultipleMinMaxAccumulator { @@ -455,6 +493,21 @@ mod tests { assert_eq!(deserialized_bytes.sub_type, "min"); } + #[test] + fn test_multiple_min_max_accumulator_arroyo_msgpack_roundtrip_max() { + let mut acc = MultipleMinMaxAccumulator::new_max(); + let key = KeyByLabelValues::new_with_labels(vec!["1.2.3.4".to_string()]); + acc.add_value(key.clone(), 1500.0); + + let bytes = acc.serialize_to_bytes_arroyo(); + let deserialized = + MultipleMinMaxAccumulator::deserialize_from_bytes_arroyo(&bytes, "max".to_string()) + .unwrap(); + + assert_eq!(deserialized.values.get(&key), Some(&1500.0)); + assert_eq!(deserialized.sub_type, "max"); + } + #[test] fn test_trait_object() { let mut acc = MultipleMinMaxAccumulator::new_min(); diff --git a/asp_config/planner_out_max_dstip/inference_config.yaml b/asp_config/planner_out_max_dstip/inference_config.yaml new file mode 100644 index 00000000..90432d15 --- /dev/null +++ b/asp_config/planner_out_max_dstip/inference_config.yaml @@ -0,0 +1,24 @@ +cleanup_policy: + name: no_cleanup +queries: +- aggregations: + - aggregation_id: 31 + query: SELECT dstip, MAX(pkt_len) AS max_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY dstip ORDER BY max_pkt_len DESC LIMIT 10 +- aggregations: + - aggregation_id: 31 + query: SELECT dstip, MAX(pkt_len) AS max_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -15, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY dstip ORDER BY max_pkt_len DESC LIMIT 10 +- aggregations: + - aggregation_id: 31 + query: SELECT dstip, MAX(pkt_len) AS max_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -70, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY dstip ORDER BY max_pkt_len DESC LIMIT 10 +- aggregations: + - aggregation_id: 31 + query: SELECT dstip, MAX(pkt_len) AS max_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -310, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY dstip ORDER BY max_pkt_len DESC LIMIT 10 +tables: +- name: netflow_table + time_column: time + value_columns: + - pkt_len + metadata_columns: + - srcip + - dstip + - proto diff --git a/asp_config/planner_out_max_dstip/streaming_config.yaml b/asp_config/planner_out_max_dstip/streaming_config.yaml new file mode 100644 index 00000000..8e15fe4c --- /dev/null +++ b/asp_config/planner_out_max_dstip/streaming_config.yaml @@ -0,0 +1,28 @@ +aggregations: +- aggregationId: 31 + aggregationSubType: max + aggregationType: MultipleMinMax + labels: + aggregated: + - dstip + grouping: [] + rollup: + - proto + - srcip + metric: netflow_table + parameters: {} + slideInterval: 1 + spatialFilter: '' + table_name: netflow_table + value_column: pkt_len + windowSize: 1 + windowType: tumbling +tables: +- name: netflow_table + time_column: time + value_columns: + - pkt_len + metadata_columns: + - srcip + - dstip + - proto diff --git a/asp_config/planner_sql_max_dstip_workload.yaml b/asp_config/planner_sql_max_dstip_workload.yaml new file mode 100644 index 00000000..242cd579 --- /dev/null +++ b/asp_config/planner_sql_max_dstip_workload.yaml @@ -0,0 +1,42 @@ +tables: + - name: netflow_table + time_column: time + value_columns: [pkt_len] + metadata_columns: [srcip, dstip, proto] +query_groups: + - id: 1 + repetition_delay: 1 + controller_options: + accuracy_sla: 0.95 + latency_sla: 100.0 + queries: + - >- + SELECT dstip, MAX(pkt_len) AS max_pkt_len + FROM netflow_table + WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) + GROUP BY dstip + ORDER BY max_pkt_len DESC + LIMIT 10 + - >- + SELECT dstip, MAX(pkt_len) AS max_pkt_len + FROM netflow_table + WHERE time BETWEEN DATEADD(s, -15, NOW()) AND DATEADD(s, -10, NOW()) + GROUP BY dstip + ORDER BY max_pkt_len DESC + LIMIT 10 + - >- + SELECT dstip, MAX(pkt_len) AS max_pkt_len + FROM netflow_table + WHERE time BETWEEN DATEADD(s, -70, NOW()) AND DATEADD(s, -10, NOW()) + GROUP BY dstip + ORDER BY max_pkt_len DESC + LIMIT 10 + - >- + SELECT dstip, MAX(pkt_len) AS max_pkt_len + FROM netflow_table + WHERE time BETWEEN DATEADD(s, -310, NOW()) AND DATEADD(s, -10, NOW()) + GROUP BY dstip + ORDER BY max_pkt_len DESC + LIMIT 10 +aggregate_cleanup: + policy: no_cleanup diff --git a/benchmark_script/benchmark_max_dstip_windows.py b/benchmark_script/benchmark_max_dstip_windows.py new file mode 100644 index 00000000..b07aa7c1 --- /dev/null +++ b/benchmark_script/benchmark_max_dstip_windows.py @@ -0,0 +1,218 @@ +#!/usr/bin/env python3 +""" +Benchmark baseline ClickHouse vs ASAP query engine for MAX(pkt_len) queries. + +Supports: +- Window presets: 1s, 5s, 1m, 5m +- Query variants: basic, orderby, orderby_limit +""" + +from __future__ import annotations + +import argparse +import os +import statistics +import subprocess +import sys +import time +from datetime import datetime + +BASELINE_URL = os.environ.get("BASELINE_URL", "http://localhost:8123/") +ASAP_URL = os.environ.get("ASAP_URL", "http://localhost:8088/clickhouse/query") +BASELINE_TABLE = os.environ.get("BASELINE_TABLE", "flow_table_streaming") +ASAP_TABLE = os.environ.get("ASAP_TABLE", "netflow_table") +DEFAULT_RUNS = int(os.environ.get("BENCHMARK_RUNS", "10")) +DROP_CACHES = os.environ.get("BENCHMARK_DROP_CACHES", "0") == "1" + +WINDOWS = { + "1s": (11, 10), # [-11s, -10s] => 1 second span + "5s": (15, 10), # [-15s, -10s] => 5 second span + "1m": (70, 10), # [-70s, -10s] => 60 second span + "5m": (310, 10), # [-310s, -10s] => 300 second span +} + + +def drop_caches() -> None: + if not DROP_CACHES: + return + result = subprocess.run( + "sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'", + shell=True, + capture_output=True, + text=True, + ) + if result.returncode != 0: + print("warning: drop_caches failed; continuing", file=sys.stderr) + time.sleep(0.2) + + +def curl_timed(url: str, query: str) -> float: + result = subprocess.run( + [ + "curl", + "-o", + "/dev/null", + "-s", + "-S", + "-w", + "%{http_code} %{time_total}", + "-G", + url, + "--data-urlencode", + f"query={query}", + ], + capture_output=True, + text=True, + ) + if result.returncode != 0: + raise RuntimeError(f"curl failed: {result.stderr}") + parts = result.stdout.strip().split() + if len(parts) != 2: + raise RuntimeError(f"unexpected curl output: {result.stdout!r}") + status, elapsed = parts[0], float(parts[1]) + if not status.startswith("2"): + raise RuntimeError(f"HTTP {status}") + return elapsed + + +def build_query( + table: str, + query_type: str, + start_ago_s: int, + end_ago_s: int, + fixed_end: str | None, +) -> str: + if fixed_end: + where = ( + f"time BETWEEN DATEADD(s, -{start_ago_s}, '{fixed_end}') " + f"AND DATEADD(s, -{end_ago_s}, '{fixed_end}')" + ) + else: + where = ( + f"time BETWEEN DATEADD(s, -{start_ago_s}, NOW()) " + f"AND DATEADD(s, -{end_ago_s}, NOW())" + ) + + base = ( + "SELECT dstip, MAX(pkt_len) AS max_pkt_len\n" + f"FROM {table}\n" + f"WHERE {where}\n" + "GROUP BY dstip\n" + ) + + if query_type == "basic": + suffix = "FORMAT TabSeparated" + elif query_type == "orderby": + suffix = "ORDER BY max_pkt_len DESC\nFORMAT TabSeparated" + elif query_type == "orderby_limit": + suffix = "ORDER BY max_pkt_len DESC\nLIMIT 10\nFORMAT TabSeparated" + else: + raise ValueError(f"unknown query_type: {query_type}") + + return base + suffix + + +def print_stats(label: str, values: list[float]) -> None: + print( + f"{label:<9} avg={statistics.mean(values):.4f}s " + f"med={statistics.median(values):.4f}s " + f"min={min(values):.4f}s max={max(values):.4f}s" + ) + + +def run_variant( + window_name: str, + query_type: str, + runs: int, + warmup: int, + fixed_end: str | None, +) -> None: + start_ago, end_ago = WINDOWS[window_name] + baseline_query = build_query(BASELINE_TABLE, query_type, start_ago, end_ago, fixed_end) + asap_query = build_query(ASAP_TABLE, query_type, start_ago, end_ago, fixed_end) + + print(f"\n=== Window: {window_name} | Variant: {query_type} ===") + print(f"Baseline: {BASELINE_URL} table={BASELINE_TABLE}") + print(f"ASAP: {ASAP_URL} table={ASAP_TABLE}") + print(f"Runs: {runs} warmup={warmup}") + print("-" * 78) + + for i in range(warmup): + drop_caches() + _ = curl_timed(BASELINE_URL, baseline_query) + drop_caches() + _ = curl_timed(ASAP_URL, asap_query) + print(f"warmup {i + 1}/{warmup} done") + + baseline_times: list[float] = [] + asap_times: list[float] = [] + for i in range(runs): + drop_caches() + baseline_t = curl_timed(BASELINE_URL, baseline_query) + baseline_times.append(baseline_t) + + drop_caches() + asap_t = curl_timed(ASAP_URL, asap_query) + asap_times.append(asap_t) + + print( + f"run {i + 1:>2}/{runs}: baseline={baseline_t:.4f}s " + f"asap={asap_t:.4f}s speedup={baseline_t / asap_t if asap_t > 0 else 0.0:.2f}x" + ) + + print("-" * 78) + print_stats("baseline", baseline_times) + print_stats("asap", asap_times) + avg_speedup = statistics.mean(baseline_times) / statistics.mean(asap_times) + med_speedup = statistics.median(baseline_times) / statistics.median(asap_times) + print(f"speedup avg={avg_speedup:.2f}x med={med_speedup:.2f}x") + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Benchmark MAX(dstip) query across windows and variants." + ) + parser.add_argument("--runs", type=int, default=DEFAULT_RUNS) + parser.add_argument("--warmup", type=int, default=2) + parser.add_argument( + "--window", + choices=["all", "1s", "5s", "1m", "5m"], + default="all", + help="Window size preset to benchmark.", + ) + parser.add_argument( + "--query-type", + choices=["all", "basic", "orderby", "orderby_limit"], + default="all", + help="Query variant to benchmark.", + ) + parser.add_argument( + "--fixed-end", + metavar="TS", + help="Use fixed end timestamp, e.g. '2026-04-13 23:50:00'", + ) + args = parser.parse_args() + + runs = max(1, args.runs) + warmup = max(0, args.warmup) + windows = ["1s", "5s", "1m", "5m"] if args.window == "all" else [args.window] + variants = ( + ["basic", "orderby", "orderby_limit"] + if args.query_type == "all" + else [args.query_type] + ) + + print(f"Started: {datetime.now()}") + print(f"Windows: {', '.join(windows)}") + print(f"Variants: {', '.join(variants)}") + print(f"Caches: {'drop each request' if DROP_CACHES else 'normal OS cache'}") + + for window_name in windows: + for query_type in variants: + run_variant(window_name, query_type, runs, warmup, args.fixed_end) + + print(f"Finished: {datetime.now()}") + + +if __name__ == "__main__": + main() From 360244894797a0d6b1b4352d77807bee451e53ce Mon Sep 17 00:00:00 2001 From: Akanksha Akkihal Date: Wed, 15 Apr 2026 16:35:26 +0000 Subject: [PATCH 06/10] feat(sql): add COUNT(DISTINCT ...) support with SetAggregator --- .../src/query_logics/logics.rs | 5 +- .../src/ast_matching/sqlparser_test.rs | 16 ++ .../src/ast_matching/sqlpattern_parser.rs | 13 +- .../src/planner/sql_single_query.rs | 36 ++- .../src/data_model/precomputed_output.rs | 32 +++ .../src/engines/simple_engine.rs | 85 +++---- .../src/tests/sql_pattern_matching_tests.rs | 15 ++ asap-summary-ingest/run_arroyosketch.py | 8 + asap-summary-ingest/tests/test_integration.py | 24 ++ .../inference_config.yaml | 28 +++ .../streaming_config.yaml | 29 +++ ...ner_sql_count_distinct_dstip_workload.yaml | 49 ++++ .../benchmark_count_distinct_dstip_windows.py | 232 ++++++++++++++++++ 13 files changed, 524 insertions(+), 48 deletions(-) create mode 100644 asp_config/planner_out_count_distinct_dstip/inference_config.yaml create mode 100644 asp_config/planner_out_count_distinct_dstip/streaming_config.yaml create mode 100644 asp_config/planner_sql_count_distinct_dstip_workload.yaml create mode 100644 benchmark_script/benchmark_count_distinct_dstip_windows.py diff --git a/asap-common/dependencies/rs/promql_utilities/src/query_logics/logics.rs b/asap-common/dependencies/rs/promql_utilities/src/query_logics/logics.rs index d9eb4cad..259238a3 100644 --- a/asap-common/dependencies/rs/promql_utilities/src/query_logics/logics.rs +++ b/asap-common/dependencies/rs/promql_utilities/src/query_logics/logics.rs @@ -50,7 +50,7 @@ pub fn map_statistic_to_precompute_operator( Ok((AggregationType::MultipleIncrease, "".to_string())) } Statistic::Topk => Ok((AggregationType::CountMinSketchWithHeap, "topk".to_string())), - _ => Err(format!("Statistic {statistic:?} not supported")), + Statistic::Cardinality => Ok((AggregationType::SetAggregator, "".to_string())), } } @@ -81,6 +81,9 @@ pub fn does_precompute_operator_support_subpopulations( // CountMinSketchWithHeap is only supported for Topk — does not support subpopulations AggregationType::CountMinSketchWithHeap if matches!(statistic, Statistic::Topk) => false, + AggregationType::SetAggregator => false, + AggregationType::DeltaSetAggregator => false, + AggregationType::HLL => false, // Default: not supported _ => panic!("Unexpected precompute operator: {}", precompute_operator), diff --git a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlparser_test.rs b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlparser_test.rs index 8a42c4d3..bb5ac0aa 100644 --- a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlparser_test.rs +++ b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlparser_test.rs @@ -89,6 +89,22 @@ mod tests { ); } + #[test] + fn test_count_distinct_marks_distinct_arg() { + let q = parse_sql_query( + "SELECT L1, COUNT(DISTINCT value) AS distinct_values FROM cpu_usage WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() GROUP BY L1", + ) + .expect("count distinct should parse"); + assert_eq!(q.aggregation_info.get_name(), "COUNT"); + assert!( + q.aggregation_info + .get_args() + .iter() + .any(|arg| arg.eq_ignore_ascii_case("distinct")), + "COUNT(DISTINCT ...) should carry a distinct marker in AggregationInfo args", + ); + } + // ── Basic smoke tests ──────────────────────────────────────────────────── #[test] diff --git a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_parser.rs b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_parser.rs index c93cce41..9fc71bb1 100644 --- a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_parser.rs +++ b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_parser.rs @@ -208,7 +208,18 @@ impl SQLPatternParser { fn try_parse_aggregation_function(&self, func: &Function) -> Option { let name = func.name.to_string().to_uppercase(); - let args = self.get_quantile_args(func); + let mut args = self.get_quantile_args(func); + let is_distinct = matches!( + &func.args, + FunctionArguments::List(func_args) + if matches!( + func_args.duplicate_treatment, + Some(DuplicateTreatment::Distinct) + ) + ); + if is_distinct && name == "COUNT" { + args.push("distinct".to_string()); + } // Get the column being aggregated let col = match &func.args { diff --git a/asap-planner-rs/src/planner/sql_single_query.rs b/asap-planner-rs/src/planner/sql_single_query.rs index 057b3209..6eaa6424 100644 --- a/asap-planner-rs/src/planner/sql_single_query.rs +++ b/asap-planner-rs/src/planner/sql_single_query.rs @@ -3,7 +3,7 @@ use std::collections::HashSet; use asap_types::enums::{CleanupPolicy, WindowType}; use promql_utilities::data_model::KeyByLabelNames; use promql_utilities::query_logics::enums::{AggregationType, QueryTreatmentType, Statistic}; -use sql_utilities::ast_matching::sqlhelper::Table; +use sql_utilities::ast_matching::sqlhelper::{AggregationInfo, Table}; use sql_utilities::ast_matching::sqlpattern_matcher::{QueryType, SQLPatternMatcher}; use sql_utilities::ast_matching::sqlpattern_parser::SQLPatternParser; use sql_utilities::ast_matching::SQLSchema; @@ -108,7 +108,7 @@ impl SQLSingleQueryProcessor { let rollup = all_metadata.difference(&spatial_output); let treatment_type = get_sql_treatment_type(agg_info.get_name()); - let statistics = get_sql_statistics(agg_info.get_name())?; + let statistics = get_sql_statistics(agg_info)?; let configs = build_agg_configs_for_statistics( &statistics, @@ -171,11 +171,22 @@ fn get_sql_treatment_type(name: &str) -> QueryTreatmentType { } } -fn get_sql_statistics(name: &str) -> Result, ControllerError> { +fn get_sql_statistics(agg_info: &AggregationInfo) -> Result, ControllerError> { + let name = agg_info.get_name(); match name.to_uppercase().as_str() { "QUANTILE" => Ok(vec![Statistic::Quantile]), "SUM" => Ok(vec![Statistic::Sum]), - "COUNT" => Ok(vec![Statistic::Count]), + "COUNT" => { + if agg_info + .get_args() + .iter() + .any(|arg| arg.eq_ignore_ascii_case("distinct")) + { + Ok(vec![Statistic::Cardinality]) + } else { + Ok(vec![Statistic::Count]) + } + } "AVG" => Ok(vec![Statistic::Sum, Statistic::Count]), "MIN" => Ok(vec![Statistic::Min]), "MAX" => Ok(vec![Statistic::Max]), @@ -212,3 +223,20 @@ fn get_all_metadata_columns( .ok_or_else(|| ControllerError::UnknownTable(table_name.to_string()))?; Ok(KeyByLabelNames::new(table.metadata_columns.clone())) } + +#[cfg(test)] +mod tests { + use super::*; + use sql_utilities::ast_matching::sqlhelper::AggregationInfo; + + #[test] + fn sql_statistics_count_distinct_maps_to_cardinality() { + let agg = AggregationInfo::new( + "COUNT".to_string(), + "dstip".to_string(), + vec!["distinct".to_string()], + ); + let stats = get_sql_statistics(&agg).expect("COUNT(DISTINCT) should be supported"); + assert_eq!(stats, vec![Statistic::Cardinality]); + } +} diff --git a/asap-query-engine/src/data_model/precomputed_output.rs b/asap-query-engine/src/data_model/precomputed_output.rs index 80ea302a..636a267c 100644 --- a/asap-query-engine/src/data_model/precomputed_output.rs +++ b/asap-query-engine/src/data_model/precomputed_output.rs @@ -442,6 +442,11 @@ impl PrecomputedOutput { .map_err(|e| format!("Failed to deserialize DeltaSetAggregatorAccumulator: {e}"))?; Ok(Box::new(accumulator)) } + AggregationType::SetAggregator => { + let accumulator = SetAggregatorAccumulator::deserialize_from_bytes_arroyo(buffer) + .map_err(|e| format!("Failed to deserialize SetAggregatorAccumulator: {e}"))?; + Ok(Box::new(accumulator)) + } _ => Err(format!("Unknown precompute type: {precompute_type:?}").into()), } } @@ -618,3 +623,30 @@ impl SerializableToSink for PrecomputedOutput { // assert_eq!(deserialized_accumulator.sum, 42.5); // } // } + +#[cfg(test)] +mod tests { + use super::PrecomputedOutput; + use crate::data_model::AggregationType; + use crate::data_model::KeyByLabelValues; + use crate::precompute_operators::set_aggregator_accumulator::SetAggregatorAccumulator; + + #[test] + fn create_precompute_from_bytes_supports_set_aggregator_arroyo() { + let mut acc = SetAggregatorAccumulator::new(); + acc.add_key(KeyByLabelValues::new_with_labels(vec![ + "src-1".to_string(), + "dst-1".to_string(), + ])); + let bytes = acc.serialize_to_bytes_arroyo(); + + let restored = PrecomputedOutput::create_precompute_from_bytes( + AggregationType::SetAggregator, + "", + &bytes, + ) + .expect("SetAggregator should deserialize from Arroyo bytes"); + + assert_eq!(restored.get_accumulator_type(), AggregationType::SetAggregator); + } +} diff --git a/asap-query-engine/src/engines/simple_engine.rs b/asap-query-engine/src/engines/simple_engine.rs index c0dabcfd..58085edf 100644 --- a/asap-query-engine/src/engines/simple_engine.rs +++ b/asap-query-engine/src/engines/simple_engine.rs @@ -159,6 +159,21 @@ pub struct SimpleEngine { } impl SimpleEngine { + fn sql_aggregation_statistics(aggregation_info: &AggregationInfo) -> Vec { + let name = aggregation_info.get_name().to_lowercase(); + if name == "count" + && aggregation_info + .get_args() + .iter() + .any(|arg| arg.eq_ignore_ascii_case("distinct")) + { + return vec![Statistic::Cardinality]; + } + name.parse::() + .map(|o| o.to_statistics()) + .unwrap_or_else(|_| panic!("Unsupported statistic: {}", name)) + } + pub fn new( store: Arc, // promsketch_store: Option>, @@ -1788,19 +1803,13 @@ impl SimpleEngine { let query_data = &match_result.query_data[0]; let metric = query_data.metric.clone(); - let statistic_name = match query_pattern_type { - QueryPatternType::OneTemporalOneSpatial => match_result.query_data[1] - .aggregation_info - .get_name() - .to_lowercase(), - _ => query_data.aggregation_info.get_name().to_lowercase(), + let statistics = match query_pattern_type { + QueryPatternType::OneTemporalOneSpatial => { + Self::sql_aggregation_statistics(&match_result.query_data[1].aggregation_info) + } + _ => Self::sql_aggregation_statistics(&query_data.aggregation_info), }; - let statistics: Vec = statistic_name - .parse::() - .map(|o| o.to_statistics()) - .unwrap_or_default(); - let data_range_ms = match query_pattern_type { QueryPatternType::OnlySpatial => None, QueryPatternType::OnlyTemporal => { @@ -2039,35 +2048,18 @@ impl SimpleEngine { }; // Statistic - determine based on query pattern type - let statistic_name = match query_pattern_type { + let statistics_to_compute = match query_pattern_type { QueryPatternType::OnlyTemporal => { - // Use the temporal aggregation (first subquery) - match_result.query_data[0] - .aggregation_info - .get_name() - .to_lowercase() + Self::sql_aggregation_statistics(&match_result.query_data[0].aggregation_info) } QueryPatternType::OneTemporalOneSpatial => { - // Use the temporal aggregation (second subquery contains temporal) - match_result.query_data[1] - .aggregation_info - .get_name() - .to_lowercase() + Self::sql_aggregation_statistics(&match_result.query_data[1].aggregation_info) } QueryPatternType::OnlySpatial => { - // Use the spatial aggregation (first subquery) - match_result.query_data[0] - .aggregation_info - .get_name() - .to_lowercase() + Self::sql_aggregation_statistics(&match_result.query_data[0].aggregation_info) } }; - let statistics_to_compute: Vec = statistic_name - .parse::() - .map(|o| o.to_statistics()) - .unwrap_or_else(|_| panic!("Unsupported statistic: {}", statistic_name)); - if statistics_to_compute.len() != 1 { warn!( "Expected exactly one statistic to compute, found {}", @@ -2180,15 +2172,8 @@ impl SimpleEngine { ); // Get the statistic from the aggregation - let statistic_name = match_result.query_data[0] - .aggregation_info - .get_name() - .to_lowercase(); - - let statistics_to_compute: Vec = statistic_name - .parse::() - .map(|o| o.to_statistics()) - .unwrap_or_else(|_| panic!("Unsupported statistic: {}", statistic_name)); + let statistics_to_compute = + Self::sql_aggregation_statistics(&match_result.query_data[0].aggregation_info); if statistics_to_compute.len() != 1 { warn!( @@ -3241,6 +3226,19 @@ impl SimpleEngine { let mut unformatted_results = HashMap::new(); for (key, precompute) in merged_outputs { + // SetAggregator stores an internal set (e.g., distinct dstip values). + // For cardinality queries we must keep the *outer* grouping key + // (e.g., srcip), not iterate internal set keys. + if *statistic == Statistic::Cardinality + && precompute.get_accumulator_type() == AggregationType::SetAggregator + { + let value = self + .query_precompute_for_statistic(precompute.as_ref(), statistic, &None, query_kwargs) + .map_err(|e| format!("Query failed: {}", e))?; + unformatted_results.insert(key.clone(), value); + continue; + } + if let Some(unwrapped_keys) = precompute.get_keys() { let keys_to_process = if enable_topk_limiting { self.limit_keys_for_topk(unwrapped_keys, statistic, query_kwargs)? @@ -3432,7 +3430,10 @@ impl SimpleEngine { } AggregationType::SetAggregator => { if let Some(set_acc) = precompute.as_any().downcast_ref::() { - if let Some(key_val) = key { + if *statistic == Statistic::Cardinality { + // COUNT(DISTINCT ...) is represented by SetAggregator; cardinality is set size. + Ok(set_acc.added.len() as f64) + } else if let Some(key_val) = key { use crate::data_model::MultipleSubpopulationAggregate; set_acc.query(*statistic, key_val, Some(query_kwargs)) } else { diff --git a/asap-query-engine/src/tests/sql_pattern_matching_tests.rs b/asap-query-engine/src/tests/sql_pattern_matching_tests.rs index b82cade8..1d88b242 100644 --- a/asap-query-engine/src/tests/sql_pattern_matching_tests.rs +++ b/asap-query-engine/src/tests/sql_pattern_matching_tests.rs @@ -13,6 +13,7 @@ mod tests { use crate::engines::simple_engine::SimpleEngine; use crate::stores::simple_map_store::SimpleMapStore; use promql_utilities::data_model::KeyByLabelNames; + use promql_utilities::query_logics::enums::Statistic; use sql_utilities::sqlhelper::{SQLSchema, Table}; use std::collections::{HashMap, HashSet}; use std::sync::Arc; @@ -294,4 +295,18 @@ mod tests { assert_eq!(keys, vec!["10.0.0.1", "10.0.0.3"]); } + #[test] + fn test_count_distinct_query_matches_now_template() { + let template = "SELECT L1, COUNT(DISTINCT value) AS unique_values FROM cpu_usage WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() GROUP BY L1"; + let engine = build_sql_engine(template, 1, 10); + + let incoming = "SELECT L1, COUNT(DISTINCT value) AS unique_values FROM cpu_usage WHERE time BETWEEN DATEADD(s, -10, '2025-10-01 00:00:10') AND '2025-10-01 00:00:10' GROUP BY L1"; + let query_time = 1727740810.0_f64; + + let context = engine.build_query_execution_context_sql(incoming.to_string(), query_time); + assert!(context.is_some(), "COUNT(DISTINCT ...) SQL query should match template"); + let context = context.unwrap(); + assert_eq!(context.metadata.statistic_to_compute, Statistic::Cardinality); + } + } diff --git a/asap-summary-ingest/run_arroyosketch.py b/asap-summary-ingest/run_arroyosketch.py index 0513cd69..7e308c14 100644 --- a/asap-summary-ingest/run_arroyosketch.py +++ b/asap-summary-ingest/run_arroyosketch.py @@ -909,6 +909,14 @@ def main(args): # Choose appropriate SQL template if streaming_aggregation_config.aggregationType == "deltasetaggregator": sql_template = deltasetaggregator_sql_template + elif ( + streaming_aggregation_config.aggregationType == "setaggregator" + and query_language == "sql" + ): + # SQL COUNT(DISTINCT value_column) must feed value_column into SetAggregator. + # The labels template ignores value_column and produces setaggregator_(''), + # which collapses cardinality to 1 per group. + sql_template = value_only_sql_template elif is_labels_accumulator: sql_template = labels_sql_template elif is_value_only_aggregation: diff --git a/asap-summary-ingest/tests/test_integration.py b/asap-summary-ingest/tests/test_integration.py index 86e47ad0..5c8581fa 100644 --- a/asap-summary-ingest/tests/test_integration.py +++ b/asap-summary-ingest/tests/test_integration.py @@ -253,6 +253,30 @@ def test_sql_query_no_label_prefix( assert '"region"' in sql_query +class TestTemplateSelectionForSetAggregatorSQL: + """Tests for SQL SetAggregator template selection in run_arroyosketch main loop.""" + + def test_sql_setaggregator_uses_value_only_template(self): + # This mirrors the template selection branch in run_arroyosketch.main(). + query_language = "sql" + aggregation_type = "setaggregator" + is_labels_accumulator = aggregation_type in {"setaggregator", "deltasetaggregator"} + is_value_only_aggregation = aggregation_type == "datasketcheskll" + + if aggregation_type == "deltasetaggregator": + selected = "deltasetaggregator_sql_template" + elif aggregation_type == "setaggregator" and query_language == "sql": + selected = "value_only_sql_template" + elif is_labels_accumulator: + selected = "labels_sql_template" + elif is_value_only_aggregation: + selected = "value_only_sql_template" + else: + selected = "aggregation_sql_template" + + assert selected == "value_only_sql_template" + + class TestGetSqlQueryPromQL: """Tests for get_sql_query with PromQL mode (backward compatibility).""" diff --git a/asp_config/planner_out_count_distinct_dstip/inference_config.yaml b/asp_config/planner_out_count_distinct_dstip/inference_config.yaml new file mode 100644 index 00000000..3f574976 --- /dev/null +++ b/asp_config/planner_out_count_distinct_dstip/inference_config.yaml @@ -0,0 +1,28 @@ +cleanup_policy: + name: no_cleanup +queries: +- aggregations: + - aggregation_id: 41 + query: SELECT srcip, COUNT(DISTINCT dstip) AS unique_peers FROM netflow_table WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip ORDER BY unique_peers DESC LIMIT 10 +- aggregations: + - aggregation_id: 41 + query: SELECT srcip, COUNT(DISTINCT dstip) AS unique_peers FROM netflow_table WHERE time BETWEEN DATEADD(s, -15, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip ORDER BY unique_peers DESC LIMIT 10 +- aggregations: + - aggregation_id: 41 + query: SELECT srcip, COUNT(DISTINCT dstip) AS unique_peers FROM netflow_table WHERE time BETWEEN DATEADD(s, -20, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip ORDER BY unique_peers DESC LIMIT 10 +- aggregations: + - aggregation_id: 41 + query: SELECT srcip, COUNT(DISTINCT dstip) AS unique_peers FROM netflow_table WHERE time BETWEEN DATEADD(s, -70, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip ORDER BY unique_peers DESC LIMIT 10 +- aggregations: + - aggregation_id: 41 + query: SELECT srcip, COUNT(DISTINCT dstip) AS unique_peers FROM netflow_table WHERE time BETWEEN DATEADD(s, -310, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip ORDER BY unique_peers DESC LIMIT 10 +tables: +- name: netflow_table + time_column: time + value_columns: + - pkt_len + - dstip + metadata_columns: + - srcip + - dstip + - proto diff --git a/asp_config/planner_out_count_distinct_dstip/streaming_config.yaml b/asp_config/planner_out_count_distinct_dstip/streaming_config.yaml new file mode 100644 index 00000000..872aadce --- /dev/null +++ b/asp_config/planner_out_count_distinct_dstip/streaming_config.yaml @@ -0,0 +1,29 @@ +aggregations: +- aggregationId: 41 + aggregationSubType: '' + aggregationType: SetAggregator + labels: + aggregated: [] + grouping: + - srcip + rollup: + - dstip + - proto + metric: netflow_table + parameters: {} + slideInterval: 1 + spatialFilter: '' + table_name: netflow_table + value_column: dstip + windowSize: 1 + windowType: tumbling +tables: +- name: netflow_table + time_column: time + value_columns: + - pkt_len + - dstip + metadata_columns: + - srcip + - dstip + - proto diff --git a/asp_config/planner_sql_count_distinct_dstip_workload.yaml b/asp_config/planner_sql_count_distinct_dstip_workload.yaml new file mode 100644 index 00000000..a2a1e45d --- /dev/null +++ b/asp_config/planner_sql_count_distinct_dstip_workload.yaml @@ -0,0 +1,49 @@ +tables: + - name: netflow_table + time_column: time + value_columns: [pkt_len, dstip] + metadata_columns: [srcip, dstip, proto] +query_groups: + - id: 1 + repetition_delay: 1 + controller_options: + accuracy_sla: 0.95 + latency_sla: 100.0 + queries: + - >- + SELECT srcip, COUNT(DISTINCT dstip) AS unique_peers + FROM netflow_table + WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) + GROUP BY srcip + ORDER BY unique_peers DESC + LIMIT 10 + - >- + SELECT srcip, COUNT(DISTINCT dstip) AS unique_peers + FROM netflow_table + WHERE time BETWEEN DATEADD(s, -15, NOW()) AND DATEADD(s, -10, NOW()) + GROUP BY srcip + ORDER BY unique_peers DESC + LIMIT 10 + - >- + SELECT srcip, COUNT(DISTINCT dstip) AS unique_peers + FROM netflow_table + WHERE time BETWEEN DATEADD(s, -20, NOW()) AND DATEADD(s, -10, NOW()) + GROUP BY srcip + ORDER BY unique_peers DESC + LIMIT 10 + - >- + SELECT srcip, COUNT(DISTINCT dstip) AS unique_peers + FROM netflow_table + WHERE time BETWEEN DATEADD(s, -70, NOW()) AND DATEADD(s, -10, NOW()) + GROUP BY srcip + ORDER BY unique_peers DESC + LIMIT 10 + - >- + SELECT srcip, COUNT(DISTINCT dstip) AS unique_peers + FROM netflow_table + WHERE time BETWEEN DATEADD(s, -310, NOW()) AND DATEADD(s, -10, NOW()) + GROUP BY srcip + ORDER BY unique_peers DESC + LIMIT 10 +aggregate_cleanup: + policy: no_cleanup diff --git a/benchmark_script/benchmark_count_distinct_dstip_windows.py b/benchmark_script/benchmark_count_distinct_dstip_windows.py new file mode 100644 index 00000000..a7279fb3 --- /dev/null +++ b/benchmark_script/benchmark_count_distinct_dstip_windows.py @@ -0,0 +1,232 @@ +#!/usr/bin/env python3 +""" +Benchmark baseline ClickHouse vs ASAP query engine for: + COUNT(DISTINCT dstip) GROUP BY srcip + +Supports: +- Window presets: 1s, 5s, 10s, 1m, 5m +- Query variants: basic, orderby, orderby_limit +""" + +from __future__ import annotations + +import argparse +import os +import statistics +import subprocess +import sys +import time +from datetime import datetime + +BASELINE_URL = os.environ.get("BASELINE_URL", "http://localhost:8123/") +ASAP_URL = os.environ.get("ASAP_URL", "http://localhost:8088/clickhouse/query") +BASELINE_TABLE = os.environ.get("BASELINE_TABLE", "flow_table_streaming") +ASAP_TABLE = os.environ.get("ASAP_TABLE", "netflow_table") +DEFAULT_RUNS = int(os.environ.get("BENCHMARK_RUNS", "10")) +DROP_CACHES = os.environ.get("BENCHMARK_DROP_CACHES", "0") == "1" + +WINDOWS = { + "1s": (11, 10), # [-11s, -10s] => 1 second span + "5s": (15, 10), # [-15s, -10s] => 5 second span + "10s": (20, 10), # [-20s, -10s] => 10 second span + "1m": (70, 10), # [-70s, -10s] => 60 second span + "5m": (310, 10), # [-310s, -10s] => 300 second span +} + + +def drop_caches() -> None: + if not DROP_CACHES: + return + result = subprocess.run( + "sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'", + shell=True, + capture_output=True, + text=True, + ) + if result.returncode != 0: + print("warning: drop_caches failed; continuing", file=sys.stderr) + time.sleep(0.2) + + +def curl_timed(url: str, query: str) -> float: + result = subprocess.run( + [ + "curl", + "-o", + "/dev/null", + "-s", + "-S", + "-w", + "%{http_code} %{time_total}", + "-G", + url, + "--data-urlencode", + f"query={query}", + ], + capture_output=True, + text=True, + ) + if result.returncode != 0: + raise RuntimeError(f"curl failed: {result.stderr}") + parts = result.stdout.strip().split() + if len(parts) != 2: + raise RuntimeError(f"unexpected curl output: {result.stdout!r}") + status, elapsed = parts[0], float(parts[1]) + if not status.startswith("2"): + raise RuntimeError(f"HTTP {status}") + return elapsed + + +def build_query( + table: str, + query_type: str, + start_ago_s: int, + end_ago_s: int, + fixed_end: str | None, +) -> str: + if fixed_end: + where = ( + f"time BETWEEN DATEADD(s, -{start_ago_s}, '{fixed_end}') " + f"AND DATEADD(s, -{end_ago_s}, '{fixed_end}')" + ) + else: + where = ( + f"time BETWEEN DATEADD(s, -{start_ago_s}, NOW()) " + f"AND DATEADD(s, -{end_ago_s}, NOW())" + ) + + base = ( + "SELECT srcip, COUNT(DISTINCT dstip) AS unique_peers\n" + f"FROM {table}\n" + f"WHERE {where}\n" + "GROUP BY srcip\n" + ) + + if query_type == "basic": + suffix = "FORMAT TabSeparated" + elif query_type == "orderby": + suffix = "ORDER BY unique_peers DESC\nFORMAT TabSeparated" + elif query_type == "orderby_limit": + suffix = "ORDER BY unique_peers DESC\nLIMIT 10\nFORMAT TabSeparated" + else: + raise ValueError(f"unknown query_type: {query_type}") + + return base + suffix + + +def print_stats(label: str, values: list[float]) -> None: + print( + f"{label:<9} avg={statistics.mean(values):.4f}s " + f"med={statistics.median(values):.4f}s " + f"min={min(values):.4f}s max={max(values):.4f}s" + ) + + +def run_variant( + window_name: str, + query_type: str, + runs: int, + warmup: int, + fixed_end: str | None, +) -> None: + start_ago, end_ago = WINDOWS[window_name] + baseline_query = build_query(BASELINE_TABLE, query_type, start_ago, end_ago, fixed_end) + asap_query = build_query(ASAP_TABLE, query_type, start_ago, end_ago, fixed_end) + + print(f"\n=== Window: {window_name} | Variant: {query_type} ===") + print(f"Baseline: {BASELINE_URL} table={BASELINE_TABLE}") + print(f"ASAP: {ASAP_URL} table={ASAP_TABLE}") + print(f"Runs: {runs} warmup={warmup}") + print("-" * 78) + + for i in range(warmup): + drop_caches() + _ = curl_timed(BASELINE_URL, baseline_query) + drop_caches() + _ = curl_timed(ASAP_URL, asap_query) + print(f"warmup {i + 1}/{warmup} done") + + baseline_times: list[float] = [] + asap_times: list[float] = [] + for i in range(runs): + drop_caches() + baseline_t = curl_timed(BASELINE_URL, baseline_query) + baseline_times.append(baseline_t) + + drop_caches() + asap_t = curl_timed(ASAP_URL, asap_query) + asap_times.append(asap_t) + + print( + f"run {i + 1:>2}/{runs}: baseline={baseline_t:.4f}s " + f"asap={asap_t:.4f}s speedup={baseline_t / asap_t if asap_t > 0 else 0.0:.2f}x" + ) + + print("-" * 78) + print_stats("baseline", baseline_times) + print_stats("asap", asap_times) + avg_speedup = statistics.mean(baseline_times) / statistics.mean(asap_times) + med_speedup = statistics.median(baseline_times) / statistics.median(asap_times) + print(f"speedup avg={avg_speedup:.2f}x med={med_speedup:.2f}x") + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Benchmark COUNT(DISTINCT dstip) query across windows and variants." + ) + parser.add_argument("--runs", type=int, default=DEFAULT_RUNS) + parser.add_argument("--warmup", type=int, default=2) + parser.add_argument( + "--window", + choices=["all", "1s", "5s", "10s", "1m", "5m"], + default="all", + help="Window size preset to benchmark.", + ) + parser.add_argument( + "--query-type", + choices=["all", "basic", "orderby", "orderby_limit"], + default="all", + help="Query variant to benchmark.", + ) + parser.add_argument( + "--fixed-end", + metavar="TS", + help="Use fixed end timestamp, e.g. '2026-04-13 23:50:00'", + ) + parser.add_argument( + "--use-live-now", + action="store_true", + help="Deprecated no-op: live NOW() is already the default.", + ) + args = parser.parse_args() + + runs = max(1, args.runs) + warmup = max(0, args.warmup) + windows = ["1s", "5s", "10s", "1m", "5m"] if args.window == "all" else [args.window] + variants = ( + ["basic", "orderby", "orderby_limit"] + if args.query_type == "all" + else [args.query_type] + ) + + if args.fixed_end: + effective_fixed_end = args.fixed_end + else: + # Default behavior: live NOW() window (matches existing benchmark workflow). + effective_fixed_end = None + + print(f"Started: {datetime.now()}") + print(f"Windows: {', '.join(windows)}") + print(f"Variants: {', '.join(variants)}") + print(f"FixedEnd: {effective_fixed_end if effective_fixed_end else 'LIVE_NOW'}") + print(f"Caches: {'drop each request' if DROP_CACHES else 'normal OS cache'}") + + for window_name in windows: + for query_type in variants: + run_variant(window_name, query_type, runs, warmup, effective_fixed_end) + + print(f"Finished: {datetime.now()}") + + +if __name__ == "__main__": + main() From 37f9d1cae02accba7cc8bd31128d28588c520d09 Mon Sep 17 00:00:00 2001 From: Akanksha Akkihal Date: Sun, 19 Apr 2026 22:19:15 +0000 Subject: [PATCH 07/10] updated configs --- .../inference_config.yaml | 20 +++- .../streaming_config.yaml | 5 +- .../inference_config.yaml | 3 + .../inference_config.yaml | 12 ++ .../inference_config.yaml | 78 +++++++++++++ .../streaming_config.yaml | 104 ++++++++++++++++++ .../planner_sql_count_srcip_workload.yaml | 20 ++++ .../planner_sql_max_dstip_workload.yaml | 7 ++ asp_config/planner_sql_quantile_workload.yaml | 28 +++++ 9 files changed, 272 insertions(+), 5 deletions(-) create mode 100644 asp_config/planner_out_universal/inference_config.yaml create mode 100644 asp_config/planner_out_universal/streaming_config.yaml diff --git a/asp_config/planner_out_count_srcip/inference_config.yaml b/asp_config/planner_out_count_srcip/inference_config.yaml index 7fd0d61d..8e16feeb 100644 --- a/asp_config/planner_out_count_srcip/inference_config.yaml +++ b/asp_config/planner_out_count_srcip/inference_config.yaml @@ -2,9 +2,25 @@ cleanup_policy: name: no_cleanup queries: - aggregations: - - aggregation_id: 1 - - aggregation_id: 2 + - aggregation_id: 51 + - aggregation_id: 52 query: SELECT srcip, COUNT(pkt_len) AS transfer_events FROM netflow_table WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip +- aggregations: + - aggregation_id: 51 + - aggregation_id: 52 + query: SELECT srcip, COUNT(pkt_len) AS transfer_events FROM netflow_table WHERE time BETWEEN DATEADD(s, -15, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip +- aggregations: + - aggregation_id: 51 + - aggregation_id: 52 + query: SELECT srcip, COUNT(pkt_len) AS transfer_events FROM netflow_table WHERE time BETWEEN DATEADD(s, -20, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip +- aggregations: + - aggregation_id: 51 + - aggregation_id: 52 + query: SELECT srcip, COUNT(pkt_len) AS transfer_events FROM netflow_table WHERE time BETWEEN DATEADD(s, -70, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip +- aggregations: + - aggregation_id: 51 + - aggregation_id: 52 + query: SELECT srcip, COUNT(pkt_len) AS transfer_events FROM netflow_table WHERE time BETWEEN DATEADD(s, -310, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip tables: - name: netflow_table time_column: time diff --git a/asp_config/planner_out_count_srcip/streaming_config.yaml b/asp_config/planner_out_count_srcip/streaming_config.yaml index 502aac5c..321b563e 100644 --- a/asp_config/planner_out_count_srcip/streaming_config.yaml +++ b/asp_config/planner_out_count_srcip/streaming_config.yaml @@ -1,5 +1,5 @@ aggregations: -- aggregationId: 1 +- aggregationId: 51 aggregationSubType: '' aggregationType: DeltaSetAggregator labels: @@ -17,7 +17,7 @@ aggregations: value_column: pkt_len windowSize: 1 windowType: tumbling -- aggregationId: 2 +- aggregationId: 52 aggregationSubType: count aggregationType: CountMinSketch labels: @@ -31,7 +31,6 @@ aggregations: parameters: depth: 3 width: 1024 - _impl_mode: Sketchlib slideInterval: 1 spatialFilter: '' table_name: netflow_table diff --git a/asp_config/planner_out_max_dstip/inference_config.yaml b/asp_config/planner_out_max_dstip/inference_config.yaml index 90432d15..f7fc5dc8 100644 --- a/asp_config/planner_out_max_dstip/inference_config.yaml +++ b/asp_config/planner_out_max_dstip/inference_config.yaml @@ -7,6 +7,9 @@ queries: - aggregations: - aggregation_id: 31 query: SELECT dstip, MAX(pkt_len) AS max_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -15, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY dstip ORDER BY max_pkt_len DESC LIMIT 10 +- aggregations: + - aggregation_id: 31 + query: SELECT dstip, MAX(pkt_len) AS max_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -20, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY dstip ORDER BY max_pkt_len DESC LIMIT 10 - aggregations: - aggregation_id: 31 query: SELECT dstip, MAX(pkt_len) AS max_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -70, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY dstip ORDER BY max_pkt_len DESC LIMIT 10 diff --git a/asp_config/planner_out_quantile/inference_config.yaml b/asp_config/planner_out_quantile/inference_config.yaml index 66468868..c7749d1a 100644 --- a/asp_config/planner_out_quantile/inference_config.yaml +++ b/asp_config/planner_out_quantile/inference_config.yaml @@ -4,6 +4,18 @@ queries: - aggregations: - aggregation_id: 21 query: SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY proto, srcip, dstip ORDER BY p99_pkt_len DESC LIMIT 10 +- aggregations: + - aggregation_id: 21 + query: SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -15, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY proto, srcip, dstip ORDER BY p99_pkt_len DESC LIMIT 10 +- aggregations: + - aggregation_id: 21 + query: SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -20, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY proto, srcip, dstip ORDER BY p99_pkt_len DESC LIMIT 10 +- aggregations: + - aggregation_id: 21 + query: SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -70, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY proto, srcip, dstip ORDER BY p99_pkt_len DESC LIMIT 10 +- aggregations: + - aggregation_id: 21 + query: SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -310, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY proto, srcip, dstip ORDER BY p99_pkt_len DESC LIMIT 10 tables: - name: netflow_table time_column: time diff --git a/asp_config/planner_out_universal/inference_config.yaml b/asp_config/planner_out_universal/inference_config.yaml new file mode 100644 index 00000000..5c3e500c --- /dev/null +++ b/asp_config/planner_out_universal/inference_config.yaml @@ -0,0 +1,78 @@ +cleanup_policy: + name: no_cleanup +queries: +- aggregations: + - aggregation_id: 21 + query: SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY proto, srcip, dstip ORDER BY p99_pkt_len DESC LIMIT 10 +- aggregations: + - aggregation_id: 21 + query: SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -15, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY proto, srcip, dstip ORDER BY p99_pkt_len DESC LIMIT 10 +- aggregations: + - aggregation_id: 21 + query: SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -20, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY proto, srcip, dstip ORDER BY p99_pkt_len DESC LIMIT 10 +- aggregations: + - aggregation_id: 21 + query: SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -70, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY proto, srcip, dstip ORDER BY p99_pkt_len DESC LIMIT 10 +- aggregations: + - aggregation_id: 21 + query: SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -310, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY proto, srcip, dstip ORDER BY p99_pkt_len DESC LIMIT 10 +- aggregations: + - aggregation_id: 31 + query: SELECT dstip, MAX(pkt_len) AS max_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY dstip ORDER BY max_pkt_len DESC LIMIT 10 +- aggregations: + - aggregation_id: 31 + query: SELECT dstip, MAX(pkt_len) AS max_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -15, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY dstip ORDER BY max_pkt_len DESC LIMIT 10 +- aggregations: + - aggregation_id: 31 + query: SELECT dstip, MAX(pkt_len) AS max_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -20, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY dstip ORDER BY max_pkt_len DESC LIMIT 10 +- aggregations: + - aggregation_id: 31 + query: SELECT dstip, MAX(pkt_len) AS max_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -70, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY dstip ORDER BY max_pkt_len DESC LIMIT 10 +- aggregations: + - aggregation_id: 31 + query: SELECT dstip, MAX(pkt_len) AS max_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -310, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY dstip ORDER BY max_pkt_len DESC LIMIT 10 +- aggregations: + - aggregation_id: 41 + query: SELECT srcip, COUNT(DISTINCT dstip) AS unique_peers FROM netflow_table WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip ORDER BY unique_peers DESC LIMIT 10 +- aggregations: + - aggregation_id: 41 + query: SELECT srcip, COUNT(DISTINCT dstip) AS unique_peers FROM netflow_table WHERE time BETWEEN DATEADD(s, -15, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip ORDER BY unique_peers DESC LIMIT 10 +- aggregations: + - aggregation_id: 41 + query: SELECT srcip, COUNT(DISTINCT dstip) AS unique_peers FROM netflow_table WHERE time BETWEEN DATEADD(s, -20, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip ORDER BY unique_peers DESC LIMIT 10 +- aggregations: + - aggregation_id: 41 + query: SELECT srcip, COUNT(DISTINCT dstip) AS unique_peers FROM netflow_table WHERE time BETWEEN DATEADD(s, -70, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip ORDER BY unique_peers DESC LIMIT 10 +- aggregations: + - aggregation_id: 41 + query: SELECT srcip, COUNT(DISTINCT dstip) AS unique_peers FROM netflow_table WHERE time BETWEEN DATEADD(s, -310, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip ORDER BY unique_peers DESC LIMIT 10 +- aggregations: + - aggregation_id: 51 + - aggregation_id: 52 + query: SELECT srcip, COUNT(pkt_len) AS transfer_events FROM netflow_table WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip +- aggregations: + - aggregation_id: 51 + - aggregation_id: 52 + query: SELECT srcip, COUNT(pkt_len) AS transfer_events FROM netflow_table WHERE time BETWEEN DATEADD(s, -15, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip +- aggregations: + - aggregation_id: 51 + - aggregation_id: 52 + query: SELECT srcip, COUNT(pkt_len) AS transfer_events FROM netflow_table WHERE time BETWEEN DATEADD(s, -20, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip +- aggregations: + - aggregation_id: 51 + - aggregation_id: 52 + query: SELECT srcip, COUNT(pkt_len) AS transfer_events FROM netflow_table WHERE time BETWEEN DATEADD(s, -70, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip +- aggregations: + - aggregation_id: 51 + - aggregation_id: 52 + query: SELECT srcip, COUNT(pkt_len) AS transfer_events FROM netflow_table WHERE time BETWEEN DATEADD(s, -310, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip +tables: +- name: netflow_table + time_column: time + value_columns: + - pkt_len + - dstip + metadata_columns: + - srcip + - dstip + - proto diff --git a/asp_config/planner_out_universal/streaming_config.yaml b/asp_config/planner_out_universal/streaming_config.yaml new file mode 100644 index 00000000..e04a3d09 --- /dev/null +++ b/asp_config/planner_out_universal/streaming_config.yaml @@ -0,0 +1,104 @@ +aggregations: +- aggregationId: 21 + aggregationSubType: '' + aggregationType: DatasketchesKLL + labels: + aggregated: [] + grouping: + - dstip + - proto + - srcip + rollup: [] + metric: netflow_table + parameters: + K: 20 + slideInterval: 1 + spatialFilter: '' + table_name: netflow_table + value_column: pkt_len + windowSize: 1 + windowType: tumbling +- aggregationId: 31 + aggregationSubType: max + aggregationType: MultipleMinMax + labels: + aggregated: + - dstip + grouping: [] + rollup: + - proto + - srcip + metric: netflow_table + parameters: {} + slideInterval: 1 + spatialFilter: '' + table_name: netflow_table + value_column: pkt_len + windowSize: 1 + windowType: tumbling +- aggregationId: 41 + aggregationSubType: '' + aggregationType: SetAggregator + labels: + aggregated: [] + grouping: + - srcip + rollup: + - dstip + - proto + metric: netflow_table + parameters: {} + slideInterval: 1 + spatialFilter: '' + table_name: netflow_table + value_column: dstip + windowSize: 1 + windowType: tumbling +- aggregationId: 51 + aggregationSubType: '' + aggregationType: DeltaSetAggregator + labels: + aggregated: + - srcip + grouping: [] + rollup: + - dstip + - proto + metric: netflow_table + parameters: {} + slideInterval: 1 + spatialFilter: '' + table_name: netflow_table + value_column: pkt_len + windowSize: 1 + windowType: tumbling +- aggregationId: 52 + aggregationSubType: count + aggregationType: CountMinSketch + labels: + aggregated: + - srcip + grouping: [] + rollup: + - dstip + - proto + metric: netflow_table + parameters: + depth: 3 + width: 1024 + slideInterval: 1 + spatialFilter: '' + table_name: netflow_table + value_column: pkt_len + windowSize: 1 + windowType: tumbling +tables: +- name: netflow_table + time_column: time + value_columns: + - pkt_len + - dstip + metadata_columns: + - srcip + - dstip + - proto diff --git a/asp_config/planner_sql_count_srcip_workload.yaml b/asp_config/planner_sql_count_srcip_workload.yaml index 4a8677d0..bd001a59 100644 --- a/asp_config/planner_sql_count_srcip_workload.yaml +++ b/asp_config/planner_sql_count_srcip_workload.yaml @@ -15,5 +15,25 @@ query_groups: FROM netflow_table WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip + - >- + SELECT srcip, COUNT(pkt_len) AS transfer_events + FROM netflow_table + WHERE time BETWEEN DATEADD(s, -15, NOW()) AND DATEADD(s, -10, NOW()) + GROUP BY srcip + - >- + SELECT srcip, COUNT(pkt_len) AS transfer_events + FROM netflow_table + WHERE time BETWEEN DATEADD(s, -20, NOW()) AND DATEADD(s, -10, NOW()) + GROUP BY srcip + - >- + SELECT srcip, COUNT(pkt_len) AS transfer_events + FROM netflow_table + WHERE time BETWEEN DATEADD(s, -70, NOW()) AND DATEADD(s, -10, NOW()) + GROUP BY srcip + - >- + SELECT srcip, COUNT(pkt_len) AS transfer_events + FROM netflow_table + WHERE time BETWEEN DATEADD(s, -310, NOW()) AND DATEADD(s, -10, NOW()) + GROUP BY srcip aggregate_cleanup: policy: no_cleanup diff --git a/asp_config/planner_sql_max_dstip_workload.yaml b/asp_config/planner_sql_max_dstip_workload.yaml index 242cd579..6c2f5ef1 100644 --- a/asp_config/planner_sql_max_dstip_workload.yaml +++ b/asp_config/planner_sql_max_dstip_workload.yaml @@ -24,6 +24,13 @@ query_groups: GROUP BY dstip ORDER BY max_pkt_len DESC LIMIT 10 + - >- + SELECT dstip, MAX(pkt_len) AS max_pkt_len + FROM netflow_table + WHERE time BETWEEN DATEADD(s, -20, NOW()) AND DATEADD(s, -10, NOW()) + GROUP BY dstip + ORDER BY max_pkt_len DESC + LIMIT 10 - >- SELECT dstip, MAX(pkt_len) AS max_pkt_len FROM netflow_table diff --git a/asp_config/planner_sql_quantile_workload.yaml b/asp_config/planner_sql_quantile_workload.yaml index a4fc7f15..437fc127 100644 --- a/asp_config/planner_sql_quantile_workload.yaml +++ b/asp_config/planner_sql_quantile_workload.yaml @@ -17,5 +17,33 @@ query_groups: GROUP BY proto, srcip, dstip ORDER BY p99_pkt_len DESC LIMIT 10 + - >- + SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len + FROM netflow_table + WHERE time BETWEEN DATEADD(s, -15, NOW()) AND DATEADD(s, -10, NOW()) + GROUP BY proto, srcip, dstip + ORDER BY p99_pkt_len DESC + LIMIT 10 + - >- + SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len + FROM netflow_table + WHERE time BETWEEN DATEADD(s, -20, NOW()) AND DATEADD(s, -10, NOW()) + GROUP BY proto, srcip, dstip + ORDER BY p99_pkt_len DESC + LIMIT 10 + - >- + SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len + FROM netflow_table + WHERE time BETWEEN DATEADD(s, -70, NOW()) AND DATEADD(s, -10, NOW()) + GROUP BY proto, srcip, dstip + ORDER BY p99_pkt_len DESC + LIMIT 10 + - >- + SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len + FROM netflow_table + WHERE time BETWEEN DATEADD(s, -310, NOW()) AND DATEADD(s, -10, NOW()) + GROUP BY proto, srcip, dstip + ORDER BY p99_pkt_len DESC + LIMIT 10 aggregate_cleanup: policy: no_cleanup From ca044241c34a026fd5dd958bc342676a6bf40b08 Mon Sep 17 00:00:00 2001 From: Akanksha Akkihal Date: Sun, 19 Apr 2026 22:57:20 +0000 Subject: [PATCH 08/10] error fix --- asap-summary-ingest/run_arroyosketch.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/asap-summary-ingest/run_arroyosketch.py b/asap-summary-ingest/run_arroyosketch.py index 7e308c14..93d53e7a 100644 --- a/asap-summary-ingest/run_arroyosketch.py +++ b/asap-summary-ingest/run_arroyosketch.py @@ -953,13 +953,15 @@ def main(args): parameters = dict(parameters) if agg_function in ("countminsketch_count", "countminsketch_sum"): - parameters["impl_mode"] = getattr( - args, "sketch_cms_impl", "legacy" - ).capitalize() + impl_mode = getattr(args, "sketch_cms_impl", "legacy").capitalize() + # Some Jinja templates expose `_impl_mode` via internal set/default usage. + # Provide both keys so template variable discovery and rendering are robust. + parameters["impl_mode"] = impl_mode + parameters["_impl_mode"] = impl_mode elif agg_function == "countminsketchwithheap_topk": - parameters["impl_mode"] = getattr( - args, "sketch_cmwh_impl", "legacy" - ).capitalize() + impl_mode = getattr(args, "sketch_cmwh_impl", "legacy").capitalize() + parameters["impl_mode"] = impl_mode + parameters["_impl_mode"] = impl_mode sql_queries.append(sql_query) # if not is_labels_accumulator: From 6798b5438d0a103e94f5c70ac3f0bd62360f6360 Mon Sep 17 00:00:00 2001 From: Akanksha Akkihal Date: Sun, 19 Apr 2026 23:39:50 +0000 Subject: [PATCH 09/10] Revert "error fix" This reverts commit ca044241c34a026fd5dd958bc342676a6bf40b08. --- asap-summary-ingest/run_arroyosketch.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/asap-summary-ingest/run_arroyosketch.py b/asap-summary-ingest/run_arroyosketch.py index 93d53e7a..7e308c14 100644 --- a/asap-summary-ingest/run_arroyosketch.py +++ b/asap-summary-ingest/run_arroyosketch.py @@ -953,15 +953,13 @@ def main(args): parameters = dict(parameters) if agg_function in ("countminsketch_count", "countminsketch_sum"): - impl_mode = getattr(args, "sketch_cms_impl", "legacy").capitalize() - # Some Jinja templates expose `_impl_mode` via internal set/default usage. - # Provide both keys so template variable discovery and rendering are robust. - parameters["impl_mode"] = impl_mode - parameters["_impl_mode"] = impl_mode + parameters["impl_mode"] = getattr( + args, "sketch_cms_impl", "legacy" + ).capitalize() elif agg_function == "countminsketchwithheap_topk": - impl_mode = getattr(args, "sketch_cmwh_impl", "legacy").capitalize() - parameters["impl_mode"] = impl_mode - parameters["_impl_mode"] = impl_mode + parameters["impl_mode"] = getattr( + args, "sketch_cmwh_impl", "legacy" + ).capitalize() sql_queries.append(sql_query) # if not is_labels_accumulator: From ec9e6920b6cede700f592f14c1f28a9f94f46721 Mon Sep 17 00:00:00 2001 From: Akanksha Akkihal Date: Sun, 19 Apr 2026 23:41:11 +0000 Subject: [PATCH 10/10] Revert "updated configs" This reverts commit 37f9d1cae02accba7cc8bd31128d28588c520d09. --- .../inference_config.yaml | 20 +--- .../streaming_config.yaml | 5 +- .../inference_config.yaml | 3 - .../inference_config.yaml | 12 -- .../inference_config.yaml | 78 ------------- .../streaming_config.yaml | 104 ------------------ .../planner_sql_count_srcip_workload.yaml | 20 ---- .../planner_sql_max_dstip_workload.yaml | 7 -- asp_config/planner_sql_quantile_workload.yaml | 28 ----- 9 files changed, 5 insertions(+), 272 deletions(-) delete mode 100644 asp_config/planner_out_universal/inference_config.yaml delete mode 100644 asp_config/planner_out_universal/streaming_config.yaml diff --git a/asp_config/planner_out_count_srcip/inference_config.yaml b/asp_config/planner_out_count_srcip/inference_config.yaml index 8e16feeb..7fd0d61d 100644 --- a/asp_config/planner_out_count_srcip/inference_config.yaml +++ b/asp_config/planner_out_count_srcip/inference_config.yaml @@ -2,25 +2,9 @@ cleanup_policy: name: no_cleanup queries: - aggregations: - - aggregation_id: 51 - - aggregation_id: 52 + - aggregation_id: 1 + - aggregation_id: 2 query: SELECT srcip, COUNT(pkt_len) AS transfer_events FROM netflow_table WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip -- aggregations: - - aggregation_id: 51 - - aggregation_id: 52 - query: SELECT srcip, COUNT(pkt_len) AS transfer_events FROM netflow_table WHERE time BETWEEN DATEADD(s, -15, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip -- aggregations: - - aggregation_id: 51 - - aggregation_id: 52 - query: SELECT srcip, COUNT(pkt_len) AS transfer_events FROM netflow_table WHERE time BETWEEN DATEADD(s, -20, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip -- aggregations: - - aggregation_id: 51 - - aggregation_id: 52 - query: SELECT srcip, COUNT(pkt_len) AS transfer_events FROM netflow_table WHERE time BETWEEN DATEADD(s, -70, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip -- aggregations: - - aggregation_id: 51 - - aggregation_id: 52 - query: SELECT srcip, COUNT(pkt_len) AS transfer_events FROM netflow_table WHERE time BETWEEN DATEADD(s, -310, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip tables: - name: netflow_table time_column: time diff --git a/asp_config/planner_out_count_srcip/streaming_config.yaml b/asp_config/planner_out_count_srcip/streaming_config.yaml index 321b563e..502aac5c 100644 --- a/asp_config/planner_out_count_srcip/streaming_config.yaml +++ b/asp_config/planner_out_count_srcip/streaming_config.yaml @@ -1,5 +1,5 @@ aggregations: -- aggregationId: 51 +- aggregationId: 1 aggregationSubType: '' aggregationType: DeltaSetAggregator labels: @@ -17,7 +17,7 @@ aggregations: value_column: pkt_len windowSize: 1 windowType: tumbling -- aggregationId: 52 +- aggregationId: 2 aggregationSubType: count aggregationType: CountMinSketch labels: @@ -31,6 +31,7 @@ aggregations: parameters: depth: 3 width: 1024 + _impl_mode: Sketchlib slideInterval: 1 spatialFilter: '' table_name: netflow_table diff --git a/asp_config/planner_out_max_dstip/inference_config.yaml b/asp_config/planner_out_max_dstip/inference_config.yaml index f7fc5dc8..90432d15 100644 --- a/asp_config/planner_out_max_dstip/inference_config.yaml +++ b/asp_config/planner_out_max_dstip/inference_config.yaml @@ -7,9 +7,6 @@ queries: - aggregations: - aggregation_id: 31 query: SELECT dstip, MAX(pkt_len) AS max_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -15, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY dstip ORDER BY max_pkt_len DESC LIMIT 10 -- aggregations: - - aggregation_id: 31 - query: SELECT dstip, MAX(pkt_len) AS max_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -20, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY dstip ORDER BY max_pkt_len DESC LIMIT 10 - aggregations: - aggregation_id: 31 query: SELECT dstip, MAX(pkt_len) AS max_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -70, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY dstip ORDER BY max_pkt_len DESC LIMIT 10 diff --git a/asp_config/planner_out_quantile/inference_config.yaml b/asp_config/planner_out_quantile/inference_config.yaml index c7749d1a..66468868 100644 --- a/asp_config/planner_out_quantile/inference_config.yaml +++ b/asp_config/planner_out_quantile/inference_config.yaml @@ -4,18 +4,6 @@ queries: - aggregations: - aggregation_id: 21 query: SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY proto, srcip, dstip ORDER BY p99_pkt_len DESC LIMIT 10 -- aggregations: - - aggregation_id: 21 - query: SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -15, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY proto, srcip, dstip ORDER BY p99_pkt_len DESC LIMIT 10 -- aggregations: - - aggregation_id: 21 - query: SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -20, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY proto, srcip, dstip ORDER BY p99_pkt_len DESC LIMIT 10 -- aggregations: - - aggregation_id: 21 - query: SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -70, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY proto, srcip, dstip ORDER BY p99_pkt_len DESC LIMIT 10 -- aggregations: - - aggregation_id: 21 - query: SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -310, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY proto, srcip, dstip ORDER BY p99_pkt_len DESC LIMIT 10 tables: - name: netflow_table time_column: time diff --git a/asp_config/planner_out_universal/inference_config.yaml b/asp_config/planner_out_universal/inference_config.yaml deleted file mode 100644 index 5c3e500c..00000000 --- a/asp_config/planner_out_universal/inference_config.yaml +++ /dev/null @@ -1,78 +0,0 @@ -cleanup_policy: - name: no_cleanup -queries: -- aggregations: - - aggregation_id: 21 - query: SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY proto, srcip, dstip ORDER BY p99_pkt_len DESC LIMIT 10 -- aggregations: - - aggregation_id: 21 - query: SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -15, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY proto, srcip, dstip ORDER BY p99_pkt_len DESC LIMIT 10 -- aggregations: - - aggregation_id: 21 - query: SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -20, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY proto, srcip, dstip ORDER BY p99_pkt_len DESC LIMIT 10 -- aggregations: - - aggregation_id: 21 - query: SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -70, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY proto, srcip, dstip ORDER BY p99_pkt_len DESC LIMIT 10 -- aggregations: - - aggregation_id: 21 - query: SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -310, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY proto, srcip, dstip ORDER BY p99_pkt_len DESC LIMIT 10 -- aggregations: - - aggregation_id: 31 - query: SELECT dstip, MAX(pkt_len) AS max_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY dstip ORDER BY max_pkt_len DESC LIMIT 10 -- aggregations: - - aggregation_id: 31 - query: SELECT dstip, MAX(pkt_len) AS max_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -15, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY dstip ORDER BY max_pkt_len DESC LIMIT 10 -- aggregations: - - aggregation_id: 31 - query: SELECT dstip, MAX(pkt_len) AS max_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -20, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY dstip ORDER BY max_pkt_len DESC LIMIT 10 -- aggregations: - - aggregation_id: 31 - query: SELECT dstip, MAX(pkt_len) AS max_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -70, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY dstip ORDER BY max_pkt_len DESC LIMIT 10 -- aggregations: - - aggregation_id: 31 - query: SELECT dstip, MAX(pkt_len) AS max_pkt_len FROM netflow_table WHERE time BETWEEN DATEADD(s, -310, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY dstip ORDER BY max_pkt_len DESC LIMIT 10 -- aggregations: - - aggregation_id: 41 - query: SELECT srcip, COUNT(DISTINCT dstip) AS unique_peers FROM netflow_table WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip ORDER BY unique_peers DESC LIMIT 10 -- aggregations: - - aggregation_id: 41 - query: SELECT srcip, COUNT(DISTINCT dstip) AS unique_peers FROM netflow_table WHERE time BETWEEN DATEADD(s, -15, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip ORDER BY unique_peers DESC LIMIT 10 -- aggregations: - - aggregation_id: 41 - query: SELECT srcip, COUNT(DISTINCT dstip) AS unique_peers FROM netflow_table WHERE time BETWEEN DATEADD(s, -20, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip ORDER BY unique_peers DESC LIMIT 10 -- aggregations: - - aggregation_id: 41 - query: SELECT srcip, COUNT(DISTINCT dstip) AS unique_peers FROM netflow_table WHERE time BETWEEN DATEADD(s, -70, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip ORDER BY unique_peers DESC LIMIT 10 -- aggregations: - - aggregation_id: 41 - query: SELECT srcip, COUNT(DISTINCT dstip) AS unique_peers FROM netflow_table WHERE time BETWEEN DATEADD(s, -310, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip ORDER BY unique_peers DESC LIMIT 10 -- aggregations: - - aggregation_id: 51 - - aggregation_id: 52 - query: SELECT srcip, COUNT(pkt_len) AS transfer_events FROM netflow_table WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip -- aggregations: - - aggregation_id: 51 - - aggregation_id: 52 - query: SELECT srcip, COUNT(pkt_len) AS transfer_events FROM netflow_table WHERE time BETWEEN DATEADD(s, -15, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip -- aggregations: - - aggregation_id: 51 - - aggregation_id: 52 - query: SELECT srcip, COUNT(pkt_len) AS transfer_events FROM netflow_table WHERE time BETWEEN DATEADD(s, -20, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip -- aggregations: - - aggregation_id: 51 - - aggregation_id: 52 - query: SELECT srcip, COUNT(pkt_len) AS transfer_events FROM netflow_table WHERE time BETWEEN DATEADD(s, -70, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip -- aggregations: - - aggregation_id: 51 - - aggregation_id: 52 - query: SELECT srcip, COUNT(pkt_len) AS transfer_events FROM netflow_table WHERE time BETWEEN DATEADD(s, -310, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip -tables: -- name: netflow_table - time_column: time - value_columns: - - pkt_len - - dstip - metadata_columns: - - srcip - - dstip - - proto diff --git a/asp_config/planner_out_universal/streaming_config.yaml b/asp_config/planner_out_universal/streaming_config.yaml deleted file mode 100644 index e04a3d09..00000000 --- a/asp_config/planner_out_universal/streaming_config.yaml +++ /dev/null @@ -1,104 +0,0 @@ -aggregations: -- aggregationId: 21 - aggregationSubType: '' - aggregationType: DatasketchesKLL - labels: - aggregated: [] - grouping: - - dstip - - proto - - srcip - rollup: [] - metric: netflow_table - parameters: - K: 20 - slideInterval: 1 - spatialFilter: '' - table_name: netflow_table - value_column: pkt_len - windowSize: 1 - windowType: tumbling -- aggregationId: 31 - aggregationSubType: max - aggregationType: MultipleMinMax - labels: - aggregated: - - dstip - grouping: [] - rollup: - - proto - - srcip - metric: netflow_table - parameters: {} - slideInterval: 1 - spatialFilter: '' - table_name: netflow_table - value_column: pkt_len - windowSize: 1 - windowType: tumbling -- aggregationId: 41 - aggregationSubType: '' - aggregationType: SetAggregator - labels: - aggregated: [] - grouping: - - srcip - rollup: - - dstip - - proto - metric: netflow_table - parameters: {} - slideInterval: 1 - spatialFilter: '' - table_name: netflow_table - value_column: dstip - windowSize: 1 - windowType: tumbling -- aggregationId: 51 - aggregationSubType: '' - aggregationType: DeltaSetAggregator - labels: - aggregated: - - srcip - grouping: [] - rollup: - - dstip - - proto - metric: netflow_table - parameters: {} - slideInterval: 1 - spatialFilter: '' - table_name: netflow_table - value_column: pkt_len - windowSize: 1 - windowType: tumbling -- aggregationId: 52 - aggregationSubType: count - aggregationType: CountMinSketch - labels: - aggregated: - - srcip - grouping: [] - rollup: - - dstip - - proto - metric: netflow_table - parameters: - depth: 3 - width: 1024 - slideInterval: 1 - spatialFilter: '' - table_name: netflow_table - value_column: pkt_len - windowSize: 1 - windowType: tumbling -tables: -- name: netflow_table - time_column: time - value_columns: - - pkt_len - - dstip - metadata_columns: - - srcip - - dstip - - proto diff --git a/asp_config/planner_sql_count_srcip_workload.yaml b/asp_config/planner_sql_count_srcip_workload.yaml index bd001a59..4a8677d0 100644 --- a/asp_config/planner_sql_count_srcip_workload.yaml +++ b/asp_config/planner_sql_count_srcip_workload.yaml @@ -15,25 +15,5 @@ query_groups: FROM netflow_table WHERE time BETWEEN DATEADD(s, -11, NOW()) AND DATEADD(s, -10, NOW()) GROUP BY srcip - - >- - SELECT srcip, COUNT(pkt_len) AS transfer_events - FROM netflow_table - WHERE time BETWEEN DATEADD(s, -15, NOW()) AND DATEADD(s, -10, NOW()) - GROUP BY srcip - - >- - SELECT srcip, COUNT(pkt_len) AS transfer_events - FROM netflow_table - WHERE time BETWEEN DATEADD(s, -20, NOW()) AND DATEADD(s, -10, NOW()) - GROUP BY srcip - - >- - SELECT srcip, COUNT(pkt_len) AS transfer_events - FROM netflow_table - WHERE time BETWEEN DATEADD(s, -70, NOW()) AND DATEADD(s, -10, NOW()) - GROUP BY srcip - - >- - SELECT srcip, COUNT(pkt_len) AS transfer_events - FROM netflow_table - WHERE time BETWEEN DATEADD(s, -310, NOW()) AND DATEADD(s, -10, NOW()) - GROUP BY srcip aggregate_cleanup: policy: no_cleanup diff --git a/asp_config/planner_sql_max_dstip_workload.yaml b/asp_config/planner_sql_max_dstip_workload.yaml index 6c2f5ef1..242cd579 100644 --- a/asp_config/planner_sql_max_dstip_workload.yaml +++ b/asp_config/planner_sql_max_dstip_workload.yaml @@ -24,13 +24,6 @@ query_groups: GROUP BY dstip ORDER BY max_pkt_len DESC LIMIT 10 - - >- - SELECT dstip, MAX(pkt_len) AS max_pkt_len - FROM netflow_table - WHERE time BETWEEN DATEADD(s, -20, NOW()) AND DATEADD(s, -10, NOW()) - GROUP BY dstip - ORDER BY max_pkt_len DESC - LIMIT 10 - >- SELECT dstip, MAX(pkt_len) AS max_pkt_len FROM netflow_table diff --git a/asp_config/planner_sql_quantile_workload.yaml b/asp_config/planner_sql_quantile_workload.yaml index 437fc127..a4fc7f15 100644 --- a/asp_config/planner_sql_quantile_workload.yaml +++ b/asp_config/planner_sql_quantile_workload.yaml @@ -17,33 +17,5 @@ query_groups: GROUP BY proto, srcip, dstip ORDER BY p99_pkt_len DESC LIMIT 10 - - >- - SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len - FROM netflow_table - WHERE time BETWEEN DATEADD(s, -15, NOW()) AND DATEADD(s, -10, NOW()) - GROUP BY proto, srcip, dstip - ORDER BY p99_pkt_len DESC - LIMIT 10 - - >- - SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len - FROM netflow_table - WHERE time BETWEEN DATEADD(s, -20, NOW()) AND DATEADD(s, -10, NOW()) - GROUP BY proto, srcip, dstip - ORDER BY p99_pkt_len DESC - LIMIT 10 - - >- - SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len - FROM netflow_table - WHERE time BETWEEN DATEADD(s, -70, NOW()) AND DATEADD(s, -10, NOW()) - GROUP BY proto, srcip, dstip - ORDER BY p99_pkt_len DESC - LIMIT 10 - - >- - SELECT proto, srcip, dstip, quantile(0.99)(pkt_len) AS p99_pkt_len - FROM netflow_table - WHERE time BETWEEN DATEADD(s, -310, NOW()) AND DATEADD(s, -10, NOW()) - GROUP BY proto, srcip, dstip - ORDER BY p99_pkt_len DESC - LIMIT 10 aggregate_cleanup: policy: no_cleanup