From fe928f5c79b91402ff6fba42892f86609599bd62 Mon Sep 17 00:00:00 2001 From: STWang Date: Thu, 2 Apr 2026 15:42:51 -0600 Subject: [PATCH 01/10] initial --- .../execution-utilities/benchmark/README.md | 344 ++++++++++++++ .../configs/clickbench_hits_init.sql | 115 +++++ .../configs/clickbench_inference.yaml | 21 + .../configs/clickbench_streaming.yaml | 26 ++ .../benchmark/configs/h2o_inference.yaml | 20 + .../benchmark/configs/h2o_init.sql | 20 + .../benchmark/configs/h2o_streaming.yaml | 26 ++ .../benchmark/download_dataset.py | 164 +++++++ .../benchmark/export_to_arroyo.py | 254 ++++++++++ .../benchmark/export_to_database.py | 353 ++++++++++++++ .../benchmark/generate_queries.py | 390 ++++++++++++++++ .../benchmark/prepare_data.py | 187 ++++++++ .../benchmark/requirements.txt | 5 + .../benchmark/run_benchmark.py | 434 ++++++++++++++++++ 14 files changed, 2359 insertions(+) create mode 100644 asap-tools/execution-utilities/benchmark/README.md create mode 100644 asap-tools/execution-utilities/benchmark/configs/clickbench_hits_init.sql create mode 100644 asap-tools/execution-utilities/benchmark/configs/clickbench_inference.yaml create mode 100644 asap-tools/execution-utilities/benchmark/configs/clickbench_streaming.yaml create mode 100644 asap-tools/execution-utilities/benchmark/configs/h2o_inference.yaml create mode 100644 asap-tools/execution-utilities/benchmark/configs/h2o_init.sql create mode 100644 asap-tools/execution-utilities/benchmark/configs/h2o_streaming.yaml create mode 100644 asap-tools/execution-utilities/benchmark/download_dataset.py create mode 100644 asap-tools/execution-utilities/benchmark/export_to_arroyo.py create mode 100644 asap-tools/execution-utilities/benchmark/export_to_database.py create mode 100644 asap-tools/execution-utilities/benchmark/generate_queries.py create mode 100644 asap-tools/execution-utilities/benchmark/prepare_data.py create mode 100644 asap-tools/execution-utilities/benchmark/requirements.txt create mode 100644 asap-tools/execution-utilities/benchmark/run_benchmark.py diff --git a/asap-tools/execution-utilities/benchmark/README.md b/asap-tools/execution-utilities/benchmark/README.md new file mode 100644 index 00000000..9a0608cc --- /dev/null +++ b/asap-tools/execution-utilities/benchmark/README.md @@ -0,0 +1,344 @@ +# ASAP Generalized Benchmark Pipeline + +Measures ASAP query latency (KLL sketch) against ClickHouse baseline for +arbitrary datasets. Supports ClickBench and H2O groupby out of the box. + +## Architecture + +``` +data_file → prepare_data.py → arroyo_file.json + ↓ + export_to_arroyo.py (file source) + ↓ + sketch_topic (Kafka) + ↓ + QueryEngineRust :8088 + ↓ +data_file → export_to_database.py run_benchmark.py → results/ + ↓ + ClickHouse :8123 (baseline) +``` + +**Key difference from the old pipeline:** Arroyo reads directly from a local +file (`single_file_custom` connector) rather than from a Kafka input topic. +Kafka is still required for the **sketch output** topic (`sketch_topic`). + +--- + +## Prerequisites + +```bash +export INSTALL_DIR=/scratch/sketch_db_for_prometheus +pip3 install --user -r requirements.txt + +# Build binaries (one-time) +cd ~/ASAPQuery/asap-query-engine && cargo build --release +``` + +--- + +## ClickBench + ClickHouse End-to-End Example + +### Step 1 — Download dataset + +```bash +cd ~/ASAPQuery/asap-tools/execution-utilities/benchmark +python download_dataset.py --dataset clickbench --output-dir ./data +``` + +Optionally limit to 1M rows: + +```bash +cd ./data +mv hits.json.gz hits_full.json.gz +zcat hits_full.json.gz | head -n 1000000 | gzip > hits.json.gz +``` + +### Step 2 — Prepare data for Arroyo file source + +The Arroyo file source requires RFC3339 timestamps and string metadata columns. +This step converts the raw ClickBench JSON: + +```bash +python prepare_data.py \ + --dataset clickbench \ + --input ./data/hits.json.gz \ + --output ./data/hits_arroyo.json \ + --max-rows 1000000 +``` + +This produces `hits_arroyo.json` with: +- `EventTime` converted from `"2013-07-14 20:38:47"` → `"2013-07-14T20:38:47Z"` +- `RegionID`, `OS`, `UserAgent`, `TraficSourceID` as strings +- Records sorted by `EventTime` + +### Step 3 — Start infrastructure + +```bash +# Kafka +~/ASAPQuery/asap-tools/installation/kafka/run.sh $INSTALL_DIR/kafka + +# Create sketch output topic +KAFKA=$INSTALL_DIR/kafka/bin +$KAFKA/kafka-topics.sh --bootstrap-server localhost:9092 --create \ + --topic sketch_topic --partitions 1 --replication-factor 1 \ + --config max.message.bytes=20971520 + +# ClickHouse +~/ASAPQuery/asap-tools/installation/clickhouse/run.sh $INSTALL_DIR +``` + +### Step 4 — Start Arroyo cluster + +```bash +~/ASAPQuery/asap-summary-ingest/target/release/arroyo \ + --config ~/ASAPQuery/asap-summary-ingest/config.yaml cluster \ + > /tmp/arroyo.log 2>&1 & +``` + +### Step 5 — Launch Arroyo sketch pipeline (file source) + +```bash +python export_to_arroyo.py \ + --streaming-config ./configs/clickbench_streaming.yaml \ + --source-type file \ + --input-file ./data/hits_arroyo.json \ + --file-format json \ + --ts-format rfc3339 \ + --pipeline-name clickbench_pipeline \ + --arroyosketch-dir ~/ASAPQuery/asap-summary-ingest \ + --output-dir ./arroyo_outputs +``` + +### Step 6 — Start QueryEngineRust + +```bash +cd ~/ASAPQuery/asap-query-engine +nohup ./target/release/query_engine_rust \ + --kafka-topic sketch_topic --input-format json \ + --config ~/ASAPQuery/asap-tools/execution-utilities/benchmark/configs/clickbench_inference.yaml \ + --streaming-config ~/ASAPQuery/asap-tools/execution-utilities/benchmark/configs/clickbench_streaming.yaml \ + --http-port 8088 --delete-existing-db --log-level DEBUG \ + --output-dir ./output --streaming-engine arroyo \ + --query-language SQL --lock-strategy per-key \ + --prometheus-scrape-interval 1 > /tmp/query_engine.log 2>&1 & +``` + +### Step 7 — Load data into ClickHouse (baseline) + +```bash +cd ~/ASAPQuery/asap-tools/execution-utilities/benchmark +python export_to_database.py \ + --dataset clickbench \ + --file-path ./data/hits.json.gz \ + --clickhouse-url "http://localhost:8123/" \ + --init-sql-file ./configs/clickbench_hits_init.sql +``` + +Verify: `$INSTALL_DIR/clickhouse client --query "SELECT count(*) FROM hits"` + +### Step 8 — Generate SQL query files + +```bash +python generate_queries.py \ + --table-name hits \ + --ts-column EventTime \ + --value-column ResolutionWidth \ + --group-by-columns RegionID,OS,UserAgent,TraficSourceID \ + --window-size 10 \ + --num-queries 50 \ + --ts-format datetime \ + --window-form dateadd \ + --auto-detect-timestamps \ + --data-file ./data/hits_arroyo.json \ + --data-file-format json \ + --output-prefix ./queries/clickbench +``` + +This writes `queries/clickbench_asap.sql` and `queries/clickbench_clickhouse.sql`. + +### Step 9 — Run benchmark + +```bash +python run_benchmark.py \ + --mode both \ + --asap-sql-file ./queries/clickbench_asap.sql \ + --baseline-sql-file ./queries/clickbench_clickhouse.sql \ + --output-dir ./results \ + --output-prefix clickbench +``` + +Results: `results/clickbench_asap.csv`, `results/clickbench_baseline.csv`, +`results/clickbench_comparison.png`. + +--- + +## H2O GroupBy End-to-End Example + +### Step 1 — Download dataset + +```bash +python download_dataset.py --dataset h2o --output-dir ./data +``` + +### Step 2 — Prepare data for Arroyo file source + +```bash +python prepare_data.py \ + --dataset h2o \ + --input ./data/G1_1e7_1e2_0_0.csv \ + --output ./data/h2o_arroyo.json \ + --max-rows 1000000 +``` + +### Steps 3–4 — Start infrastructure and Arroyo (same as ClickBench) + +### Step 5 — Launch Arroyo sketch pipeline + +```bash +python export_to_arroyo.py \ + --streaming-config ./configs/h2o_streaming.yaml \ + --source-type file \ + --input-file ./data/h2o_arroyo.json \ + --file-format json \ + --ts-format rfc3339 \ + --pipeline-name h2o_pipeline \ + --arroyosketch-dir ~/ASAPQuery/asap-summary-ingest \ + --output-dir ./arroyo_outputs +``` + +### Step 6 — Start QueryEngineRust + +```bash +cd ~/ASAPQuery/asap-query-engine +nohup ./target/release/query_engine_rust \ + --kafka-topic sketch_topic --input-format json \ + --config ~/ASAPQuery/asap-tools/execution-utilities/benchmark/configs/h2o_inference.yaml \ + --streaming-config ~/ASAPQuery/asap-tools/execution-utilities/benchmark/configs/h2o_streaming.yaml \ + --http-port 8088 --delete-existing-db --log-level DEBUG \ + --output-dir ./output --streaming-engine arroyo \ + --query-language SQL --lock-strategy per-key \ + --prometheus-scrape-interval 1 > /tmp/query_engine.log 2>&1 & +``` + +### Step 7 — Load data into ClickHouse (baseline) + +```bash +python export_to_database.py \ + --dataset h2o \ + --file-path ./data/G1_1e7_1e2_0_0.csv \ + --init-sql-file ./configs/h2o_init.sql \ + --max-rows 1000000 +``` + +### Step 8 — Generate SQL query files + +```bash +python generate_queries.py \ + --table-name h2o_groupby \ + --ts-column timestamp \ + --value-column v1 \ + --group-by-columns id1,id2 \ + --window-size 10 \ + --num-queries 50 \ + --ts-format iso \ + --auto-detect-timestamps \ + --data-file ./data/h2o_arroyo.json \ + --data-file-format json \ + --output-prefix ./queries/h2o +``` + +### Step 9 — Run benchmark + +```bash +python run_benchmark.py \ + --mode both \ + --asap-sql-file ./queries/h2o_asap.sql \ + --baseline-sql-file ./queries/h2o_clickhouse.sql \ + --output-dir ./results \ + --output-prefix h2o +``` + +--- + +## Custom Dataset + +```bash +# 1. Download (any HTTP URL) +python download_dataset.py --dataset custom \ + --custom-url https://example.com/mydata.json.gz \ + --output-dir ./data + +# 2. Prepare (edit prepare_data.py for your schema, or skip if already RFC3339) + +# 3. Export to Arroyo +python export_to_arroyo.py \ + --streaming-config ./configs/my_streaming.yaml \ + --source-type file \ + --input-file ./data/mydata.json \ + --file-format json \ + --ts-format rfc3339 \ + --pipeline-name my_pipeline \ + --arroyosketch-dir ~/ASAPQuery/asap-summary-ingest + +# 4. Export to ClickHouse +python export_to_database.py \ + --dataset custom \ + --file-path ./data/mydata.json \ + --init-sql-file ./configs/my_init.sql \ + --table-name my_table + +# 5. Generate queries +python generate_queries.py \ + --table-name my_table \ + --ts-column event_time \ + --value-column metric_value \ + --group-by-columns region,host \ + --window-size 10 \ + --num-queries 50 \ + --auto-detect-timestamps \ + --data-file ./data/mydata.json \ + --output-prefix ./queries/my_dataset + +# 6. Run benchmark +python run_benchmark.py \ + --mode both \ + --asap-sql-file ./queries/my_dataset_asap.sql \ + --baseline-sql-file ./queries/my_dataset_clickhouse.sql \ + --output-dir ./results +``` + +--- + +## Reset + +```bash +pkill -f "arroyo"; pkill -f "query_engine_rust" +sleep 2 +pkill -f "kafka-server-start.sh"; pkill -f "clickhouse server" +sleep 2 +rm -rf /tmp/arroyo/ + +KAFKA=$INSTALL_DIR/kafka/bin +$KAFKA/kafka-topics.sh --bootstrap-server localhost:9092 --delete --topic sketch_topic + +cd ~/ASAPQuery/asap-summary-ingest +python3 delete_pipeline.py --all_pipelines + +$INSTALL_DIR/clickhouse client --query "TRUNCATE TABLE hits" +# or for H2O: $INSTALL_DIR/clickhouse client --query "TRUNCATE TABLE h2o_groupby" +``` + +--- + +## Files + +| File | Purpose | +|------|---------| +| `download_dataset.py` | Download ClickBench, H2O, or custom datasets | +| `prepare_data.py` | Convert raw data to Arroyo file source format (RFC3339, string columns) | +| `export_to_arroyo.py` | Launch Arroyo sketch pipeline (file or kafka source) | +| `export_to_database.py` | Load data into ClickHouse for baseline | +| `generate_queries.py` | Generate paired ASAP + ClickHouse SQL query files | +| `run_benchmark.py` | Run queries and produce CSV results + plots | +| `configs/` | Dataset-specific streaming/inference YAML and ClickHouse init SQL | diff --git a/asap-tools/execution-utilities/benchmark/configs/clickbench_hits_init.sql b/asap-tools/execution-utilities/benchmark/configs/clickbench_hits_init.sql new file mode 100644 index 00000000..b462faec --- /dev/null +++ b/asap-tools/execution-utilities/benchmark/configs/clickbench_hits_init.sql @@ -0,0 +1,115 @@ +-- ClickHouse init for ClickBench baseline (MergeTree only, no Kafka engine) +-- Use this with export_to_database.py --dataset clickbench --init-sql-file + +CREATE TABLE IF NOT EXISTS hits +( + WatchID Int64, + JavaEnable UInt8, + Title String, + GoodEvent Int16, + EventTime DateTime, + EventDate Date, + CounterID UInt32, + ClientIP Int32, + RegionID UInt32, + UserID Int64, + CounterClass Int8, + OS UInt8, + UserAgent UInt8, + URL String, + Referer String, + IsRefresh UInt8, + RefererCategoryID UInt16, + RefererRegionID UInt32, + URLCategoryID UInt16, + URLRegionID UInt32, + ResolutionWidth UInt16, + ResolutionHeight UInt16, + ResolutionDepth UInt8, + FlashMajor UInt8, + FlashMinor UInt8, + FlashMinor2 String, + NetMajor UInt8, + NetMinor UInt8, + UserAgentMajor UInt16, + UserAgentMinor String, + CookieEnable UInt8, + JavascriptEnable UInt8, + IsMobile UInt8, + MobilePhone UInt8, + MobilePhoneModel String, + Params String, + IPNetworkID UInt32, + TraficSourceID Int8, + SearchEngineID UInt16, + SearchPhrase String, + AdvEngineID UInt8, + IsArtifical UInt8, + WindowClientWidth UInt16, + WindowClientHeight UInt16, + ClientTimeZone Int16, + ClientEventTime DateTime, + SilverlightVersion1 UInt8, + SilverlightVersion2 UInt8, + SilverlightVersion3 UInt32, + SilverlightVersion4 UInt16, + PageCharset String, + CodeVersion UInt32, + IsLink UInt8, + IsDownload UInt8, + IsNotBounce UInt8, + FUniqID Int64, + OriginalURL String, + HID UInt32, + IsOldCounter UInt8, + IsEvent UInt8, + IsParameter UInt8, + DontCountHits UInt8, + WithHash UInt8, + HitColor String, + LocalEventTime DateTime, + Age UInt8, + Sex UInt8, + Income UInt8, + Interests UInt16, + Robotness UInt8, + RemoteIP Int32, + WindowName Int32, + OpenerName Int32, + HistoryLength Int16, + BrowserLanguage String, + BrowserCountry String, + SocialNetwork String, + SocialAction String, + HTTPError UInt16, + SendTiming UInt32, + DNSTiming UInt32, + ConnectTiming UInt32, + ResponseStartTiming UInt32, + ResponseEndTiming UInt32, + FetchTiming UInt32, + SocialSourceNetworkID UInt8, + SocialSourcePage String, + ParamPrice Int64, + ParamOrderID String, + ParamCurrency String, + ParamCurrencyID UInt16, + OpenstatServiceName String, + OpenstatCampaignID String, + OpenstatAdID String, + OpenstatSourceID String, + UTMSource String, + UTMMedium String, + UTMCampaign String, + UTMContent String, + UTMTerm String, + FromTag String, + HasGCLID UInt8, + RefererHash Int64, + URLHash Int64, + CLID UInt32 +) +ENGINE = MergeTree +PARTITION BY toYYYYMM(EventDate) +ORDER BY (CounterID, EventDate, intHash32(UserID), EventTime, WatchID) +SETTINGS index_granularity = 8192; diff --git a/asap-tools/execution-utilities/benchmark/configs/clickbench_inference.yaml b/asap-tools/execution-utilities/benchmark/configs/clickbench_inference.yaml new file mode 100644 index 00000000..7c4af097 --- /dev/null +++ b/asap-tools/execution-utilities/benchmark/configs/clickbench_inference.yaml @@ -0,0 +1,21 @@ +# ASAP Inference Config for ClickBench Hits Dataset +# Source: asap_query_latency/inference_config.yaml + +tables: + - name: hits + time_column: EventTime + metadata_columns: [RegionID, OS, UserAgent, TraficSourceID] + value_columns: [ResolutionWidth] + +cleanup_policy: + name: read_based + +queries: + # Temporal queries (10s window, all labels) - QUANTILE + - aggregations: + - aggregation_id: 12 + read_count_threshold: 999999 + query: | + SELECT QUANTILE(0.95, ResolutionWidth) FROM hits + WHERE EventTime BETWEEN DATEADD(s, -10, NOW()) AND NOW() + GROUP BY RegionID, OS, UserAgent, TraficSourceID diff --git a/asap-tools/execution-utilities/benchmark/configs/clickbench_streaming.yaml b/asap-tools/execution-utilities/benchmark/configs/clickbench_streaming.yaml new file mode 100644 index 00000000..3d18e1ed --- /dev/null +++ b/asap-tools/execution-utilities/benchmark/configs/clickbench_streaming.yaml @@ -0,0 +1,26 @@ +# ASAP Streaming Config for ClickBench Hits Dataset +# Defines sketch aggregations for Arroyo to compute +# Source: asap_query_latency/streaming_config.yaml + +tables: + - name: hits + time_column: EventTime + metadata_columns: [RegionID, OS, UserAgent, TraficSourceID] + value_columns: [ResolutionWidth] + +aggregations: + # Temporal queries (10s window, all labels) - QUANTILE (DatasketchesKLL) + - aggregationId: 12 + aggregationType: DatasketchesKLL + aggregationSubType: '' + labels: + grouping: [RegionID, OS, UserAgent, TraficSourceID] + rollup: [] + aggregated: [] + table_name: hits + value_column: ResolutionWidth + parameters: + K: 200 + windowSize: 10 + windowType: tumbling + spatialFilter: '' diff --git a/asap-tools/execution-utilities/benchmark/configs/h2o_inference.yaml b/asap-tools/execution-utilities/benchmark/configs/h2o_inference.yaml new file mode 100644 index 00000000..0d1e45b0 --- /dev/null +++ b/asap-tools/execution-utilities/benchmark/configs/h2o_inference.yaml @@ -0,0 +1,20 @@ +# ASAP Inference Config for H2O GroupBy Dataset +# Source: asap_benchmark_pipeline/inference_config.yaml + +tables: + - name: h2o_groupby + time_column: timestamp + metadata_columns: [id1, id2] + value_columns: [v1] + +cleanup_policy: + name: read_based + +queries: + - aggregations: + - aggregation_id: 12 + read_count_threshold: 999999 + query: |- + SELECT QUANTILE(0.95, v1) FROM h2o_groupby + WHERE timestamp BETWEEN DATEADD(s, -10, NOW()) AND NOW() + GROUP BY id1, id2; diff --git a/asap-tools/execution-utilities/benchmark/configs/h2o_init.sql b/asap-tools/execution-utilities/benchmark/configs/h2o_init.sql new file mode 100644 index 00000000..dbaf81c0 --- /dev/null +++ b/asap-tools/execution-utilities/benchmark/configs/h2o_init.sql @@ -0,0 +1,20 @@ +-- ClickHouse init for H2O GroupBy baseline (MergeTree, direct load) +-- Use this with export_to_database.py --dataset h2o --init-sql-file +-- Source: asap_benchmark_pipeline/h2o_init.sql + +DROP TABLE IF EXISTS h2o_groupby; + +CREATE TABLE IF NOT EXISTS h2o_groupby +( + timestamp DateTime, + id1 String, + id2 String, + id3 String, + id4 Int32, + id5 Int32, + id6 Int32, + v1 Int32, + v2 Int32, + v3 Float64 +) ENGINE = MergeTree() +ORDER BY (id1, id2); diff --git a/asap-tools/execution-utilities/benchmark/configs/h2o_streaming.yaml b/asap-tools/execution-utilities/benchmark/configs/h2o_streaming.yaml new file mode 100644 index 00000000..c500d696 --- /dev/null +++ b/asap-tools/execution-utilities/benchmark/configs/h2o_streaming.yaml @@ -0,0 +1,26 @@ +# ASAP Streaming Config for H2O GroupBy Dataset +# Source: asap_benchmark_pipeline/streaming_config.yaml + +tables: + - name: h2o_groupby + time_column: timestamp + metadata_columns: [id1, id2] + value_columns: [v1] + +aggregations: + # Temporal queries (10s window, all labels) - QUANTILE (DatasketchesKLL) + - aggregationId: 12 + aggregationType: DatasketchesKLL + aggregationSubType: '' + labels: + grouping: [id1, id2] + rollup: [] + aggregated: [] + table_name: h2o_groupby + value_column: v1 + parameters: + K: 200 + tumblingWindowSize: 10 + windowSize: 10 + windowType: tumbling + spatialFilter: '' diff --git a/asap-tools/execution-utilities/benchmark/download_dataset.py b/asap-tools/execution-utilities/benchmark/download_dataset.py new file mode 100644 index 00000000..5226ae53 --- /dev/null +++ b/asap-tools/execution-utilities/benchmark/download_dataset.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +""" +Unified dataset downloader for the ASAP benchmark pipeline. + +Supports ClickBench (hits.json.gz), H2O groupby (G1_1e7_1e2_0_0.csv), +or any custom HTTP URL. + +Usage: + python download_dataset.py --dataset clickbench --output-dir ./data + python download_dataset.py --dataset h2o --output-dir ./data + python download_dataset.py --dataset custom --custom-url https://... --output-dir ./data +""" + +import argparse +import os +import sys +import urllib.request + + +CLICKBENCH_URL = "https://datasets.clickhouse.com/hits_compatible/hits.json.gz" +CLICKBENCH_FILENAME = "hits.json.gz" + +H2O_FILE_ID = "15SVQjQ2QehzYDLoDonio4aP7xqdMiNyi" +H2O_FILENAME = "G1_1e7_1e2_0_0.csv" + + +def _http_download(url: str, output_path: str) -> str: + """Download a file via HTTP with progress reporting.""" + print(f"Downloading from {url}") + request = urllib.request.Request( + url, headers={"User-Agent": "Mozilla/5.0 (compatible; ASAP-Benchmark/1.0)"} + ) + try: + with urllib.request.urlopen(request) as response: + total_size = int(response.headers.get("Content-Length", 0)) + downloaded = 0 + last_percent = -1 + block_size = 8192 * 128 # ~1 MB blocks + + with open(output_path, "wb") as f: + while True: + block = response.read(block_size) + if not block: + break + f.write(block) + downloaded += len(block) + if total_size > 0: + percent = downloaded * 100 // total_size + if percent != last_percent: + last_percent = percent + mb = downloaded / (1024 * 1024) + total_mb = total_size / (1024 * 1024) + sys.stdout.write( + f"\rProgress: {percent}% ({mb:.1f}/{total_mb:.1f} MB)" + ) + sys.stdout.flush() + + print("\nDownload complete!") + return output_path + + except urllib.error.HTTPError as e: + print(f"\nDownload failed: HTTP {e.code} - {e.reason}") + raise + + +def download_clickbench(output_path: str, force: bool = False) -> str: + """Download hits.json.gz from ClickHouse datasets CDN.""" + if not force and os.path.exists(output_path): + print(f"Using existing file: {output_path}") + return output_path + print("Downloading ClickBench dataset (~14 GB compressed). Please wait...") + return _http_download(CLICKBENCH_URL, output_path) + + +def download_h2o(output_path: str, force: bool = False) -> str: + """Download H2O groupby CSV (~300 MB) from Google Drive via gdown.""" + if not force and os.path.exists(output_path) and os.path.getsize(output_path) > 100 * 1024 * 1024: + print(f"Using existing file: {output_path}") + return output_path + + try: + import gdown + except ImportError: + print("Installing gdown...") + import subprocess + subprocess.check_call([sys.executable, "-m", "pip", "install", "gdown"]) + import gdown + + print(f"Downloading H2O dataset via gdown (ID: {H2O_FILE_ID})...") + url = f"https://drive.google.com/uc?id={H2O_FILE_ID}" + gdown.download(url, output_path, quiet=False) + return output_path + + +def download_custom(url: str, output_path: str, force: bool = False) -> str: + """Download a dataset from an arbitrary HTTP URL.""" + if not force and os.path.exists(output_path): + print(f"Using existing file: {output_path}") + return output_path + return _http_download(url, output_path) + + +def main(): + parser = argparse.ArgumentParser( + description="Download benchmark datasets", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "--dataset", + choices=["clickbench", "h2o", "custom"], + required=True, + help="Dataset to download", + ) + parser.add_argument( + "--output-dir", + required=True, + help="Directory to save the downloaded file", + ) + parser.add_argument( + "--output-file", + default=None, + help="Exact output file path (overrides --output-dir)", + ) + parser.add_argument( + "--custom-url", + default=None, + help="URL to download (required when --dataset custom)", + ) + parser.add_argument( + "--force-redownload", + action="store_true", + help="Re-download even if the file already exists", + ) + args = parser.parse_args() + + if args.dataset == "custom" and not args.custom_url: + parser.error("--custom-url is required when --dataset custom") + + os.makedirs(args.output_dir, exist_ok=True) + + if args.output_file: + output_path = args.output_file + os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True) + elif args.dataset == "clickbench": + output_path = os.path.join(args.output_dir, CLICKBENCH_FILENAME) + elif args.dataset == "h2o": + output_path = os.path.join(args.output_dir, H2O_FILENAME) + else: + filename = args.custom_url.rstrip("/").split("/")[-1] or "data" + output_path = os.path.join(args.output_dir, filename) + + if args.dataset == "clickbench": + download_clickbench(output_path, force=args.force_redownload) + elif args.dataset == "h2o": + download_h2o(output_path, force=args.force_redownload) + else: + download_custom(args.custom_url, output_path, force=args.force_redownload) + + print(f"Dataset saved to: {output_path}") + + +if __name__ == "__main__": + main() diff --git a/asap-tools/execution-utilities/benchmark/export_to_arroyo.py b/asap-tools/execution-utilities/benchmark/export_to_arroyo.py new file mode 100644 index 00000000..6e72af72 --- /dev/null +++ b/asap-tools/execution-utilities/benchmark/export_to_arroyo.py @@ -0,0 +1,254 @@ +#!/usr/bin/env python3 +""" +Launch an Arroyo sketch pipeline against a dataset. + +Supports two source modes: + file (default): Arroyo reads directly from a local JSON/Parquet file. + No Kafka input topic is required. + kafka: Arroyo reads from a Kafka topic (legacy path). + +In both cases the sketch output is written to a Kafka topic (default: +sketch_topic) for consumption by QueryEngineRust. + +Usage: + # File source (recommended) + python export_to_arroyo.py \\ + --streaming-config configs/clickbench_streaming.yaml \\ + --source-type file \\ + --input-file ./data/hits.json.gz \\ + --file-format json \\ + --ts-format rfc3339 \\ + --pipeline-name clickbench_pipeline \\ + --arroyosketch-dir ~/ASAPQuery/asap-summary-ingest + + # Kafka source (legacy) + python export_to_arroyo.py \\ + --streaming-config configs/h2o_streaming.yaml \\ + --source-type kafka \\ + --input-kafka-topic h2o_groupby \\ + --pipeline-name h2o_pipeline \\ + --arroyosketch-dir ~/ASAPQuery/asap-summary-ingest +""" + +import argparse +import os +import subprocess +import sys +import time + +import requests + +DEFAULT_ARROYO_URL = "http://localhost:5115/api/v1" +DEFAULT_OUTPUT_KAFKA_TOPIC = "sketch_topic" +DEFAULT_PARALLELISM = 1 +DEFAULT_WAIT_TIMEOUT = 300 + + +def wait_for_pipeline_running( + pipeline_name: str, + arroyo_url: str = DEFAULT_ARROYO_URL, + timeout: int = DEFAULT_WAIT_TIMEOUT, +) -> bool: + """Poll the Arroyo API until the named pipeline reaches RUNNING state. + + Translated from asap_benchmark_pipeline/run_pipeline.sh lines 107-141. + A pipeline is considered running when its 'state' field is None and + 'stop' is 'none' (Arroyo's representation of a healthy running pipeline). + """ + print(f"Waiting for pipeline '{pipeline_name}' to reach RUNNING state...") + elapsed = 0 + while True: + state = "error" + try: + r = requests.get(f"{arroyo_url}/pipelines", timeout=5) + if r.ok: + data = r.json() + for p in data.get("data", []): + if p.get("name") == pipeline_name: + s = p.get("state") + stop = p.get("stop", "") + if s is None and stop == "none": + state = "running" + else: + state = str(s).lower() if s else "unknown" + break + else: + state = "not_found" + except Exception: + state = "error" + + if state == "running": + print(f"Pipeline '{pipeline_name}' is RUNNING") + return True + + print(f" Pipeline state: {state} (elapsed: {elapsed}s)") + time.sleep(5) + elapsed += 5 + if elapsed >= timeout: + print( + f"ERROR: Pipeline did not reach RUNNING state within {timeout}s" + ) + return False + + +def build_arroyosketch_cmd(args, arroyosketch_script: str) -> list: + """Build the run_arroyosketch.py command from our CLI arguments.""" + cmd = [ + sys.executable, + arroyosketch_script, + "--source_type", args.source_type, + "--output_format", "json", + "--pipeline_name", args.pipeline_name, + "--config_file_path", os.path.abspath(args.streaming_config), + "--output_kafka_topic", args.output_kafka_topic, + "--output_dir", os.path.abspath(args.output_dir), + "--parallelism", str(args.parallelism), + "--query_language", "sql", + ] + + if args.source_type == "file": + cmd += [ + "--input_file_path", os.path.abspath(args.input_file), + "--file_format", args.file_format, + "--ts_format", args.ts_format, + ] + elif args.source_type == "kafka": + cmd += [ + "--kafka_input_format", "json", + "--input_kafka_topic", args.input_kafka_topic, + ] + + return cmd + + +def main(): + parser = argparse.ArgumentParser( + description="Launch Arroyo sketch pipeline (file or kafka source)", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "--streaming-config", + required=True, + help="Path to streaming_config.yaml", + ) + parser.add_argument( + "--source-type", + choices=["file", "kafka"], + default="file", + help="Data source type (default: file)", + ) + # File source args + parser.add_argument( + "--input-file", + default=None, + help="Path to input data file (required for --source-type file)", + ) + parser.add_argument( + "--file-format", + choices=["json", "parquet"], + default="json", + help="File format (default: json)", + ) + parser.add_argument( + "--ts-format", + choices=["unix_millis", "unix_seconds", "rfc3339"], + default="rfc3339", + help="Timestamp format in the data file (default: rfc3339)", + ) + # Kafka source args + parser.add_argument( + "--input-kafka-topic", + default=None, + help="Kafka topic to read from (required for --source-type kafka)", + ) + # Common args + parser.add_argument( + "--output-kafka-topic", + default=DEFAULT_OUTPUT_KAFKA_TOPIC, + help=f"Kafka topic for sketch output (default: {DEFAULT_OUTPUT_KAFKA_TOPIC})", + ) + parser.add_argument( + "--pipeline-name", + required=True, + help="Arroyo pipeline name", + ) + parser.add_argument( + "--parallelism", + type=int, + default=DEFAULT_PARALLELISM, + help=f"Arroyo pipeline parallelism (default: {DEFAULT_PARALLELISM})", + ) + parser.add_argument( + "--arroyosketch-dir", + required=True, + help="Path to asap-summary-ingest/ directory (contains run_arroyosketch.py)", + ) + parser.add_argument( + "--arroyo-url", + default=DEFAULT_ARROYO_URL, + help=f"Arroyo API base URL (default: {DEFAULT_ARROYO_URL})", + ) + parser.add_argument( + "--output-dir", + default="./arroyo_outputs", + help="Directory for Arroyo pipeline output artifacts (default: ./arroyo_outputs)", + ) + parser.add_argument( + "--wait-for-pipeline", + action="store_true", + default=True, + help="Poll until pipeline reaches RUNNING state (default: True)", + ) + parser.add_argument( + "--no-wait", + action="store_true", + help="Do not wait for pipeline to reach RUNNING state", + ) + parser.add_argument( + "--wait-timeout", + type=int, + default=DEFAULT_WAIT_TIMEOUT, + help=f"Seconds to wait for RUNNING state (default: {DEFAULT_WAIT_TIMEOUT})", + ) + + args = parser.parse_args() + + # Validate source-specific required args + if args.source_type == "file" and not args.input_file: + parser.error("--input-file is required when --source-type file") + if args.source_type == "kafka" and not args.input_kafka_topic: + parser.error("--input-kafka-topic is required when --source-type kafka") + + arroyosketch_script = os.path.join( + os.path.abspath(args.arroyosketch_dir), "run_arroyosketch.py" + ) + if not os.path.exists(arroyosketch_script): + print(f"ERROR: run_arroyosketch.py not found at {arroyosketch_script}") + sys.exit(1) + + os.makedirs(args.output_dir, exist_ok=True) + + cmd = build_arroyosketch_cmd(args, arroyosketch_script) + print(f"Launching Arroyo pipeline '{args.pipeline_name}' ({args.source_type} source)...") + print(f"Command: {' '.join(cmd)}") + + result = subprocess.run(cmd) + if result.returncode != 0: + print(f"ERROR: run_arroyosketch.py exited with code {result.returncode}") + sys.exit(result.returncode) + + if not args.no_wait: + success = wait_for_pipeline_running( + args.pipeline_name, + arroyo_url=args.arroyo_url, + timeout=args.wait_timeout, + ) + if not success: + sys.exit(1) + + print("Done.") + + +if __name__ == "__main__": + main() diff --git a/asap-tools/execution-utilities/benchmark/export_to_database.py b/asap-tools/execution-utilities/benchmark/export_to_database.py new file mode 100644 index 00000000..d9583641 --- /dev/null +++ b/asap-tools/execution-utilities/benchmark/export_to_database.py @@ -0,0 +1,353 @@ +#!/usr/bin/env python3 +""" +Load a dataset into ClickHouse for baseline comparison. + +Supports ClickBench (hits.json.gz), H2O groupby CSV, or a custom table. + +Usage: + # ClickBench + python export_to_database.py \\ + --dataset clickbench \\ + --file-path ./data/hits.json.gz \\ + --init-sql-file ../clickhouse-benchmark-pipeline/clickhouse/clickbench_init.sql + + # H2O + python export_to_database.py \\ + --dataset h2o \\ + --file-path ./data/G1_1e7_1e2_0_0.csv \\ + --init-sql-file ../asap_benchmark_pipeline/h2o_init.sql + + # Custom JSON file + python export_to_database.py \\ + --dataset custom \\ + --file-path ./data/mydata.json \\ + --table-name mytable \\ + --ts-column event_time \\ + --ts-assignment passthrough +""" + +import argparse +import gzip +import os +import subprocess +import sys +from datetime import datetime, timezone +from pathlib import Path + +import requests + +DEFAULT_CLICKHOUSE_URL = "http://localhost:8123/" +H2O_BATCH_SIZE = 50_000 +H2O_ROWS_PER_SECOND = 1000 +H2O_BASE_EPOCH = 1704067200 # 2024-01-01T00:00:00Z + + +def _exec_clickhouse_sql(clickhouse_url: str, sql: str, label: str = ""): + """Execute a SQL statement via the ClickHouse HTTP API.""" + r = requests.post(clickhouse_url, data=sql.encode()) + if not r.ok: + print(f" WARN [{label}]: {r.text.strip()[:200]}") + else: + short = sql.strip()[:80].replace("\n", " ") + print(f" OK: {short}") + + +def run_init_sql(clickhouse_url: str, init_sql_file: str): + """Execute DDL statements from a SQL file.""" + print(f"Running init SQL from {init_sql_file}...") + with open(init_sql_file) as f: + content = f.read() + stmts = [s.strip() for s in content.split(";") if s.strip()] + for stmt in stmts: + _exec_clickhouse_sql(clickhouse_url, stmt, label=stmt[:40]) + + +def check_row_count(clickhouse_url: str, table_name: str) -> int: + r = requests.post(clickhouse_url, data=f"SELECT count(*) FROM {table_name}") + if r.ok: + return int(r.text.strip()) + return 0 + + +def load_clickbench( + clickhouse_url: str, + file_path: str, + init_sql_file: str = None, + skip_table_init: bool = False, + skip_if_loaded: bool = False, + max_rows: int = 0, +): + """Load hits.json.gz into ClickHouse. + + Uses `zcat | clickhouse-client INSERT` for gzip-compressed JSON. + Adapted from asap_query_latency/run_benchmark.py:load_clickbench_data(). + """ + if not skip_table_init and init_sql_file: + run_init_sql(clickhouse_url, init_sql_file) + + if skip_if_loaded: + count = check_row_count(clickhouse_url, "hits") + if count > 0: + print(f"Data already loaded ({count:,} rows). Skipping.") + return True + + if not os.path.exists(file_path): + print(f"ERROR: Data file not found: {file_path}") + return False + + print(f"Loading ClickBench data from {file_path}...") + if max_rows > 0: + # Pipe through head to limit rows + cmd = ( + f"zcat {file_path} | head -n {max_rows} | " + f"clickhouse-client --query='INSERT INTO hits FORMAT JSONEachRow'" + ) + else: + cmd = ( + f"zcat {file_path} | " + f"clickhouse-client --query='INSERT INTO hits FORMAT JSONEachRow'" + ) + + result = subprocess.run(cmd, shell=True) + if result.returncode != 0: + print("ERROR: ClickHouse insert failed") + return False + + count = check_row_count(clickhouse_url, "hits") + print(f"Loaded {count:,} rows into ClickHouse (hits)") + return True + + +def _flush_h2o_batch(clickhouse_url: str, rows: list): + """Flush a batch of H2O rows to ClickHouse via HTTP INSERT.""" + sql = "INSERT INTO h2o_groupby VALUES " + ",".join(rows) + r = requests.post(clickhouse_url, data=sql.encode()) + if not r.ok: + raise RuntimeError(f"ClickHouse insert failed: {r.text[:200]}") + + +def load_h2o( + clickhouse_url: str, + file_path: str, + init_sql_file: str = None, + skip_table_init: bool = False, + skip_if_loaded: bool = False, + max_rows: int = 0, +): + """Load H2O groupby CSV into ClickHouse with synthetic timestamps. + + Timestamps are assigned at H2O_ROWS_PER_SECOND rows/sec starting from + H2O_BASE_EPOCH (2024-01-01T00:00:00Z). + Adapted from asap_benchmark_pipeline/run_benchmark.py:load_h2o_data_clickhouse(). + """ + if not skip_table_init and init_sql_file: + run_init_sql(clickhouse_url, init_sql_file) + + if skip_if_loaded: + count = check_row_count(clickhouse_url, "h2o_groupby") + if count > 0: + print(f"Data already loaded ({count:,} rows). Skipping.") + return True + + if not os.path.exists(file_path): + print(f"ERROR: Data file not found: {file_path}") + return False + + print(f"Inserting H2O data from {file_path} into ClickHouse...") + batch: list = [] + total = 0 + + with open(file_path, "r", encoding="utf-8") as f: + f.readline() # skip header + for i, line in enumerate(f): + if max_rows > 0 and i >= max_rows: + break + parts = line.rstrip("\n").split(",") + abs_sec = H2O_BASE_EPOCH + i // H2O_ROWS_PER_SECOND + ts = datetime.fromtimestamp(abs_sec, tz=timezone.utc) + ts_str = ts.strftime("%Y-%m-%d %H:%M:%S") + + batch.append( + f"('{ts_str}','{parts[0]}','{parts[1]}','{parts[2]}'," + f"{parts[3]},{parts[4]},{parts[5]}," + f"{parts[6]},{parts[7]},{parts[8]})" + ) + + if len(batch) >= H2O_BATCH_SIZE: + _flush_h2o_batch(clickhouse_url, batch) + total += len(batch) + batch = [] + if total % 500_000 == 0: + print(f" Inserted {total:,} rows...") + + if batch: + _flush_h2o_batch(clickhouse_url, batch) + total += len(batch) + + print(f"Loaded {total:,} rows into ClickHouse (h2o_groupby)") + return True + + +def load_custom( + clickhouse_url: str, + file_path: str, + table_name: str, + ts_column: str, + ts_assignment: str = "passthrough", + init_sql_file: str = None, + skip_table_init: bool = False, + skip_if_loaded: bool = False, + max_rows: int = 0, +): + """Load a custom JSON or CSV file into ClickHouse. + + For JSON files: uses INSERT FORMAT JSONEachRow via clickhouse-client. + ts_assignment='synthetic' is only supported for CSV (same logic as H2O). + """ + if not skip_table_init and init_sql_file: + run_init_sql(clickhouse_url, init_sql_file) + + if skip_if_loaded: + count = check_row_count(clickhouse_url, table_name) + if count > 0: + print(f"Data already loaded ({count:,} rows). Skipping.") + return True + + if not os.path.exists(file_path): + print(f"ERROR: Data file not found: {file_path}") + return False + + path_lower = file_path.lower() + if path_lower.endswith(".json.gz") or path_lower.endswith(".jsonl.gz"): + head_cmd = f"| head -n {max_rows}" if max_rows > 0 else "" + cmd = ( + f"zcat {file_path} {head_cmd} | " + f"clickhouse-client --query='INSERT INTO {table_name} FORMAT JSONEachRow'" + ) + print(f"Loading {file_path} into ClickHouse ({table_name})...") + result = subprocess.run(cmd, shell=True) + if result.returncode != 0: + print("ERROR: ClickHouse insert failed") + return False + elif path_lower.endswith(".json") or path_lower.endswith(".jsonl"): + head_cmd = f"head -n {max_rows} {file_path} | " if max_rows > 0 else "" + cmd = ( + f"{head_cmd}clickhouse-client --query='INSERT INTO {table_name} FORMAT JSONEachRow' " + f"< {file_path}" + ) + print(f"Loading {file_path} into ClickHouse ({table_name})...") + result = subprocess.run(cmd, shell=True) + if result.returncode != 0: + print("ERROR: ClickHouse insert failed") + return False + else: + print(f"ERROR: Unsupported file format for {file_path}. Use --dataset h2o for CSV.") + return False + + count = check_row_count(clickhouse_url, table_name) + print(f"Loaded {count:,} rows into ClickHouse ({table_name})") + return True + + +def main(): + parser = argparse.ArgumentParser( + description="Load a dataset into ClickHouse for baseline comparison", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "--dataset", + choices=["clickbench", "h2o", "custom"], + required=True, + help="Dataset type", + ) + parser.add_argument( + "--file-path", + required=True, + help="Path to the source data file", + ) + parser.add_argument( + "--clickhouse-url", + default=DEFAULT_CLICKHOUSE_URL, + help=f"ClickHouse HTTP URL (default: {DEFAULT_CLICKHOUSE_URL})", + ) + parser.add_argument( + "--init-sql-file", + default=None, + help="DDL SQL file to run before loading (CREATE TABLE ...)", + ) + parser.add_argument( + "--table-name", + default=None, + help="Target table name (required for --dataset custom)", + ) + parser.add_argument( + "--ts-column", + default=None, + help="Timestamp column name (for --dataset custom)", + ) + parser.add_argument( + "--ts-assignment", + choices=["synthetic", "passthrough"], + default="passthrough", + help="How to assign timestamps for custom CSV data (default: passthrough)", + ) + parser.add_argument( + "--skip-table-init", + action="store_true", + help="Skip CREATE TABLE (assume tables already exist)", + ) + parser.add_argument( + "--skip-if-loaded", + action="store_true", + help="Skip insert if the table already has rows", + ) + parser.add_argument( + "--max-rows", + type=int, + default=0, + help="Maximum rows to load (0 = all)", + ) + + args = parser.parse_args() + + if args.dataset == "custom" and not args.table_name: + parser.error("--table-name is required when --dataset custom") + + success = False + if args.dataset == "clickbench": + success = load_clickbench( + args.clickhouse_url, + args.file_path, + init_sql_file=args.init_sql_file, + skip_table_init=args.skip_table_init, + skip_if_loaded=args.skip_if_loaded, + max_rows=args.max_rows, + ) + elif args.dataset == "h2o": + success = load_h2o( + args.clickhouse_url, + args.file_path, + init_sql_file=args.init_sql_file, + skip_table_init=args.skip_table_init, + skip_if_loaded=args.skip_if_loaded, + max_rows=args.max_rows, + ) + else: + success = load_custom( + args.clickhouse_url, + args.file_path, + table_name=args.table_name, + ts_column=args.ts_column, + ts_assignment=args.ts_assignment, + init_sql_file=args.init_sql_file, + skip_table_init=args.skip_table_init, + skip_if_loaded=args.skip_if_loaded, + max_rows=args.max_rows, + ) + + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() diff --git a/asap-tools/execution-utilities/benchmark/generate_queries.py b/asap-tools/execution-utilities/benchmark/generate_queries.py new file mode 100644 index 00000000..13989100 --- /dev/null +++ b/asap-tools/execution-utilities/benchmark/generate_queries.py @@ -0,0 +1,390 @@ +#!/usr/bin/env python3 +""" +Generate paired ASAP and ClickHouse SQL query files for benchmarking. + +Each query targets a fixed time window (window-end timestamp) and matches the +annotation format `-- T{NNN}: description` expected by run_benchmark.py. + +Output: + {prefix}_asap.sql QUANTILE(q, col) syntax for QueryEngineRust + {prefix}_clickhouse.sql quantile(q)(col) syntax for ClickHouse baseline + +Usage: + # Auto-detect timestamps from data file + python generate_queries.py \\ + --table-name hits \\ + --ts-column EventTime \\ + --value-column ResolutionWidth \\ + --group-by-columns RegionID,OS,UserAgent,TraficSourceID \\ + --window-size 10 \\ + --num-queries 50 \\ + --auto-detect-timestamps \\ + --data-file ./data/hits.json.gz \\ + --data-file-format json.gz \\ + --output-prefix ./queries/clickbench + + # Explicit timestamp file (one ISO timestamp per line) + python generate_queries.py \\ + --table-name h2o_groupby \\ + --ts-column timestamp \\ + --value-column v1 \\ + --group-by-columns id1,id2 \\ + --window-size 10 \\ + --num-queries 50 \\ + --timestamps-file ./my_timestamps.txt \\ + --output-prefix ./queries/h2o +""" + +import argparse +import gzip +import json +import sys +from datetime import datetime, timedelta, timezone +from pathlib import Path +from typing import List, Optional + + +SAMPLE_SIZE = 10_000 # rows to read for timestamp auto-detection + + +def _parse_timestamp(value: str) -> Optional[datetime]: + """Try to parse a timestamp string in common formats.""" + value = str(value).strip() + for fmt in ( + "%Y-%m-%dT%H:%M:%SZ", + "%Y-%m-%dT%H:%M:%S.%fZ", + "%Y-%m-%dT%H:%M:%S", + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%d", + ): + try: + return datetime.strptime(value, fmt).replace(tzinfo=timezone.utc) + except ValueError: + pass + # Try unix seconds/millis (numeric string) + try: + v = float(value) + if v > 1e12: # millis + return datetime.fromtimestamp(v / 1000, tz=timezone.utc) + return datetime.fromtimestamp(v, tz=timezone.utc) + except ValueError: + pass + return None + + +def _read_timestamps_from_json( + file_path: str, ts_column: str, compressed: bool +) -> List[datetime]: + """Read up to SAMPLE_SIZE timestamps from a JSON-lines file.""" + timestamps = [] + opener = gzip.open if compressed else open + mode = "rt" if compressed else "r" + with opener(file_path, mode) as f: + for i, line in enumerate(f): + if i >= SAMPLE_SIZE: + break + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + val = obj.get(ts_column) + if val is not None: + ts = _parse_timestamp(val) + if ts: + timestamps.append(ts) + except (json.JSONDecodeError, KeyError): + continue + return timestamps + + +def _read_timestamps_from_csv( + file_path: str, ts_column: str +) -> List[datetime]: + """Read up to SAMPLE_SIZE timestamps from a CSV file.""" + import csv + timestamps = [] + with open(file_path, "r", newline="") as f: + reader = csv.DictReader(f) + if ts_column not in (reader.fieldnames or []): + print( + f"WARNING: Column '{ts_column}' not found in CSV. " + f"Available: {reader.fieldnames}" + ) + return [] + for i, row in enumerate(reader): + if i >= SAMPLE_SIZE: + break + ts = _parse_timestamp(row[ts_column]) + if ts: + timestamps.append(ts) + return timestamps + + +def detect_timestamps( + data_file: str, data_file_format: str, ts_column: str +) -> tuple: + """Return (min_ts, max_ts) from a sample of the data file.""" + fmt = data_file_format.lower() + if fmt in ("json.gz", "jsonl.gz"): + timestamps = _read_timestamps_from_json(data_file, ts_column, compressed=True) + elif fmt in ("json", "jsonl"): + timestamps = _read_timestamps_from_json(data_file, ts_column, compressed=False) + elif fmt == "csv": + timestamps = _read_timestamps_from_csv(data_file, ts_column) + else: + print(f"ERROR: Unsupported data file format: {data_file_format}") + sys.exit(1) + + if not timestamps: + print( + f"ERROR: No '{ts_column}' timestamps found in the first {SAMPLE_SIZE} " + f"rows of {data_file}" + ) + sys.exit(1) + + return min(timestamps), max(timestamps) + + +def _snap_to_window_boundary(ts: datetime, window_size: int) -> datetime: + """Round a timestamp up to the next window boundary (epoch-aligned). + + Arroyo tumbling windows are aligned to epoch multiples of window_size. + Querying at a non-boundary timestamp will miss the sketch. + """ + epoch_sec = int(ts.timestamp()) + remainder = epoch_sec % window_size + if remainder == 0: + return ts + snapped = epoch_sec + (window_size - remainder) + return datetime.fromtimestamp(snapped, tz=timezone.utc) + + +def generate_window_ends( + min_ts: datetime, + max_ts: datetime, + window_size: int, + stride: int, + num_queries: int, +) -> List[datetime]: + """Generate evenly-spaced window-end timestamps within [min_ts, max_ts]. + + Timestamps are snapped to epoch-aligned window boundaries so that + Arroyo's tumbling window sketches can be found by QueryEngineRust. + """ + # First valid window-end: snap to next boundary after min_ts + window_size + earliest = min_ts + timedelta(seconds=window_size) + start = _snap_to_window_boundary(earliest, window_size) + if start >= max_ts: + print( + f"WARNING: window_size ({window_size}s) exceeds the data time range " + f"({(max_ts - min_ts).total_seconds():.0f}s). Using max_ts as only endpoint." + ) + return [max_ts] + + ends = [] + current = start + while current <= max_ts and len(ends) < num_queries: + ends.append(current) + current += timedelta(seconds=stride) + + return ends + + +def format_ts(ts: datetime, ts_format: str) -> str: + """Format a timestamp for SQL injection.""" + if ts_format == "iso": + return ts.strftime("%Y-%m-%dT%H:%M:%SZ") + else: # datetime + return ts.strftime("%Y-%m-%d %H:%M:%S") + + +def generate_sql_files( + table_name: str, + ts_column: str, + value_column: str, + group_by_columns: List[str], + quantile: float, + window_size: int, + window_ends: List[datetime], + ts_format: str, + window_form: str, + output_prefix: str, +): + """Write the paired ASAP and ClickHouse SQL files.""" + group_by_clause = ", ".join(group_by_columns) + asap_lines = [] + ch_lines = [] + + for i, end_ts in enumerate(window_ends): + end_str = format_ts(end_ts, ts_format) + start_ts = end_ts - timedelta(seconds=window_size) + start_str = format_ts(start_ts, ts_format) + label = f"T{i:03d}" + desc = f"quantile window ending at {end_str}" + + if window_form == "dateadd": + where_clause = ( + f"{ts_column} BETWEEN DATEADD(s, -{window_size}, '{end_str}') AND '{end_str}'" + ) + else: + where_clause = ( + f"{ts_column} BETWEEN '{start_str}' AND '{end_str}'" + ) + + asap_sql = ( + f"-- {label}: {desc}\n" + f"SELECT QUANTILE({quantile}, {value_column}) FROM {table_name} " + f"WHERE {where_clause} GROUP BY {group_by_clause};" + ) + ch_sql = ( + f"-- {label}: {desc}\n" + f"SELECT quantile({quantile})({value_column}) FROM {table_name} " + f"WHERE {where_clause} GROUP BY {group_by_clause};" + ) + + asap_lines.append(asap_sql) + ch_lines.append(ch_sql) + + asap_file = f"{output_prefix}_asap.sql" + ch_file = f"{output_prefix}_clickhouse.sql" + + Path(asap_file).parent.mkdir(parents=True, exist_ok=True) + + with open(asap_file, "w") as f: + f.write("\n".join(asap_lines) + "\n") + + with open(ch_file, "w") as f: + f.write("\n".join(ch_lines) + "\n") + + print(f"Generated {len(window_ends)} queries:") + print(f" ASAP: {asap_file}") + print(f" ClickHouse: {ch_file}") + + +def main(): + parser = argparse.ArgumentParser( + description="Generate paired ASAP + ClickHouse SQL query files", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + # Table/column config + parser.add_argument("--table-name", required=True) + parser.add_argument("--ts-column", required=True, help="Timestamp column name") + parser.add_argument("--value-column", required=True, help="Column to compute quantile on") + parser.add_argument( + "--group-by-columns", + required=True, + help="Comma-separated GROUP BY columns", + ) + # Query parameters + parser.add_argument("--quantile", type=float, default=0.95) + parser.add_argument("--window-size", type=int, default=10, help="Window size in seconds") + parser.add_argument("--num-queries", type=int, default=50) + parser.add_argument( + "--ts-format", + choices=["iso", "datetime"], + default="iso", + help="Timestamp format in SQL: iso='YYYY-MM-DDTHH:MM:SSZ', datetime='YYYY-MM-DD HH:MM:SS' (default: iso)", + ) + parser.add_argument( + "--window-form", + choices=["explicit", "dateadd"], + default="explicit", + help="SQL window form: explicit='BETWEEN start AND end', dateadd='BETWEEN DATEADD(s,-N,end) AND end' (default: explicit)", + ) + parser.add_argument( + "--output-prefix", + required=True, + help="Output file prefix (e.g. ./queries/clickbench → clickbench_asap.sql + clickbench_clickhouse.sql)", + ) + # Timestamp sources (mutually exclusive) + ts_group = parser.add_mutually_exclusive_group(required=True) + ts_group.add_argument( + "--auto-detect-timestamps", + action="store_true", + help="Scan data file to determine time range", + ) + ts_group.add_argument( + "--timestamps-file", + default=None, + help="File with explicit window-end timestamps (one ISO timestamp per line)", + ) + # Auto-detect options + parser.add_argument( + "--data-file", + default=None, + help="Path to data file (required with --auto-detect-timestamps)", + ) + parser.add_argument( + "--data-file-format", + choices=["json", "jsonl", "json.gz", "jsonl.gz", "csv"], + default="json", + help="Data file format (default: json)", + ) + parser.add_argument( + "--stride-seconds", + type=int, + default=None, + help="Spacing between window-end timestamps (default: window-size * 3)", + ) + + args = parser.parse_args() + + if args.auto_detect_timestamps and not args.data_file: + parser.error("--data-file is required when --auto-detect-timestamps is set") + + group_by_columns = [c.strip() for c in args.group_by_columns.split(",")] + stride = args.stride_seconds if args.stride_seconds else args.window_size * 3 + + # Determine window-end timestamps + if args.timestamps_file: + window_ends = [] + with open(args.timestamps_file) as f: + for line in f: + line = line.strip() + if not line: + continue + ts = _parse_timestamp(line) + if ts: + window_ends.append(ts) + else: + print(f"WARNING: Could not parse timestamp: {line!r}") + if not window_ends: + print("ERROR: No valid timestamps found in --timestamps-file") + sys.exit(1) + window_ends = window_ends[: args.num_queries] + print( + f"Using {len(window_ends)} timestamps from {args.timestamps_file} " + f"({window_ends[0]} – {window_ends[-1]})" + ) + else: + print(f"Scanning {args.data_file} for timestamp range...") + min_ts, max_ts = detect_timestamps( + args.data_file, args.data_file_format, args.ts_column + ) + print(f" Detected range: {min_ts} – {max_ts}") + window_ends = generate_window_ends( + min_ts, max_ts, args.window_size, stride, args.num_queries + ) + print( + f" Generated {len(window_ends)} window endpoints " + f"(stride={stride}s, window={args.window_size}s)" + ) + + generate_sql_files( + table_name=args.table_name, + ts_column=args.ts_column, + value_column=args.value_column, + group_by_columns=group_by_columns, + quantile=args.quantile, + window_size=args.window_size, + window_ends=window_ends, + ts_format=args.ts_format, + window_form=args.window_form, + output_prefix=args.output_prefix, + ) + + +if __name__ == "__main__": + main() diff --git a/asap-tools/execution-utilities/benchmark/prepare_data.py b/asap-tools/execution-utilities/benchmark/prepare_data.py new file mode 100644 index 00000000..33bc207d --- /dev/null +++ b/asap-tools/execution-utilities/benchmark/prepare_data.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python3 +""" +Prepare data files for use with the Arroyo file source. + +The Arroyo file source (single_file_custom connector) requires: + - JSON-lines format + - Timestamps in RFC3339 format (e.g. "2013-07-14T20:38:47Z") + - Metadata columns (GROUP BY columns) as strings + - Value columns as floats + +This script converts raw downloaded datasets into the right format. + +Usage: + # ClickBench: convert hits.json.gz → hits_arroyo.json + python prepare_data.py --dataset clickbench \\ + --input ./data/hits.json.gz \\ + --output ./data/hits_arroyo.json \\ + [--max-rows 1000000] + + # H2O: convert G1_1e7_1e2_0_0.csv → h2o_arroyo.json (adds synthetic timestamps) + python prepare_data.py --dataset h2o \\ + --input ./data/G1_1e7_1e2_0_0.csv \\ + --output ./data/h2o_arroyo.json \\ + [--max-rows 1000000] +""" + +import argparse +import gzip +import json +import sys +from datetime import datetime, timedelta, timezone +from pathlib import Path + +# Synthetic timestamp base for H2O (2024-01-01T00:00:00Z) +H2O_BASE_EPOCH = 1704067200 +H2O_ROWS_PER_SECOND = 1000 + +# ClickBench columns needed by Arroyo (must match streaming_config.yaml) +CB_TIMESTAMP_FIELD = "EventTime" +CB_VALUE_FIELDS = ["ResolutionWidth"] +CB_METADATA_FIELDS = ["RegionID", "OS", "UserAgent", "TraficSourceID"] +CB_KEEP_FIELDS = [CB_TIMESTAMP_FIELD] + CB_VALUE_FIELDS + CB_METADATA_FIELDS + +# H2O columns +H2O_TIMESTAMP_FIELD = "timestamp" +H2O_METADATA_FIELDS = ["id1", "id2"] +H2O_VALUE_FIELDS = ["v1"] + + +def _parse_clickbench_ts(ts_str: str) -> str: + """Convert 'YYYY-MM-DD HH:MM:SS' → 'YYYY-MM-DDTHH:MM:SSZ' (RFC3339).""" + try: + dt = datetime.strptime(ts_str, "%Y-%m-%d %H:%M:%S") + return dt.strftime("%Y-%m-%dT%H:%M:%SZ") + except ValueError: + return ts_str # already RFC3339 or unknown format + + +def prepare_clickbench(input_path: str, output_path: str, max_rows: int = 0): + """Convert hits.json.gz to Arroyo-compatible JSON. + + - Converts EventTime to RFC3339 + - Stringifies integer metadata columns (RegionID, OS, UserAgent, TraficSourceID) + - Sorts by EventTime (required for Arroyo event-time watermarks) + - Writes only the fields needed by the streaming config + """ + print(f"Reading {input_path}...") + records = [] + + opener = gzip.open if input_path.endswith(".gz") else open + with opener(input_path, "rt") as f: + for i, line in enumerate(f): + if max_rows > 0 and i >= max_rows: + break + if i % 100_000 == 0 and i > 0: + print(f" Read {i:,} rows...", end="\r") + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + except json.JSONDecodeError: + continue + + ts = _parse_clickbench_ts(str(obj.get(CB_TIMESTAMP_FIELD, ""))) + record = {CB_TIMESTAMP_FIELD: ts} + for col in CB_VALUE_FIELDS: + record[col] = float(obj.get(col, 0)) + for col in CB_METADATA_FIELDS: + record[col] = str(obj.get(col, "")) + records.append(record) + + print(f"\nSorting {len(records):,} records by {CB_TIMESTAMP_FIELD}...") + records.sort(key=lambda r: r[CB_TIMESTAMP_FIELD]) + + print(f"Writing to {output_path}...") + with open(output_path, "w") as f: + for record in records: + f.write(json.dumps(record) + "\n") + + print(f"Done. {len(records):,} records written.") + if records: + print(f" Time range: {records[0][CB_TIMESTAMP_FIELD]} – {records[-1][CB_TIMESTAMP_FIELD]}") + + +def prepare_h2o(input_path: str, output_path: str, max_rows: int = 0): + """Convert H2O CSV to Arroyo-compatible JSON with synthetic timestamps. + + - Adds synthetic RFC3339 timestamps at H2O_ROWS_PER_SECOND rows/sec + starting from 2024-01-01T00:00:00Z + - Converts id4, id5, id6 to strings (metadata columns are expected as strings) + """ + print(f"Reading {input_path}...") + count = 0 + + with open(input_path, "r", encoding="utf-8") as fin, \ + open(output_path, "w") as fout: + + header = fin.readline().strip() + cols = header.split(",") + id_idx = {c: i for i, c in enumerate(cols)} + + for i, line in enumerate(fin): + if max_rows > 0 and i >= max_rows: + break + if i % 100_000 == 0 and i > 0: + print(f" Written {i:,} rows...", end="\r") + + parts = line.rstrip("\n").split(",") + abs_sec = H2O_BASE_EPOCH + i // H2O_ROWS_PER_SECOND + ms = i % H2O_ROWS_PER_SECOND + ts = datetime.fromtimestamp(abs_sec, tz=timezone.utc) + ts_str = ts.strftime("%Y-%m-%dT%H:%M:%S") + f".{ms:03d}Z" + + record = { + H2O_TIMESTAMP_FIELD: ts_str, + "id1": parts[id_idx["id1"]], + "id2": parts[id_idx["id2"]], + "id3": parts[id_idx["id3"]], + "id4": int(parts[id_idx["id4"]]), + "id5": int(parts[id_idx["id5"]]), + "id6": int(parts[id_idx["id6"]]), + "v1": float(parts[id_idx["v1"]]), + "v2": float(parts[id_idx["v2"]]), + "v3": float(parts[id_idx["v3"]]), + } + fout.write(json.dumps(record) + "\n") + count += 1 + + print(f"\nDone. {count:,} records written to {output_path}.") + first_ts = datetime.fromtimestamp(H2O_BASE_EPOCH, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + last_ts = datetime.fromtimestamp(H2O_BASE_EPOCH + count // H2O_ROWS_PER_SECOND, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + print(f" Time range: {first_ts} – {last_ts}") + + +def main(): + parser = argparse.ArgumentParser( + description="Prepare dataset files for Arroyo file source", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "--dataset", + choices=["clickbench", "h2o"], + required=True, + help="Dataset type to prepare", + ) + parser.add_argument("--input", required=True, help="Path to raw input file") + parser.add_argument("--output", required=True, help="Path to write prepared JSON file") + parser.add_argument( + "--max-rows", + type=int, + default=0, + help="Max rows to process (0 = all, default: 0)", + ) + args = parser.parse_args() + + Path(args.output).parent.mkdir(parents=True, exist_ok=True) + + if args.dataset == "clickbench": + prepare_clickbench(args.input, args.output, args.max_rows) + else: + prepare_h2o(args.input, args.output, args.max_rows) + + +if __name__ == "__main__": + main() diff --git a/asap-tools/execution-utilities/benchmark/requirements.txt b/asap-tools/execution-utilities/benchmark/requirements.txt new file mode 100644 index 00000000..85676314 --- /dev/null +++ b/asap-tools/execution-utilities/benchmark/requirements.txt @@ -0,0 +1,5 @@ +requests>=2.28 +gdown>=4.7 +pyyaml>=6.0 +matplotlib>=3.7 +numpy>=1.24 diff --git a/asap-tools/execution-utilities/benchmark/run_benchmark.py b/asap-tools/execution-utilities/benchmark/run_benchmark.py new file mode 100644 index 00000000..a196aced --- /dev/null +++ b/asap-tools/execution-utilities/benchmark/run_benchmark.py @@ -0,0 +1,434 @@ +#!/usr/bin/env python3 +""" +Unified benchmark runner: ASAP (QueryEngineRust) vs ClickHouse baseline. + +Reads SQL files generated by generate_queries.py, sends each query to the +configured endpoint, and writes results to CSV. With --mode both, runs +baseline then ASAP and generates a latency comparison plot. + +Usage: + # Both modes with comparison plot + python run_benchmark.py \\ + --mode both \\ + --asap-sql-file ./queries/clickbench_asap.sql \\ + --baseline-sql-file ./queries/clickbench_clickhouse.sql \\ + --output-dir ./results + + # ASAP only + python run_benchmark.py \\ + --mode asap \\ + --asap-sql-file ./queries/h2o_asap.sql \\ + --output-dir ./results + + # Baseline only + python run_benchmark.py \\ + --mode baseline \\ + --baseline-sql-file ./queries/h2o_clickhouse.sql \\ + --output-dir ./results +""" + +import argparse +import csv +import re +import time +import urllib.parse +from pathlib import Path +from typing import List, Optional, Tuple + +import matplotlib.pyplot as plt +import numpy as np +import requests + +DEFAULT_ASAP_URL = "http://localhost:8088/clickhouse/query" +DEFAULT_CLICKHOUSE_URL = "http://localhost:8123/?session_timezone=UTC" +DEFAULT_OUTPUT_DIR = "./results" +DEFAULT_OUTPUT_PREFIX = "benchmark" + + +# --------------------------------------------------------------------------- +# Query extraction +# Reused from asap_query_latency/run_benchmark.py:extract_queries_from_sql() +# --------------------------------------------------------------------------- + + +def extract_queries_from_sql(sql_file: Path) -> List[Tuple[str, str]]: + """Extract (query_id, sql) pairs from an annotated SQL file. + + Expects lines of the form: + -- T001: description + SELECT ... ; + """ + with open(sql_file) as f: + content = f.read() + pattern = r"-- ([A-Za-z0-9_]+):[^\n]*\n(SELECT[^;]+;)" + return [ + (qid, sql.strip()) + for qid, sql in re.findall(pattern, content, re.DOTALL | re.IGNORECASE) + ] + + +# --------------------------------------------------------------------------- +# Query runner +# Adapted from asap_benchmark_pipeline/run_benchmark.py:run_query() +# Uses requests.Session for connection reuse across queries. +# --------------------------------------------------------------------------- + + +def run_query( + query: str, + endpoint_url: str, + session: requests.Session, + timeout: int = 30, + debug: bool = False, +) -> Tuple[float, Optional[str], Optional[str]]: + """Send a single SQL query and return (latency_ms, result_text, error).""" + encoded_query = urllib.parse.quote(query) + separator = "&" if "?" in endpoint_url else "?" + url = f"{endpoint_url}{separator}query={encoded_query}" + + try: + start = time.time() + response = session.get(url, timeout=timeout) + latency_ms = (time.time() - start) * 1000 + + if debug: + source = "OK" if response.status_code == 200 else f"HTTP {response.status_code}" + print(f" [{source}] {latency_ms:.2f}ms") + + if response.status_code == 200: + return latency_ms, response.text.strip(), None + else: + return latency_ms, None, f"HTTP {response.status_code}: {response.text[:200]}" + except requests.Timeout: + return timeout * 1000.0, None, "Timeout" + except Exception as e: + return 0.0, None, str(e) + + +# --------------------------------------------------------------------------- +# Benchmark runner +# Consolidated from both asap_query_latency/run_benchmark.py and +# asap_benchmark_pipeline/run_benchmark.py:run_benchmark(). +# --------------------------------------------------------------------------- + + +def _infer_pattern(query_id: str) -> str: + if query_id.startswith("ST"): + return "SpatioTemporal" + if query_id.startswith("S"): + return "Spatial" + if query_id.startswith("T"): + return "Temporal" + if query_id.startswith("N"): + return "Nested" + if query_id.startswith("D"): + return "Dated" + if query_id.startswith("L"): + return "LongRange" + return "Unknown" + + +def _latency_summary(latencies: List[float], label: str): + if not latencies: + return + s = sorted(latencies) + n = len(s) + print(f"\n{label} ({n} successful queries):") + print( + f" min={s[0]:.2f}ms avg={sum(s)/n:.2f}ms " + f"p50={s[int(n*0.50)]:.2f}ms p95={s[int(n*0.95)]:.2f}ms max={s[-1]:.2f}ms" + ) + + +def run_benchmark( + sql_file: Path, + endpoint_url: str, + output_csv: Path, + mode: str, + query_filter: Optional[List[str]] = None, + timeout: int = 30, + repeat: int = 1, + debug: bool = False, + no_plot: bool = False, +): + """Run all queries and write results to CSV. + + CSV columns: query_id, query_pattern, latency_ms, result_rows, + result_full, error, mode + """ + print(f"\nRunning benchmark in {mode.upper()} mode...") + print(f"Endpoint: {endpoint_url}") + print(f"SQL file: {sql_file}") + print(f"Output: {output_csv}") + if debug: + print("Debug: per-request HTTP status shown.") + + queries = extract_queries_from_sql(sql_file) + if query_filter: + queries = [(qid, sql) for qid, sql in queries if qid in query_filter] + print(f"Found {len(queries)} queries (repeat={repeat})") + + output_csv.parent.mkdir(parents=True, exist_ok=True) + session = requests.Session() + latencies_ok: List[float] = [] + plot_latencies: List[float] = [] + + with open(output_csv, "w", newline="") as csvfile: + writer = csv.writer(csvfile) + writer.writerow( + ["query_id", "query_pattern", "latency_ms", "result_rows", "result_full", "error", "mode"] + ) + + for query_id, sql in queries: + pattern = _infer_pattern(query_id) + print(f"Running {query_id}...", end=" " if not debug else "\n", flush=True) + + # Repeat and take median + trial_latencies = [] + last_result, last_error = None, None + for _ in range(repeat): + lat, result, error = run_query(sql, endpoint_url, session, timeout, debug) + trial_latencies.append(lat) + last_result, last_error = result, error + if error: + break # don't retry on error + + latency_ms = sorted(trial_latencies)[len(trial_latencies) // 2] + + if last_error: + print(f"ERROR {last_error}") + writer.writerow([query_id, pattern, f"{latency_ms:.2f}", 0, "", last_error, mode]) + plot_latencies.append(0.0) + else: + result_lines = last_result.strip().split("\n") if last_result else [] + num_rows = len(result_lines) + preview = last_result.replace("\n", " | ")[:200] if last_result else "" + latencies_ok.append(latency_ms) + plot_latencies.append(latency_ms) + print(f"{latency_ms:.2f}ms ({num_rows} rows)") + writer.writerow( + [query_id, pattern, f"{latency_ms:.2f}", num_rows, preview, "", mode] + ) + + time.sleep(0.1) + + print(f"\nResults saved to {output_csv}") + _latency_summary(latencies_ok, f"Latency summary") + + if not no_plot and plot_latencies: + _plot_single(plot_latencies, mode, output_csv.with_suffix(".png")) + + +def _plot_single(latencies: List[float], mode: str, out_path: Path): + """Bar chart of per-query latency for a single mode.""" + color = "#4682b4" if mode == "asap" else "#f4a460" + x = list(range(1, len(latencies) + 1)) + plt.figure(figsize=(12, 5)) + plt.bar(x, latencies, color=color, edgecolor="black") + plt.xlabel("Query Execution Order") + plt.ylabel("Latency (ms)") + plt.title(f"Query Latency — {mode.upper()} mode") + plt.grid(axis="y", linestyle="--", alpha=0.7) + plt.tight_layout() + plt.savefig(out_path, dpi=150) + plt.close() + print(f"Plot saved to {out_path}") + + +def _plot_comparison(asap_csv: Path, baseline_csv: Path, out_path: Path): + """Two-panel comparison plot: per-query bars + speedup bars. + + Adapted from asap_query_latency/plot_latency.py. + """ + def _load(path): + rows = {} + with open(path) as f: + for row in csv.DictReader(f): + if not row["error"]: + rows[row["query_id"]] = float(row["latency_ms"]) + return rows + + asap = _load(asap_csv) + base = _load(baseline_csv) + qids = sorted(set(asap) & set(base)) + if not qids: + print("WARNING: No common query IDs for comparison plot.") + return + + x = np.arange(len(qids)) + a_vals = [asap[q] for q in qids] + b_vals = [base[q] for q in qids] + speedup = [b / a if a > 0 else 0 for a, b in zip(a_vals, b_vals)] + + fig, (ax1, ax2) = plt.subplots( + 2, 1, figsize=(14, 7), gridspec_kw={"height_ratios": [3, 1]} + ) + + w = 0.4 + ax1.bar(x - w / 2, b_vals, w, label="ClickHouse baseline", color="#f4a460") + ax1.bar(x + w / 2, a_vals, w, label="ASAP (KLL sketch)", color="#4682b4") + ax1.set_xticks(x) + ax1.set_xticklabels(qids, rotation=90, fontsize=7) + ax1.set_ylabel("Latency (ms)") + ax1.set_title( + f"Query latency: ASAP vs ClickHouse baseline " + f"(p50: {np.median(a_vals):.1f}ms vs {np.median(b_vals):.1f}ms)" + ) + ax1.legend() + ax1.set_xlim(-0.6, len(qids) - 0.4) + + ax2.bar(x, speedup, color="#2e8b57", width=0.7) + ax2.axhline( + np.mean(speedup), + color="red", + linewidth=1, + linestyle="--", + label=f"mean {np.mean(speedup):.1f}×", + ) + ax2.set_xticks(x) + ax2.set_xticklabels(qids, rotation=90, fontsize=7) + ax2.set_ylabel("Speedup (×)") + ax2.legend(fontsize=8) + ax2.set_xlim(-0.6, len(qids) - 0.4) + + plt.tight_layout() + plt.savefig(out_path, dpi=150) + plt.close() + print(f"Comparison plot saved to {out_path}") + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main(): + parser = argparse.ArgumentParser( + description="Benchmark ASAP vs ClickHouse baseline", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "--mode", + choices=["asap", "baseline", "both"], + default="both", + help="Which mode(s) to run (default: both)", + ) + parser.add_argument( + "--asap-sql-file", + default=None, + help="SQL file for ASAP mode (required if mode is asap or both)", + ) + parser.add_argument( + "--baseline-sql-file", + default=None, + help="SQL file for baseline mode (required if mode is baseline or both)", + ) + parser.add_argument( + "--asap-url", + default=DEFAULT_ASAP_URL, + help=f"QueryEngineRust endpoint (default: {DEFAULT_ASAP_URL})", + ) + parser.add_argument( + "--clickhouse-url", + default=DEFAULT_CLICKHOUSE_URL, + help=f"ClickHouse HTTP URL (default: {DEFAULT_CLICKHOUSE_URL})", + ) + parser.add_argument( + "--output-dir", + default=DEFAULT_OUTPUT_DIR, + help=f"Directory for results (default: {DEFAULT_OUTPUT_DIR})", + ) + parser.add_argument( + "--output-prefix", + default=DEFAULT_OUTPUT_PREFIX, + help=f"Prefix for output files (default: {DEFAULT_OUTPUT_PREFIX})", + ) + parser.add_argument( + "--query-filter", + default=None, + help="Comma-separated query IDs to run (e.g. T000,T001)", + ) + parser.add_argument( + "--repeat", + type=int, + default=1, + help="Run each query N times and report the median (default: 1)", + ) + parser.add_argument( + "--timeout", + type=int, + default=30, + help="Per-query timeout in seconds (default: 30)", + ) + parser.add_argument( + "--debug", + action="store_true", + help="Show per-query HTTP status", + ) + parser.add_argument( + "--no-plot", + action="store_true", + help="Do not generate any plots", + ) + # Ignored flag for backward compatibility + parser.add_argument( + "--measure-pipeline-overhead", + action="store_true", + help="(No-op) Pipeline overhead measurement is not applicable with file source", + ) + + args = parser.parse_args() + + if args.measure_pipeline_overhead: + print( + "WARNING: --measure-pipeline-overhead is not applicable when using " + "file source (no Kafka ingest). Ignoring." + ) + + # Validate required SQL files + if args.mode in ("asap", "both") and not args.asap_sql_file: + parser.error("--asap-sql-file is required when --mode is asap or both") + if args.mode in ("baseline", "both") and not args.baseline_sql_file: + parser.error("--baseline-sql-file is required when --mode is baseline or both") + + output_dir = Path(args.output_dir) + prefix = args.output_prefix + query_filter = [q.strip() for q in args.query_filter.split(",")] if args.query_filter else None + + asap_csv = output_dir / f"{prefix}_asap.csv" + baseline_csv = output_dir / f"{prefix}_baseline.csv" + + if args.mode in ("baseline", "both"): + run_benchmark( + sql_file=Path(args.baseline_sql_file), + endpoint_url=args.clickhouse_url, + output_csv=baseline_csv, + mode="baseline", + query_filter=query_filter, + timeout=args.timeout, + repeat=args.repeat, + debug=args.debug, + no_plot=args.no_plot, + ) + + if args.mode in ("asap", "both"): + run_benchmark( + sql_file=Path(args.asap_sql_file), + endpoint_url=args.asap_url, + output_csv=asap_csv, + mode="asap", + query_filter=query_filter, + timeout=args.timeout, + repeat=args.repeat, + debug=args.debug, + no_plot=args.no_plot, + ) + + if args.mode == "both" and not args.no_plot: + comparison_png = output_dir / f"{prefix}_comparison.png" + _plot_comparison(asap_csv, baseline_csv, comparison_png) + + +if __name__ == "__main__": + main() From c0446360e181ccb898e4715f02a48f1fcfa117ac Mon Sep 17 00:00:00 2001 From: STWang Date: Wed, 8 Apr 2026 12:59:00 -0600 Subject: [PATCH 02/10] format; clickhouse can run --- asap-common/.gitignore | 1 + asap-query-engine/.gitignore | 1 + asap-query-engine/src/main.rs | 2 +- asap-summary-ingest/.gitignore | 1 + .../templates/udfs/countminsketch_count.rs.j2 | 2 +- asap-tools/execution-utilities/.gitignore | 3 + .../execution-utilities/benchmark/README.md | 57 +++++---- .../benchmark/download_dataset.py | 7 +- .../benchmark/export_to_arroyo.py | 116 +++++------------- .../benchmark/export_to_database.py | 78 ++++++------ .../benchmark/generate_queries.py | 76 +++++------- .../benchmark/prepare_data.py | 22 ++-- .../benchmark/run_benchmark.py | 58 +++++---- 13 files changed, 200 insertions(+), 224 deletions(-) diff --git a/asap-common/.gitignore b/asap-common/.gitignore index 102b6eac..a3b80cf2 100644 --- a/asap-common/.gitignore +++ b/asap-common/.gitignore @@ -5,6 +5,7 @@ .vscode/ dependencies/py/promql_utilities/promql_utilities.egg-info/ +dependencies/py/promql_utilities/build/ dependencies/rs/**/target/ tests/**/*.json diff --git a/asap-query-engine/.gitignore b/asap-query-engine/.gitignore index eb5a316c..5c63ba3f 100644 --- a/asap-query-engine/.gitignore +++ b/asap-query-engine/.gitignore @@ -1 +1,2 @@ target +output/ diff --git a/asap-query-engine/src/main.rs b/asap-query-engine/src/main.rs index fa589aa0..842fe284 100644 --- a/asap-query-engine/src/main.rs +++ b/asap-query-engine/src/main.rs @@ -541,4 +541,4 @@ fn setup_logging( info!("Logging initialized (respects RUST_LOG environment variable)"); info!("Logs will be written to: {}/query_engine.log", output_dir); Ok(guard) -} +} \ No newline at end of file diff --git a/asap-summary-ingest/.gitignore b/asap-summary-ingest/.gitignore index f7ee054e..49407f65 100644 --- a/asap-summary-ingest/.gitignore +++ b/asap-summary-ingest/.gitignore @@ -1,3 +1,4 @@ __pycache__ **/*.pyc **/*.swp +outputs/ diff --git a/asap-summary-ingest/templates/udfs/countminsketch_count.rs.j2 b/asap-summary-ingest/templates/udfs/countminsketch_count.rs.j2 index b8c3d54d..b720e603 100644 --- a/asap-summary-ingest/templates/udfs/countminsketch_count.rs.j2 +++ b/asap-summary-ingest/templates/udfs/countminsketch_count.rs.j2 @@ -110,4 +110,4 @@ fn countminsketch_count(keys: Vec<&str>, values: Vec) -> Option> { .ok()?; Some(buf) } -} +} \ No newline at end of file diff --git a/asap-tools/execution-utilities/.gitignore b/asap-tools/execution-utilities/.gitignore index c8760c7f..7704e7d1 100644 --- a/asap-tools/execution-utilities/.gitignore +++ b/asap-tools/execution-utilities/.gitignore @@ -7,6 +7,9 @@ clickhouse-benchmark-pipeline/benchmark_results/ **/data/ +benchmark/arroyo_outputs/ +benchmark/queries/ +benchmark/results/ **/*.csv **/*.png diff --git a/asap-tools/execution-utilities/benchmark/README.md b/asap-tools/execution-utilities/benchmark/README.md index 9a0608cc..c45b171c 100644 --- a/asap-tools/execution-utilities/benchmark/README.md +++ b/asap-tools/execution-utilities/benchmark/README.md @@ -19,10 +19,6 @@ data_file → export_to_database.py run_benchmark.py → results/ ClickHouse :8123 (baseline) ``` -**Key difference from the old pipeline:** Arroyo reads directly from a local -file (`single_file_custom` connector) rather than from a Kafka input topic. -Kafka is still required for the **sketch output** topic (`sketch_topic`). - --- ## Prerequisites @@ -31,8 +27,8 @@ Kafka is still required for the **sketch output** topic (`sketch_topic`). export INSTALL_DIR=/scratch/sketch_db_for_prometheus pip3 install --user -r requirements.txt -# Build binaries (one-time) -cd ~/ASAPQuery/asap-query-engine && cargo build --release +# Build binaries (one-time) — workspace target is at ~/ASAPQuery/target/release/ +cd ~/ASAPQuery && cargo build --release ``` --- @@ -60,6 +56,7 @@ The Arroyo file source requires RFC3339 timestamps and string metadata columns. This step converts the raw ClickBench JSON: ```bash +cd ~/ASAPQuery/asap-tools/execution-utilities/benchmark python prepare_data.py \ --dataset clickbench \ --input ./data/hits.json.gz \ @@ -74,17 +71,19 @@ This produces `hits_arroyo.json` with: ### Step 3 — Start infrastructure +Skip any service that is already running. + ```bash -# Kafka +# Kafka — skip if `kafka-topics.sh --list` succeeds ~/ASAPQuery/asap-tools/installation/kafka/run.sh $INSTALL_DIR/kafka -# Create sketch output topic +# Create sketch output topic — skip if sketch_topic already exists KAFKA=$INSTALL_DIR/kafka/bin $KAFKA/kafka-topics.sh --bootstrap-server localhost:9092 --create \ --topic sketch_topic --partitions 1 --replication-factor 1 \ --config max.message.bytes=20971520 -# ClickHouse +# ClickHouse — skip if port 8123 is already listening ~/ASAPQuery/asap-tools/installation/clickhouse/run.sh $INSTALL_DIR ``` @@ -99,9 +98,9 @@ $KAFKA/kafka-topics.sh --bootstrap-server localhost:9092 --create \ ### Step 5 — Launch Arroyo sketch pipeline (file source) ```bash +cd ~/ASAPQuery/asap-tools/execution-utilities/benchmark python export_to_arroyo.py \ --streaming-config ./configs/clickbench_streaming.yaml \ - --source-type file \ --input-file ./data/hits_arroyo.json \ --file-format json \ --ts-format rfc3339 \ @@ -113,13 +112,13 @@ python export_to_arroyo.py \ ### Step 6 — Start QueryEngineRust ```bash -cd ~/ASAPQuery/asap-query-engine +cd ~/ASAPQuery nohup ./target/release/query_engine_rust \ --kafka-topic sketch_topic --input-format json \ --config ~/ASAPQuery/asap-tools/execution-utilities/benchmark/configs/clickbench_inference.yaml \ --streaming-config ~/ASAPQuery/asap-tools/execution-utilities/benchmark/configs/clickbench_streaming.yaml \ --http-port 8088 --delete-existing-db --log-level DEBUG \ - --output-dir ./output --streaming-engine arroyo \ + --output-dir ./asap-query-engine/output --streaming-engine arroyo \ --query-language SQL --lock-strategy per-key \ --prometheus-scrape-interval 1 > /tmp/query_engine.log 2>&1 & ``` @@ -140,6 +139,7 @@ Verify: `$INSTALL_DIR/clickhouse client --query "SELECT count(*) FROM hits"` ### Step 8 — Generate SQL query files ```bash +cd ~/ASAPQuery/asap-tools/execution-utilities/benchmark python generate_queries.py \ --table-name hits \ --ts-column EventTime \ @@ -155,15 +155,16 @@ python generate_queries.py \ --output-prefix ./queries/clickbench ``` -This writes `queries/clickbench_asap.sql` and `queries/clickbench_clickhouse.sql`. +This writes `queries/clickbench.sql`. ### Step 9 — Run benchmark ```bash +cd ~/ASAPQuery/asap-tools/execution-utilities/benchmark python run_benchmark.py \ --mode both \ - --asap-sql-file ./queries/clickbench_asap.sql \ - --baseline-sql-file ./queries/clickbench_clickhouse.sql \ + --asap-sql-file ./queries/clickbench.sql \ + --baseline-sql-file ./queries/clickbench.sql \ --output-dir ./results \ --output-prefix clickbench ``` @@ -178,12 +179,14 @@ Results: `results/clickbench_asap.csv`, `results/clickbench_baseline.csv`, ### Step 1 — Download dataset ```bash +cd ~/ASAPQuery/asap-tools/execution-utilities/benchmark python download_dataset.py --dataset h2o --output-dir ./data ``` ### Step 2 — Prepare data for Arroyo file source ```bash +cd ~/ASAPQuery/asap-tools/execution-utilities/benchmark python prepare_data.py \ --dataset h2o \ --input ./data/G1_1e7_1e2_0_0.csv \ @@ -196,9 +199,9 @@ python prepare_data.py \ ### Step 5 — Launch Arroyo sketch pipeline ```bash +cd ~/ASAPQuery/asap-tools/execution-utilities/benchmark python export_to_arroyo.py \ --streaming-config ./configs/h2o_streaming.yaml \ - --source-type file \ --input-file ./data/h2o_arroyo.json \ --file-format json \ --ts-format rfc3339 \ @@ -210,13 +213,13 @@ python export_to_arroyo.py \ ### Step 6 — Start QueryEngineRust ```bash -cd ~/ASAPQuery/asap-query-engine +cd ~/ASAPQuery nohup ./target/release/query_engine_rust \ --kafka-topic sketch_topic --input-format json \ --config ~/ASAPQuery/asap-tools/execution-utilities/benchmark/configs/h2o_inference.yaml \ --streaming-config ~/ASAPQuery/asap-tools/execution-utilities/benchmark/configs/h2o_streaming.yaml \ --http-port 8088 --delete-existing-db --log-level DEBUG \ - --output-dir ./output --streaming-engine arroyo \ + --output-dir ./asap-query-engine/output --streaming-engine arroyo \ --query-language SQL --lock-strategy per-key \ --prometheus-scrape-interval 1 > /tmp/query_engine.log 2>&1 & ``` @@ -224,6 +227,7 @@ nohup ./target/release/query_engine_rust \ ### Step 7 — Load data into ClickHouse (baseline) ```bash +cd ~/ASAPQuery/asap-tools/execution-utilities/benchmark python export_to_database.py \ --dataset h2o \ --file-path ./data/G1_1e7_1e2_0_0.csv \ @@ -234,6 +238,7 @@ python export_to_database.py \ ### Step 8 — Generate SQL query files ```bash +cd ~/ASAPQuery/asap-tools/execution-utilities/benchmark python generate_queries.py \ --table-name h2o_groupby \ --ts-column timestamp \ @@ -251,10 +256,11 @@ python generate_queries.py \ ### Step 9 — Run benchmark ```bash +cd ~/ASAPQuery/asap-tools/execution-utilities/benchmark python run_benchmark.py \ --mode both \ - --asap-sql-file ./queries/h2o_asap.sql \ - --baseline-sql-file ./queries/h2o_clickhouse.sql \ + --asap-sql-file ./queries/h2o.sql \ + --baseline-sql-file ./queries/h2o.sql \ --output-dir ./results \ --output-prefix h2o ``` @@ -264,6 +270,8 @@ python run_benchmark.py \ ## Custom Dataset ```bash +cd ~/ASAPQuery/asap-tools/execution-utilities/benchmark + # 1. Download (any HTTP URL) python download_dataset.py --dataset custom \ --custom-url https://example.com/mydata.json.gz \ @@ -274,7 +282,6 @@ python download_dataset.py --dataset custom \ # 3. Export to Arroyo python export_to_arroyo.py \ --streaming-config ./configs/my_streaming.yaml \ - --source-type file \ --input-file ./data/mydata.json \ --file-format json \ --ts-format rfc3339 \ @@ -303,8 +310,8 @@ python generate_queries.py \ # 6. Run benchmark python run_benchmark.py \ --mode both \ - --asap-sql-file ./queries/my_dataset_asap.sql \ - --baseline-sql-file ./queries/my_dataset_clickhouse.sql \ + --asap-sql-file ./queries/my_dataset.sql \ + --baseline-sql-file ./queries/my_dataset.sql \ --output-dir ./results ``` @@ -337,8 +344,8 @@ $INSTALL_DIR/clickhouse client --query "TRUNCATE TABLE hits" |------|---------| | `download_dataset.py` | Download ClickBench, H2O, or custom datasets | | `prepare_data.py` | Convert raw data to Arroyo file source format (RFC3339, string columns) | -| `export_to_arroyo.py` | Launch Arroyo sketch pipeline (file or kafka source) | +| `export_to_arroyo.py` | Launch Arroyo sketch pipeline from a local file source | | `export_to_database.py` | Load data into ClickHouse for baseline | -| `generate_queries.py` | Generate paired ASAP + ClickHouse SQL query files | +| `generate_queries.py` | Generate a single SQL query file (database-style, compatible with both ASAP and ClickHouse) | | `run_benchmark.py` | Run queries and produce CSV results + plots | | `configs/` | Dataset-specific streaming/inference YAML and ClickHouse init SQL | diff --git a/asap-tools/execution-utilities/benchmark/download_dataset.py b/asap-tools/execution-utilities/benchmark/download_dataset.py index 5226ae53..750b5502 100644 --- a/asap-tools/execution-utilities/benchmark/download_dataset.py +++ b/asap-tools/execution-utilities/benchmark/download_dataset.py @@ -74,7 +74,11 @@ def download_clickbench(output_path: str, force: bool = False) -> str: def download_h2o(output_path: str, force: bool = False) -> str: """Download H2O groupby CSV (~300 MB) from Google Drive via gdown.""" - if not force and os.path.exists(output_path) and os.path.getsize(output_path) > 100 * 1024 * 1024: + if ( + not force + and os.path.exists(output_path) + and os.path.getsize(output_path) > 100 * 1024 * 1024 + ): print(f"Using existing file: {output_path}") return output_path @@ -83,6 +87,7 @@ def download_h2o(output_path: str, force: bool = False) -> str: except ImportError: print("Installing gdown...") import subprocess + subprocess.check_call([sys.executable, "-m", "pip", "install", "gdown"]) import gdown diff --git a/asap-tools/execution-utilities/benchmark/export_to_arroyo.py b/asap-tools/execution-utilities/benchmark/export_to_arroyo.py index 6e72af72..38533668 100644 --- a/asap-tools/execution-utilities/benchmark/export_to_arroyo.py +++ b/asap-tools/execution-utilities/benchmark/export_to_arroyo.py @@ -1,33 +1,18 @@ #!/usr/bin/env python3 """ -Launch an Arroyo sketch pipeline against a dataset. +Launch an Arroyo sketch pipeline from a local file source. -Supports two source modes: - file (default): Arroyo reads directly from a local JSON/Parquet file. - No Kafka input topic is required. - kafka: Arroyo reads from a Kafka topic (legacy path). - -In both cases the sketch output is written to a Kafka topic (default: -sketch_topic) for consumption by QueryEngineRust. +Arroyo reads directly from a local JSON/Parquet file and writes sketches to +a Kafka topic (default: sketch_topic) for consumption by QueryEngineRust. Usage: - # File source (recommended) python export_to_arroyo.py \\ --streaming-config configs/clickbench_streaming.yaml \\ - --source-type file \\ - --input-file ./data/hits.json.gz \\ + --input-file ./data/hits_arroyo.json \\ --file-format json \\ --ts-format rfc3339 \\ --pipeline-name clickbench_pipeline \\ --arroyosketch-dir ~/ASAPQuery/asap-summary-ingest - - # Kafka source (legacy) - python export_to_arroyo.py \\ - --streaming-config configs/h2o_streaming.yaml \\ - --source-type kafka \\ - --input-kafka-topic h2o_groupby \\ - --pipeline-name h2o_pipeline \\ - --arroyosketch-dir ~/ASAPQuery/asap-summary-ingest """ import argparse @@ -49,12 +34,7 @@ def wait_for_pipeline_running( arroyo_url: str = DEFAULT_ARROYO_URL, timeout: int = DEFAULT_WAIT_TIMEOUT, ) -> bool: - """Poll the Arroyo API until the named pipeline reaches RUNNING state. - - Translated from asap_benchmark_pipeline/run_pipeline.sh lines 107-141. - A pipeline is considered running when its 'state' field is None and - 'stop' is 'none' (Arroyo's representation of a healthy running pipeline). - """ + """Poll the Arroyo API until the named pipeline reaches RUNNING state.""" print(f"Waiting for pipeline '{pipeline_name}' to reach RUNNING state...") elapsed = 0 while True: @@ -85,45 +65,43 @@ def wait_for_pipeline_running( time.sleep(5) elapsed += 5 if elapsed >= timeout: - print( - f"ERROR: Pipeline did not reach RUNNING state within {timeout}s" - ) + print(f"ERROR: Pipeline did not reach RUNNING state within {timeout}s") return False def build_arroyosketch_cmd(args, arroyosketch_script: str) -> list: """Build the run_arroyosketch.py command from our CLI arguments.""" - cmd = [ + return [ sys.executable, arroyosketch_script, - "--source_type", args.source_type, - "--output_format", "json", - "--pipeline_name", args.pipeline_name, - "--config_file_path", os.path.abspath(args.streaming_config), - "--output_kafka_topic", args.output_kafka_topic, - "--output_dir", os.path.abspath(args.output_dir), - "--parallelism", str(args.parallelism), - "--query_language", "sql", + "--source_type", + "file", + "--output_format", + "json", + "--pipeline_name", + args.pipeline_name, + "--config_file_path", + os.path.abspath(args.streaming_config), + "--output_kafka_topic", + args.output_kafka_topic, + "--output_dir", + os.path.abspath(args.output_dir), + "--parallelism", + str(args.parallelism), + "--query_language", + "sql", + "--input_file_path", + os.path.abspath(args.input_file), + "--file_format", + args.file_format, + "--ts_format", + args.ts_format, ] - if args.source_type == "file": - cmd += [ - "--input_file_path", os.path.abspath(args.input_file), - "--file_format", args.file_format, - "--ts_format", args.ts_format, - ] - elif args.source_type == "kafka": - cmd += [ - "--kafka_input_format", "json", - "--input_kafka_topic", args.input_kafka_topic, - ] - - return cmd - def main(): parser = argparse.ArgumentParser( - description="Launch Arroyo sketch pipeline (file or kafka source)", + description="Launch Arroyo sketch pipeline from a local file source", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__, ) @@ -132,17 +110,10 @@ def main(): required=True, help="Path to streaming_config.yaml", ) - parser.add_argument( - "--source-type", - choices=["file", "kafka"], - default="file", - help="Data source type (default: file)", - ) - # File source args parser.add_argument( "--input-file", - default=None, - help="Path to input data file (required for --source-type file)", + required=True, + help="Path to input data file (JSON or Parquet)", ) parser.add_argument( "--file-format", @@ -156,13 +127,6 @@ def main(): default="rfc3339", help="Timestamp format in the data file (default: rfc3339)", ) - # Kafka source args - parser.add_argument( - "--input-kafka-topic", - default=None, - help="Kafka topic to read from (required for --source-type kafka)", - ) - # Common args parser.add_argument( "--output-kafka-topic", default=DEFAULT_OUTPUT_KAFKA_TOPIC, @@ -194,12 +158,6 @@ def main(): default="./arroyo_outputs", help="Directory for Arroyo pipeline output artifacts (default: ./arroyo_outputs)", ) - parser.add_argument( - "--wait-for-pipeline", - action="store_true", - default=True, - help="Poll until pipeline reaches RUNNING state (default: True)", - ) parser.add_argument( "--no-wait", action="store_true", @@ -214,12 +172,6 @@ def main(): args = parser.parse_args() - # Validate source-specific required args - if args.source_type == "file" and not args.input_file: - parser.error("--input-file is required when --source-type file") - if args.source_type == "kafka" and not args.input_kafka_topic: - parser.error("--input-kafka-topic is required when --source-type kafka") - arroyosketch_script = os.path.join( os.path.abspath(args.arroyosketch_dir), "run_arroyosketch.py" ) @@ -230,10 +182,10 @@ def main(): os.makedirs(args.output_dir, exist_ok=True) cmd = build_arroyosketch_cmd(args, arroyosketch_script) - print(f"Launching Arroyo pipeline '{args.pipeline_name}' ({args.source_type} source)...") + print(f"Launching Arroyo pipeline '{args.pipeline_name}'...") print(f"Command: {' '.join(cmd)}") - result = subprocess.run(cmd) + result = subprocess.run(cmd, cwd=os.path.abspath(args.arroyosketch_dir)) if result.returncode != 0: print(f"ERROR: run_arroyosketch.py exited with code {result.returncode}") sys.exit(result.returncode) diff --git a/asap-tools/execution-utilities/benchmark/export_to_database.py b/asap-tools/execution-utilities/benchmark/export_to_database.py index d9583641..9811917c 100644 --- a/asap-tools/execution-utilities/benchmark/export_to_database.py +++ b/asap-tools/execution-utilities/benchmark/export_to_database.py @@ -29,10 +29,8 @@ import argparse import gzip import os -import subprocess import sys from datetime import datetime, timezone -from pathlib import Path import requests @@ -77,11 +75,7 @@ def load_clickbench( skip_if_loaded: bool = False, max_rows: int = 0, ): - """Load hits.json.gz into ClickHouse. - - Uses `zcat | clickhouse-client INSERT` for gzip-compressed JSON. - Adapted from asap_query_latency/run_benchmark.py:load_clickbench_data(). - """ + """Load hits.json.gz into ClickHouse via HTTP INSERT.""" if not skip_table_init and init_sql_file: run_init_sql(clickhouse_url, init_sql_file) @@ -96,21 +90,18 @@ def load_clickbench( return False print(f"Loading ClickBench data from {file_path}...") - if max_rows > 0: - # Pipe through head to limit rows - cmd = ( - f"zcat {file_path} | head -n {max_rows} | " - f"clickhouse-client --query='INSERT INTO hits FORMAT JSONEachRow'" - ) - else: - cmd = ( - f"zcat {file_path} | " - f"clickhouse-client --query='INSERT INTO hits FORMAT JSONEachRow'" - ) - result = subprocess.run(cmd, shell=True) - if result.returncode != 0: - print("ERROR: ClickHouse insert failed") + def _row_stream(): + with gzip.open(file_path, "rt") as f: + for i, line in enumerate(f): + if max_rows > 0 and i >= max_rows: + break + yield line.encode() + + url = clickhouse_url.rstrip("/") + "/?query=INSERT+INTO+hits+FORMAT+JSONEachRow" + r = requests.post(url, data=_row_stream(), stream=True) + if not r.ok: + print(f"ERROR: ClickHouse insert failed: {r.text[:200]}") return False count = check_row_count(clickhouse_url, "hits") @@ -218,30 +209,41 @@ def load_custom( return False path_lower = file_path.lower() + url = ( + clickhouse_url.rstrip("/") + + f"/?query=INSERT+INTO+{table_name}+FORMAT+JSONEachRow" + ) + + def _stream_gzip(): + with gzip.open(file_path, "rt") as f: + for i, line in enumerate(f): + if max_rows > 0 and i >= max_rows: + break + yield line.encode() + + def _stream_plain(): + with open(file_path, "r") as f: + for i, line in enumerate(f): + if max_rows > 0 and i >= max_rows: + break + yield line.encode() + if path_lower.endswith(".json.gz") or path_lower.endswith(".jsonl.gz"): - head_cmd = f"| head -n {max_rows}" if max_rows > 0 else "" - cmd = ( - f"zcat {file_path} {head_cmd} | " - f"clickhouse-client --query='INSERT INTO {table_name} FORMAT JSONEachRow'" - ) print(f"Loading {file_path} into ClickHouse ({table_name})...") - result = subprocess.run(cmd, shell=True) - if result.returncode != 0: - print("ERROR: ClickHouse insert failed") + r = requests.post(url, data=_stream_gzip(), stream=True) + if not r.ok: + print(f"ERROR: ClickHouse insert failed: {r.text[:200]}") return False elif path_lower.endswith(".json") or path_lower.endswith(".jsonl"): - head_cmd = f"head -n {max_rows} {file_path} | " if max_rows > 0 else "" - cmd = ( - f"{head_cmd}clickhouse-client --query='INSERT INTO {table_name} FORMAT JSONEachRow' " - f"< {file_path}" - ) print(f"Loading {file_path} into ClickHouse ({table_name})...") - result = subprocess.run(cmd, shell=True) - if result.returncode != 0: - print("ERROR: ClickHouse insert failed") + r = requests.post(url, data=_stream_plain(), stream=True) + if not r.ok: + print(f"ERROR: ClickHouse insert failed: {r.text[:200]}") return False else: - print(f"ERROR: Unsupported file format for {file_path}. Use --dataset h2o for CSV.") + print( + f"ERROR: Unsupported file format for {file_path}. Use --dataset h2o for CSV." + ) return False count = check_row_count(clickhouse_url, table_name) diff --git a/asap-tools/execution-utilities/benchmark/generate_queries.py b/asap-tools/execution-utilities/benchmark/generate_queries.py index 13989100..730b2efd 100644 --- a/asap-tools/execution-utilities/benchmark/generate_queries.py +++ b/asap-tools/execution-utilities/benchmark/generate_queries.py @@ -1,16 +1,17 @@ #!/usr/bin/env python3 """ -Generate paired ASAP and ClickHouse SQL query files for benchmarking. +Generate a SQL query file for benchmarking ASAP and ClickHouse. -Each query targets a fixed time window (window-end timestamp) and matches the +Each query uses database-style quantile(q)(col) syntax, compatible with both +QueryEngineRust and ClickHouse. Queries target fixed time windows and match the annotation format `-- T{NNN}: description` expected by run_benchmark.py. Output: - {prefix}_asap.sql QUANTILE(q, col) syntax for QueryEngineRust - {prefix}_clickhouse.sql quantile(q)(col) syntax for ClickHouse baseline + {prefix}.sql quantile(q)(col) database-style syntax, compatible with both + QueryEngineRust and ClickHouse baseline Usage: - # Auto-detect timestamps from data file + # Auto-detect timestamps from data file → ./queries/clickbench.sql python generate_queries.py \\ --table-name hits \\ --ts-column EventTime \\ @@ -23,7 +24,7 @@ --data-file-format json.gz \\ --output-prefix ./queries/clickbench - # Explicit timestamp file (one ISO timestamp per line) + # Explicit timestamp file (one ISO timestamp per line) → ./queries/h2o.sql python generate_queries.py \\ --table-name h2o_groupby \\ --ts-column timestamp \\ @@ -98,11 +99,10 @@ def _read_timestamps_from_json( return timestamps -def _read_timestamps_from_csv( - file_path: str, ts_column: str -) -> List[datetime]: +def _read_timestamps_from_csv(file_path: str, ts_column: str) -> List[datetime]: """Read up to SAMPLE_SIZE timestamps from a CSV file.""" import csv + timestamps = [] with open(file_path, "r", newline="") as f: reader = csv.DictReader(f) @@ -121,9 +121,7 @@ def _read_timestamps_from_csv( return timestamps -def detect_timestamps( - data_file: str, data_file_format: str, ts_column: str -) -> tuple: +def detect_timestamps(data_file: str, data_file_format: str, ts_column: str) -> tuple: """Return (min_ts, max_ts) from a sample of the data file.""" fmt = data_file_format.lower() if fmt in ("json.gz", "jsonl.gz"): @@ -199,7 +197,7 @@ def format_ts(ts: datetime, ts_format: str) -> str: return ts.strftime("%Y-%m-%d %H:%M:%S") -def generate_sql_files( +def generate_sql_file( table_name: str, ts_column: str, value_column: str, @@ -211,10 +209,9 @@ def generate_sql_files( window_form: str, output_prefix: str, ): - """Write the paired ASAP and ClickHouse SQL files.""" + """Write a single SQL file compatible with both ASAP and ClickHouse.""" group_by_clause = ", ".join(group_by_columns) - asap_lines = [] - ch_lines = [] + lines = [] for i, end_ts in enumerate(window_ends): end_str = format_ts(end_ts, ts_format) @@ -224,42 +221,23 @@ def generate_sql_files( desc = f"quantile window ending at {end_str}" if window_form == "dateadd": - where_clause = ( - f"{ts_column} BETWEEN DATEADD(s, -{window_size}, '{end_str}') AND '{end_str}'" - ) + where_clause = f"{ts_column} BETWEEN DATEADD(s, -{window_size}, '{end_str}') AND '{end_str}'" else: - where_clause = ( - f"{ts_column} BETWEEN '{start_str}' AND '{end_str}'" - ) + where_clause = f"{ts_column} BETWEEN '{start_str}' AND '{end_str}'" - asap_sql = ( - f"-- {label}: {desc}\n" - f"SELECT QUANTILE({quantile}, {value_column}) FROM {table_name} " - f"WHERE {where_clause} GROUP BY {group_by_clause};" - ) - ch_sql = ( + lines.append( f"-- {label}: {desc}\n" f"SELECT quantile({quantile})({value_column}) FROM {table_name} " f"WHERE {where_clause} GROUP BY {group_by_clause};" ) - asap_lines.append(asap_sql) - ch_lines.append(ch_sql) + sql_file = f"{output_prefix}.sql" + Path(sql_file).parent.mkdir(parents=True, exist_ok=True) - asap_file = f"{output_prefix}_asap.sql" - ch_file = f"{output_prefix}_clickhouse.sql" + with open(sql_file, "w") as f: + f.write("\n".join(lines) + "\n") - Path(asap_file).parent.mkdir(parents=True, exist_ok=True) - - with open(asap_file, "w") as f: - f.write("\n".join(asap_lines) + "\n") - - with open(ch_file, "w") as f: - f.write("\n".join(ch_lines) + "\n") - - print(f"Generated {len(window_ends)} queries:") - print(f" ASAP: {asap_file}") - print(f" ClickHouse: {ch_file}") + print(f"Generated {len(window_ends)} queries → {sql_file}") def main(): @@ -271,7 +249,9 @@ def main(): # Table/column config parser.add_argument("--table-name", required=True) parser.add_argument("--ts-column", required=True, help="Timestamp column name") - parser.add_argument("--value-column", required=True, help="Column to compute quantile on") + parser.add_argument( + "--value-column", required=True, help="Column to compute quantile on" + ) parser.add_argument( "--group-by-columns", required=True, @@ -279,7 +259,9 @@ def main(): ) # Query parameters parser.add_argument("--quantile", type=float, default=0.95) - parser.add_argument("--window-size", type=int, default=10, help="Window size in seconds") + parser.add_argument( + "--window-size", type=int, default=10, help="Window size in seconds" + ) parser.add_argument("--num-queries", type=int, default=50) parser.add_argument( "--ts-format", @@ -296,7 +278,7 @@ def main(): parser.add_argument( "--output-prefix", required=True, - help="Output file prefix (e.g. ./queries/clickbench → clickbench_asap.sql + clickbench_clickhouse.sql)", + help="Output file prefix (e.g. ./queries/clickbench → clickbench.sql)", ) # Timestamp sources (mutually exclusive) ts_group = parser.add_mutually_exclusive_group(required=True) @@ -372,7 +354,7 @@ def main(): f"(stride={stride}s, window={args.window_size}s)" ) - generate_sql_files( + generate_sql_file( table_name=args.table_name, ts_column=args.ts_column, value_column=args.value_column, diff --git a/asap-tools/execution-utilities/benchmark/prepare_data.py b/asap-tools/execution-utilities/benchmark/prepare_data.py index 33bc207d..5b2d7b56 100644 --- a/asap-tools/execution-utilities/benchmark/prepare_data.py +++ b/asap-tools/execution-utilities/benchmark/prepare_data.py @@ -27,8 +27,7 @@ import argparse import gzip import json -import sys -from datetime import datetime, timedelta, timezone +from datetime import datetime, timezone from pathlib import Path # Synthetic timestamp base for H2O (2024-01-01T00:00:00Z) @@ -100,7 +99,9 @@ def prepare_clickbench(input_path: str, output_path: str, max_rows: int = 0): print(f"Done. {len(records):,} records written.") if records: - print(f" Time range: {records[0][CB_TIMESTAMP_FIELD]} – {records[-1][CB_TIMESTAMP_FIELD]}") + print( + f" Time range: {records[0][CB_TIMESTAMP_FIELD]} – {records[-1][CB_TIMESTAMP_FIELD]}" + ) def prepare_h2o(input_path: str, output_path: str, max_rows: int = 0): @@ -113,8 +114,7 @@ def prepare_h2o(input_path: str, output_path: str, max_rows: int = 0): print(f"Reading {input_path}...") count = 0 - with open(input_path, "r", encoding="utf-8") as fin, \ - open(output_path, "w") as fout: + with open(input_path, "r", encoding="utf-8") as fin, open(output_path, "w") as fout: header = fin.readline().strip() cols = header.split(",") @@ -148,8 +148,12 @@ def prepare_h2o(input_path: str, output_path: str, max_rows: int = 0): count += 1 print(f"\nDone. {count:,} records written to {output_path}.") - first_ts = datetime.fromtimestamp(H2O_BASE_EPOCH, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") - last_ts = datetime.fromtimestamp(H2O_BASE_EPOCH + count // H2O_ROWS_PER_SECOND, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + first_ts = datetime.fromtimestamp(H2O_BASE_EPOCH, tz=timezone.utc).strftime( + "%Y-%m-%dT%H:%M:%SZ" + ) + last_ts = datetime.fromtimestamp( + H2O_BASE_EPOCH + count // H2O_ROWS_PER_SECOND, tz=timezone.utc + ).strftime("%Y-%m-%dT%H:%M:%SZ") print(f" Time range: {first_ts} – {last_ts}") @@ -166,7 +170,9 @@ def main(): help="Dataset type to prepare", ) parser.add_argument("--input", required=True, help="Path to raw input file") - parser.add_argument("--output", required=True, help="Path to write prepared JSON file") + parser.add_argument( + "--output", required=True, help="Path to write prepared JSON file" + ) parser.add_argument( "--max-rows", type=int, diff --git a/asap-tools/execution-utilities/benchmark/run_benchmark.py b/asap-tools/execution-utilities/benchmark/run_benchmark.py index a196aced..85c637a7 100644 --- a/asap-tools/execution-utilities/benchmark/run_benchmark.py +++ b/asap-tools/execution-utilities/benchmark/run_benchmark.py @@ -92,13 +92,19 @@ def run_query( latency_ms = (time.time() - start) * 1000 if debug: - source = "OK" if response.status_code == 200 else f"HTTP {response.status_code}" + source = ( + "OK" if response.status_code == 200 else f"HTTP {response.status_code}" + ) print(f" [{source}] {latency_ms:.2f}ms") if response.status_code == 200: return latency_ms, response.text.strip(), None else: - return latency_ms, None, f"HTTP {response.status_code}: {response.text[:200]}" + return ( + latency_ms, + None, + f"HTTP {response.status_code}: {response.text[:200]}", + ) except requests.Timeout: return timeout * 1000.0, None, "Timeout" except Exception as e: @@ -176,7 +182,15 @@ def run_benchmark( with open(output_csv, "w", newline="") as csvfile: writer = csv.writer(csvfile) writer.writerow( - ["query_id", "query_pattern", "latency_ms", "result_rows", "result_full", "error", "mode"] + [ + "query_id", + "query_pattern", + "latency_ms", + "result_rows", + "result_full", + "error", + "mode", + ] ) for query_id, sql in queries: @@ -187,7 +201,9 @@ def run_benchmark( trial_latencies = [] last_result, last_error = None, None for _ in range(repeat): - lat, result, error = run_query(sql, endpoint_url, session, timeout, debug) + lat, result, error = run_query( + sql, endpoint_url, session, timeout, debug + ) trial_latencies.append(lat) last_result, last_error = result, error if error: @@ -197,7 +213,9 @@ def run_benchmark( if last_error: print(f"ERROR {last_error}") - writer.writerow([query_id, pattern, f"{latency_ms:.2f}", 0, "", last_error, mode]) + writer.writerow( + [query_id, pattern, f"{latency_ms:.2f}", 0, "", last_error, mode] + ) plot_latencies.append(0.0) else: result_lines = last_result.strip().split("\n") if last_result else [] @@ -207,13 +225,21 @@ def run_benchmark( plot_latencies.append(latency_ms) print(f"{latency_ms:.2f}ms ({num_rows} rows)") writer.writerow( - [query_id, pattern, f"{latency_ms:.2f}", num_rows, preview, "", mode] + [ + query_id, + pattern, + f"{latency_ms:.2f}", + num_rows, + preview, + "", + mode, + ] ) time.sleep(0.1) print(f"\nResults saved to {output_csv}") - _latency_summary(latencies_ok, f"Latency summary") + _latency_summary(latencies_ok, "Latency summary") if not no_plot and plot_latencies: _plot_single(plot_latencies, mode, output_csv.with_suffix(".png")) @@ -240,6 +266,7 @@ def _plot_comparison(asap_csv: Path, baseline_csv: Path, out_path: Path): Adapted from asap_query_latency/plot_latency.py. """ + def _load(path): rows = {} with open(path) as f: @@ -371,21 +398,8 @@ def main(): action="store_true", help="Do not generate any plots", ) - # Ignored flag for backward compatibility - parser.add_argument( - "--measure-pipeline-overhead", - action="store_true", - help="(No-op) Pipeline overhead measurement is not applicable with file source", - ) - args = parser.parse_args() - if args.measure_pipeline_overhead: - print( - "WARNING: --measure-pipeline-overhead is not applicable when using " - "file source (no Kafka ingest). Ignoring." - ) - # Validate required SQL files if args.mode in ("asap", "both") and not args.asap_sql_file: parser.error("--asap-sql-file is required when --mode is asap or both") @@ -394,7 +408,9 @@ def main(): output_dir = Path(args.output_dir) prefix = args.output_prefix - query_filter = [q.strip() for q in args.query_filter.split(",")] if args.query_filter else None + query_filter = ( + [q.strip() for q in args.query_filter.split(",")] if args.query_filter else None + ) asap_csv = output_dir / f"{prefix}_asap.csv" baseline_csv = output_dir / f"{prefix}_baseline.csv" From 14dbd06d03c8a0090c706c42a4ee449543f3a606 Mon Sep 17 00:00:00 2001 From: Kavya Bhat Date: Sat, 11 Apr 2026 07:32:19 -0600 Subject: [PATCH 03/10] rebase and add Elastic to pipeline --- .../benchmark/configs/h2o_inference.yaml | 8 +- .../benchmark/configs/h2o_streaming.yaml | 8 +- .../benchmark/export_to_database.py | 195 ++++++++++++++++-- .../benchmark/generate_queries.py | 40 +++- .../benchmark/prepare_data.py | 14 +- .../benchmark/run_benchmark.py | 173 ++++++++++++---- 6 files changed, 358 insertions(+), 80 deletions(-) diff --git a/asap-tools/execution-utilities/benchmark/configs/h2o_inference.yaml b/asap-tools/execution-utilities/benchmark/configs/h2o_inference.yaml index 0d1e45b0..fde732f9 100644 --- a/asap-tools/execution-utilities/benchmark/configs/h2o_inference.yaml +++ b/asap-tools/execution-utilities/benchmark/configs/h2o_inference.yaml @@ -4,8 +4,8 @@ tables: - name: h2o_groupby time_column: timestamp - metadata_columns: [id1, id2] - value_columns: [v1] + metadata_columns: [id1, id2, id3, id4, id5, id6] + value_columns: [v1, v2, v3] cleanup_policy: name: read_based @@ -15,6 +15,6 @@ queries: - aggregation_id: 12 read_count_threshold: 999999 query: |- - SELECT QUANTILE(0.95, v1) FROM h2o_groupby + SELECT PERCENTILE(v3, 95) FROM h2o_groupby WHERE timestamp BETWEEN DATEADD(s, -10, NOW()) AND NOW() - GROUP BY id1, id2; + GROUP BY id1, id2 ORDER BY id1, id2; diff --git a/asap-tools/execution-utilities/benchmark/configs/h2o_streaming.yaml b/asap-tools/execution-utilities/benchmark/configs/h2o_streaming.yaml index c500d696..9a7e6299 100644 --- a/asap-tools/execution-utilities/benchmark/configs/h2o_streaming.yaml +++ b/asap-tools/execution-utilities/benchmark/configs/h2o_streaming.yaml @@ -4,8 +4,8 @@ tables: - name: h2o_groupby time_column: timestamp - metadata_columns: [id1, id2] - value_columns: [v1] + metadata_columns: [id1, id2, id3, id4, id5, id6] + value_columns: [v1, v2, v3] aggregations: # Temporal queries (10s window, all labels) - QUANTILE (DatasketchesKLL) @@ -14,10 +14,10 @@ aggregations: aggregationSubType: '' labels: grouping: [id1, id2] - rollup: [] + rollup: [id3, id4, id5, id6] aggregated: [] table_name: h2o_groupby - value_column: v1 + value_column: v3 parameters: K: 200 tumblingWindowSize: 10 diff --git a/asap-tools/execution-utilities/benchmark/export_to_database.py b/asap-tools/execution-utilities/benchmark/export_to_database.py index 9811917c..79511a71 100644 --- a/asap-tools/execution-utilities/benchmark/export_to_database.py +++ b/asap-tools/execution-utilities/benchmark/export_to_database.py @@ -1,25 +1,35 @@ #!/usr/bin/env python3 """ -Load a dataset into ClickHouse for baseline comparison. +Load a dataset into ClickHouse or Elasticsearch for baseline comparison. Supports ClickBench (hits.json.gz), H2O groupby CSV, or a custom table. Usage: - # ClickBench + # ClickBench to Clickhouse python export_to_database.py \\ - --dataset clickbench \\ + --dataset clickbench --database clickhouse \\ --file-path ./data/hits.json.gz \\ --init-sql-file ../clickhouse-benchmark-pipeline/clickhouse/clickbench_init.sql - # H2O + # H2O to Clickhouse python export_to_database.py \\ - --dataset h2o \\ + --dataset h2o --database clickhouse \\ --file-path ./data/G1_1e7_1e2_0_0.csv \\ --init-sql-file ../asap_benchmark_pipeline/h2o_init.sql - # Custom JSON file + # H2O to Elasticsearch python export_to_database.py \\ - --dataset custom \\ + --dataset h2o --database elasticsearch \\ + --file-path ./data/G1_1e7_1e2_0_0.csv \\ + --es-host localhost \\ + --es-port 9200 \\ + --es-index h2o_benchmark \\ + --es-api-key your_api_key_here \\ + --es-bulk-size 5000 + + # Custom JSON to ClickHouse + python export_to_database.py \\ + --dataset custom --database clickhouse \\ --file-path ./data/mydata.json \\ --table-name mytable \\ --ts-column event_time \\ @@ -27,7 +37,6 @@ """ import argparse -import gzip import os import sys from datetime import datetime, timezone @@ -39,6 +48,14 @@ H2O_ROWS_PER_SECOND = 1000 H2O_BASE_EPOCH = 1704067200 # 2024-01-01T00:00:00Z +# Valid (dataset, database) combinations tested so far +VALID_COMBINATIONS = { + ("clickbench", "clickhouse"), + ("h2o", "clickhouse"), + ("h2o", "elasticsearch"), + ("custom", "clickhouse"), +} + def _exec_clickhouse_sql(clickhouse_url: str, sql: str, label: str = ""): """Execute a SQL statement via the ClickHouse HTTP API.""" @@ -117,7 +134,7 @@ def _flush_h2o_batch(clickhouse_url: str, rows: list): raise RuntimeError(f"ClickHouse insert failed: {r.text[:200]}") -def load_h2o( +def load_h2o_clickhouse( clickhouse_url: str, file_path: str, init_sql_file: str = None, @@ -178,6 +195,112 @@ def load_h2o( print(f"Loaded {total:,} rows into ClickHouse (h2o_groupby)") return True +def load_h2o_elasticsearch( + es_host: str, + es_port: int, + index_name: str, + file_path: str, + api_key: str = None, + skip_if_loaded: bool = False, + max_rows: int = 0, +): + """Load H2O groupby CSV into Elasticsearch with synthetic timestamps.""" + try: + from elasticsearch import Elasticsearch, helpers + except ImportError: + print("ERROR: elasticsearch-py not installed. Run: pip install elasticsearch") + return False + + auth = {"api_key": api_key} if api_key else {} + es = Elasticsearch(f"http://{es_host}:{es_port}", **auth) + + if not es.ping(): + print(f"ERROR: Cannot connect to Elasticsearch at {es_host}:{es_port}") + return False + + if skip_if_loaded and es.indices.exists(index=index_name): + count = es.count(index=index_name)["count"] + if count > 0: + print(f"Data already loaded ({count:,} rows). Skipping.") + return True + + if es.indices.exists(index=index_name): + print(f"Deleting existing index: {index_name}") + es.indices.delete(index=index_name) + + print(f"Creating index: {index_name}") + es.indices.create(index=index_name, body={ + "settings": { + "number_of_shards": 1, + "number_of_replicas": 0, + "refresh_interval": "30s", + }, + "mappings": { + "properties": { + "timestamp": {"type": "date", "format": "epoch_millis"}, + "id1": {"type": "keyword"}, + "id2": {"type": "keyword"}, + "id3": {"type": "keyword"}, + "id4": {"type": "long"}, + "id5": {"type": "long"}, + "id6": {"type": "long"}, + "v1": {"type": "long"}, + "v2": {"type": "long"}, + "v3": {"type": "double"}, + } + }, + }) + + if not os.path.exists(file_path): + print(f"ERROR: Data file not found: {file_path}") + return False + + print(f"Importing H2O data from {file_path} into Elasticsearch ({index_name})...") + + base_timestamp_ms = 1704067200000 # 2024-01-01T00:00:00Z in millis + + def generate_docs(): + with open(file_path, "r", encoding="utf-8") as f: + f.readline() # skip header + for row_num, line in enumerate(f): + if max_rows > 0 and row_num >= max_rows: + break + parts = line.rstrip("\n").split(",") + if len(parts) < 9: + continue + yield { + "_index": index_name, + "_source": { + "timestamp": base_timestamp_ms + row_num * 10, + "id1": parts[0], + "id2": parts[1], + "id3": parts[2], + "id4": int(parts[3] or 0), + "id5": int(parts[4] or 0), + "id6": int(parts[5] or 0), + "v1": int(parts[6] or 0), + "v2": int(parts[7] or 0), + "v3": float(parts[8] or 0.0), + }, + } + + total = 0 + errors = 0 + for ok, _ in helpers.streaming_bulk( + es, generate_docs(), chunk_size=H2O_BATCH_SIZE, raise_on_error=False + ): + if ok: + total += 1 + else: + errors += 1 + if total % 500_000 == 0 and total > 0: + print(f" Indexed {total:,} documents...") + + print(f"Indexed {total:,} documents ({errors} errors)") + print("Refreshing index...") + es.indices.refresh(index=index_name) + print(f"✓ Import complete! Index: {index_name}") + return True def load_custom( clickhouse_url: str, @@ -253,7 +376,7 @@ def _stream_plain(): def main(): parser = argparse.ArgumentParser( - description="Load a dataset into ClickHouse for baseline comparison", + description="Load a dataset into ClickHouse or Elasticsearch for baseline comparison", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__, ) @@ -263,6 +386,12 @@ def main(): required=True, help="Dataset type", ) + parser.add_argument( + "--database", + choices=["clickhouse", "elasticsearch"], + required=True, + help="Target database", + ) parser.add_argument( "--file-path", required=True, @@ -311,8 +440,25 @@ def main(): help="Maximum rows to load (0 = all)", ) + # Elasticsearch-specific flags + es_group = parser.add_argument_group("Elasticsearch options (--database elasticsearch)") + es_group.add_argument("--es-host", default="localhost", help="Elasticsearch host") + es_group.add_argument("--es-port", type=int, default=9200, help="Elasticsearch port") + es_group.add_argument("--es-index", default="h2o_benchmark", help="Elasticsearch index name") + es_group.add_argument("--es-api-key", default=None, help="Elasticsearch API key") + es_group.add_argument("--es-bulk-size", type=int, default=5000, help="Bulk insert batch size") + args = parser.parse_args() + # Validate (dataset, database) combination + combo = (args.dataset, args.database) + if combo not in VALID_COMBINATIONS: + valid = ", ".join(f"({d}/{db})" for d, db in sorted(VALID_COMBINATIONS)) + parser.error( + f"--dataset {args.dataset} is not supported with --database {args.database}. " + f"Valid combinations: {valid}" + ) + if args.dataset == "custom" and not args.table_name: parser.error("--table-name is required when --dataset custom") @@ -327,15 +473,26 @@ def main(): max_rows=args.max_rows, ) elif args.dataset == "h2o": - success = load_h2o( - args.clickhouse_url, - args.file_path, - init_sql_file=args.init_sql_file, - skip_table_init=args.skip_table_init, - skip_if_loaded=args.skip_if_loaded, - max_rows=args.max_rows, - ) - else: + if args.database == "elasticsearch": + success = load_h2o_elasticsearch( + es_host=args.es_host, + es_port=args.es_port, + index_name=args.es_index, + file_path=args.file_path, + api_key=args.es_api_key, + skip_if_loaded=args.skip_if_loaded, + max_rows=args.max_rows, + ) + else: + success = load_h2o_clickhouse( + args.clickhouse_url, + args.file_path, + init_sql_file=args.init_sql_file, + skip_table_init=args.skip_table_init, + skip_if_loaded=args.skip_if_loaded, + max_rows=args.max_rows, + ) + elif args.dataset == "custom": success = load_custom( args.clickhouse_url, args.file_path, diff --git a/asap-tools/execution-utilities/benchmark/generate_queries.py b/asap-tools/execution-utilities/benchmark/generate_queries.py index 730b2efd..44590e67 100644 --- a/asap-tools/execution-utilities/benchmark/generate_queries.py +++ b/asap-tools/execution-utilities/benchmark/generate_queries.py @@ -211,7 +211,12 @@ def generate_sql_file( ): """Write a single SQL file compatible with both ASAP and ClickHouse.""" group_by_clause = ", ".join(group_by_columns) - lines = [] + percentile = quantile * 100 + # Strip trailing zero: 95.0 -> 95, 99.5 -> 99.5 + percentile_str = f"{percentile:.1f}".rstrip("0").rstrip(".") + + ch_lines = [] + es_lines = [] for i, end_ts in enumerate(window_ends): end_str = format_ts(end_ts, ts_format) @@ -225,19 +230,40 @@ def generate_sql_file( else: where_clause = f"{ts_column} BETWEEN '{start_str}' AND '{end_str}'" - lines.append( + # Elasticsearch uses DATEADD + CAST form + es_where = ( + f"{ts_column} BETWEEN DATEADD('s', -{window_size}, CAST('{end_str}' AS DATETIME)) " + f"AND CAST('{end_str}' AS DATETIME)" + ) + + ch_sql = ( f"-- {label}: {desc}\n" f"SELECT quantile({quantile})({value_column}) FROM {table_name} " f"WHERE {where_clause} GROUP BY {group_by_clause};" ) - sql_file = f"{output_prefix}.sql" - Path(sql_file).parent.mkdir(parents=True, exist_ok=True) + asap_lines.append(asap_sql) + ch_lines.append(ch_sql) + es_lines.append( + f"-- {label}: {desc}\n" + f"SELECT PERCENTILE({value_column}, {percentile_str}) FROM {table_name} " + f"WHERE {es_where} GROUP BY {group_by_clause};" + ) + + ch_file = f"{output_prefix}_clickhouse.sql" + es_file = f"{output_prefix}_elasticsearch.sql" + + Path(asap_file).parent.mkdir(parents=True, exist_ok=True) + + with open(ch_file, "w") as f: + f.write("\n".join(ch_lines) + "\n") - with open(sql_file, "w") as f: - f.write("\n".join(lines) + "\n") + with open(es_file, "w") as f: + f.write("\n".join(es_lines) + "\n") - print(f"Generated {len(window_ends)} queries → {sql_file}") + print(f"Generated {len(window_ends)} queries:") + print(f" ClickHouse: {ch_file}") + print(f" Elasticsearch: {es_file}") def main(): diff --git a/asap-tools/execution-utilities/benchmark/prepare_data.py b/asap-tools/execution-utilities/benchmark/prepare_data.py index 5b2d7b56..043c6e06 100644 --- a/asap-tools/execution-utilities/benchmark/prepare_data.py +++ b/asap-tools/execution-utilities/benchmark/prepare_data.py @@ -127,19 +127,15 @@ def prepare_h2o(input_path: str, output_path: str, max_rows: int = 0): print(f" Written {i:,} rows...", end="\r") parts = line.rstrip("\n").split(",") - abs_sec = H2O_BASE_EPOCH + i // H2O_ROWS_PER_SECOND - ms = i % H2O_ROWS_PER_SECOND - ts = datetime.fromtimestamp(abs_sec, tz=timezone.utc) - ts_str = ts.strftime("%Y-%m-%dT%H:%M:%S") + f".{ms:03d}Z" - + abs_ms = H2O_BASE_EPOCH * 1000 + i * 10 # 10 ms per row record = { - H2O_TIMESTAMP_FIELD: ts_str, + H2O_TIMESTAMP_FIELD: abs_ms, "id1": parts[id_idx["id1"]], "id2": parts[id_idx["id2"]], "id3": parts[id_idx["id3"]], - "id4": int(parts[id_idx["id4"]]), - "id5": int(parts[id_idx["id5"]]), - "id6": int(parts[id_idx["id6"]]), + "id4": str(parts[id_idx["id4"]]), + "id5": str(parts[id_idx["id5"]]), + "id6": str(parts[id_idx["id6"]]), "v1": float(parts[id_idx["v1"]]), "v2": float(parts[id_idx["v2"]]), "v3": float(parts[id_idx["v3"]]), diff --git a/asap-tools/execution-utilities/benchmark/run_benchmark.py b/asap-tools/execution-utilities/benchmark/run_benchmark.py index 85c637a7..50ce4e84 100644 --- a/asap-tools/execution-utilities/benchmark/run_benchmark.py +++ b/asap-tools/execution-utilities/benchmark/run_benchmark.py @@ -1,28 +1,39 @@ #!/usr/bin/env python3 """ -Unified benchmark runner: ASAP (QueryEngineRust) vs ClickHouse baseline. +Unified benchmark runner: ASAP (QueryEngineRust) vs ClickHouse/Elasticsearch baseline. Reads SQL files generated by generate_queries.py, sends each query to the configured endpoint, and writes results to CSV. With --mode both, runs baseline then ASAP and generates a latency comparison plot. Usage: - # Both modes with comparison plot + # Both modes, ClickHouse baseline python run_benchmark.py \\ - --mode both \\ + --mode both --database clickhouse \\ --asap-sql-file ./queries/clickbench_asap.sql \\ --baseline-sql-file ./queries/clickbench_clickhouse.sql \\ --output-dir ./results + # Both modes, Elasticsearch baseline + python run_benchmark.py \\ + --mode both --database elasticsearch \\ + --asap-sql-file ./queries/h2o_asap.sql \\ + --baseline-sql-file ./queries/h2o_elasticsearch.sql \\ + --elastic-host localhost \\ + --elastic-port 9200 \\ + --elastic-api-key your_api_key_here \\ + --output-dir ./results \\ + --output-prefix h2o + # ASAP only python run_benchmark.py \\ - --mode asap \\ + --mode asap --database clickhouse \\ --asap-sql-file ./queries/h2o_asap.sql \\ --output-dir ./results # Baseline only python run_benchmark.py \\ - --mode baseline \\ + --mode baseline --database clickhouse \\ --baseline-sql-file ./queries/h2o_clickhouse.sql \\ --output-dir ./results """ @@ -38,8 +49,12 @@ import matplotlib.pyplot as plt import numpy as np import requests +import json -DEFAULT_ASAP_URL = "http://localhost:8088/clickhouse/query" +DEFAULT_ELASTIC_HOST = "localhost" +DEFAULT_ELASTIC_PORT = 9200 +DEFAULT_ASAP_CLICKHOUSE_URL = "http://localhost:8088/clickhouse/query" +DEFAULT_ASAP_ELASTIC_URL = "http://localhost:8088/_sql?format=json" DEFAULT_CLICKHOUSE_URL = "http://localhost:8123/?session_timezone=UTC" DEFAULT_OUTPUT_DIR = "./results" DEFAULT_OUTPUT_PREFIX = "benchmark" @@ -80,15 +95,26 @@ def run_query( session: requests.Session, timeout: int = 30, debug: bool = False, -) -> Tuple[float, Optional[str], Optional[str]]: - """Send a single SQL query and return (latency_ms, result_text, error).""" - encoded_query = urllib.parse.quote(query) - separator = "&" if "?" in endpoint_url else "?" - url = f"{endpoint_url}{separator}query={encoded_query}" - + database: str = "clickhouse", + api_key: Optional[str] = None, + fetch_size: int = 1000, +) -> Tuple[float, Optional[str], int, Optional[str]]: + """Send a single SQL query and return (latency_ms, result_text, num_rows, error).""" try: start = time.time() - response = session.get(url, timeout=timeout) + + if database == "elasticsearch": + headers = {"Content-Type": "application/json"} + if api_key: + headers["Authorization"] = f"ApiKey {api_key}" + body = {"query": query.strip().rstrip(";"), "fetch_size": fetch_size} + response = session.post(endpoint_url, headers=headers, json=body, timeout=timeout) + else: + encoded_query = urllib.parse.quote(query) + separator = "&" if "?" in endpoint_url else "?" + url = f"{endpoint_url}{separator}query={encoded_query}" + response = session.get(url, timeout=timeout) + latency_ms = (time.time() - start) * 1000 if debug: @@ -98,7 +124,45 @@ def run_query( print(f" [{source}] {latency_ms:.2f}ms") if response.status_code == 200: - return latency_ms, response.text.strip(), None + if database == "elasticsearch": + data = response.json() + + if "hits" in data: + hits = data["hits"].get("hits", []) + if hits: + col_names = list(hits[0].get("_source", {}).keys()) + formatted_rows = [ + ", ".join(f"{k}={hit.get('_source', {}).get(k)}" for k in col_names) + for hit in hits + ] + result_text = "\n".join(formatted_rows) + num_rows = len(hits) + else: + result_text = "" + num_rows = 0 + + elif "rows" in data: + rows = data.get("rows", []) + columns = data.get("columns", []) + col_names = [c.get("name", f"col{i}") for i, c in enumerate(columns)] + formatted_rows = [ + ", ".join( + f"{col_names[i]}={v}" if i < len(col_names) else str(v) + for i, v in enumerate(row) + ) if isinstance(row, (list, tuple)) else str(row) + for row in rows + ] + result_text = "\n".join(formatted_rows) + num_rows = len(rows) + + else: + result_text = "" + num_rows = 0 + else: + result_text = response.text.strip() + num_rows = len(result_text.split("\n")) if result_text else 0 + + return latency_ms, result_text, num_rows, None else: return ( latency_ms, @@ -106,9 +170,9 @@ def run_query( f"HTTP {response.status_code}: {response.text[:200]}", ) except requests.Timeout: - return timeout * 1000.0, None, "Timeout" + return timeout * 1000.0, None, 0, "Timeout" except Exception as e: - return 0.0, None, str(e) + return 0.0, None, 0, str(e) # --------------------------------------------------------------------------- @@ -156,6 +220,8 @@ def run_benchmark( repeat: int = 1, debug: bool = False, no_plot: bool = False, + database: str = "clickhouse", + api_key: Optional[str] = None, ): """Run all queries and write results to CSV. @@ -197,17 +263,17 @@ def run_benchmark( pattern = _infer_pattern(query_id) print(f"Running {query_id}...", end=" " if not debug else "\n", flush=True) - # Repeat and take median trial_latencies = [] - last_result, last_error = None, None + last_result, last_error, last_row_count = None, None, 0 for _ in range(repeat): - lat, result, error = run_query( - sql, endpoint_url, session, timeout, debug + lat, result, row_count, error = run_query( + sql, endpoint_url, session, timeout, debug, + database=database, api_key=api_key, ) trial_latencies.append(lat) - last_result, last_error = result, error + last_result, last_error, last_row_count = result, error, row_count if error: - break # don't retry on error + break latency_ms = sorted(trial_latencies)[len(trial_latencies) // 2] @@ -218,18 +284,16 @@ def run_benchmark( ) plot_latencies.append(0.0) else: - result_lines = last_result.strip().split("\n") if last_result else [] - num_rows = len(result_lines) preview = last_result.replace("\n", " | ")[:200] if last_result else "" latencies_ok.append(latency_ms) plot_latencies.append(latency_ms) - print(f"{latency_ms:.2f}ms ({num_rows} rows)") + print(f"{latency_ms:.2f}ms ({last_row_count} rows)") writer.writerow( [ query_id, pattern, f"{latency_ms:.2f}", - num_rows, + last_row_count, preview, "", mode, @@ -292,13 +356,13 @@ def _load(path): ) w = 0.4 - ax1.bar(x - w / 2, b_vals, w, label="ClickHouse baseline", color="#f4a460") + ax1.bar(x - w / 2, b_vals, w, label="Baseline", color="#f4a460") ax1.bar(x + w / 2, a_vals, w, label="ASAP (KLL sketch)", color="#4682b4") ax1.set_xticks(x) ax1.set_xticklabels(qids, rotation=90, fontsize=7) ax1.set_ylabel("Latency (ms)") ax1.set_title( - f"Query latency: ASAP vs ClickHouse baseline " + f"Query latency: ASAP vs baseline " f"(p50: {np.median(a_vals):.1f}ms vs {np.median(b_vals):.1f}ms)" ) ax1.legend() @@ -331,7 +395,7 @@ def _load(path): def main(): parser = argparse.ArgumentParser( - description="Benchmark ASAP vs ClickHouse baseline", + description="Benchmark ASAP vs ClickHouse/Elasticsearch baseline", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__, ) @@ -341,6 +405,12 @@ def main(): default="both", help="Which mode(s) to run (default: both)", ) + parser.add_argument( + "--database", + choices=["clickhouse", "elasticsearch"], + required=True, + help="Baseline database to benchmark against", + ) parser.add_argument( "--asap-sql-file", default=None, @@ -351,16 +421,30 @@ def main(): default=None, help="SQL file for baseline mode (required if mode is baseline or both)", ) - parser.add_argument( + + # ClickHouse flags + ch_group = parser.add_argument_group("ClickHouse options (--database clickhouse)") + ch_group.add_argument( "--asap-url", - default=DEFAULT_ASAP_URL, - help=f"QueryEngineRust endpoint (default: {DEFAULT_ASAP_URL})", + default=None, + help=f"ASAP endpoint for ClickHouse mode (default: {DEFAULT_ASAP_CLICKHOUSE_URL})", ) - parser.add_argument( + ch_group.add_argument( "--clickhouse-url", default=DEFAULT_CLICKHOUSE_URL, help=f"ClickHouse HTTP URL (default: {DEFAULT_CLICKHOUSE_URL})", ) + + # Elasticsearch flags + es_group = parser.add_argument_group("Elasticsearch options (--database elasticsearch)") + es_group.add_argument("--elastic-host", default=DEFAULT_ELASTIC_HOST, + help="Elasticsearch host") + es_group.add_argument("--elastic-port", type=int, default=DEFAULT_ELASTIC_PORT, + help="Elasticsearch port") + es_group.add_argument("--elastic-api-key", default=None, + help="Elasticsearch API key") + + # Shared flags parser.add_argument( "--output-dir", default=DEFAULT_OUTPUT_DIR, @@ -406,6 +490,18 @@ def main(): if args.mode in ("baseline", "both") and not args.baseline_sql_file: parser.error("--baseline-sql-file is required when --mode is baseline or both") + # Resolve endpoints based on --database + use_elastic = args.database == "elasticsearch" + + baseline_url = ( + f"http://{args.elastic_host}:{args.elastic_port}/_sql?format=json" + if use_elastic + else args.clickhouse_url + ) + asap_url = ( + args.asap_url or (DEFAULT_ASAP_ELASTIC_URL if use_elastic else DEFAULT_ASAP_CLICKHOUSE_URL) + ) + output_dir = Path(args.output_dir) prefix = args.output_prefix query_filter = ( @@ -418,9 +514,11 @@ def main(): if args.mode in ("baseline", "both"): run_benchmark( sql_file=Path(args.baseline_sql_file), - endpoint_url=args.clickhouse_url, + endpoint_url=baseline_url, output_csv=baseline_csv, mode="baseline", + database=args.database, + api_key=args.elastic_api_key if use_elastic else None, query_filter=query_filter, timeout=args.timeout, repeat=args.repeat, @@ -431,9 +529,11 @@ def main(): if args.mode in ("asap", "both"): run_benchmark( sql_file=Path(args.asap_sql_file), - endpoint_url=args.asap_url, + endpoint_url=asap_url, output_csv=asap_csv, mode="asap", + database=args.database, + api_key=args.elastic_api_key if use_elastic else None, query_filter=query_filter, timeout=args.timeout, repeat=args.repeat, @@ -442,8 +542,7 @@ def main(): ) if args.mode == "both" and not args.no_plot: - comparison_png = output_dir / f"{prefix}_comparison.png" - _plot_comparison(asap_csv, baseline_csv, comparison_png) + _plot_comparison(asap_csv, baseline_csv, output_dir / f"{prefix}_comparison.png") if __name__ == "__main__": From f7110f815d8d9923acfcbd543343414b911ca0d3 Mon Sep 17 00:00:00 2001 From: Kavya Bhat Date: Sat, 11 Apr 2026 07:43:02 -0600 Subject: [PATCH 04/10] fix formatting --- .../benchmark/export_to_database.py | 68 +++++++++++-------- .../benchmark/generate_queries.py | 3 - .../benchmark/run_benchmark.py | 64 +++++++++++------ 3 files changed, 86 insertions(+), 49 deletions(-) diff --git a/asap-tools/execution-utilities/benchmark/export_to_database.py b/asap-tools/execution-utilities/benchmark/export_to_database.py index 79511a71..1e6359d1 100644 --- a/asap-tools/execution-utilities/benchmark/export_to_database.py +++ b/asap-tools/execution-utilities/benchmark/export_to_database.py @@ -37,6 +37,7 @@ """ import argparse +import gzip import os import sys from datetime import datetime, timezone @@ -195,6 +196,7 @@ def load_h2o_clickhouse( print(f"Loaded {total:,} rows into ClickHouse (h2o_groupby)") return True + def load_h2o_elasticsearch( es_host: str, es_port: int, @@ -229,27 +231,30 @@ def load_h2o_elasticsearch( es.indices.delete(index=index_name) print(f"Creating index: {index_name}") - es.indices.create(index=index_name, body={ - "settings": { - "number_of_shards": 1, - "number_of_replicas": 0, - "refresh_interval": "30s", - }, - "mappings": { - "properties": { - "timestamp": {"type": "date", "format": "epoch_millis"}, - "id1": {"type": "keyword"}, - "id2": {"type": "keyword"}, - "id3": {"type": "keyword"}, - "id4": {"type": "long"}, - "id5": {"type": "long"}, - "id6": {"type": "long"}, - "v1": {"type": "long"}, - "v2": {"type": "long"}, - "v3": {"type": "double"}, - } + es.indices.create( + index=index_name, + body={ + "settings": { + "number_of_shards": 1, + "number_of_replicas": 0, + "refresh_interval": "30s", + }, + "mappings": { + "properties": { + "timestamp": {"type": "date", "format": "epoch_millis"}, + "id1": {"type": "keyword"}, + "id2": {"type": "keyword"}, + "id3": {"type": "keyword"}, + "id4": {"type": "long"}, + "id5": {"type": "long"}, + "id6": {"type": "long"}, + "v1": {"type": "long"}, + "v2": {"type": "long"}, + "v3": {"type": "double"}, + } + }, }, - }) + ) if not os.path.exists(file_path): print(f"ERROR: Data file not found: {file_path}") @@ -278,9 +283,9 @@ def generate_docs(): "id4": int(parts[3] or 0), "id5": int(parts[4] or 0), "id6": int(parts[5] or 0), - "v1": int(parts[6] or 0), - "v2": int(parts[7] or 0), - "v3": float(parts[8] or 0.0), + "v1": int(parts[6] or 0), + "v2": int(parts[7] or 0), + "v3": float(parts[8] or 0.0), }, } @@ -302,6 +307,7 @@ def generate_docs(): print(f"✓ Import complete! Index: {index_name}") return True + def load_custom( clickhouse_url: str, file_path: str, @@ -441,12 +447,20 @@ def main(): ) # Elasticsearch-specific flags - es_group = parser.add_argument_group("Elasticsearch options (--database elasticsearch)") + es_group = parser.add_argument_group( + "Elasticsearch options (--database elasticsearch)" + ) es_group.add_argument("--es-host", default="localhost", help="Elasticsearch host") - es_group.add_argument("--es-port", type=int, default=9200, help="Elasticsearch port") - es_group.add_argument("--es-index", default="h2o_benchmark", help="Elasticsearch index name") + es_group.add_argument( + "--es-port", type=int, default=9200, help="Elasticsearch port" + ) + es_group.add_argument( + "--es-index", default="h2o_benchmark", help="Elasticsearch index name" + ) es_group.add_argument("--es-api-key", default=None, help="Elasticsearch API key") - es_group.add_argument("--es-bulk-size", type=int, default=5000, help="Bulk insert batch size") + es_group.add_argument( + "--es-bulk-size", type=int, default=5000, help="Bulk insert batch size" + ) args = parser.parse_args() diff --git a/asap-tools/execution-utilities/benchmark/generate_queries.py b/asap-tools/execution-utilities/benchmark/generate_queries.py index 44590e67..003dc52d 100644 --- a/asap-tools/execution-utilities/benchmark/generate_queries.py +++ b/asap-tools/execution-utilities/benchmark/generate_queries.py @@ -242,7 +242,6 @@ def generate_sql_file( f"WHERE {where_clause} GROUP BY {group_by_clause};" ) - asap_lines.append(asap_sql) ch_lines.append(ch_sql) es_lines.append( f"-- {label}: {desc}\n" @@ -253,8 +252,6 @@ def generate_sql_file( ch_file = f"{output_prefix}_clickhouse.sql" es_file = f"{output_prefix}_elasticsearch.sql" - Path(asap_file).parent.mkdir(parents=True, exist_ok=True) - with open(ch_file, "w") as f: f.write("\n".join(ch_lines) + "\n") diff --git a/asap-tools/execution-utilities/benchmark/run_benchmark.py b/asap-tools/execution-utilities/benchmark/run_benchmark.py index 50ce4e84..696dee0a 100644 --- a/asap-tools/execution-utilities/benchmark/run_benchmark.py +++ b/asap-tools/execution-utilities/benchmark/run_benchmark.py @@ -108,7 +108,9 @@ def run_query( if api_key: headers["Authorization"] = f"ApiKey {api_key}" body = {"query": query.strip().rstrip(";"), "fetch_size": fetch_size} - response = session.post(endpoint_url, headers=headers, json=body, timeout=timeout) + response = session.post( + endpoint_url, headers=headers, json=body, timeout=timeout + ) else: encoded_query = urllib.parse.quote(query) separator = "&" if "?" in endpoint_url else "?" @@ -132,7 +134,10 @@ def run_query( if hits: col_names = list(hits[0].get("_source", {}).keys()) formatted_rows = [ - ", ".join(f"{k}={hit.get('_source', {}).get(k)}" for k in col_names) + ", ".join( + f"{k}={hit.get('_source', {}).get(k)}" + for k in col_names + ) for hit in hits ] result_text = "\n".join(formatted_rows) @@ -144,12 +149,18 @@ def run_query( elif "rows" in data: rows = data.get("rows", []) columns = data.get("columns", []) - col_names = [c.get("name", f"col{i}") for i, c in enumerate(columns)] + col_names = [ + c.get("name", f"col{i}") for i, c in enumerate(columns) + ] formatted_rows = [ - ", ".join( - f"{col_names[i]}={v}" if i < len(col_names) else str(v) - for i, v in enumerate(row) - ) if isinstance(row, (list, tuple)) else str(row) + ( + ", ".join( + f"{col_names[i]}={v}" if i < len(col_names) else str(v) + for i, v in enumerate(row) + ) + if isinstance(row, (list, tuple)) + else str(row) + ) for row in rows ] result_text = "\n".join(formatted_rows) @@ -267,8 +278,13 @@ def run_benchmark( last_result, last_error, last_row_count = None, None, 0 for _ in range(repeat): lat, result, row_count, error = run_query( - sql, endpoint_url, session, timeout, debug, - database=database, api_key=api_key, + sql, + endpoint_url, + session, + timeout, + debug, + database=database, + api_key=api_key, ) trial_latencies.append(lat) last_result, last_error, last_row_count = result, error, row_count @@ -436,13 +452,21 @@ def main(): ) # Elasticsearch flags - es_group = parser.add_argument_group("Elasticsearch options (--database elasticsearch)") - es_group.add_argument("--elastic-host", default=DEFAULT_ELASTIC_HOST, - help="Elasticsearch host") - es_group.add_argument("--elastic-port", type=int, default=DEFAULT_ELASTIC_PORT, - help="Elasticsearch port") - es_group.add_argument("--elastic-api-key", default=None, - help="Elasticsearch API key") + es_group = parser.add_argument_group( + "Elasticsearch options (--database elasticsearch)" + ) + es_group.add_argument( + "--elastic-host", default=DEFAULT_ELASTIC_HOST, help="Elasticsearch host" + ) + es_group.add_argument( + "--elastic-port", + type=int, + default=DEFAULT_ELASTIC_PORT, + help="Elasticsearch port", + ) + es_group.add_argument( + "--elastic-api-key", default=None, help="Elasticsearch API key" + ) # Shared flags parser.add_argument( @@ -498,8 +522,8 @@ def main(): if use_elastic else args.clickhouse_url ) - asap_url = ( - args.asap_url or (DEFAULT_ASAP_ELASTIC_URL if use_elastic else DEFAULT_ASAP_CLICKHOUSE_URL) + asap_url = args.asap_url or ( + DEFAULT_ASAP_ELASTIC_URL if use_elastic else DEFAULT_ASAP_CLICKHOUSE_URL ) output_dir = Path(args.output_dir) @@ -542,7 +566,9 @@ def main(): ) if args.mode == "both" and not args.no_plot: - _plot_comparison(asap_csv, baseline_csv, output_dir / f"{prefix}_comparison.png") + _plot_comparison( + asap_csv, baseline_csv, output_dir / f"{prefix}_comparison.png" + ) if __name__ == "__main__": From 9f32c3a6a56e3e014bc5f537dd67a58c6b68e370 Mon Sep 17 00:00:00 2001 From: Kavya Bhat Date: Sat, 11 Apr 2026 07:49:19 -0600 Subject: [PATCH 05/10] rust format fix --- asap-query-engine/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asap-query-engine/src/main.rs b/asap-query-engine/src/main.rs index 842fe284..fa589aa0 100644 --- a/asap-query-engine/src/main.rs +++ b/asap-query-engine/src/main.rs @@ -541,4 +541,4 @@ fn setup_logging( info!("Logging initialized (respects RUST_LOG environment variable)"); info!("Logs will be written to: {}/query_engine.log", output_dir); Ok(guard) -} \ No newline at end of file +} From 80909ddb3347ade3b445f8ad396c954a92b59ca3 Mon Sep 17 00:00:00 2001 From: benjamib112 Date: Wed, 15 Apr 2026 05:33:34 -0400 Subject: [PATCH 06/10] added automatic timestamp detection, updated query generation script to generate both query files in one run, and added automatic streaming/inference config generation --- .../execution-utilities/benchmark/README.md | 193 +++++------ .../benchmark/generate_queries.py | 314 +++++++++++++----- 2 files changed, 329 insertions(+), 178 deletions(-) diff --git a/asap-tools/execution-utilities/benchmark/README.md b/asap-tools/execution-utilities/benchmark/README.md index c45b171c..a63beded 100644 --- a/asap-tools/execution-utilities/benchmark/README.md +++ b/asap-tools/execution-utilities/benchmark/README.md @@ -19,6 +19,10 @@ data_file → export_to_database.py run_benchmark.py → results/ ClickHouse :8123 (baseline) ``` +**Key difference from the old pipeline:** Arroyo reads directly from a local +file (`single_file_custom` connector) rather than from a Kafka input topic. +Kafka is still required for the **sketch output** topic (`sketch_topic`). + --- ## Prerequisites @@ -27,8 +31,8 @@ data_file → export_to_database.py run_benchmark.py → results/ export INSTALL_DIR=/scratch/sketch_db_for_prometheus pip3 install --user -r requirements.txt -# Build binaries (one-time) — workspace target is at ~/ASAPQuery/target/release/ -cd ~/ASAPQuery && cargo build --release +# Build binaries (one-time) +cd ~/ASAPQuery/asap-query-engine && cargo build --release ``` --- @@ -56,7 +60,6 @@ The Arroyo file source requires RFC3339 timestamps and string metadata columns. This step converts the raw ClickBench JSON: ```bash -cd ~/ASAPQuery/asap-tools/execution-utilities/benchmark python prepare_data.py \ --dataset clickbench \ --input ./data/hits.json.gz \ @@ -71,19 +74,17 @@ This produces `hits_arroyo.json` with: ### Step 3 — Start infrastructure -Skip any service that is already running. - ```bash -# Kafka — skip if `kafka-topics.sh --list` succeeds +# Kafka ~/ASAPQuery/asap-tools/installation/kafka/run.sh $INSTALL_DIR/kafka -# Create sketch output topic — skip if sketch_topic already exists +# Create sketch output topic KAFKA=$INSTALL_DIR/kafka/bin $KAFKA/kafka-topics.sh --bootstrap-server localhost:9092 --create \ --topic sketch_topic --partitions 1 --replication-factor 1 \ --config max.message.bytes=20971520 -# ClickHouse — skip if port 8123 is already listening +# ClickHouse ~/ASAPQuery/asap-tools/installation/clickhouse/run.sh $INSTALL_DIR ``` @@ -95,12 +96,36 @@ $KAFKA/kafka-topics.sh --bootstrap-server localhost:9092 --create \ > /tmp/arroyo.log 2>&1 & ``` -### Step 5 — Launch Arroyo sketch pipeline (file source) +### Step 5 — Generate queries and configs + +```bash +python generate_queries.py \ + --table-name hits \ + --ts-column EventTime \ + --value-column ResolutionWidth \ + --group-by-columns RegionID,OS,UserAgent,TraficSourceID \ + --window-size 10 \ + --num-queries 50 \ + --window-form dateadd \ + --generate-configs \ + --auto-detect-timestamps \ + --data-file ./data/hits_arroyo.json \ + --data-file-format json \ + --output-prefix ./queries/clickbench +``` + +This writes: +- `queries/clickbench_asap.sql` — ASAP queries (ISO timestamps) +- `queries/clickbench_clickhouse.sql` — ClickHouse queries (datetime timestamps) +- `queries/clickbench_streaming.yaml` — Arroyo streaming config +- `queries/clickbench_inference.yaml` — QueryEngineRust inference config + +### Step 6 — Launch Arroyo sketch pipeline (file source) ```bash -cd ~/ASAPQuery/asap-tools/execution-utilities/benchmark python export_to_arroyo.py \ - --streaming-config ./configs/clickbench_streaming.yaml \ + --streaming-config ./queries/clickbench_streaming.yaml \ + --source-type file \ --input-file ./data/hits_arroyo.json \ --file-format json \ --ts-format rfc3339 \ @@ -109,21 +134,21 @@ python export_to_arroyo.py \ --output-dir ./arroyo_outputs ``` -### Step 6 — Start QueryEngineRust +### Step 7 — Start QueryEngineRust ```bash -cd ~/ASAPQuery +cd ~/ASAPQuery/asap-query-engine nohup ./target/release/query_engine_rust \ --kafka-topic sketch_topic --input-format json \ - --config ~/ASAPQuery/asap-tools/execution-utilities/benchmark/configs/clickbench_inference.yaml \ - --streaming-config ~/ASAPQuery/asap-tools/execution-utilities/benchmark/configs/clickbench_streaming.yaml \ + --config ~/ASAPQuery/asap-tools/execution-utilities/benchmark/queries/clickbench_inference.yaml \ + --streaming-config ~/ASAPQuery/asap-tools/execution-utilities/benchmark/queries/clickbench_streaming.yaml \ --http-port 8088 --delete-existing-db --log-level DEBUG \ - --output-dir ./asap-query-engine/output --streaming-engine arroyo \ + --output-dir ./output --streaming-engine arroyo \ --query-language SQL --lock-strategy per-key \ --prometheus-scrape-interval 1 > /tmp/query_engine.log 2>&1 & ``` -### Step 7 — Load data into ClickHouse (baseline) +### Step 8 — Load data into ClickHouse (baseline) ```bash cd ~/ASAPQuery/asap-tools/execution-utilities/benchmark @@ -136,35 +161,14 @@ python export_to_database.py \ Verify: `$INSTALL_DIR/clickhouse client --query "SELECT count(*) FROM hits"` -### Step 8 — Generate SQL query files - -```bash -cd ~/ASAPQuery/asap-tools/execution-utilities/benchmark -python generate_queries.py \ - --table-name hits \ - --ts-column EventTime \ - --value-column ResolutionWidth \ - --group-by-columns RegionID,OS,UserAgent,TraficSourceID \ - --window-size 10 \ - --num-queries 50 \ - --ts-format datetime \ - --window-form dateadd \ - --auto-detect-timestamps \ - --data-file ./data/hits_arroyo.json \ - --data-file-format json \ - --output-prefix ./queries/clickbench -``` - -This writes `queries/clickbench.sql`. - ### Step 9 — Run benchmark ```bash -cd ~/ASAPQuery/asap-tools/execution-utilities/benchmark python run_benchmark.py \ --mode both \ - --asap-sql-file ./queries/clickbench.sql \ - --baseline-sql-file ./queries/clickbench.sql \ + --asap-sql-file ./queries/clickbench_asap.sql \ + --baseline-sql-file ./queries/clickbench_clickhouse.sql \ + --asap-url "http://localhost:8088/api/v1/query" \ --output-dir ./results \ --output-prefix clickbench ``` @@ -179,14 +183,12 @@ Results: `results/clickbench_asap.csv`, `results/clickbench_baseline.csv`, ### Step 1 — Download dataset ```bash -cd ~/ASAPQuery/asap-tools/execution-utilities/benchmark python download_dataset.py --dataset h2o --output-dir ./data ``` ### Step 2 — Prepare data for Arroyo file source ```bash -cd ~/ASAPQuery/asap-tools/execution-utilities/benchmark python prepare_data.py \ --dataset h2o \ --input ./data/G1_1e7_1e2_0_0.csv \ @@ -196,12 +198,29 @@ python prepare_data.py \ ### Steps 3–4 — Start infrastructure and Arroyo (same as ClickBench) -### Step 5 — Launch Arroyo sketch pipeline +### Step 5 — Generate queries and configs + +```bash +python generate_queries.py \ + --table-name h2o_groupby \ + --ts-column timestamp \ + --value-column v1 \ + --group-by-columns id1,id2 \ + --window-size 10 \ + --num-queries 50 \ + --generate-configs \ + --auto-detect-timestamps \ + --data-file ./data/h2o_arroyo.json \ + --data-file-format json \ + --output-prefix ./queries/h2o +``` + +### Step 6 — Launch Arroyo sketch pipeline ```bash -cd ~/ASAPQuery/asap-tools/execution-utilities/benchmark python export_to_arroyo.py \ - --streaming-config ./configs/h2o_streaming.yaml \ + --streaming-config ./queries/h2o_streaming.yaml \ + --source-type file \ --input-file ./data/h2o_arroyo.json \ --file-format json \ --ts-format rfc3339 \ @@ -210,24 +229,23 @@ python export_to_arroyo.py \ --output-dir ./arroyo_outputs ``` -### Step 6 — Start QueryEngineRust +### Step 7 — Start QueryEngineRust ```bash -cd ~/ASAPQuery +cd ~/ASAPQuery/asap-query-engine nohup ./target/release/query_engine_rust \ --kafka-topic sketch_topic --input-format json \ - --config ~/ASAPQuery/asap-tools/execution-utilities/benchmark/configs/h2o_inference.yaml \ - --streaming-config ~/ASAPQuery/asap-tools/execution-utilities/benchmark/configs/h2o_streaming.yaml \ + --config ~/ASAPQuery/asap-tools/execution-utilities/benchmark/queries/h2o_inference.yaml \ + --streaming-config ~/ASAPQuery/asap-tools/execution-utilities/benchmark/queries/h2o_streaming.yaml \ --http-port 8088 --delete-existing-db --log-level DEBUG \ - --output-dir ./asap-query-engine/output --streaming-engine arroyo \ + --output-dir ./output --streaming-engine arroyo \ --query-language SQL --lock-strategy per-key \ --prometheus-scrape-interval 1 > /tmp/query_engine.log 2>&1 & ``` -### Step 7 — Load data into ClickHouse (baseline) +### Step 8 — Load data into ClickHouse (baseline) ```bash -cd ~/ASAPQuery/asap-tools/execution-utilities/benchmark python export_to_database.py \ --dataset h2o \ --file-path ./data/G1_1e7_1e2_0_0.csv \ @@ -235,32 +253,14 @@ python export_to_database.py \ --max-rows 1000000 ``` -### Step 8 — Generate SQL query files - -```bash -cd ~/ASAPQuery/asap-tools/execution-utilities/benchmark -python generate_queries.py \ - --table-name h2o_groupby \ - --ts-column timestamp \ - --value-column v1 \ - --group-by-columns id1,id2 \ - --window-size 10 \ - --num-queries 50 \ - --ts-format iso \ - --auto-detect-timestamps \ - --data-file ./data/h2o_arroyo.json \ - --data-file-format json \ - --output-prefix ./queries/h2o -``` - ### Step 9 — Run benchmark ```bash -cd ~/ASAPQuery/asap-tools/execution-utilities/benchmark python run_benchmark.py \ --mode both \ - --asap-sql-file ./queries/h2o.sql \ - --baseline-sql-file ./queries/h2o.sql \ + --asap-sql-file ./queries/h2o_asap.sql \ + --baseline-sql-file ./queries/h2o_clickhouse.sql \ + --asap-url "http://localhost:8088/api/v1/query" \ --output-dir ./results \ --output-prefix h2o ``` @@ -270,8 +270,6 @@ python run_benchmark.py \ ## Custom Dataset ```bash -cd ~/ASAPQuery/asap-tools/execution-utilities/benchmark - # 1. Download (any HTTP URL) python download_dataset.py --dataset custom \ --custom-url https://example.com/mydata.json.gz \ @@ -279,39 +277,42 @@ python download_dataset.py --dataset custom \ # 2. Prepare (edit prepare_data.py for your schema, or skip if already RFC3339) -# 3. Export to Arroyo +# 3. Generate queries and configs +python generate_queries.py \ + --table-name my_table \ + --ts-column event_time \ + --value-column metric_value \ + --group-by-columns region,host \ + --window-size 10 \ + --num-queries 50 \ + --generate-configs \ + --auto-detect-timestamps \ + --data-file ./data/mydata.json \ + --output-prefix ./queries/my_dataset + +# 4. Export to Arroyo python export_to_arroyo.py \ - --streaming-config ./configs/my_streaming.yaml \ + --streaming-config ./queries/my_dataset_streaming.yaml \ + --source-type file \ --input-file ./data/mydata.json \ --file-format json \ --ts-format rfc3339 \ --pipeline-name my_pipeline \ --arroyosketch-dir ~/ASAPQuery/asap-summary-ingest -# 4. Export to ClickHouse +# 5. Export to ClickHouse python export_to_database.py \ --dataset custom \ --file-path ./data/mydata.json \ --init-sql-file ./configs/my_init.sql \ --table-name my_table -# 5. Generate queries -python generate_queries.py \ - --table-name my_table \ - --ts-column event_time \ - --value-column metric_value \ - --group-by-columns region,host \ - --window-size 10 \ - --num-queries 50 \ - --auto-detect-timestamps \ - --data-file ./data/mydata.json \ - --output-prefix ./queries/my_dataset - # 6. Run benchmark python run_benchmark.py \ --mode both \ - --asap-sql-file ./queries/my_dataset.sql \ - --baseline-sql-file ./queries/my_dataset.sql \ + --asap-sql-file ./queries/my_dataset_asap.sql \ + --baseline-sql-file ./queries/my_dataset_clickhouse.sql \ + --asap-url "http://localhost:8088/api/v1/query" \ --output-dir ./results ``` @@ -344,8 +345,8 @@ $INSTALL_DIR/clickhouse client --query "TRUNCATE TABLE hits" |------|---------| | `download_dataset.py` | Download ClickBench, H2O, or custom datasets | | `prepare_data.py` | Convert raw data to Arroyo file source format (RFC3339, string columns) | -| `export_to_arroyo.py` | Launch Arroyo sketch pipeline from a local file source | +| `export_to_arroyo.py` | Launch Arroyo sketch pipeline (file or kafka source) | | `export_to_database.py` | Load data into ClickHouse for baseline | -| `generate_queries.py` | Generate a single SQL query file (database-style, compatible with both ASAP and ClickHouse) | +| `generate_queries.py` | Generate paired ASAP + ClickHouse SQL query files and streaming/inference YAML configs | | `run_benchmark.py` | Run queries and produce CSV results + plots | -| `configs/` | Dataset-specific streaming/inference YAML and ClickHouse init SQL | +| `configs/` | ClickHouse init SQL (CREATE TABLE statements) | diff --git a/asap-tools/execution-utilities/benchmark/generate_queries.py b/asap-tools/execution-utilities/benchmark/generate_queries.py index 003dc52d..0754a843 100644 --- a/asap-tools/execution-utilities/benchmark/generate_queries.py +++ b/asap-tools/execution-utilities/benchmark/generate_queries.py @@ -1,17 +1,35 @@ #!/usr/bin/env python3 """ -Generate a SQL query file for benchmarking ASAP and ClickHouse. +Generate paired ASAP and ClickHouse SQL query files for benchmarking, +and optionally generate streaming/inference YAML configs. -Each query uses database-style quantile(q)(col) syntax, compatible with both -QueryEngineRust and ClickHouse. Queries target fixed time windows and match the +Each query targets a fixed time window (window-end timestamp) and matches the annotation format `-- T{NNN}: description` expected by run_benchmark.py. -Output: - {prefix}.sql quantile(q)(col) database-style syntax, compatible with both - QueryEngineRust and ClickHouse baseline +Output (always): + {prefix}_asap.sql QUANTILE(q, col) syntax for QueryEngineRust + {prefix}_clickhouse.sql quantile(q)(col) syntax for ClickHouse baseline + +Output (with --generate-configs): + {prefix}_streaming.yaml Arroyo streaming config + {prefix}_inference.yaml QueryEngineRust inference config Usage: - # Auto-detect timestamps from data file → ./queries/clickbench.sql + # Generate queries + configs in one shot + python generate_queries.py \\ + --table-name h2o_groupby \\ + --ts-column timestamp \\ + --value-column v1 \\ + --group-by-columns id1,id2 \\ + --window-size 30 \\ + --num-queries 50 \\ + --generate-configs \\ + --auto-detect-timestamps \\ + --data-file ./data/h2o_arroyo_full.json \\ + --data-file-format json \\ + --output-prefix ./queries/h2o_30s + + # Queries only (no configs) python generate_queries.py \\ --table-name hits \\ --ts-column EventTime \\ @@ -24,7 +42,7 @@ --data-file-format json.gz \\ --output-prefix ./queries/clickbench - # Explicit timestamp file (one ISO timestamp per line) → ./queries/h2o.sql + # Override timestamp format for both outputs python generate_queries.py \\ --table-name h2o_groupby \\ --ts-column timestamp \\ @@ -32,6 +50,7 @@ --group-by-columns id1,id2 \\ --window-size 10 \\ --num-queries 50 \\ + --ts-format iso \\ --timestamps-file ./my_timestamps.txt \\ --output-prefix ./queries/h2o """ @@ -45,8 +64,6 @@ from typing import List, Optional -SAMPLE_SIZE = 10_000 # rows to read for timestamp auto-detection - def _parse_timestamp(value: str) -> Optional[datetime]: """Try to parse a timestamp string in common formats.""" @@ -73,17 +90,16 @@ def _parse_timestamp(value: str) -> Optional[datetime]: return None -def _read_timestamps_from_json( +def _scan_ts_range_json( file_path: str, ts_column: str, compressed: bool -) -> List[datetime]: - """Read up to SAMPLE_SIZE timestamps from a JSON-lines file.""" - timestamps = [] +) -> tuple: + """Scan a JSON-lines file and return (min_ts, max_ts, count).""" + min_ts = max_ts = None + count = 0 opener = gzip.open if compressed else open mode = "rt" if compressed else "r" with opener(file_path, mode) as f: - for i, line in enumerate(f): - if i >= SAMPLE_SIZE: - break + for line in f: line = line.strip() if not line: continue @@ -93,17 +109,23 @@ def _read_timestamps_from_json( if val is not None: ts = _parse_timestamp(val) if ts: - timestamps.append(ts) + count += 1 + if min_ts is None or ts < min_ts: + min_ts = ts + if max_ts is None or ts > max_ts: + max_ts = ts except (json.JSONDecodeError, KeyError): continue - return timestamps + return min_ts, max_ts, count -def _read_timestamps_from_csv(file_path: str, ts_column: str) -> List[datetime]: - """Read up to SAMPLE_SIZE timestamps from a CSV file.""" +def _scan_ts_range_csv( + file_path: str, ts_column: str +) -> tuple: + """Scan a CSV file and return (min_ts, max_ts, count).""" import csv - - timestamps = [] + min_ts = max_ts = None + count = 0 with open(file_path, "r", newline="") as f: reader = csv.DictReader(f) if ts_column not in (reader.fieldnames or []): @@ -111,37 +133,40 @@ def _read_timestamps_from_csv(file_path: str, ts_column: str) -> List[datetime]: f"WARNING: Column '{ts_column}' not found in CSV. " f"Available: {reader.fieldnames}" ) - return [] - for i, row in enumerate(reader): - if i >= SAMPLE_SIZE: - break + return None, None, 0 + for row in reader: ts = _parse_timestamp(row[ts_column]) if ts: - timestamps.append(ts) - return timestamps - - -def detect_timestamps(data_file: str, data_file_format: str, ts_column: str) -> tuple: - """Return (min_ts, max_ts) from a sample of the data file.""" + count += 1 + if min_ts is None or ts < min_ts: + min_ts = ts + if max_ts is None or ts > max_ts: + max_ts = ts + return min_ts, max_ts, count + + +def detect_timestamps( + data_file: str, data_file_format: str, ts_column: str +) -> tuple: + """Return (min_ts, max_ts) by scanning the entire data file.""" fmt = data_file_format.lower() if fmt in ("json.gz", "jsonl.gz"): - timestamps = _read_timestamps_from_json(data_file, ts_column, compressed=True) + min_ts, max_ts, count = _scan_ts_range_json(data_file, ts_column, compressed=True) elif fmt in ("json", "jsonl"): - timestamps = _read_timestamps_from_json(data_file, ts_column, compressed=False) + min_ts, max_ts, count = _scan_ts_range_json(data_file, ts_column, compressed=False) elif fmt == "csv": - timestamps = _read_timestamps_from_csv(data_file, ts_column) + min_ts, max_ts, count = _scan_ts_range_csv(data_file, ts_column) else: print(f"ERROR: Unsupported data file format: {data_file_format}") sys.exit(1) - if not timestamps: + if min_ts is None: print( - f"ERROR: No '{ts_column}' timestamps found in the first {SAMPLE_SIZE} " - f"rows of {data_file}" + f"ERROR: No '{ts_column}' timestamps found in {data_file}" ) sys.exit(1) - return min(timestamps), max(timestamps) + return min_ts, max_ts def _snap_to_window_boundary(ts: datetime, window_size: int) -> datetime: @@ -197,7 +222,7 @@ def format_ts(ts: datetime, ts_format: str) -> str: return ts.strftime("%Y-%m-%d %H:%M:%S") -def generate_sql_file( +def generate_sql_files( table_name: str, ts_column: str, value_column: str, @@ -205,62 +230,144 @@ def generate_sql_file( quantile: float, window_size: int, window_ends: List[datetime], - ts_format: str, + ts_format_asap: str, + ts_format_db: str, window_form: str, output_prefix: str, ): - """Write a single SQL file compatible with both ASAP and ClickHouse.""" + """Write the paired ASAP and ClickHouse SQL files.""" group_by_clause = ", ".join(group_by_columns) - percentile = quantile * 100 - # Strip trailing zero: 95.0 -> 95, 99.5 -> 99.5 - percentile_str = f"{percentile:.1f}".rstrip("0").rstrip(".") - + asap_lines = [] ch_lines = [] - es_lines = [] for i, end_ts in enumerate(window_ends): - end_str = format_ts(end_ts, ts_format) - start_ts = end_ts - timedelta(seconds=window_size) - start_str = format_ts(start_ts, ts_format) + asap_end = format_ts(end_ts, ts_format_asap) + asap_start = format_ts(end_ts - timedelta(seconds=window_size), ts_format_asap) + db_end = format_ts(end_ts, ts_format_db) + db_start = format_ts(end_ts - timedelta(seconds=window_size), ts_format_db) label = f"T{i:03d}" - desc = f"quantile window ending at {end_str}" + desc_asap = f"quantile window ending at {asap_end}" + desc_db = f"quantile window ending at {db_end}" if window_form == "dateadd": - where_clause = f"{ts_column} BETWEEN DATEADD(s, -{window_size}, '{end_str}') AND '{end_str}'" + asap_where = ( + f"{ts_column} BETWEEN DATEADD(s, -{window_size}, '{asap_end}') AND '{asap_end}'" + ) + db_where = ( + f"{ts_column} BETWEEN DATEADD(s, -{window_size}, '{db_end}') AND '{db_end}'" + ) else: - where_clause = f"{ts_column} BETWEEN '{start_str}' AND '{end_str}'" + asap_where = ( + f"{ts_column} BETWEEN '{asap_start}' AND '{asap_end}'" + ) + db_where = ( + f"{ts_column} BETWEEN '{db_start}' AND '{db_end}'" + ) - # Elasticsearch uses DATEADD + CAST form - es_where = ( - f"{ts_column} BETWEEN DATEADD('s', -{window_size}, CAST('{end_str}' AS DATETIME)) " - f"AND CAST('{end_str}' AS DATETIME)" + asap_sql = ( + f"-- {label}: {desc_asap}\n" + f"SELECT QUANTILE({quantile}, {value_column}) FROM {table_name} " + f"WHERE {asap_where} GROUP BY {group_by_clause};" ) - ch_sql = ( - f"-- {label}: {desc}\n" + f"-- {label}: {desc_db}\n" f"SELECT quantile({quantile})({value_column}) FROM {table_name} " - f"WHERE {where_clause} GROUP BY {group_by_clause};" + f"WHERE {db_where} GROUP BY {group_by_clause};" ) + asap_lines.append(asap_sql) ch_lines.append(ch_sql) - es_lines.append( - f"-- {label}: {desc}\n" - f"SELECT PERCENTILE({value_column}, {percentile_str}) FROM {table_name} " - f"WHERE {es_where} GROUP BY {group_by_clause};" - ) + asap_file = f"{output_prefix}_asap.sql" ch_file = f"{output_prefix}_clickhouse.sql" - es_file = f"{output_prefix}_elasticsearch.sql" + + Path(asap_file).parent.mkdir(parents=True, exist_ok=True) + + with open(asap_file, "w") as f: + f.write("\n".join(asap_lines) + "\n") with open(ch_file, "w") as f: f.write("\n".join(ch_lines) + "\n") - with open(es_file, "w") as f: - f.write("\n".join(es_lines) + "\n") - print(f"Generated {len(window_ends)} queries:") - print(f" ClickHouse: {ch_file}") - print(f" Elasticsearch: {es_file}") + print(f" ASAP: {asap_file}") + print(f" ClickHouse: {ch_file}") + + +def generate_config_files( + table_name: str, + ts_column: str, + value_column: str, + group_by_columns: List[str], + quantile: float, + window_size: int, + aggregation_id: int, + aggregation_k: int, + output_prefix: str, +): + """Write paired streaming and inference YAML config files.""" + meta_yaml = "[" + ", ".join(group_by_columns) + "]" + group_by_clause = ", ".join(group_by_columns) + + streaming_content = f"""\ +tables: + - name: {table_name} + time_column: {ts_column} + metadata_columns: {meta_yaml} + value_columns: [{value_column}] + +aggregations: + - aggregationId: {aggregation_id} + aggregationType: DatasketchesKLL + aggregationSubType: '' + labels: + grouping: {meta_yaml} + rollup: [] + aggregated: [] + table_name: {table_name} + value_column: {value_column} + parameters: + K: {aggregation_k} + tumblingWindowSize: {window_size} + windowSize: {window_size} + windowType: tumbling + spatialFilter: '' +""" + + inference_content = f"""\ +tables: + - name: {table_name} + time_column: {ts_column} + metadata_columns: {meta_yaml} + value_columns: [{value_column}] + +cleanup_policy: + name: read_based + +queries: + - aggregations: + - aggregation_id: {aggregation_id} + read_count_threshold: 999999 + query: |- + SELECT QUANTILE({quantile}, {value_column}) FROM {table_name} + WHERE {ts_column} BETWEEN DATEADD(s, -{window_size}, NOW()) AND NOW() + GROUP BY {group_by_clause}; +""" + + streaming_file = f"{output_prefix}_streaming.yaml" + inference_file = f"{output_prefix}_inference.yaml" + + Path(streaming_file).parent.mkdir(parents=True, exist_ok=True) + + with open(streaming_file, "w") as f: + f.write(streaming_content) + + with open(inference_file, "w") as f: + f.write(inference_content) + + print(f"Generated configs:") + print(f" Streaming: {streaming_file}") + print(f" Inference: {inference_file}") def main(): @@ -272,9 +379,7 @@ def main(): # Table/column config parser.add_argument("--table-name", required=True) parser.add_argument("--ts-column", required=True, help="Timestamp column name") - parser.add_argument( - "--value-column", required=True, help="Column to compute quantile on" - ) + parser.add_argument("--value-column", required=True, help="Column to compute quantile on") parser.add_argument( "--group-by-columns", required=True, @@ -282,15 +387,25 @@ def main(): ) # Query parameters parser.add_argument("--quantile", type=float, default=0.95) + parser.add_argument("--window-size", type=int, default=10, help="Window size in seconds") + parser.add_argument("--num-queries", type=int, default=50) parser.add_argument( - "--window-size", type=int, default=10, help="Window size in seconds" + "--ts-format-asap", + choices=["iso", "datetime"], + default="iso", + help="Timestamp format for ASAP SQL: iso='YYYY-MM-DDTHH:MM:SSZ', datetime='YYYY-MM-DD HH:MM:SS' (default: iso)", + ) + parser.add_argument( + "--ts-format-db", + choices=["iso", "datetime"], + default="datetime", + help="Timestamp format for ClickHouse SQL: iso='YYYY-MM-DDTHH:MM:SSZ', datetime='YYYY-MM-DD HH:MM:SS' (default: datetime)", ) - parser.add_argument("--num-queries", type=int, default=50) parser.add_argument( "--ts-format", choices=["iso", "datetime"], - default="iso", - help="Timestamp format in SQL: iso='YYYY-MM-DDTHH:MM:SSZ', datetime='YYYY-MM-DD HH:MM:SS' (default: iso)", + default=None, + help="Set both --ts-format-asap and --ts-format-db to the same value (overrides individual flags)", ) parser.add_argument( "--window-form", @@ -301,7 +416,7 @@ def main(): parser.add_argument( "--output-prefix", required=True, - help="Output file prefix (e.g. ./queries/clickbench → clickbench.sql)", + help="Output file prefix (e.g. ./queries/clickbench → clickbench_asap.sql + clickbench_clickhouse.sql)", ) # Timestamp sources (mutually exclusive) ts_group = parser.add_mutually_exclusive_group(required=True) @@ -333,6 +448,24 @@ def main(): default=None, help="Spacing between window-end timestamps (default: window-size * 3)", ) + # Config generation + parser.add_argument( + "--generate-configs", + action="store_true", + help="Also generate streaming and inference YAML config files", + ) + parser.add_argument( + "--aggregation-id", + type=int, + default=12, + help="Aggregation ID for config files (default: 12)", + ) + parser.add_argument( + "--aggregation-k", + type=int, + default=200, + help="KLL sketch K parameter (default: 200)", + ) args = parser.parse_args() @@ -377,7 +510,10 @@ def main(): f"(stride={stride}s, window={args.window_size}s)" ) - generate_sql_file( + ts_format_asap = args.ts_format if args.ts_format else args.ts_format_asap + ts_format_db = args.ts_format if args.ts_format else args.ts_format_db + + generate_sql_files( table_name=args.table_name, ts_column=args.ts_column, value_column=args.value_column, @@ -385,11 +521,25 @@ def main(): quantile=args.quantile, window_size=args.window_size, window_ends=window_ends, - ts_format=args.ts_format, + ts_format_asap=ts_format_asap, + ts_format_db=ts_format_db, window_form=args.window_form, output_prefix=args.output_prefix, ) + if args.generate_configs: + generate_config_files( + table_name=args.table_name, + ts_column=args.ts_column, + value_column=args.value_column, + group_by_columns=group_by_columns, + quantile=args.quantile, + window_size=args.window_size, + aggregation_id=args.aggregation_id, + aggregation_k=args.aggregation_k, + output_prefix=args.output_prefix, + ) + if __name__ == "__main__": main() From 7e1983b2282a52eadbb558b938988cf9e48c5fac Mon Sep 17 00:00:00 2001 From: benjamib112 Date: Sat, 18 Apr 2026 21:06:36 -0400 Subject: [PATCH 07/10] formatting --- .../benchmark/download_dataset.py | 1 - .../benchmark/generate_queries.py | 50 ++++++++----------- 2 files changed, 21 insertions(+), 30 deletions(-) diff --git a/asap-tools/execution-utilities/benchmark/download_dataset.py b/asap-tools/execution-utilities/benchmark/download_dataset.py index 750b5502..26ee54d5 100644 --- a/asap-tools/execution-utilities/benchmark/download_dataset.py +++ b/asap-tools/execution-utilities/benchmark/download_dataset.py @@ -16,7 +16,6 @@ import sys import urllib.request - CLICKBENCH_URL = "https://datasets.clickhouse.com/hits_compatible/hits.json.gz" CLICKBENCH_FILENAME = "hits.json.gz" diff --git a/asap-tools/execution-utilities/benchmark/generate_queries.py b/asap-tools/execution-utilities/benchmark/generate_queries.py index 0754a843..462c1bd7 100644 --- a/asap-tools/execution-utilities/benchmark/generate_queries.py +++ b/asap-tools/execution-utilities/benchmark/generate_queries.py @@ -64,7 +64,6 @@ from typing import List, Optional - def _parse_timestamp(value: str) -> Optional[datetime]: """Try to parse a timestamp string in common formats.""" value = str(value).strip() @@ -90,9 +89,7 @@ def _parse_timestamp(value: str) -> Optional[datetime]: return None -def _scan_ts_range_json( - file_path: str, ts_column: str, compressed: bool -) -> tuple: +def _scan_ts_range_json(file_path: str, ts_column: str, compressed: bool) -> tuple: """Scan a JSON-lines file and return (min_ts, max_ts, count).""" min_ts = max_ts = None count = 0 @@ -119,11 +116,10 @@ def _scan_ts_range_json( return min_ts, max_ts, count -def _scan_ts_range_csv( - file_path: str, ts_column: str -) -> tuple: +def _scan_ts_range_csv(file_path: str, ts_column: str) -> tuple: """Scan a CSV file and return (min_ts, max_ts, count).""" import csv + min_ts = max_ts = None count = 0 with open(file_path, "r", newline="") as f: @@ -145,15 +141,17 @@ def _scan_ts_range_csv( return min_ts, max_ts, count -def detect_timestamps( - data_file: str, data_file_format: str, ts_column: str -) -> tuple: +def detect_timestamps(data_file: str, data_file_format: str, ts_column: str) -> tuple: """Return (min_ts, max_ts) by scanning the entire data file.""" fmt = data_file_format.lower() if fmt in ("json.gz", "jsonl.gz"): - min_ts, max_ts, count = _scan_ts_range_json(data_file, ts_column, compressed=True) + min_ts, max_ts, count = _scan_ts_range_json( + data_file, ts_column, compressed=True + ) elif fmt in ("json", "jsonl"): - min_ts, max_ts, count = _scan_ts_range_json(data_file, ts_column, compressed=False) + min_ts, max_ts, count = _scan_ts_range_json( + data_file, ts_column, compressed=False + ) elif fmt == "csv": min_ts, max_ts, count = _scan_ts_range_csv(data_file, ts_column) else: @@ -161,9 +159,7 @@ def detect_timestamps( sys.exit(1) if min_ts is None: - print( - f"ERROR: No '{ts_column}' timestamps found in {data_file}" - ) + print(f"ERROR: No '{ts_column}' timestamps found in {data_file}") sys.exit(1) return min_ts, max_ts @@ -250,19 +246,11 @@ def generate_sql_files( desc_db = f"quantile window ending at {db_end}" if window_form == "dateadd": - asap_where = ( - f"{ts_column} BETWEEN DATEADD(s, -{window_size}, '{asap_end}') AND '{asap_end}'" - ) - db_where = ( - f"{ts_column} BETWEEN DATEADD(s, -{window_size}, '{db_end}') AND '{db_end}'" - ) + asap_where = f"{ts_column} BETWEEN DATEADD(s, -{window_size}, '{asap_end}') AND '{asap_end}'" + db_where = f"{ts_column} BETWEEN DATEADD(s, -{window_size}, '{db_end}') AND '{db_end}'" else: - asap_where = ( - f"{ts_column} BETWEEN '{asap_start}' AND '{asap_end}'" - ) - db_where = ( - f"{ts_column} BETWEEN '{db_start}' AND '{db_end}'" - ) + asap_where = f"{ts_column} BETWEEN '{asap_start}' AND '{asap_end}'" + db_where = f"{ts_column} BETWEEN '{db_start}' AND '{db_end}'" asap_sql = ( f"-- {label}: {desc_asap}\n" @@ -379,7 +367,9 @@ def main(): # Table/column config parser.add_argument("--table-name", required=True) parser.add_argument("--ts-column", required=True, help="Timestamp column name") - parser.add_argument("--value-column", required=True, help="Column to compute quantile on") + parser.add_argument( + "--value-column", required=True, help="Column to compute quantile on" + ) parser.add_argument( "--group-by-columns", required=True, @@ -387,7 +377,9 @@ def main(): ) # Query parameters parser.add_argument("--quantile", type=float, default=0.95) - parser.add_argument("--window-size", type=int, default=10, help="Window size in seconds") + parser.add_argument( + "--window-size", type=int, default=10, help="Window size in seconds" + ) parser.add_argument("--num-queries", type=int, default=50) parser.add_argument( "--ts-format-asap", From 9b0c7d62ba3f1208b6c319022d039d6bd5e7adbc Mon Sep 17 00:00:00 2001 From: benjamib112 Date: Wed, 22 Apr 2026 09:39:58 -0400 Subject: [PATCH 08/10] added query accuracy comparison between baseline and asap --- .../benchmark/run_benchmark.py | 107 ++++++++++++++++-- 1 file changed, 97 insertions(+), 10 deletions(-) diff --git a/asap-tools/execution-utilities/benchmark/run_benchmark.py b/asap-tools/execution-utilities/benchmark/run_benchmark.py index 696dee0a..5501e7e7 100644 --- a/asap-tools/execution-utilities/benchmark/run_benchmark.py +++ b/asap-tools/execution-utilities/benchmark/run_benchmark.py @@ -178,6 +178,7 @@ def run_query( return ( latency_ms, None, + 0, f"HTTP {response.status_code}: {response.text[:200]}", ) except requests.Timeout: @@ -300,7 +301,7 @@ def run_benchmark( ) plot_latencies.append(0.0) else: - preview = last_result.replace("\n", " | ")[:200] if last_result else "" + preview = last_result.replace("\n", " | ") if last_result else "" latencies_ok.append(latency_ms) plot_latencies.append(latency_ms) print(f"{latency_ms:.2f}ms ({last_row_count} rows)") @@ -341,18 +342,56 @@ def _plot_single(latencies: List[float], mode: str, out_path: Path): print(f"Plot saved to {out_path}") -def _plot_comparison(asap_csv: Path, baseline_csv: Path, out_path: Path): - """Two-panel comparison plot: per-query bars + speedup bars. +def _parse_result_values(result_full: str) -> List[float]: + """Extract numeric values from a pipe-separated result_full string.""" + if not result_full: + return [] + values = [] + for part in result_full.split(" | "): + part = part.strip() + if not part: + continue + cols = part.split("\t") + try: + values.append(float(cols[-1])) + except (ValueError, IndexError): + continue + return values + + +def _compute_result_error( + baseline_values: List[float], asap_values: List[float] +) -> Optional[float]: + """Mean absolute relative error between two sorted result sets.""" + if not baseline_values or not asap_values: + return None + b = sorted(baseline_values) + a = sorted(asap_values) + n = min(len(b), len(a)) + if n == 0: + return None + b, a = b[:n], a[:n] + errors = [] + for bv, av in zip(b, a): + if bv == 0: + errors.append(0.0 if av == 0 else abs(av)) + else: + errors.append(abs(av - bv) / abs(bv)) + return sum(errors) / len(errors) - Adapted from asap_query_latency/plot_latency.py. - """ + +def _plot_comparison(asap_csv: Path, baseline_csv: Path, out_path: Path): + """Three-panel comparison: latency bars, speedup, and result accuracy.""" def _load(path): rows = {} with open(path) as f: for row in csv.DictReader(f): if not row["error"]: - rows[row["query_id"]] = float(row["latency_ms"]) + rows[row["query_id"]] = { + "latency": float(row["latency_ms"]), + "result": row.get("result_full", ""), + } return rows asap = _load(asap_csv) @@ -363,13 +402,28 @@ def _load(path): return x = np.arange(len(qids)) - a_vals = [asap[q] for q in qids] - b_vals = [base[q] for q in qids] + a_vals = [asap[q]["latency"] for q in qids] + b_vals = [base[q]["latency"] for q in qids] speedup = [b / a if a > 0 else 0 for a, b in zip(a_vals, b_vals)] - fig, (ax1, ax2) = plt.subplots( - 2, 1, figsize=(14, 7), gridspec_kw={"height_ratios": [3, 1]} + errors_pct = [] + for q in qids: + b_results = _parse_result_values(base[q]["result"]) + a_results = _parse_result_values(asap[q]["result"]) + err = _compute_result_error(b_results, a_results) + errors_pct.append((err or 0.0) * 100) + + has_accuracy = any(e > 0 for e in errors_pct) + n_panels = 3 if has_accuracy else 2 + ratios = [3, 1, 1.5] if has_accuracy else [3, 1] + + fig, axes = plt.subplots( + n_panels, + 1, + figsize=(14, 4 + 3 * n_panels), + gridspec_kw={"height_ratios": ratios}, ) + ax1, ax2 = axes[0], axes[1] w = 0.4 ax1.bar(x - w / 2, b_vals, w, label="Baseline", color="#f4a460") @@ -398,11 +452,44 @@ def _load(path): ax2.legend(fontsize=8) ax2.set_xlim(-0.6, len(qids) - 0.4) + if has_accuracy: + ax3 = axes[2] + colors = [ + "#d9534f" if e > 10 else "#f0ad4e" if e > 5 else "#5cb85c" + for e in errors_pct + ] + ax3.bar( + x, errors_pct, color=colors, width=0.7, edgecolor="black", linewidth=0.3 + ) + mean_err = np.mean(errors_pct) + ax3.axhline( + mean_err, + color="red", + linewidth=1, + linestyle="--", + label=f"mean {mean_err:.2f}%", + ) + ax3.set_xticks(x) + ax3.set_xticklabels(qids, rotation=90, fontsize=7) + ax3.set_ylabel("Relative Error (%)") + ax3.set_title("Result accuracy: ASAP estimate vs baseline exact answer") + ax3.legend(fontsize=8) + ax3.set_xlim(-0.6, len(qids) - 0.4) + plt.tight_layout() plt.savefig(out_path, dpi=150) plt.close() print(f"Comparison plot saved to {out_path}") + if has_accuracy: + s = sorted(errors_pct) + n = len(s) + print( + f"Result error: mean={np.mean(s):.2f}% " + f"p50={s[int(n*0.50)]:.2f}% p95={s[int(n*0.95)]:.2f}% " + f"max={s[-1]:.2f}%" + ) + # --------------------------------------------------------------------------- # Main From 39eb0d4bb522f8465143e8ec196d086a1ece593d Mon Sep 17 00:00:00 2001 From: Kavya Bhat Date: Thu, 30 Apr 2026 11:08:53 -0400 Subject: [PATCH 09/10] Add Elasticsearch instructions to README --- .../execution-utilities/benchmark/README.md | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/asap-tools/execution-utilities/benchmark/README.md b/asap-tools/execution-utilities/benchmark/README.md index a63beded..3f63ffcd 100644 --- a/asap-tools/execution-utilities/benchmark/README.md +++ b/asap-tools/execution-utilities/benchmark/README.md @@ -264,7 +264,67 @@ python run_benchmark.py \ --output-dir ./results \ --output-prefix h2o ``` +--- +## Elasticsearch End-to-End Example using H2O Dataset + +### Step 1-5: +Follow the same instructions from the H2O GroupBy example above. +### Step 6 — Launch Arroyo sketch pipeline + +```bash +python export_to_arroyo.py \ + --streaming-config ./configs/h2o_streaming.yaml \ + --source-type file \ + --input-file ./data/h2o_arroyo.json \ + --file-format json \ + --ts-format unix_millis \ + --pipeline-name h2o_pipeline \ + --arroyosketch-dir ~/ASAPQuery/asap-summary-ingest \ + --output-dir ./arroyo_outputs +``` + +### Step 7 — Start QueryEngineRust + +```bash +cd ~/ASAPQuery/asap-query-engine + +./target/release/query_engine_rust \ + --kafka-topic sketch_topic + --input-format json \ + --config ~/ASAPQuery/asap-tools/execution-utilities/benchmark/configs/h2o_inference.yaml \ + --streaming-config ~/ASAPQuery/asap-tools/execution-utilities/benchmark/configs/h2o_streaming.yaml \ + --http-port 8088 --delete-existing-db --log-level DEBUG \ + --output-dir ./output --streaming-engine arroyo \ + --query-language SQL --lock-strategy per-key \ + --prometheus-scrape-interval 1 > /tmp/query_engine.log 2>&1 & +``` + +### Step 8 — Load data into Elasticsearch (baseline) + +```bash +python export_to_database.py + --dataset h2o + --file-path ./data/G1_1e7_1e2_0_0.csv + --es-host localhost + --es-port 9200 + --es-index h2o_groupby + --es-api-key your-api-key + --es-bulk-size 5000 +``` + +### Step 9 — Run benchmark + +```bash +python run_benchmark.py + --mode asap + --asap-sql-file ./queries/h2o_asap.sql + --baseline-sql-file ./queries/h2o_elasticsearch.sql + --elastic-host localhost + --elastic-port 9200 + --elastic-api-key your-api-key + --output-dir ./results --output-prefix h2o +``` --- ## Custom Dataset From ab6465de48a62bd735aae491ffb516ceb4ce4ffa Mon Sep 17 00:00:00 2001 From: Milind Srivastava Date: Wed, 6 May 2026 22:26:01 -0400 Subject: [PATCH 10/10] updated scripts --- .../src/ast_matching/sqlparser_test.rs | 59 +++++++++ .../src/ast_matching/sqlpattern_parser.rs | 5 + .../execution-utilities/benchmark/README.md | 49 ++++---- .../benchmark/generate_queries.py | 117 ++++++------------ 4 files changed, 128 insertions(+), 102 deletions(-) diff --git a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlparser_test.rs b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlparser_test.rs index 72b0940b..37a24506 100644 --- a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlparser_test.rs +++ b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlparser_test.rs @@ -496,6 +496,65 @@ mod tests { ); } + // ── ClickHouse parametric syntax + explicit BETWEEN timestamps ──────────── + // These verify that a fully ClickHouse-compatible query (no DATEADD, no NOW()) + // is parseable by ASAP: quantile(q)(col) + BETWEEN 'start' AND 'end'. + + #[test] + fn test_clickhouse_explicit_datetime_temporal_quantile() { + check_query( + "SELECT quantile(0.95)(value) FROM cpu_usage WHERE time BETWEEN '2025-10-01 00:00:00' AND '2025-10-01 00:00:10' GROUP BY L1, L2, L3, L4", + vec![QueryType::TemporalQuantile], + None, + ); + } + + #[test] + // ASAP-only: parse_datetime accepts the Z suffix (interprets as UTC), but ClickHouse + // rejects it with TYPE_MISMATCH when comparing against a DateTime column. + // Do not use Z-suffix strings in queries intended for both systems. + fn test_asap_only_iso_z_temporal_quantile() { + check_query( + "SELECT quantile(0.95)(value) FROM cpu_usage WHERE time BETWEEN '2025-10-01T00:00:00Z' AND '2025-10-01T00:00:10Z' GROUP BY L1, L2, L3, L4", + vec![QueryType::TemporalQuantile], + None, + ); + } + + #[test] + // Both ASAP (parse_datetime) and ClickHouse treat ISO-without-Z as local server time. + // They agree only when running in the same timezone; prefer 'YYYY-MM-DD HH:MM:SS' + // (space format) to avoid this implicit dependency. + fn test_iso_no_z_treated_as_local_time_temporal_quantile() { + check_query( + "SELECT quantile(0.95)(value) FROM cpu_usage WHERE time BETWEEN '2025-10-01T00:00:00' AND '2025-10-01T00:00:10' GROUP BY L1, L2, L3, L4", + vec![QueryType::TemporalQuantile], + None, + ); + } + + #[test] + fn test_clickhouse_explicit_datetime_spatial_quantile() { + check_query( + "SELECT quantile(0.95)(value) FROM cpu_usage WHERE time BETWEEN '2025-10-01 00:00:00' AND '2025-10-01 00:00:01' GROUP BY L1", + vec![QueryType::Spatial], + None, + ); + } + + #[test] + fn test_clickhouse_explicit_matches_now_template() { + // A ClickHouse-style query (explicit timestamps, parametric quantile) must + // match a stored DATEADD(NOW()) template of the same shape. + let template = parse_sql_query( + "SELECT quantile(0.95)(value) FROM cpu_usage WHERE time BETWEEN DATEADD(s, -10, NOW()) AND NOW() GROUP BY L1, L2, L3, L4" + ).unwrap(); + let incoming = parse_sql_query( + "SELECT quantile(0.95)(value) FROM cpu_usage WHERE time BETWEEN '2025-10-01 00:00:00' AND '2025-10-01 00:00:10' GROUP BY L1, L2, L3, L4" + ).unwrap(); + assert!(incoming.matches_sql_pattern(&template)); + } + // ── Error cases ────────────────────────────────────────────────────────── #[test] diff --git a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_parser.rs b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_parser.rs index 3c833a08..1c145a96 100644 --- a/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_parser.rs +++ b/asap-common/dependencies/rs/sql_utilities/src/ast_matching/sqlpattern_parser.rs @@ -320,6 +320,11 @@ impl SQLPatternParser { } fn get_timestamp_from_datetime_str(datetime_str: &str) -> Option { + // parse_datetime treats timezone-naive strings (e.g. "2025-10-01 00:00:00", + // "2025-10-01T00:00:00") as local server time, matching ClickHouse's behavior — + // but only when both run in the same timezone. Z-suffix strings (e.g. + // "2025-10-01T00:00:00Z") are interpreted as UTC here but rejected by ClickHouse. + // Use space-format datetime strings ("YYYY-MM-DD HH:MM:SS") for portability. let parsed_datetime = parse_datetime(datetime_str).ok()?; Some(parsed_datetime.timestamp().as_second() as f64) } diff --git a/asap-tools/execution-utilities/benchmark/README.md b/asap-tools/execution-utilities/benchmark/README.md index 3f63ffcd..9ee62f77 100644 --- a/asap-tools/execution-utilities/benchmark/README.md +++ b/asap-tools/execution-utilities/benchmark/README.md @@ -35,6 +35,12 @@ pip3 install --user -r requirements.txt cd ~/ASAPQuery/asap-query-engine && cargo build --release ``` +> **UTC requirement:** Both ASAP and ClickHouse must run in UTC so that bare +> datetime strings (`'YYYY-MM-DD HH:MM:SS'`) are interpreted identically by both +> systems. Set `TZ=UTC` in the environment for ASAP processes and ensure +> ClickHouse's `timezone` config is set to `UTC`. If the two systems run in +> different timezones, queries will target different time windows on each side. + --- ## ClickBench + ClickHouse End-to-End Example @@ -115,8 +121,7 @@ python generate_queries.py \ ``` This writes: -- `queries/clickbench_asap.sql` — ASAP queries (ISO timestamps) -- `queries/clickbench_clickhouse.sql` — ClickHouse queries (datetime timestamps) +- `queries/clickbench.sql` — shared query file for both ASAP and ClickHouse - `queries/clickbench_streaming.yaml` — Arroyo streaming config - `queries/clickbench_inference.yaml` — QueryEngineRust inference config @@ -166,8 +171,8 @@ Verify: `$INSTALL_DIR/clickhouse client --query "SELECT count(*) FROM hits"` ```bash python run_benchmark.py \ --mode both \ - --asap-sql-file ./queries/clickbench_asap.sql \ - --baseline-sql-file ./queries/clickbench_clickhouse.sql \ + --asap-sql-file ./queries/clickbench.sql \ + --baseline-sql-file ./queries/clickbench.sql \ --asap-url "http://localhost:8088/api/v1/query" \ --output-dir ./results \ --output-prefix clickbench @@ -258,8 +263,8 @@ python export_to_database.py \ ```bash python run_benchmark.py \ --mode both \ - --asap-sql-file ./queries/h2o_asap.sql \ - --baseline-sql-file ./queries/h2o_clickhouse.sql \ + --asap-sql-file ./queries/h2o.sql \ + --baseline-sql-file ./queries/h2o.sql \ --asap-url "http://localhost:8088/api/v1/query" \ --output-dir ./results \ --output-prefix h2o @@ -290,7 +295,7 @@ python export_to_arroyo.py \ cd ~/ASAPQuery/asap-query-engine ./target/release/query_engine_rust \ - --kafka-topic sketch_topic + --kafka-topic sketch_topic --input-format json \ --config ~/ASAPQuery/asap-tools/execution-utilities/benchmark/configs/h2o_inference.yaml \ --streaming-config ~/ASAPQuery/asap-tools/execution-utilities/benchmark/configs/h2o_streaming.yaml \ @@ -303,12 +308,12 @@ cd ~/ASAPQuery/asap-query-engine ### Step 8 — Load data into Elasticsearch (baseline) ```bash -python export_to_database.py - --dataset h2o - --file-path ./data/G1_1e7_1e2_0_0.csv - --es-host localhost - --es-port 9200 - --es-index h2o_groupby +python export_to_database.py + --dataset h2o + --file-path ./data/G1_1e7_1e2_0_0.csv + --es-host localhost + --es-port 9200 + --es-index h2o_groupby --es-api-key your-api-key --es-bulk-size 5000 ``` @@ -316,12 +321,12 @@ python export_to_database.py ### Step 9 — Run benchmark ```bash -python run_benchmark.py - --mode asap - --asap-sql-file ./queries/h2o_asap.sql - --baseline-sql-file ./queries/h2o_elasticsearch.sql - --elastic-host localhost - --elastic-port 9200 +python run_benchmark.py + --mode asap + --asap-sql-file ./queries/h2o.sql + --baseline-sql-file ./queries/h2o.sql + --elastic-host localhost + --elastic-port 9200 --elastic-api-key your-api-key --output-dir ./results --output-prefix h2o ``` @@ -370,8 +375,8 @@ python export_to_database.py \ # 6. Run benchmark python run_benchmark.py \ --mode both \ - --asap-sql-file ./queries/my_dataset_asap.sql \ - --baseline-sql-file ./queries/my_dataset_clickhouse.sql \ + --asap-sql-file ./queries/my_dataset.sql \ + --baseline-sql-file ./queries/my_dataset.sql \ --asap-url "http://localhost:8088/api/v1/query" \ --output-dir ./results ``` @@ -407,6 +412,6 @@ $INSTALL_DIR/clickhouse client --query "TRUNCATE TABLE hits" | `prepare_data.py` | Convert raw data to Arroyo file source format (RFC3339, string columns) | | `export_to_arroyo.py` | Launch Arroyo sketch pipeline (file or kafka source) | | `export_to_database.py` | Load data into ClickHouse for baseline | -| `generate_queries.py` | Generate paired ASAP + ClickHouse SQL query files and streaming/inference YAML configs | +| `generate_queries.py` | Generate a shared SQL query file (ClickHouse-compatible syntax, used for both ASAP and ClickHouse) and optional streaming/inference YAML configs | | `run_benchmark.py` | Run queries and produce CSV results + plots | | `configs/` | ClickHouse init SQL (CREATE TABLE statements) | diff --git a/asap-tools/execution-utilities/benchmark/generate_queries.py b/asap-tools/execution-utilities/benchmark/generate_queries.py index 462c1bd7..eb1b5d4e 100644 --- a/asap-tools/execution-utilities/benchmark/generate_queries.py +++ b/asap-tools/execution-utilities/benchmark/generate_queries.py @@ -1,14 +1,21 @@ #!/usr/bin/env python3 """ -Generate paired ASAP and ClickHouse SQL query files for benchmarking, +Generate ASAP/ClickHouse SQL query files for benchmarking, and optionally generate streaming/inference YAML configs. +Both ASAP and ClickHouse receive identical queries using native ClickHouse syntax: + - quantile(q)(col) parametric aggregate + - 'YYYY-MM-DD HH:MM:SS' datetime timestamps (no Z suffix) + +This works because after PR #166 ASAP's parser accepts ClickHouse parametric syntax, +and both systems interpret bare datetime strings as local server time — which is +unambiguous only when both run in UTC. See README for the UTC requirement. + Each query targets a fixed time window (window-end timestamp) and matches the annotation format `-- T{NNN}: description` expected by run_benchmark.py. Output (always): - {prefix}_asap.sql QUANTILE(q, col) syntax for QueryEngineRust - {prefix}_clickhouse.sql quantile(q)(col) syntax for ClickHouse baseline + {prefix}.sql shared query file for both ASAP and ClickHouse Output (with --generate-configs): {prefix}_streaming.yaml Arroyo streaming config @@ -42,7 +49,7 @@ --data-file-format json.gz \\ --output-prefix ./queries/clickbench - # Override timestamp format for both outputs + # Use a pre-built timestamps file python generate_queries.py \\ --table-name h2o_groupby \\ --ts-column timestamp \\ @@ -50,7 +57,6 @@ --group-by-columns id1,id2 \\ --window-size 10 \\ --num-queries 50 \\ - --ts-format iso \\ --timestamps-file ./my_timestamps.txt \\ --output-prefix ./queries/h2o """ @@ -210,15 +216,7 @@ def generate_window_ends( return ends -def format_ts(ts: datetime, ts_format: str) -> str: - """Format a timestamp for SQL injection.""" - if ts_format == "iso": - return ts.strftime("%Y-%m-%dT%H:%M:%SZ") - else: # datetime - return ts.strftime("%Y-%m-%d %H:%M:%S") - - -def generate_sql_files( +def generate_sql_file( table_name: str, ts_column: str, value_column: str, @@ -226,60 +224,42 @@ def generate_sql_files( quantile: float, window_size: int, window_ends: List[datetime], - ts_format_asap: str, - ts_format_db: str, window_form: str, output_prefix: str, ): - """Write the paired ASAP and ClickHouse SQL files.""" + """Write a single SQL file using ClickHouse-compatible syntax. + + Uses quantile(q)(col) and 'YYYY-MM-DD HH:MM:SS' datetime strings. + Both ASAP and ClickHouse accept this format when running in UTC. + """ group_by_clause = ", ".join(group_by_columns) - asap_lines = [] - ch_lines = [] + lines = [] for i, end_ts in enumerate(window_ends): - asap_end = format_ts(end_ts, ts_format_asap) - asap_start = format_ts(end_ts - timedelta(seconds=window_size), ts_format_asap) - db_end = format_ts(end_ts, ts_format_db) - db_start = format_ts(end_ts - timedelta(seconds=window_size), ts_format_db) + end_str = end_ts.strftime("%Y-%m-%d %H:%M:%S") + start_str = (end_ts - timedelta(seconds=window_size)).strftime( + "%Y-%m-%d %H:%M:%S" + ) label = f"T{i:03d}" - desc_asap = f"quantile window ending at {asap_end}" - desc_db = f"quantile window ending at {db_end}" if window_form == "dateadd": - asap_where = f"{ts_column} BETWEEN DATEADD(s, -{window_size}, '{asap_end}') AND '{asap_end}'" - db_where = f"{ts_column} BETWEEN DATEADD(s, -{window_size}, '{db_end}') AND '{db_end}'" + where = f"{ts_column} BETWEEN DATEADD(s, -{window_size}, '{end_str}') AND '{end_str}'" else: - asap_where = f"{ts_column} BETWEEN '{asap_start}' AND '{asap_end}'" - db_where = f"{ts_column} BETWEEN '{db_start}' AND '{db_end}'" + where = f"{ts_column} BETWEEN '{start_str}' AND '{end_str}'" - asap_sql = ( - f"-- {label}: {desc_asap}\n" - f"SELECT QUANTILE({quantile}, {value_column}) FROM {table_name} " - f"WHERE {asap_where} GROUP BY {group_by_clause};" - ) - ch_sql = ( - f"-- {label}: {desc_db}\n" + lines.append( + f"-- {label}: quantile window ending at {end_str}\n" f"SELECT quantile({quantile})({value_column}) FROM {table_name} " - f"WHERE {db_where} GROUP BY {group_by_clause};" + f"WHERE {where} GROUP BY {group_by_clause};" ) - asap_lines.append(asap_sql) - ch_lines.append(ch_sql) - - asap_file = f"{output_prefix}_asap.sql" - ch_file = f"{output_prefix}_clickhouse.sql" - - Path(asap_file).parent.mkdir(parents=True, exist_ok=True) + sql_file = f"{output_prefix}.sql" + Path(sql_file).parent.mkdir(parents=True, exist_ok=True) - with open(asap_file, "w") as f: - f.write("\n".join(asap_lines) + "\n") + with open(sql_file, "w") as f: + f.write("\n".join(lines) + "\n") - with open(ch_file, "w") as f: - f.write("\n".join(ch_lines) + "\n") - - print(f"Generated {len(window_ends)} queries:") - print(f" ASAP: {asap_file}") - print(f" ClickHouse: {ch_file}") + print(f"Generated {len(window_ends)} queries → {sql_file}") def generate_config_files( @@ -337,7 +317,7 @@ def generate_config_files( - aggregation_id: {aggregation_id} read_count_threshold: 999999 query: |- - SELECT QUANTILE({quantile}, {value_column}) FROM {table_name} + SELECT quantile({quantile})({value_column}) FROM {table_name} WHERE {ts_column} BETWEEN DATEADD(s, -{window_size}, NOW()) AND NOW() GROUP BY {group_by_clause}; """ @@ -353,14 +333,14 @@ def generate_config_files( with open(inference_file, "w") as f: f.write(inference_content) - print(f"Generated configs:") + print("Generated configs:") print(f" Streaming: {streaming_file}") print(f" Inference: {inference_file}") def main(): parser = argparse.ArgumentParser( - description="Generate paired ASAP + ClickHouse SQL query files", + description="Generate ASAP + ClickHouse SQL query files (shared syntax)", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__, ) @@ -381,24 +361,6 @@ def main(): "--window-size", type=int, default=10, help="Window size in seconds" ) parser.add_argument("--num-queries", type=int, default=50) - parser.add_argument( - "--ts-format-asap", - choices=["iso", "datetime"], - default="iso", - help="Timestamp format for ASAP SQL: iso='YYYY-MM-DDTHH:MM:SSZ', datetime='YYYY-MM-DD HH:MM:SS' (default: iso)", - ) - parser.add_argument( - "--ts-format-db", - choices=["iso", "datetime"], - default="datetime", - help="Timestamp format for ClickHouse SQL: iso='YYYY-MM-DDTHH:MM:SSZ', datetime='YYYY-MM-DD HH:MM:SS' (default: datetime)", - ) - parser.add_argument( - "--ts-format", - choices=["iso", "datetime"], - default=None, - help="Set both --ts-format-asap and --ts-format-db to the same value (overrides individual flags)", - ) parser.add_argument( "--window-form", choices=["explicit", "dateadd"], @@ -408,7 +370,7 @@ def main(): parser.add_argument( "--output-prefix", required=True, - help="Output file prefix (e.g. ./queries/clickbench → clickbench_asap.sql + clickbench_clickhouse.sql)", + help="Output file prefix (e.g. ./queries/clickbench → clickbench.sql)", ) # Timestamp sources (mutually exclusive) ts_group = parser.add_mutually_exclusive_group(required=True) @@ -502,10 +464,7 @@ def main(): f"(stride={stride}s, window={args.window_size}s)" ) - ts_format_asap = args.ts_format if args.ts_format else args.ts_format_asap - ts_format_db = args.ts_format if args.ts_format else args.ts_format_db - - generate_sql_files( + generate_sql_file( table_name=args.table_name, ts_column=args.ts_column, value_column=args.value_column, @@ -513,8 +472,6 @@ def main(): quantile=args.quantile, window_size=args.window_size, window_ends=window_ends, - ts_format_asap=ts_format_asap, - ts_format_db=ts_format_db, window_form=args.window_form, output_prefix=args.output_prefix, )