From c985730bdbd48a7fc410fe8191995c4552d6faa1 Mon Sep 17 00:00:00 2001 From: Jacob Marble Date: Tue, 17 Mar 2026 01:01:44 +0000 Subject: [PATCH 1/3] materialize-clickhouse: new connector --- .../materialization-connectors/ClickHouse.md | 92 +++++++++++++++++++ .../Dekaf/clickhouse.md | 2 + .../materialization-connectors/README.md | 3 + 3 files changed, 97 insertions(+) create mode 100644 site/docs/reference/Connectors/materialization-connectors/ClickHouse.md diff --git a/site/docs/reference/Connectors/materialization-connectors/ClickHouse.md b/site/docs/reference/Connectors/materialization-connectors/ClickHouse.md new file mode 100644 index 00000000000..6478668f2ce --- /dev/null +++ b/site/docs/reference/Connectors/materialization-connectors/ClickHouse.md @@ -0,0 +1,92 @@ + + +# ClickHouse + +This connector materializes Estuary collections into tables in a ClickHouse database. + +[ClickHouse](https://clickhouse.com/) is a column-oriented OLAP database designed for real-time analytics. +This connector writes directly to ClickHouse using the native protocol. + +Estuary also provides a [Dekaf-based integration](./Dekaf/clickhouse.md) for users who prefer to ingest via ClickPipes. + +## Prerequisites + +To use this connector, you'll need: + +* A ClickHouse database (self-hosted or ClickHouse Cloud) with a user that has permissions to create tables and write data. +* The connector uses the ClickHouse native protocol (port 9000 by default, not the HTTP interface on port 8123). +* At least one Estuary collection. + +:::tip +If you haven't yet captured your data from its external source, start at the beginning of the [guide to create a dataflow](../../../guides/create-dataflow.md). You'll be referred back to this connector-specific documentation at the appropriate steps. +::: + +## Configuration + +To use this connector, begin with data in one or more Estuary collections. +Use the below properties to configure a ClickHouse materialization, which will direct the contents of these collections into ClickHouse tables. + +### Properties + +#### Endpoint + +| Property | Title | Description | Type | Required/Default | +|---|---|---|---|---| +| **`/address`** | Address | Host and port of the database, in the form of `host[:port]`. Port 9000 is used as the default if no specific port is provided. | string | Required | +| **`/credentials`** | Authentication | | object | Required | +| **`/credentials/auth_type`** | Auth Type | Authentication type. Must be `user_password`. | string | Required | +| **`/credentials/username`** | Username | Database username. | string | Required | +| **`/credentials/password`** | Password | Database password. | string | Required | +| **`/database`** | Database | Name of the ClickHouse database to materialize to. | string | Required | +| `/hardDelete` | Hard Delete | If enabled, the connector inserts tombstone rows with `_is_deleted = 1` when source documents are deleted, causing them to be excluded from `FINAL` queries. By default, source deletions are ignored at the destination. | boolean | `false` | + +#### Bindings + +| Property | Title | Description | Type | Required/Default | +|---|---|---|---|---| +| **`/table`** | Table | Name of the database table to materialize to. The connector will create the table if it doesn't already exist. | string | Required | + +### Sample + +```yaml +materializations: + ${PREFIX}/${mat_name}: + endpoint: + connector: + config: + address: clickhouse.example.com:9000 + credentials: + auth_type: user_password + username: flow_user + password: secret + database: my_database + image: ghcr.io/estuary/materialize-clickhouse:v1 + bindings: + - resource: + table: my_table + source: ${PREFIX}/${source_collection} +``` + +## ReplacingMergeTree and FINAL + +The connector creates tables using the [ReplacingMergeTree engine](https://clickhouse.com/docs/engines/table-engines/mergetree-family/replacingmergetree). Updated records are actually inserted as duplicates; ClickHouse later deduplicates these as a background process. + +Your queries should use the `FINAL` directive to get deduplicated results, and include the predicate `_is_deleted = 0` to ignore deleted records. + +```sql +SELECT * FROM my_table FINAL WHERE _is_deleted = 0; +``` + +## Hard deletes + +All tables are created with `_version` (UInt64) and `_is_deleted` (UInt8) columns used internally by the `ReplacingMergeTree` engine. + +If you set `hardDelete: true` in the endpoint configuration, the connector inserts a **tombstone row** when a source document is deleted. The tombstone has `_is_deleted = 1`, the same key columns as the original row, and zero values for all other columns. The `ReplacingMergeTree` engine then uses `_is_deleted` to hide these rows from `FINAL` queries, and eventually removes the tombstoned records from the table. + +## Soft deletes not supported + +Source deletions are effectively ignored at the destination. + +## Delta updates not supported + +This connector does not support [delta updates](/concepts/materialization/#delta-updates). Only standard (merge) mode is supported. diff --git a/site/docs/reference/Connectors/materialization-connectors/Dekaf/clickhouse.md b/site/docs/reference/Connectors/materialization-connectors/Dekaf/clickhouse.md index 0395cf480f0..ed9bf3f318a 100644 --- a/site/docs/reference/Connectors/materialization-connectors/Dekaf/clickhouse.md +++ b/site/docs/reference/Connectors/materialization-connectors/Dekaf/clickhouse.md @@ -3,6 +3,8 @@ This connector materializes Estuary collections as Kafka-compatible messages that a ClickHouse Kafka consumer can read. [ClickHouse](https://clickhouse.com/) is a real-time analytical database and warehouse. +Estuary also provides a [direct materialization with ClickHouse](../ClickHouse.md). + ## Prerequisites To use this connector, you'll need: diff --git a/site/docs/reference/Connectors/materialization-connectors/README.md b/site/docs/reference/Connectors/materialization-connectors/README.md index fdc80a4861a..083911ea9b1 100644 --- a/site/docs/reference/Connectors/materialization-connectors/README.md +++ b/site/docs/reference/Connectors/materialization-connectors/README.md @@ -59,6 +59,9 @@ In the future, other open-source materialization connectors from third parties c * Bytewax * [Configuration](./Dekaf/bytewax.md) * ClickHouse + * [Configuration](./ClickHouse.md) + * Package - ghcr.io/estuary/materialize-clickhouse:v1 +* ClickHouse (Dekaf) * [Configuration](./Dekaf/clickhouse.md) * CSV Files in GCS * [Configuration](./google-gcs-csv.md) From 580bf0134f5e132fccd30073ea15dd0ada92b0bb Mon Sep 17 00:00:00 2001 From: Jacob Marble Date: Wed, 8 Apr 2026 00:26:10 +0000 Subject: [PATCH 2/3] update per changes to implementation --- .../materialization-connectors/ClickHouse.md | 38 +++++++++++-------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/site/docs/reference/Connectors/materialization-connectors/ClickHouse.md b/site/docs/reference/Connectors/materialization-connectors/ClickHouse.md index 6478668f2ce..8993f0ff278 100644 --- a/site/docs/reference/Connectors/materialization-connectors/ClickHouse.md +++ b/site/docs/reference/Connectors/materialization-connectors/ClickHouse.md @@ -5,7 +5,9 @@ This connector materializes Estuary collections into tables in a ClickHouse database. [ClickHouse](https://clickhouse.com/) is a column-oriented OLAP database designed for real-time analytics. -This connector writes directly to ClickHouse using the native protocol. +This connector writes batches directly to ClickHouse using the +[Native protocol](https://clickhouse.com/docs/interfaces/tcp) and +[Native format](https://clickhouse.com/docs/interfaces/formats/Native). Estuary also provides a [Dekaf-based integration](./Dekaf/clickhouse.md) for users who prefer to ingest via ClickPipes. @@ -14,7 +16,7 @@ Estuary also provides a [Dekaf-based integration](./Dekaf/clickhouse.md) for use To use this connector, you'll need: * A ClickHouse database (self-hosted or ClickHouse Cloud) with a user that has permissions to create tables and write data. -* The connector uses the ClickHouse native protocol (port 9000 by default, not the HTTP interface on port 8123). +* The connector uses the ClickHouse native protocol. The default port is **9440** (TLS enabled, the default) or **9000** (TLS disabled). It does not use the HTTP interface on port 8123. * At least one Estuary collection. :::tip @@ -32,19 +34,22 @@ Use the below properties to configure a ClickHouse materialization, which will d | Property | Title | Description | Type | Required/Default | |---|---|---|---|---| -| **`/address`** | Address | Host and port of the database, in the form of `host[:port]`. Port 9000 is used as the default if no specific port is provided. | string | Required | +| **`/address`** | Address | Host and port of the database, in the form of `host[:port]`. Port 9440 is used as the default when SSL is enabled (the default), or 9000 when SSL is disabled. | string | Required | | **`/credentials`** | Authentication | | object | Required | | **`/credentials/auth_type`** | Auth Type | Authentication type. Must be `user_password`. | string | Required | | **`/credentials/username`** | Username | Database username. | string | Required | | **`/credentials/password`** | Password | Database password. | string | Required | | **`/database`** | Database | Name of the ClickHouse database to materialize to. | string | Required | -| `/hardDelete` | Hard Delete | If enabled, the connector inserts tombstone rows with `_is_deleted = 1` when source documents are deleted, causing them to be excluded from `FINAL` queries. By default, source deletions are ignored at the destination. | boolean | `false` | +| `/hardDelete` | Hard Delete | If enabled, items deleted in the source will also be deleted from the destination. By default, deletions are tracked via `_meta/op` (soft-delete). | boolean | `false` | +| `/advanced/sslmode` | SSL Mode | Controls the TLS connection behavior. Options: `disable`, `require`, `verify-full`. | string | `verify-full` | +| `/advanced/no_flow_document` | Exclude Flow Document | When enabled, the root document column will not be required for standard updates. | boolean | `false` | #### Bindings | Property | Title | Description | Type | Required/Default | |---|---|---|---|---| | **`/table`** | Table | Name of the database table to materialize to. The connector will create the table if it doesn't already exist. | string | Required | +| `/delta_updates` | Delta Update | Should updates to this table be done via delta updates. | boolean | `false` | ### Sample @@ -54,7 +59,7 @@ materializations: endpoint: connector: config: - address: clickhouse.example.com:9000 + address: clickhouse.example.com:9440 credentials: auth_type: user_password username: flow_user @@ -69,24 +74,27 @@ materializations: ## ReplacingMergeTree and FINAL -The connector creates tables using the [ReplacingMergeTree engine](https://clickhouse.com/docs/engines/table-engines/mergetree-family/replacingmergetree). Updated records are actually inserted as duplicates; ClickHouse later deduplicates these as a background process. +In standard (non-delta) mode, the connector creates tables using the [ReplacingMergeTree engine](https://clickhouse.com/docs/engines/table-engines/mergetree-family/replacingmergetree) with `flow_published_at` as the version column. +Updated records are inserted as new rows; ClickHouse deduplicates them in a background process, keeping the row with the highest `flow_published_at` value for each key. -Your queries should use the `FINAL` directive to get deduplicated results, and include the predicate `_is_deleted = 0` to ignore deleted records. +The connector also configures automatic background cleanup merges so that superseded rows and tombstones are eventually removed from disk. + +Your queries should use the `FINAL` directive to get results with duplicate and tombstone rows removed: ```sql -SELECT * FROM my_table FINAL WHERE _is_deleted = 0; +SELECT * FROM my_table FINAL; ``` ## Hard deletes -All tables are created with `_version` (UInt64) and `_is_deleted` (UInt8) columns used internally by the `ReplacingMergeTree` engine. - -If you set `hardDelete: true` in the endpoint configuration, the connector inserts a **tombstone row** when a source document is deleted. The tombstone has `_is_deleted = 1`, the same key columns as the original row, and zero values for all other columns. The `ReplacingMergeTree` engine then uses `_is_deleted` to hide these rows from `FINAL` queries, and eventually removes the tombstoned records from the table. +When `hardDelete: true` is set in the endpoint configuration, the connector adds an `_is_deleted` (UInt8) column to each table. +When a source document is deleted, the connector inserts a **tombstone row** with `_is_deleted = 1` and the same key columns as the original row. +The `ReplacingMergeTree` engine uses `_is_deleted` to exclude these rows from `FINAL` queries, and automatic cleanup merges eventually remove the tombstoned records from disk. -## Soft deletes not supported +## Soft deletes -Source deletions are effectively ignored at the destination. +By default (when `hardDelete` is not enabled), source deletions are tracked in the destination via the `_meta/op` column, which indicates whether a row was created, updated, or deleted. The row itself remains in the table. -## Delta updates not supported +## Delta updates -This connector does not support [delta updates](/concepts/materialization/#delta-updates). Only standard (merge) mode is supported. +This connector supports [delta updates](/concepts/materialization/#delta-updates) on a per-binding basis. When `delta_updates` is enabled for a binding, the table uses the `MergeTree` engine instead of `ReplacingMergeTree`. Every store operation is appended as-is with no deduplication — rows accumulate and are never removed. From 014beb05f73b95258c4fe431f8702b787af4ca0e Mon Sep 17 00:00:00 2001 From: Jacob Marble Date: Fri, 10 Apr 2026 21:40:56 +0000 Subject: [PATCH 3/3] materialize-clickhouse: document required permissions Add a Required Permissions section with the specific grants needed: GRANT ALL on the target database, plus SELECT on system.columns, system.parts, and system.tables (not covered by the database grant). Also document optional row-level policies for restricting system table visibility to the target database only. --- .../materialization-connectors/ClickHouse.md | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/site/docs/reference/Connectors/materialization-connectors/ClickHouse.md b/site/docs/reference/Connectors/materialization-connectors/ClickHouse.md index 8993f0ff278..7dd55819ea7 100644 --- a/site/docs/reference/Connectors/materialization-connectors/ClickHouse.md +++ b/site/docs/reference/Connectors/materialization-connectors/ClickHouse.md @@ -15,10 +15,33 @@ Estuary also provides a [Dekaf-based integration](./Dekaf/clickhouse.md) for use To use this connector, you'll need: -* A ClickHouse database (self-hosted or ClickHouse Cloud) with a user that has permissions to create tables and write data. +* A ClickHouse database (self-hosted or ClickHouse Cloud) and user. * The connector uses the ClickHouse native protocol. The default port is **9440** (TLS enabled, the default) or **9000** (TLS disabled). It does not use the HTTP interface on port 8123. * At least one Estuary collection. +### Required Permissions + +```sql +-- Target database access: CREATE TABLE, DROP TABLE, SELECT, INSERT, TRUNCATE, etc. +GRANT ALL ON .* TO ; + +-- System table access for metadata discovery and partition management. +-- These are NOT covered by the database grant above. +GRANT SELECT ON system.columns TO ; +GRANT SELECT ON system.parts TO ; +GRANT SELECT ON system.tables TO ; +``` + +Replace `` and `` with your actual user and database names. + +Optionally, you can use row-level policies to restrict the user's system table access to only the target database: + +```sql +CREATE ROW POLICY estuary_tables ON system.tables FOR SELECT USING database = '' TO ; +CREATE ROW POLICY estuary_columns ON system.columns FOR SELECT USING database = '' TO ; +CREATE ROW POLICY estuary_parts ON system.parts FOR SELECT USING database = '' TO ; +``` + :::tip If you haven't yet captured your data from its external source, start at the beginning of the [guide to create a dataflow](../../../guides/create-dataflow.md). You'll be referred back to this connector-specific documentation at the appropriate steps. :::