From 5fc5af3a09a1d41448661cefad3607a3c8f373d9 Mon Sep 17 00:00:00 2001 From: Joe S Date: Tue, 9 Jun 2026 15:04:09 -0700 Subject: [PATCH 1/3] auto-promote large query parameters to form body --- CHANGELOG.md | 3 ++ clickhouse_connect/driver/__init__.py | 12 +++--- clickhouse_connect/driver/asyncclient.py | 12 +++--- clickhouse_connect/driver/binding.py | 20 ++++++++++ clickhouse_connect/driver/httpclient.py | 12 +++--- .../test_form_encode_query.py | 33 +++++++++++++++++ tests/unit_tests/test_driver/test_binding.py | 33 ++++++++++++++++- .../unit_tests/test_driver/test_httpclient.py | 37 +++++++++++++++++++ 8 files changed, 145 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 787be481..ba8cb522 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ ### Improvements - Build and publish Windows ARM64 `win_arm64` wheels for CPython 3.10 through 3.14, including the free-threaded 3.14 build. Closes [#785](https://github.com/ClickHouse/clickhouse-connect/issues/785). +### Bug Fixes +- Large query parameter payloads are now automatically sent as form data in the request body instead of the URL query string. Server-side bind parameters were urlencoded into the request URL, so a large `IN` list or a high-dimensional vector embedding could produce a URL that HTTP intermediaries such as nginx, AWS ALB, and CloudFront reject with HTTP 414. The client now routes parameters to the POST body once their encoded length passes a threshold, which keeps the URL small. Setting `form_encode_query_params=True` still forces form encoding for all queries. This does not change the server's per-value size limit, which is governed by `http_max_field_value_size`. Applies to both sync and async clients. Closes [#740](https://github.com/ClickHouse/clickhouse-connect/issues/740). + ## 1.2.0, 2026-06-08 ### Improvements diff --git a/clickhouse_connect/driver/__init__.py b/clickhouse_connect/driver/__init__.py index 5e920f7b..d0d2c93e 100644 --- a/clickhouse_connect/driver/__init__.py +++ b/clickhouse_connect/driver/__init__.py @@ -184,9 +184,9 @@ def create_client( match the server's column definition which means timezone-aware when the column defines a timezone and naive for bare DateTime columns. :param autogenerate_session_id If set, this will override the 'autogenerate_session_id' common setting. - :param form_encode_query_params If True, query parameters will be sent as form-encoded data in the request body - instead of as URL parameters. This is useful for queries with large parameter sets that might exceed URL length - limits. Only available for query operations (not inserts). Default: False + :param form_encode_query_params If True, always send query parameters as form-encoded data in the request body + instead of as URL parameters. Regardless of this setting, large parameter payloads are automatically sent as form + data to avoid exceeding URL length limits. Only available for query operations (not inserts). Default: False :return: ClickHouse Connect Client instance """ host, username, password, port, database, interface = _parse_connection_params( @@ -320,9 +320,9 @@ async def create_async_client( match the server's column definition which means timezone-aware when the column defines a timezone and naive for bare DateTime columns. :param autogenerate_session_id If set, this will override the 'autogenerate_session_id' common setting. - :param form_encode_query_params If True, query parameters will be sent as form-encoded data in the request body - instead of as URL parameters. This is useful for queries with large parameter sets that might exceed URL length - limits. Only available for query operations (not inserts). Default: False + :param form_encode_query_params If True, always send query parameters as form-encoded data in the request body + instead of as URL parameters. Regardless of this setting, large parameter payloads are automatically sent as form + data to avoid exceeding URL length limits. Only available for query operations (not inserts). Default: False :return: ClickHouse Connect AsyncClient instance """ try: diff --git a/clickhouse_connect/driver/asyncclient.py b/clickhouse_connect/driver/asyncclient.py index eef16ed6..2d0813c4 100644 --- a/clickhouse_connect/driver/asyncclient.py +++ b/clickhouse_connect/driver/asyncclient.py @@ -35,7 +35,7 @@ from clickhouse_connect.datatypes.registry import get_from_name from clickhouse_connect.driver import httputil, options, tzutil from clickhouse_connect.driver.asyncqueue import EOF_SENTINEL, AsyncSyncQueue -from clickhouse_connect.driver.binding import bind_query, quote_identifier +from clickhouse_connect.driver.binding import bind_query, quote_identifier, use_form_encoding from clickhouse_connect.driver.client import Client, _apply_arrow_tz_policy from clickhouse_connect.driver.common import StreamContext, coerce_bool, dict_copy from clickhouse_connect.driver.compression import available_compression @@ -580,13 +580,14 @@ async def _query_with_context(self, context: QueryContext) -> QueryResult: # ty context.block_info = True params.update(self._validate_settings(context.settings)) context.rename_response_column = self._rename_response_column + use_form = use_form_encoding(context.final_query, context.bind_params, self.form_encode_query_params) if not context.is_insert and columns_only_re.search(context.uncommented_query): fmt_json_query = f"{context.final_query}\n FORMAT JSON" fields = {"query": fmt_json_query} fields.update(context.bind_params) - if self.form_encode_query_params: + if use_form: files = {} if context.external_data: params.update(context.external_data.query_params) @@ -645,7 +646,7 @@ def decompress_and_parse_json(): files = None data = None - if self.form_encode_query_params: + if use_form: fields = {"query": final_query} fields.update(context.bind_params) @@ -1125,10 +1126,11 @@ def _prep_raw_query(self, query, parameters, settings, fmt, use_database, extern files = None body = None - if external_data and not self.form_encode_query_params and isinstance(final_query, bytes): + use_form = use_form_encoding(final_query, bind_params, self.form_encode_query_params) + if external_data and not use_form and isinstance(final_query, bytes): raise ProgrammingError("Binary query cannot be placed in URL when using External Data; enable form encoding.") - if self.form_encode_query_params: + if use_form: files = {} files["query"] = (None, final_query if isinstance(final_query, str) else final_query.decode()) for k, v in bind_params.items(): diff --git a/clickhouse_connect/driver/binding.py b/clickhouse_connect/driver/binding.py index 3e26230c..65c47b19 100644 --- a/clickhouse_connect/driver/binding.py +++ b/clickhouse_connect/driver/binding.py @@ -6,6 +6,7 @@ from datetime import date, datetime, timezone, tzinfo from enum import Enum from typing import Any +from urllib.parse import urlencode from clickhouse_connect import common from clickhouse_connect.driver import tzutil @@ -192,6 +193,25 @@ def bind_query( return query, bound_params +# Server-side bind parameters are urlencoded into the request URL. Once the encoded length +# passes this budget the client routes them through multipart form data instead, keeping +# oversized payloads out of the URL where proxies (nginx, ALB, CloudFront) reject them with +# HTTP 414. The threshold leaves ample headroom under common request line limits. +MAX_URL_BIND_PARAM_LENGTH = 4096 + + +def use_form_encoding(query, bind_params: dict[str, str], force_form: bool = False) -> bool: + if force_form: + return True + # Binary binds embed bytes into the query, which the form path cannot round-trip; leave + # those on the default path unless form encoding is explicitly requested. + if isinstance(query, bytes): + return False + if not bind_params: + return False + return len(urlencode(bind_params)) > MAX_URL_BIND_PARAM_LENGTH + + def format_str(value: str): return f"'{escape_str(value)}'" diff --git a/clickhouse_connect/driver/httpclient.py b/clickhouse_connect/driver/httpclient.py index e3737a84..ebffd0b1 100644 --- a/clickhouse_connect/driver/httpclient.py +++ b/clickhouse_connect/driver/httpclient.py @@ -19,7 +19,7 @@ from clickhouse_connect import common from clickhouse_connect.datatypes import registry from clickhouse_connect.datatypes.base import ClickHouseType -from clickhouse_connect.driver.binding import bind_query, quote_identifier +from clickhouse_connect.driver.binding import bind_query, quote_identifier, use_form_encoding from clickhouse_connect.driver.client import Client from clickhouse_connect.driver.common import coerce_bool, coerce_int, dict_add, dict_copy from clickhouse_connect.driver.compression import available_compression @@ -268,10 +268,11 @@ def _query_with_context(self, context: QueryContext) -> QueryResult: context.block_info = True params.update(self._validate_settings(context.settings)) context.rename_response_column = self._rename_response_column + use_form = use_form_encoding(context.final_query, context.bind_params, self.form_encode_query_params) if not context.is_insert and columns_only_re.search(context.uncommented_query): # Mirror normal query behavior for form encoding and external data fmt_json_query = f"{context.final_query}\n FORMAT JSON" - if self.form_encode_query_params: + if use_form: fields = {"query": fmt_json_query} fields.update(context.bind_params) if context.external_data: # Deal with form encoding + external data @@ -311,7 +312,7 @@ def _query_with_context(self, context: QueryContext) -> QueryResult: final_query = self._prep_query(context) fields = {} # Setup additional query parameters and body - if self.form_encode_query_params: + if use_form: body = b"" fields["query"] = final_query fields.update(context.bind_params) @@ -687,11 +688,12 @@ def _prep_raw_query( if use_database and self.database: params["database"] = self.database fields = {} + use_form = use_form_encoding(final_query, bind_params, self.form_encode_query_params) # Setup query body - if external_data and not self.form_encode_query_params and isinstance(final_query, bytes): + if external_data and not use_form and isinstance(final_query, bytes): raise ProgrammingError("Binary query cannot be placed in URL when using External Data; enable form encoding.") # Setup additional query parameters and body - if self.form_encode_query_params: + if use_form: body = b"" fields["query"] = final_query fields.update(bind_params) diff --git a/tests/integration_tests/test_form_encode_query.py b/tests/integration_tests/test_form_encode_query.py index d925d16a..45078497 100644 --- a/tests/integration_tests/test_form_encode_query.py +++ b/tests/integration_tests/test_form_encode_query.py @@ -1,6 +1,7 @@ from collections.abc import Callable from clickhouse_connect.driver import Client +from clickhouse_connect.driver.binding import bind_query, use_form_encoding def test_form_encode_query_basic(client_factory, call, table_context: Callable): @@ -126,3 +127,35 @@ def test_form_encode_schema_probe_query(client_factory, call, table_context: Cal assert result.column_names == ("id", "name", "adjusted_value") assert len(result.column_types) == 3 assert len(result.result_set) == 0 + + +def test_auto_form_encode_large_params(client_factory, call): + """Large parameter payloads are auto-promoted to form data without enabling the flag""" + default_client = client_factory() + assert default_client.form_encode_query_params is False + + ids = list(range(3000)) + query = "SELECT 1 WHERE 0 IN {ids:Array(UInt64)}" + final_query, bind_params = bind_query(query, {"ids": ids}) + # Payload exceeds the URL budget, so the default client must route it through the body + assert use_form_encoding(final_query, bind_params) is True + + result = call( + default_client.query, + "SELECT count() FROM numbers(5000) WHERE number IN {ids:Array(UInt64)}", + parameters={"ids": ids}, + ) + assert result.first_row[0] == 3000 + + +def test_auto_form_encode_raw_query_large_params(client_factory, call): + """raw_query auto-promotes large parameter payloads on a default client""" + default_client = client_factory() + + ids = list(range(3000)) + result = call( + default_client.raw_query, + "SELECT count() FROM numbers(5000) WHERE number IN {ids:Array(UInt64)}", + parameters={"ids": ids}, + ) + assert b"3000" in result diff --git a/tests/unit_tests/test_driver/test_binding.py b/tests/unit_tests/test_driver/test_binding.py index 0664d38a..304769fa 100644 --- a/tests/unit_tests/test_driver/test_binding.py +++ b/tests/unit_tests/test_driver/test_binding.py @@ -1,6 +1,6 @@ import pytest -from clickhouse_connect.driver.binding import quote_identifier +from clickhouse_connect.driver.binding import MAX_URL_BIND_PARAM_LENGTH, quote_identifier, use_form_encoding @pytest.mark.parametrize( @@ -41,3 +41,34 @@ def test_quote_identifier_valid_prequoted_passthrough(identifier): ) def test_quote_identifier_invalid_prequoted_escaped_as_raw(identifier, expected): assert quote_identifier(identifier) == expected + + +def test_use_form_encoding_empty(): + assert use_form_encoding("SELECT 1", {}) is False + assert use_form_encoding("SELECT 1", {}, force_form=True) is True + + +def test_use_form_encoding_force(): + assert use_form_encoding("SELECT {id:UInt32}", {"param_id": "1"}, force_form=True) is True + + +def test_use_form_encoding_small_params_stay_in_url(): + assert use_form_encoding("SELECT 1", {"param_id": "123", "param_name": "abc"}) is False + + +def test_use_form_encoding_large_params_promote(): + big = {"param_big": "x" * (MAX_URL_BIND_PARAM_LENGTH + 1)} + assert use_form_encoding("SELECT {big:String}", big) is True + + +def test_use_form_encoding_total_across_params(): + # Many individually small params whose combined encoded length exceeds the budget + params = {f"param_{i}": "v" * 200 for i in range(40)} + assert use_form_encoding("SELECT 1", params) is True + + +def test_use_form_encoding_binary_query_not_promoted(): + # Binary binds make the query bytes; auto-promotion must not kick in unless forced + big = {"param_big": "x" * (MAX_URL_BIND_PARAM_LENGTH + 1)} + assert use_form_encoding(b"SELECT \xff", big) is False + assert use_form_encoding(b"SELECT \xff", big, force_form=True) is True diff --git a/tests/unit_tests/test_driver/test_httpclient.py b/tests/unit_tests/test_driver/test_httpclient.py index b2c50dad..82bba37d 100644 --- a/tests/unit_tests/test_driver/test_httpclient.py +++ b/tests/unit_tests/test_driver/test_httpclient.py @@ -440,6 +440,25 @@ def test_raw_query_with_form_encode(self, mock_raw_request): assert "param_id" in fields assert "param_id" not in params + @patch.object(HttpClient, "_raw_request") + def test_raw_query_auto_form_encode_large_params(self, mock_raw_request): + """Large bind params auto-promote to form data even with form_encode_query_params=False""" + self.client.form_encode_query_params = False + + mock_response = Mock() + mock_response.data = b"test_result" + mock_raw_request.return_value = mock_response + + query = "SELECT * FROM table WHERE name = {name:String}" + parameters = {"name": "x" * 5000} + + self.client.raw_query(query, parameters=parameters) + + body, params, fields = self.extract_raw_request_params(mock_raw_request) + assert body == b"" + assert "param_name" in fields + assert "param_name" not in params + @patch.object(HttpClient, "_raw_request") def test_raw_query_with_external_data_only(self, mock_raw_request): """Test raw_query with external_data only (no form_encode)""" @@ -671,6 +690,24 @@ def test_query_with_context_form_encode(self, mock_raw_request): assert "param_id" in fields assert "param_id" not in params + @patch.object(HttpClient, "_raw_request") + def test_query_with_context_auto_form_encode_large_params(self, mock_raw_request): + """Large bind params auto-promote to form data even with form_encode_query_params=False""" + self.client.form_encode_query_params = False + + mock_raw_request.return_value = self.setup_mock_raw_request() + self.client._transform = Mock() + self.client._transform.parse_response.return_value = Mock(summary=None) + + context = self.create_mock_query_context(query="SELECT * FROM table WHERE name = 'x'", bind_params={"param_name": "x" * 5000}) + + self.client._query_with_context(context) + + body, params, fields = self.extract_raw_request_params(mock_raw_request) + assert body == b"" + assert "param_name" in fields + assert "param_name" not in params + @patch.object(HttpClient, "_raw_request") def test_query_with_context_external_data(self, mock_raw_request): """Test _query_with_context with external_data only""" From 1860351f88687244ed93876edd57513db466027d Mon Sep 17 00:00:00 2001 From: Joe S Date: Tue, 9 Jun 2026 15:35:35 -0700 Subject: [PATCH 2/3] address pr review comments --- CHANGELOG.md | 2 +- clickhouse_connect/driver/__init__.py | 10 ++++++---- clickhouse_connect/driver/binding.py | 9 +++++++-- tests/unit_tests/test_driver/test_binding.py | 6 ++++++ 4 files changed, 20 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ba8cb522..5289e26e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ - Build and publish Windows ARM64 `win_arm64` wheels for CPython 3.10 through 3.14, including the free-threaded 3.14 build. Closes [#785](https://github.com/ClickHouse/clickhouse-connect/issues/785). ### Bug Fixes -- Large query parameter payloads are now automatically sent as form data in the request body instead of the URL query string. Server-side bind parameters were urlencoded into the request URL, so a large `IN` list or a high-dimensional vector embedding could produce a URL that HTTP intermediaries such as nginx, AWS ALB, and CloudFront reject with HTTP 414. The client now routes parameters to the POST body once their encoded length passes a threshold, which keeps the URL small. Setting `form_encode_query_params=True` still forces form encoding for all queries. This does not change the server's per-value size limit, which is governed by `http_max_field_value_size`. Applies to both sync and async clients. Closes [#740](https://github.com/ClickHouse/clickhouse-connect/issues/740). +- Large query parameter payloads are now automatically sent as form data in the request body instead of the URL query string. Server-side bind parameters were urlencoded into the request URL, so a large `IN` list or a high-dimensional vector embedding could produce a URL that HTTP intermediaries such as nginx, AWS ALB, and CloudFront reject with HTTP 414. The client now routes parameters to the POST body once their encoded length passes a threshold, which keeps the URL small. Setting `form_encode_query_params=True` still forces form encoding for all queries. Queries using binary parameter binds are never promoted automatically and only use form encoding when the flag is set. This does not change the server's per-value size limit, which is governed by `http_max_field_value_size`. Applies to both sync and async clients. Closes [#740](https://github.com/ClickHouse/clickhouse-connect/issues/740). ## 1.2.0, 2026-06-08 diff --git a/clickhouse_connect/driver/__init__.py b/clickhouse_connect/driver/__init__.py index d0d2c93e..811286ad 100644 --- a/clickhouse_connect/driver/__init__.py +++ b/clickhouse_connect/driver/__init__.py @@ -185,8 +185,9 @@ def create_client( for bare DateTime columns. :param autogenerate_session_id If set, this will override the 'autogenerate_session_id' common setting. :param form_encode_query_params If True, always send query parameters as form-encoded data in the request body - instead of as URL parameters. Regardless of this setting, large parameter payloads are automatically sent as form - data to avoid exceeding URL length limits. Only available for query operations (not inserts). Default: False + instead of as URL parameters. When False, large parameter payloads are still automatically sent as form data to + avoid exceeding URL length limits, except for queries using binary parameter binds, which are only form-encoded + when this is True. Only available for query operations (not inserts). Default: False :return: ClickHouse Connect Client instance """ host, username, password, port, database, interface = _parse_connection_params( @@ -321,8 +322,9 @@ async def create_async_client( for bare DateTime columns. :param autogenerate_session_id If set, this will override the 'autogenerate_session_id' common setting. :param form_encode_query_params If True, always send query parameters as form-encoded data in the request body - instead of as URL parameters. Regardless of this setting, large parameter payloads are automatically sent as form - data to avoid exceeding URL length limits. Only available for query operations (not inserts). Default: False + instead of as URL parameters. When False, large parameter payloads are still automatically sent as form data to + avoid exceeding URL length limits, except for queries using binary parameter binds, which are only form-encoded + when this is True. Only available for query operations (not inserts). Default: False :return: ClickHouse Connect AsyncClient instance """ try: diff --git a/clickhouse_connect/driver/binding.py b/clickhouse_connect/driver/binding.py index 65c47b19..beb0860b 100644 --- a/clickhouse_connect/driver/binding.py +++ b/clickhouse_connect/driver/binding.py @@ -6,7 +6,7 @@ from datetime import date, datetime, timezone, tzinfo from enum import Enum from typing import Any -from urllib.parse import urlencode +from urllib.parse import quote, urlencode from clickhouse_connect import common from clickhouse_connect.driver import tzutil @@ -209,7 +209,12 @@ def use_form_encoding(query, bind_params: dict[str, str], force_form: bool = Fal return False if not bind_params: return False - return len(urlencode(bind_params)) > MAX_URL_BIND_PARAM_LENGTH + # Raw length is a lower bound on the encoded length, so large payloads short-circuit + # without materializing the encoded string. + if sum(len(k) + len(str(v)) for k, v in bind_params.items()) > MAX_URL_BIND_PARAM_LENGTH: + return True + # Measure with quote so spaces count as %20, matching the longer of the two client encodings. + return len(urlencode(bind_params, quote_via=quote)) > MAX_URL_BIND_PARAM_LENGTH def format_str(value: str): diff --git a/tests/unit_tests/test_driver/test_binding.py b/tests/unit_tests/test_driver/test_binding.py index 304769fa..3e6bdb1e 100644 --- a/tests/unit_tests/test_driver/test_binding.py +++ b/tests/unit_tests/test_driver/test_binding.py @@ -67,6 +67,12 @@ def test_use_form_encoding_total_across_params(): assert use_form_encoding("SELECT 1", params) is True +def test_use_form_encoding_percent_expansion_promotes(): + # Raw length is under the budget but percent-encoding expands each space to %20 + params = {"param_s": " " * (MAX_URL_BIND_PARAM_LENGTH // 2)} + assert use_form_encoding("SELECT 1", params) is True + + def test_use_form_encoding_binary_query_not_promoted(): # Binary binds make the query bytes; auto-promotion must not kick in unless forced big = {"param_big": "x" * (MAX_URL_BIND_PARAM_LENGTH + 1)} From 2dc3bc745c4735b931c08459ae98aec5a250ae51 Mon Sep 17 00:00:00 2001 From: Joe S Date: Tue, 9 Jun 2026 16:27:54 -0700 Subject: [PATCH 3/3] add type hint --- clickhouse_connect/driver/binding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clickhouse_connect/driver/binding.py b/clickhouse_connect/driver/binding.py index beb0860b..0d619f74 100644 --- a/clickhouse_connect/driver/binding.py +++ b/clickhouse_connect/driver/binding.py @@ -200,7 +200,7 @@ def bind_query( MAX_URL_BIND_PARAM_LENGTH = 4096 -def use_form_encoding(query, bind_params: dict[str, str], force_form: bool = False) -> bool: +def use_form_encoding(query: str | bytes, bind_params: dict[str, str], force_form: bool = False) -> bool: if force_form: return True # Binary binds embed bytes into the query, which the form path cannot round-trip; leave