From 8fb7d0211b5b8a043bd926dee5f0f8f39f16ed93 Mon Sep 17 00:00:00 2001 From: Mircea Lungu Date: Fri, 5 Jun 2026 14:12:16 +0200 Subject: [PATCH 1/2] Strip tracking params (gaa_*, utm_*, ...) from stored URLs Some publishers (e.g. bt.dk via Google Discover / Subscribe with Google) append long signed access tokens (gaa_at, gaa_n, gaa_ts, gaa_sig) to article URLs. These pushed URLs past 255 chars and broke two things: - user_activity_data.value insert 500'd with DataError 1406 on the OPEN POPUP event from the extension. - Url.get_path produced an over-long path that silently failed the len(path) > 255 guard, so the article's URL row was dropped. Add remove_tracking_query_params() and apply it at both sites: clean + clamp the activity value, and clean at Url.get_path so store and lookup stay canonical and consistent. Co-Authored-By: Claude Opus 4.8 (1M context) --- zeeguu/core/model/url.py | 6 ++++ zeeguu/core/model/user_activitiy_data.py | 5 +++ zeeguu/core/test/test_url.py | 5 +++ zeeguu/core/test/test_util_url.py | 33 +++++++++++++++++++ zeeguu/core/util/url.py | 41 ++++++++++++++++++++++++ 5 files changed, 90 insertions(+) create mode 100644 zeeguu/core/test/test_util_url.py create mode 100644 zeeguu/core/util/url.py diff --git a/zeeguu/core/model/url.py b/zeeguu/core/model/url.py index 680c46fa6..87d8b2776 100644 --- a/zeeguu/core/model/url.py +++ b/zeeguu/core/model/url.py @@ -13,6 +13,7 @@ from zeeguu.core.model.db import db from zeeguu.core.model.domain_name import DomainName +from zeeguu.core.util.url import remove_tracking_query_params class Url(db.Model): @@ -78,6 +79,11 @@ def get_domain(cls, url): @classmethod def get_path(cls, url: str): + # Strip tracking cruft (gaa_*, utm_*, ...) before extracting the path, + # so the stored path stays canonical and short. Some publishers append + # long signed access tokens that otherwise overflow the 255-char column. + url = remove_tracking_query_params(url) + protocol_re = "(.*://)?" domain_re = "([^/?]*)" path_re = "(.*)" diff --git a/zeeguu/core/model/user_activitiy_data.py b/zeeguu/core/model/user_activitiy_data.py index 66060a9e6..af91c8553 100644 --- a/zeeguu/core/model/user_activitiy_data.py +++ b/zeeguu/core/model/user_activitiy_data.py @@ -24,6 +24,7 @@ from zeeguu.core.behavioral_modeling import ( find_last_reading_percentage, ) +from zeeguu.core.util.url import remove_tracking_query_params import zeeguu from zeeguu.core.model.db import db @@ -475,6 +476,10 @@ def create_from_post_data(cls, session, data, user): event = data.get("event", "") value = data.get("value", "") + # Strip tracking cruft (gaa_*, utm_*, ...) and clamp to the column size. + # Some publishers append long signed access tokens that overflow the + # 255-char `value` column and 500 the whole activity upload. + value = remove_tracking_query_params(value)[:255] extra_data = data.get("extra_data", "") source_id = data.get("source_id", "") platform = data.get("platform", None) diff --git a/zeeguu/core/test/test_url.py b/zeeguu/core/test/test_url.py index b767baf8e..153390913 100644 --- a/zeeguu/core/test/test_url.py +++ b/zeeguu/core/test/test_url.py @@ -27,6 +27,11 @@ def test_domain_plus_path_must_be_unique(self): self.assertTrue("Duplicate entry" or "IntegrityError" in str(context.exception)) + def test_get_path_strips_tracking_params(self): + # gaa_* tokens would otherwise overflow the 255-char path column + url = "https://www.bt.dk/krimi/x?gaa_sig=AAAA&page=2" + self.assertEqual(Url.get_path(url), "/krimi/x?page=2") + def test_find_or_create_works(self): _url = self.url_rule.url.as_string() diff --git a/zeeguu/core/test/test_util_url.py b/zeeguu/core/test/test_util_url.py new file mode 100644 index 000000000..ee4259e71 --- /dev/null +++ b/zeeguu/core/test/test_util_url.py @@ -0,0 +1,33 @@ +from unittest import TestCase + +from zeeguu.core.util.url import remove_tracking_query_params + + +class RemoveTrackingQueryParamsTest(TestCase): + + # The real-world case that motivated this: bt.dk via Google Discover + # appends gaa_* signed tokens that overflow the 255-char value column. + def test_strips_gaa_tokens(self): + url = ( + "https://www.bt.dk/krimi/rumaenske-tyve-haerger" + "?gaa_at=eafs&gaa_n=AVngi4j3JkJxoD1&gaa_ts=6a1c8a0c&gaa_sig=c9nTTe9K9P" + ) + self.assertEqual( + remove_tracking_query_params(url), + "https://www.bt.dk/krimi/rumaenske-tyve-haerger", + ) + + def test_strips_utm_and_click_ids_keeps_real_params(self): + url = "https://x.dk/a?id=42&utm_source=news&fbclid=abc&gclid=z&page=2" + self.assertEqual( + remove_tracking_query_params(url), + "https://x.dk/a?id=42&page=2", + ) + + def test_leaves_clean_url_untouched(self): + url = "https://x.dk/a?id=42&page=2" + self.assertEqual(remove_tracking_query_params(url), url) + + def test_leaves_non_url_and_empty_untouched(self): + self.assertEqual(remove_tracking_query_params("OPEN POPUP"), "OPEN POPUP") + self.assertEqual(remove_tracking_query_params(""), "") diff --git a/zeeguu/core/util/url.py b/zeeguu/core/util/url.py new file mode 100644 index 000000000..fbb8c94c4 --- /dev/null +++ b/zeeguu/core/util/url.py @@ -0,0 +1,41 @@ +from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse + +""" + Helpers for cleaning up URLs before we store them. + + Some publishers (e.g. bt.dk via Google Discover / "Subscribe with Google") + append long signed access tokens as query params: gaa_at, gaa_n, gaa_ts, + gaa_sig. Together with the usual utm_*/fbclid/gclid analytics cruft these + can push a URL well past 255 chars, which is the size of e.g. the + user_activity_data.value column. +""" + +# Param names (or prefixes) that carry no meaning for us and only bloat the URL. +_TRACKING_PARAM_PREFIXES = ( + "gaa_", # Google Article Access tokens (Subscribe with Google / Discover) + "utm_", # analytics campaign tags +) +_TRACKING_PARAM_NAMES = { + "fbclid", # Facebook click id + "gclid", # Google Ads click id + "_ga", # Google Analytics +} + + +def _is_tracking_param(name: str) -> bool: + return name in _TRACKING_PARAM_NAMES or name.startswith(_TRACKING_PARAM_PREFIXES) + + +def remove_tracking_query_params(url: str) -> str: + """Drop known tracking query params, preserving everything else. + + Leaves non-URL strings untouched. + """ + if not url or "://" not in url: + return url + + parsed = urlparse(url) + kept = [(k, v) for k, v in parse_qsl(parsed.query, keep_blank_values=True) + if not _is_tracking_param(k)] + + return urlunparse(parsed._replace(query=urlencode(kept))) From 275f9acf9b88ce9481b39aff2e2d0e21ffc44550 Mon Sep 17 00:00:00 2001 From: Mircea Lungu Date: Fri, 5 Jun 2026 17:27:36 +0200 Subject: [PATCH 2/2] Make URL cleaning surgical: only strip matched params, never re-encode Code review found the parse_qsl+urlencode round-trip mutated URLs even when no tracking param was present: - embedded articleURL= got percent-mangled and split - ?q=a%20b -> ?q=a+b, valueless ?key -> ?key= - signed image/CDN query values re-encoded -> broken served URLs - every URL with a query string no longer matched its already-stored url.path row -> duplicate Url/Article rows, broken translated-article cache Now operate on the raw query string: drop only the matched key=value segments, leave survivors byte-for-byte, and return the original string untouched when nothing was stripped. Add regression tests. Co-Authored-By: Claude Opus 4.8 (1M context) --- zeeguu/core/test/test_util_url.py | 33 +++++++++++++++++++++++++++++++ zeeguu/core/util/url.py | 24 +++++++++++++++++----- 2 files changed, 52 insertions(+), 5 deletions(-) diff --git a/zeeguu/core/test/test_util_url.py b/zeeguu/core/test/test_util_url.py index ee4259e71..2f0cf9c3e 100644 --- a/zeeguu/core/test/test_util_url.py +++ b/zeeguu/core/test/test_util_url.py @@ -31,3 +31,36 @@ def test_leaves_clean_url_untouched(self): def test_leaves_non_url_and_empty_untouched(self): self.assertEqual(remove_tracking_query_params("OPEN POPUP"), "OPEN POPUP") self.assertEqual(remove_tracking_query_params(""), "") + + # The cleaning must be surgical: a URL without tracking params has to come + # back byte-for-byte identical, because the result is used as a DB key + # (url.path) and reconstructed into URLs we serve back. Re-encoding would + # corrupt signed values and miss lookups against already-stored rows. + + def test_does_not_re_encode_when_nothing_stripped(self): + # space stays %20 (not +), valueless param stays valueless + self.assertEqual( + remove_tracking_query_params("https://x.dk/a?q=hello%20world&flag"), + "https://x.dk/a?q=hello%20world&flag", + ) + + def test_does_not_mangle_embedded_articleurl(self): + # an articleURL= wrapper carries an inner URL with its own ?/&/= — these + # must survive verbatim so as_canonical_string()/split('articleURL=') work + url = "https://s.dk/read?articleURL=https://bt.dk/x?foo=1&bar=2" + self.assertEqual(remove_tracking_query_params(url), url) + + def test_preserves_signed_param_encoding_while_stripping(self): + # a load-bearing signed value keeps its %2F/%3D even when utm is removed + self.assertEqual( + remove_tracking_query_params( + "https://i.cdn/x.jpg?sig=ab%2Fcd%3D&utm_source=x" + ), + "https://i.cdn/x.jpg?sig=ab%2Fcd%3D", + ) + + def test_preserves_fragment(self): + self.assertEqual( + remove_tracking_query_params("https://x.dk/a?gaa_sig=Z#frag"), + "https://x.dk/a#frag", + ) diff --git a/zeeguu/core/util/url.py b/zeeguu/core/util/url.py index fbb8c94c4..7033aaa79 100644 --- a/zeeguu/core/util/url.py +++ b/zeeguu/core/util/url.py @@ -1,4 +1,4 @@ -from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse +from urllib.parse import urlparse, urlunparse """ Helpers for cleaning up URLs before we store them. @@ -27,7 +27,15 @@ def _is_tracking_param(name: str) -> bool: def remove_tracking_query_params(url: str) -> str: - """Drop known tracking query params, preserving everything else. + """Drop known tracking query params (gaa_*, utm_*, fbclid, ...). + + Surgical by design: operates on the raw query string and only the + matched ``key=value`` segments are removed. Surviving params keep their + exact original encoding, and a URL with no tracking params is returned + byte-for-byte unchanged. This matters because the result is used as a DB + key (url.path) and is reconstructed into URLs served back to the client + (image/CDN URLs, articleURL=... wrappers) — re-encoding would corrupt + signed values and break lookups against already-stored rows. Leaves non-URL strings untouched. """ @@ -35,7 +43,13 @@ def remove_tracking_query_params(url: str) -> str: return url parsed = urlparse(url) - kept = [(k, v) for k, v in parse_qsl(parsed.query, keep_blank_values=True) - if not _is_tracking_param(k)] + if not parsed.query: + return url + + # Split the *raw* query (no decode) and drop only the tracking segments. + segments = parsed.query.split("&") + kept = [s for s in segments if not _is_tracking_param(s.split("=", 1)[0])] + if len(kept) == len(segments): + return url # nothing stripped — don't touch the original encoding - return urlunparse(parsed._replace(query=urlencode(kept))) + return urlunparse(parsed._replace(query="&".join(kept)))