Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions zeeguu/core/model/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from zeeguu.core.model.db import db

from zeeguu.core.model.domain_name import DomainName
from zeeguu.core.util.url import remove_tracking_query_params


class Url(db.Model):
Expand Down Expand Up @@ -78,6 +79,11 @@ def get_domain(cls, url):

@classmethod
def get_path(cls, url: str):
# Strip tracking cruft (gaa_*, utm_*, ...) before extracting the path,
# so the stored path stays canonical and short. Some publishers append
# long signed access tokens that otherwise overflow the 255-char column.
url = remove_tracking_query_params(url)

protocol_re = "(.*://)?"
domain_re = "([^/?]*)"
path_re = "(.*)"
Expand Down
5 changes: 5 additions & 0 deletions zeeguu/core/model/user_activitiy_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from zeeguu.core.behavioral_modeling import (
find_last_reading_percentage,
)
from zeeguu.core.util.url import remove_tracking_query_params
import zeeguu

from zeeguu.core.model.db import db
Expand Down Expand Up @@ -475,6 +476,10 @@ def create_from_post_data(cls, session, data, user):

event = data.get("event", "")
value = data.get("value", "")
# Strip tracking cruft (gaa_*, utm_*, ...) and clamp to the column size.
# Some publishers append long signed access tokens that overflow the
# 255-char `value` column and 500 the whole activity upload.
value = remove_tracking_query_params(value)[:255]
extra_data = data.get("extra_data", "")
source_id = data.get("source_id", "")
platform = data.get("platform", None)
Expand Down
5 changes: 5 additions & 0 deletions zeeguu/core/test/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ def test_domain_plus_path_must_be_unique(self):

self.assertTrue("Duplicate entry" or "IntegrityError" in str(context.exception))

def test_get_path_strips_tracking_params(self):
# gaa_* tokens would otherwise overflow the 255-char path column
url = "https://www.bt.dk/krimi/x?gaa_sig=AAAA&page=2"
self.assertEqual(Url.get_path(url), "/krimi/x?page=2")

def test_find_or_create_works(self):

_url = self.url_rule.url.as_string()
Expand Down
66 changes: 66 additions & 0 deletions zeeguu/core/test/test_util_url.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from unittest import TestCase

from zeeguu.core.util.url import remove_tracking_query_params


class RemoveTrackingQueryParamsTest(TestCase):

# The real-world case that motivated this: bt.dk via Google Discover
# appends gaa_* signed tokens that overflow the 255-char value column.
def test_strips_gaa_tokens(self):
url = (
"https://www.bt.dk/krimi/rumaenske-tyve-haerger"
"?gaa_at=eafs&gaa_n=AVngi4j3JkJxoD1&gaa_ts=6a1c8a0c&gaa_sig=c9nTTe9K9P"
)
self.assertEqual(
remove_tracking_query_params(url),
"https://www.bt.dk/krimi/rumaenske-tyve-haerger",
)

def test_strips_utm_and_click_ids_keeps_real_params(self):
url = "https://x.dk/a?id=42&utm_source=news&fbclid=abc&gclid=z&page=2"
self.assertEqual(
remove_tracking_query_params(url),
"https://x.dk/a?id=42&page=2",
)

def test_leaves_clean_url_untouched(self):
url = "https://x.dk/a?id=42&page=2"
self.assertEqual(remove_tracking_query_params(url), url)

def test_leaves_non_url_and_empty_untouched(self):
self.assertEqual(remove_tracking_query_params("OPEN POPUP"), "OPEN POPUP")
self.assertEqual(remove_tracking_query_params(""), "")

# The cleaning must be surgical: a URL without tracking params has to come
# back byte-for-byte identical, because the result is used as a DB key
# (url.path) and reconstructed into URLs we serve back. Re-encoding would
# corrupt signed values and miss lookups against already-stored rows.

def test_does_not_re_encode_when_nothing_stripped(self):
# space stays %20 (not +), valueless param stays valueless
self.assertEqual(
remove_tracking_query_params("https://x.dk/a?q=hello%20world&flag"),
"https://x.dk/a?q=hello%20world&flag",
)

def test_does_not_mangle_embedded_articleurl(self):
# an articleURL= wrapper carries an inner URL with its own ?/&/= — these
# must survive verbatim so as_canonical_string()/split('articleURL=') work
url = "https://s.dk/read?articleURL=https://bt.dk/x?foo=1&bar=2"
self.assertEqual(remove_tracking_query_params(url), url)

def test_preserves_signed_param_encoding_while_stripping(self):
# a load-bearing signed value keeps its %2F/%3D even when utm is removed
self.assertEqual(
remove_tracking_query_params(
"https://i.cdn/x.jpg?sig=ab%2Fcd%3D&utm_source=x"
),
"https://i.cdn/x.jpg?sig=ab%2Fcd%3D",
)

def test_preserves_fragment(self):
self.assertEqual(
remove_tracking_query_params("https://x.dk/a?gaa_sig=Z#frag"),
"https://x.dk/a#frag",
)
55 changes: 55 additions & 0 deletions zeeguu/core/util/url.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from urllib.parse import urlparse, urlunparse

"""
Helpers for cleaning up URLs before we store them.

Some publishers (e.g. bt.dk via Google Discover / "Subscribe with Google")
append long signed access tokens as query params: gaa_at, gaa_n, gaa_ts,
gaa_sig. Together with the usual utm_*/fbclid/gclid analytics cruft these
can push a URL well past 255 chars, which is the size of e.g. the
user_activity_data.value column.
"""

# Param names (or prefixes) that carry no meaning for us and only bloat the URL.
_TRACKING_PARAM_PREFIXES = (
"gaa_", # Google Article Access tokens (Subscribe with Google / Discover)
"utm_", # analytics campaign tags
)
_TRACKING_PARAM_NAMES = {
"fbclid", # Facebook click id
"gclid", # Google Ads click id
"_ga", # Google Analytics
}


def _is_tracking_param(name: str) -> bool:
return name in _TRACKING_PARAM_NAMES or name.startswith(_TRACKING_PARAM_PREFIXES)


def remove_tracking_query_params(url: str) -> str:
"""Drop known tracking query params (gaa_*, utm_*, fbclid, ...).

Surgical by design: operates on the raw query string and only the
matched ``key=value`` segments are removed. Surviving params keep their
exact original encoding, and a URL with no tracking params is returned
byte-for-byte unchanged. This matters because the result is used as a DB
key (url.path) and is reconstructed into URLs served back to the client
(image/CDN URLs, articleURL=... wrappers) — re-encoding would corrupt
signed values and break lookups against already-stored rows.

Leaves non-URL strings untouched.
"""
if not url or "://" not in url:
return url

parsed = urlparse(url)
if not parsed.query:
return url

# Split the *raw* query (no decode) and drop only the tracking segments.
segments = parsed.query.split("&")
kept = [s for s in segments if not _is_tracking_param(s.split("=", 1)[0])]
if len(kept) == len(segments):
return url # nothing stripped — don't touch the original encoding

return urlunparse(parsed._replace(query="&".join(kept)))
Loading