From b7139420575294e1ed9436bdfd02b5737db3361e Mon Sep 17 00:00:00 2001 From: Alek050 Date: Tue, 24 Feb 2026 11:06:11 +0100 Subject: [PATCH 1/5] Added support for DOM in xml feeds --- kloppy/infra/serializers/event/statsperform/parsers/__init__.py | 2 +- kloppy/infra/serializers/tracking/tracab/parsers/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kloppy/infra/serializers/event/statsperform/parsers/__init__.py b/kloppy/infra/serializers/event/statsperform/parsers/__init__.py index 96c6bc29c..4337b1c12 100644 --- a/kloppy/infra/serializers/event/statsperform/parsers/__init__.py +++ b/kloppy/infra/serializers/event/statsperform/parsers/__init__.py @@ -17,7 +17,7 @@ def get_parser( ) -> OptaParser: # infer the data format if not provided if feed_format is None: - if feed.read(1).decode("utf-8")[0] == "<": + if feed.read(10).decode("utf-8").lstrip("\ufeff")[0] == "<": feed_format = "XML" else: feed_format = "JSON" diff --git a/kloppy/infra/serializers/tracking/tracab/parsers/__init__.py b/kloppy/infra/serializers/tracking/tracab/parsers/__init__.py index eeed75585..6b9c6fadf 100644 --- a/kloppy/infra/serializers/tracking/tracab/parsers/__init__.py +++ b/kloppy/infra/serializers/tracking/tracab/parsers/__init__.py @@ -18,7 +18,7 @@ def get_metadata_parser( ) -> TracabMetadataParser: # infer the data format if not provided if feed_format is None: - if feed.read(1).decode("utf-8")[0] == "<": + if feed.read(10).decode("utf-8").lstrip("\ufeff")[0] == "<": feed.seek(0) meta_data = objectify.fromstring(feed.read()) if hasattr(meta_data, "match"): From a0afc238473b532f207e4be5e71f373352b2b535 Mon Sep 17 00:00:00 2001 From: Alek050 Date: Tue, 24 Feb 2026 11:20:22 +0100 Subject: [PATCH 2/5] Changed to 5 bytes instead of 10 --- kloppy/infra/serializers/tracking/tracab/parsers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kloppy/infra/serializers/tracking/tracab/parsers/__init__.py b/kloppy/infra/serializers/tracking/tracab/parsers/__init__.py index 6b9c6fadf..ad34c09fd 100644 --- a/kloppy/infra/serializers/tracking/tracab/parsers/__init__.py +++ b/kloppy/infra/serializers/tracking/tracab/parsers/__init__.py @@ -18,7 +18,7 @@ def get_metadata_parser( ) -> TracabMetadataParser: # infer the data format if not provided if feed_format is None: - if feed.read(10).decode("utf-8").lstrip("\ufeff")[0] == "<": + if feed.read(5).decode("utf-8").lstrip("\ufeff")[0] == "<": feed.seek(0) meta_data = objectify.fromstring(feed.read()) if hasattr(meta_data, "match"): From 0a0309ad66ff535c3d1a71ba61784c04054c5347 Mon Sep 17 00:00:00 2001 From: Alek050 Date: Tue, 24 Feb 2026 11:22:37 +0100 Subject: [PATCH 3/5] now also for statsperform --- kloppy/infra/serializers/event/statsperform/parsers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kloppy/infra/serializers/event/statsperform/parsers/__init__.py b/kloppy/infra/serializers/event/statsperform/parsers/__init__.py index 4337b1c12..b5dab333c 100644 --- a/kloppy/infra/serializers/event/statsperform/parsers/__init__.py +++ b/kloppy/infra/serializers/event/statsperform/parsers/__init__.py @@ -17,7 +17,7 @@ def get_parser( ) -> OptaParser: # infer the data format if not provided if feed_format is None: - if feed.read(10).decode("utf-8").lstrip("\ufeff")[0] == "<": + if feed.read(5).decode("utf-8").lstrip("\ufeff")[0] == "<": feed_format = "XML" else: feed_format = "JSON" From ec14b7e3beb5424c5a3426d9b7dd0eb38fd2184d Mon Sep 17 00:00:00 2001 From: Alek050 Date: Tue, 24 Feb 2026 12:57:53 +0100 Subject: [PATCH 4/5] Updated to use utf-8-sig --- kloppy/infra/serializers/event/statsperform/parsers/__init__.py | 2 +- kloppy/infra/serializers/tracking/tracab/parsers/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kloppy/infra/serializers/event/statsperform/parsers/__init__.py b/kloppy/infra/serializers/event/statsperform/parsers/__init__.py index b5dab333c..7475d7355 100644 --- a/kloppy/infra/serializers/event/statsperform/parsers/__init__.py +++ b/kloppy/infra/serializers/event/statsperform/parsers/__init__.py @@ -17,7 +17,7 @@ def get_parser( ) -> OptaParser: # infer the data format if not provided if feed_format is None: - if feed.read(5).decode("utf-8").lstrip("\ufeff")[0] == "<": + if feed.read(4).decode("utf-8-sig")[0] == "<": feed_format = "XML" else: feed_format = "JSON" diff --git a/kloppy/infra/serializers/tracking/tracab/parsers/__init__.py b/kloppy/infra/serializers/tracking/tracab/parsers/__init__.py index ad34c09fd..6dddb23b5 100644 --- a/kloppy/infra/serializers/tracking/tracab/parsers/__init__.py +++ b/kloppy/infra/serializers/tracking/tracab/parsers/__init__.py @@ -18,7 +18,7 @@ def get_metadata_parser( ) -> TracabMetadataParser: # infer the data format if not provided if feed_format is None: - if feed.read(5).decode("utf-8").lstrip("\ufeff")[0] == "<": + if feed.read(4).decode("utf-8-sig")[0] == "<": feed.seek(0) meta_data = objectify.fromstring(feed.read()) if hasattr(meta_data, "match"): From 4141a1be28956cfdb97f0a86130087eb5854fbe9 Mon Sep 17 00:00:00 2001 From: Pieter Robberechts Date: Tue, 10 Mar 2026 18:19:43 +0100 Subject: [PATCH 5/5] test: add unit tests --- kloppy/tests/test_statsperform.py | 31 +++++++++++++++++++++++++++++++ kloppy/tests/test_tracab.py | 21 +++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/kloppy/tests/test_statsperform.py b/kloppy/tests/test_statsperform.py index 9510cb641..d36290018 100644 --- a/kloppy/tests/test_statsperform.py +++ b/kloppy/tests/test_statsperform.py @@ -1,4 +1,5 @@ from datetime import datetime, timedelta, timezone +import io from pathlib import Path import pytest @@ -419,3 +420,33 @@ def test_correct_normalized_deserialization( tracking_system="sportvu", coordinates="kloppy", ) + + +class TestStatsPerformXMLWithBOM: + def test_event_xml_with_bom( + self, event_metadata_xml: Path, event_data_xml: Path + ): + with open(event_metadata_xml, "rb") as f: + ma1_data = f.read() + with open(event_data_xml, "rb") as f: + ma3_data = f.read() + + dataset = statsperform.load_event( + ma1_data=io.BytesIO(b"\xef\xbb\xbf" + ma1_data), + ma3_data=io.BytesIO(b"\xef\xbb\xbf" + ma3_data), + ) + + assert len(dataset.records) > 0 + + def test_tracking_xml_with_bom( + self, tracking_metadata_xml: Path, tracking_data: Path + ): + with open(tracking_metadata_xml, "rb") as f: + ma1_data = f.read() + + dataset = statsperform.load_tracking( + ma1_data=io.BytesIO(b"\xef\xbb\xbf" + ma1_data), + ma25_data=tracking_data, + ) + + assert len(dataset.records) > 0 diff --git a/kloppy/tests/test_tracab.py b/kloppy/tests/test_tracab.py index 242cf7a2d..42ad21ea4 100644 --- a/kloppy/tests/test_tracab.py +++ b/kloppy/tests/test_tracab.py @@ -1,4 +1,5 @@ from datetime import datetime, timedelta, timezone +import io from pathlib import Path import pytest @@ -392,3 +393,23 @@ def test_correct_deserialization( ) meta_tracking_assertions(dataset) + + +class TestTracabXMLWithBOM: + def test_correct_deserialization( + self, xml_meta_data: Path, dat_raw_data: Path + ): + # Read the original XML and prepend a UTF-8 BOM + with open(xml_meta_data, "rb") as f: + xml_data = f.read() + + bom_xml = b"\xef\xbb\xbf" + xml_data + + dataset = tracab.load( + meta_data=io.BytesIO(bom_xml), + raw_data=dat_raw_data, + coordinates="tracab", + only_alive=False, + ) + + meta_tracking_assertions(dataset)