From 5aea8c993c99a3fe8e10c460a9b6be4e96ee4c1d Mon Sep 17 00:00:00 2001
From: Nico Thomaier <me@semidark.net>
Date: Fri, 17 Apr 2026 15:16:10 +0200
Subject: [PATCH 1/2] feat: add German G2P with text normalization
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add misaki/de.py with:
- normalize_text_de(): expands numbers, dates, times, currency,
  abbreviations, ordinals, and years to spelled-out German text
- DEG2P: wraps normalize_text_de() + EspeakG2P(language='de')

Normalizer handles:
- Cardinal numbers (zweiundvierzig), ordinals (dritte)
- Years (neunzehnhundertfünfundachtzig)
- Dates DD.MM.YYYY, times HH:MM
- Currency (Euro, Dollar, Pfund, Yen with cents)
- 30+ German abbreviations (Dr., GmbH, z.B., usw., months)
- German-format numbers (1.234,56 with thousand dots + decimal comma)
- Quote normalization, whitespace cleanup

Add tests/test_de.py with 60 unit tests (CI-safe, no espeak needed)
and 4 integration tests (skipped when phonemizer unavailable).

Add 'de' optional dependency in pyproject.toml.
---
 misaki/de.py     | 304 ++++++++++++++++++++++++++++++++++++++++++
 pyproject.toml   |   1 +
 tests/test_de.py | 338 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 643 insertions(+)
 create mode 100644 misaki/de.py
 create mode 100644 tests/test_de.py

diff --git a/misaki/de.py b/misaki/de.py
new file mode 100644
index 0000000..f80bf2b
--- /dev/null
+++ b/misaki/de.py
@@ -0,0 +1,304 @@
+"""German G2P: text normalization + espeak-ng phonemization.
+
+normalize_text_de() expands numbers, dates, times, currency, and
+abbreviations so espeak-ng receives clean spelled-out text.
+
+DEG2P wraps normalize_text_de() + EspeakG2P for use in KPipeline.
+"""
+
+from typing import Tuple
+import re
+
+# ── cardinal numbers ─────────────────────────────────────────────────────────
+
+_ONES = [
+    "",
+    "ein",
+    "zwei",
+    "drei",
+    "vier",
+    "fünf",
+    "sechs",
+    "sieben",
+    "acht",
+    "neun",
+    "zehn",
+    "elf",
+    "zwölf",
+    "dreizehn",
+    "vierzehn",
+    "fünfzehn",
+    "sechzehn",
+    "siebzehn",
+    "achtzehn",
+    "neunzehn",
+]
+_TENS = [
+    "",
+    "",
+    "zwanzig",
+    "dreißig",
+    "vierzig",
+    "fünfzig",
+    "sechzig",
+    "siebzig",
+    "achtzig",
+    "neunzig",
+]
+
+
+def _int_to_de(n, standalone=True):
+    """Convert integer to German words.
+
+    standalone=True returns "eins" for 1, standalone=False returns "ein"
+    (used in composition: einhundert, eintausend).
+    """
+    if n < 0:
+        return "minus " + _int_to_de(-n)
+    if n == 0:
+        return "null"
+    if n == 1:
+        return "eins" if standalone else "ein"
+    if n < 20:
+        return _ONES[n]
+    if n < 100:
+        ones, tens = n % 10, n // 10
+        if ones:
+            return _ONES[ones] + "und" + _TENS[tens]
+        return _TENS[tens]
+    if n < 1_000:
+        h, r = n // 100, n % 100
+        return _ONES[h] + "hundert" + (_int_to_de(r, standalone=False) if r else "")
+    if n < 1_000_000:
+        t, r = n // 1_000, n % 1_000
+        prefix = _int_to_de(t, standalone=False) if t != 1 else "ein"
+        return prefix + "tausend" + (_int_to_de(r, standalone=False) if r else "")
+    if n < 1_000_000_000:
+        m, r = n // 1_000_000, n % 1_000_000
+        word = (
+            "eine Million" if m == 1 else _int_to_de(m, standalone=False) + " Millionen"
+        )
+        return word + (" " + _int_to_de(r, standalone=False) if r else "")
+    b, r = n // 1_000_000_000, n % 1_000_000_000
+    word = (
+        "eine Milliarde" if b == 1 else _int_to_de(b, standalone=False) + " Milliarden"
+    )
+    return word + (" " + _int_to_de(r, standalone=False) if r else "")
+
+
+# ── ordinals ─────────────────────────────────────────────────────────────────
+
+_ORD_IRREG = {1: "erst", 2: "zweit", 3: "dritt", 7: "siebt", 8: "acht"}
+
+
+def _ordinal_stem_de(n):
+    """Ordinal stem without inflection suffix."""
+    if n in _ORD_IRREG:
+        return _ORD_IRREG[n]
+    return _int_to_de(n, standalone=False) + ("t" if n < 20 else "st")
+
+
+# ── years ────────────────────────────────────────────────────────────────────
+
+
+def _year_de(n):
+    """German year pronunciation: 1985 -> neunzehnhundertfünfundachtzig."""
+    if 1100 <= n <= 1999:
+        c, r = n // 100, n % 100
+        return (
+            _int_to_de(c, standalone=False)
+            + "hundert"
+            + (_int_to_de(r, standalone=False) if r else "")
+        )
+    return _int_to_de(n)
+
+
+# ── month names ──────────────────────────────────────────────────────────────
+
+_MONTHS = [
+    "",
+    "Januar",
+    "Februar",
+    "März",
+    "April",
+    "Mai",
+    "Juni",
+    "Juli",
+    "August",
+    "September",
+    "Oktober",
+    "November",
+    "Dezember",
+]
+
+# ── currency ─────────────────────────────────────────────────────────────────
+
+_CURRENCY = {"€": "Euro", "$": "Dollar", "£": "Pfund", "¥": "Yen"}
+
+
+def _currency_repl(sym, num):
+    word = _CURRENCY.get(sym, sym)
+    cleaned = num.replace(".", "").replace(",", ".")
+    try:
+        val = float(cleaned)
+    except ValueError:
+        return sym + num
+    euros = int(val)
+    cents = round((val - euros) * 100)
+    if cents == 0:
+        return _int_to_de(euros) + " " + word
+    return _int_to_de(euros) + " " + word + " und " + _int_to_de(cents) + " Cent"
+
+
+# ── text normalization ───────────────────────────────────────────────────────
+
+
+def normalize_text_de(text):
+    """Normalize German text for TTS: expand numbers, dates, times, currency, abbreviations."""
+    if not text:
+        return text
+
+    # 1. Quotes -> ASCII
+    text = text.replace("\u201e", '"').replace("\u201c", '"')  # „ "
+    text = text.replace("\u2018", "'").replace("\u2019", "'")  # ' '
+    text = text.replace("\u00ab", '"').replace("\u00bb", '"')  # « »
+    text = text.replace("\u2039", '"').replace("\u203a", '"')  # ‹ ›
+
+    # 2. Non-breaking whitespace
+    text = re.sub(r"[^\S \n]", " ", text)
+
+    # 3. Abbreviations
+    text = re.sub(r"\bDr\.(?=\s)", "Doktor", text)
+    text = re.sub(r"\bProf\.(?=\s)", "Professor", text)
+    text = re.sub(r"\bHr\.(?=\s)", "Herr ", text)
+    text = re.sub(r"\bFr\.(?=\s[A-ZÄÖÜ])", "Frau", text)
+    text = re.sub(r"\bDipl\.\s*-?\s*Ing\.", "Diplom-Ingenieur", text)
+    text = re.sub(r"\bStr\.(?=\s)", "Straße", text)
+    text = re.sub(r"\bNr\.(?=\s*\d)", "Nummer", text)
+    text = re.sub(r"\bTel\.(?=\s)", "Telefon", text)
+    text = re.sub(r"\bAbt\.(?=\s)", "Abteilung", text)
+    text = re.sub(r"\bGmbH\b", "Gesellschaft mit beschränkter Haftung", text)
+    text = re.sub(r"\bAG\b(?=[\s,.]|$)", "Aktiengesellschaft", text)
+    text = re.sub(r"\bz\.\s*B\.", "zum Beispiel", text, flags=re.IGNORECASE)
+    text = re.sub(r"\bd\.\s*h\.", "das heißt", text, flags=re.IGNORECASE)
+    text = re.sub(r"\bu\.\s*a\.", "unter anderem", text, flags=re.IGNORECASE)
+    text = re.sub(r"\bbzw\.", "beziehungsweise", text, flags=re.IGNORECASE)
+    text = re.sub(r"\busw\.", "und so weiter", text, flags=re.IGNORECASE)
+    text = re.sub(r"\betc\.", "et cetera", text, flags=re.IGNORECASE)
+    text = re.sub(r"\bca\.", "circa", text, flags=re.IGNORECASE)
+    text = re.sub(r"\bvgl\.", "vergleiche", text, flags=re.IGNORECASE)
+    text = re.sub(r"\binkl\.", "inklusive", text, flags=re.IGNORECASE)
+    text = re.sub(r"\bexkl\.", "exklusive", text, flags=re.IGNORECASE)
+    text = re.sub(r"\bggf\.", "gegebenenfalls", text, flags=re.IGNORECASE)
+    text = re.sub(r"\bi\.\s*d\.\s*R\.", "in der Regel", text, flags=re.IGNORECASE)
+    text = re.sub(r"\bo\.\s*ä\.", "oder ähnliches", text, flags=re.IGNORECASE)
+    text = re.sub(r"\bu\.\s*U\.", "unter Umständen", text, flags=re.IGNORECASE)
+    # Month abbreviations
+    for abbr, full in [
+        ("Jan", "Januar"),
+        ("Feb", "Februar"),
+        ("Mär", "März"),
+        ("Apr", "April"),
+        ("Jun", "Juni"),
+        ("Jul", "Juli"),
+        ("Aug", "August"),
+        ("Sep", "September"),
+        ("Okt", "Oktober"),
+        ("Nov", "November"),
+        ("Dez", "Dezember"),
+    ]:
+        text = re.sub(rf"\b{abbr}\.(?=\s)", full, text)
+
+    # 4. Currency (symbol before or after amount)
+    csym = r"[€$£¥]"
+    text = re.sub(
+        rf"({csym})\s*(\d[\d.,]*)",
+        lambda m: _currency_repl(m.group(1), m.group(2)),
+        text,
+    )
+    text = re.sub(
+        rf"(\d[\d.,]*)\s*({csym})",
+        lambda m: _currency_repl(m.group(2), m.group(1)),
+        text,
+    )
+
+    # 5. Times (HH:MM)
+    def _time_repl(m):
+        h, mi = int(m.group(1)), int(m.group(2))
+        return _int_to_de(h) + " Uhr" + (" " + _int_to_de(mi) if mi else "")
+
+    text = re.sub(r"\b(\d{1,2}):(\d{2})\b", _time_repl, text)
+
+    # 6. Full dates (DD.MM.YYYY)
+    def _date_repl(m):
+        d, mo, y = int(m.group(1)), int(m.group(2)), int(m.group(3))
+        if d < 1 or d > 31 or mo < 1 or mo > 12:
+            return m.group(0)
+        return _ordinal_stem_de(d) + "e " + _MONTHS[mo] + " " + _year_de(y)
+
+    text = re.sub(r"\b(\d{1,2})\.(\d{1,2})\.(\d{4})\b", _date_repl, text)
+
+    # 7. Ordinals mid-sentence (e.g. "am 3. Mai") -- only 1-2 digit numbers
+    text = re.sub(
+        r"(?<!\d)(\d{1,2})\.\s",
+        lambda m: _ordinal_stem_de(int(m.group(1))) + "e ",
+        text,
+    )
+
+    # 8. Standalone years (1100-2099)
+    def _year_repl(m):
+        n = int(m.group(1))
+        return _year_de(n) if 1100 <= n <= 2099 else _int_to_de(n)
+
+    text = re.sub(r"\b(\d{4})\b", _year_repl, text)
+
+    # 9. German-format numbers: 1.234.567 or 1.234,56
+    def _grouped_num_repl(m):
+        cleaned = m.group(0).replace(".", "").replace(",", ".")
+        try:
+            val = float(cleaned)
+        except ValueError:
+            return m.group(0)
+        if val == int(val):
+            return _int_to_de(int(val))
+        ip, fp = cleaned.split(".")
+        return (
+            _int_to_de(int(ip)) + " Komma " + " ".join(_int_to_de(int(d)) for d in fp)
+        )
+
+    text = re.sub(r"\b\d{1,3}(?:\.\d{3})+(?:,\d+)?\b", _grouped_num_repl, text)
+
+    # Decimal comma (3,14)
+    def _decimal_repl(m):
+        ip, fp = m.group(1), m.group(2)
+        return (
+            _int_to_de(int(ip)) + " Komma " + " ".join(_int_to_de(int(d)) for d in fp)
+        )
+
+    text = re.sub(r"\b(\d+),(\d+)\b", _decimal_repl, text)
+
+    # Plain integers
+    text = re.sub(r"\b(\d+)\b", lambda m: _int_to_de(int(m.group(1))), text)
+
+    # 10. Whitespace cleanup
+    text = re.sub(r"[ \t]{2,}", " ", text)
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    text = text.strip()
+
+    return text
+
+
+# ── G2P class ────────────────────────────────────────────────────────────────
+
+
+class DEG2P:
+    """German G2P: normalize text then phonemize via espeak-ng."""
+
+    def __init__(self):
+        from .espeak import EspeakG2P
+
+        self.espeak = EspeakG2P(language="de")
+
+    def __call__(self, text) -> Tuple[str, None]:
+        text = normalize_text_de(text)
+        return self.espeak(text)
diff --git a/pyproject.toml b/pyproject.toml
index e0ee5aa..29cf13a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,6 +29,7 @@ ja = ["fugashi", "jaconv", "mojimoji", "unidic", "pyopenjtalk"]
 ko = ["jamo", "nltk"]
 zh = ["jieba", "ordered-set", "pypinyin", "cn2an", "pypinyin-dict"]
 vi = ["num2words", "spacy", "spacy-curated-transformers", "underthesea"]
+de = ["phonemizer-fork", "espeakng-loader"]
 he = ["mishkal-hebrew>=0.3.2"]
 
 [build-system]
diff --git a/tests/test_de.py b/tests/test_de.py
new file mode 100644
index 0000000..3f8d4ac
--- /dev/null
+++ b/tests/test_de.py
@@ -0,0 +1,338 @@
+"""Tests for misaki.de — German text normalization and G2P.
+
+Unit tests (no espeak-ng required) test normalize_text_de() and helpers.
+Integration tests (require espeak-ng) test DEG2P end-to-end.
+"""
+
+import pytest
+from misaki.de import _int_to_de, _ordinal_stem_de, _year_de, normalize_text_de
+
+# ── _int_to_de ───────────────────────────────────────────────────────────────
+
+
+class TestIntToDe:
+    def test_zero(self):
+        assert _int_to_de(0) == "null"
+
+    def test_one_standalone(self):
+        assert _int_to_de(1) == "eins"
+
+    def test_one_composition(self):
+        assert _int_to_de(1, standalone=False) == "ein"
+
+    def test_teens(self):
+        assert _int_to_de(11) == "elf"
+        assert _int_to_de(12) == "zwölf"
+        assert _int_to_de(16) == "sechzehn"
+        assert _int_to_de(17) == "siebzehn"
+
+    def test_tens(self):
+        assert _int_to_de(20) == "zwanzig"
+        assert _int_to_de(30) == "dreißig"
+        assert _int_to_de(70) == "siebzig"
+
+    def test_compound(self):
+        assert _int_to_de(21) == "einundzwanzig"
+        assert _int_to_de(42) == "zweiundvierzig"
+        assert _int_to_de(99) == "neunundneunzig"
+
+    def test_hundreds(self):
+        assert _int_to_de(100) == "einhundert"
+        assert _int_to_de(256) == "zweihundertsechsundfünfzig"
+
+    def test_thousands(self):
+        assert _int_to_de(1000) == "eintausend"
+        assert _int_to_de(1234) == "eintausendzweihundertvierunddreißig"
+
+    def test_millions(self):
+        assert _int_to_de(1_000_000) == "eine Million"
+        assert _int_to_de(2_000_000) == "zwei Millionen"
+
+    def test_billions(self):
+        assert _int_to_de(1_000_000_000) == "eine Milliarde"
+
+    def test_negative(self):
+        assert _int_to_de(-5) == "minus fünf"
+
+
+# ── _ordinal_stem_de ─────────────────────────────────────────────────────────
+
+
+class TestOrdinalStemDe:
+    def test_irregular(self):
+        assert _ordinal_stem_de(1) == "erst"
+        assert _ordinal_stem_de(2) == "zweit"
+        assert _ordinal_stem_de(3) == "dritt"
+        assert _ordinal_stem_de(7) == "siebt"
+        assert _ordinal_stem_de(8) == "acht"
+
+    def test_regular_under_20(self):
+        assert _ordinal_stem_de(4) == "viert"
+        assert _ordinal_stem_de(5) == "fünft"
+        assert _ordinal_stem_de(19) == "neunzehnt"
+
+    def test_regular_20_plus(self):
+        assert _ordinal_stem_de(20) == "zwanzigst"
+        assert _ordinal_stem_de(100) == "einhundertst"
+
+
+# ── _year_de ─────────────────────────────────────────────────────────────────
+
+
+class TestYearDe:
+    def test_1900(self):
+        assert _year_de(1900) == "neunzehnhundert"
+
+    def test_1985(self):
+        assert _year_de(1985) == "neunzehnhundertfünfundachtzig"
+
+    def test_1100(self):
+        assert "hundert" in _year_de(1100)
+
+    def test_2024(self):
+        assert _year_de(2024) == "zweitausendvierundzwanzig"
+
+    def test_below_1100(self):
+        # Falls through to cardinal
+        assert _year_de(800) == "achthundert"
+
+
+# ── normalize_text_de ────────────────────────────────────────────────────────
+
+
+class TestNormalize:
+    def test_empty(self):
+        assert normalize_text_de("") == ""
+
+    def test_plain_text(self):
+        r = normalize_text_de("Guten Morgen, wie geht es Ihnen?")
+        assert "Guten Morgen" in r
+
+    def test_umlauts_preserved(self):
+        r = normalize_text_de("Äpfel, Österreich, Überraschung, Größe")
+        assert "Äpfel" in r
+        assert "Österreich" in r
+        assert "Überraschung" in r
+        assert "Größe" in r
+
+
+class TestQuotes:
+    def test_german_quotes(self):
+        r = normalize_text_de("Er sagte: \u201eGuten Morgen.\u201c")
+        assert "\u201e" not in r
+        assert "\u201c" not in r
+
+    def test_guillemets(self):
+        r = normalize_text_de("Das ist \u00abtoll\u00bb.")
+        assert "\u00ab" not in r
+        assert "\u00bb" not in r
+
+
+class TestAbbreviations:
+    def test_doktor(self):
+        assert "Doktor" in normalize_text_de("Dr. Müller")
+
+    def test_professor(self):
+        assert "Professor" in normalize_text_de("Prof. Schmidt hält")
+
+    def test_herr(self):
+        # Hr. should expand to Herr (nominative), not Herrn
+        r = normalize_text_de("Hr. Müller")
+        assert "Herr" in r
+        assert "Herrn" not in r
+
+    def test_strasse_standalone(self):
+        assert "Straße" in normalize_text_de("Str. des Friedens")
+
+    def test_nummer(self):
+        assert "Nummer" in normalize_text_de("Nr. 5 bitte")
+
+    def test_zum_beispiel(self):
+        assert "zum Beispiel" in normalize_text_de("z.B. morgen")
+
+    def test_das_heisst(self):
+        assert "das heißt" in normalize_text_de("d.h. später")
+
+    def test_und_so_weiter(self):
+        assert "und so weiter" in normalize_text_de("Äpfel usw.")
+
+    def test_gmbh(self):
+        assert "Gesellschaft" in normalize_text_de("Muster GmbH")
+
+    def test_ag(self):
+        assert "Aktiengesellschaft" in normalize_text_de("Siemens AG,")
+
+    def test_month_jan(self):
+        assert "Januar" in normalize_text_de("Jan. war kalt")
+
+    def test_month_okt(self):
+        assert "Oktober" in normalize_text_de("Okt. war schön")
+
+
+class TestNumbers:
+    def test_standalone_number(self):
+        r = normalize_text_de("5 Katzen.")
+        assert "fünf" in r
+        assert "5" not in r
+
+    def test_42(self):
+        r = normalize_text_de("42 Leute.")
+        assert "zweiundvierzig" in r
+        assert "42" not in r
+
+    def test_german_thousands(self):
+        r = normalize_text_de("1.000 Menschen.")
+        assert "tausend" in r
+        assert "1.000" not in r
+
+    def test_decimal_comma(self):
+        r = normalize_text_de("36,9 Grad.")
+        assert "Komma" in r
+        assert "36,9" not in r
+
+
+class TestCurrency:
+    def test_euro_before(self):
+        r = normalize_text_de("kostet €10")
+        assert "Euro" in r
+        assert "€" not in r
+
+    def test_euro_after(self):
+        r = normalize_text_de("kostet 10€")
+        assert "Euro" in r
+        assert "€" not in r
+
+    def test_euro_with_cents(self):
+        r = normalize_text_de("€9,99 bitte")
+        assert "Euro" in r
+        assert "Cent" in r
+
+    def test_dollar(self):
+        r = normalize_text_de("$100 Rabatt")
+        assert "Dollar" in r
+        assert "$" not in r
+
+
+class TestTimes:
+    def test_full_hour(self):
+        r = normalize_text_de("Um 14:00 Uhr.")
+        assert "vierzehn Uhr" in r
+        assert "14:00" not in r
+
+    def test_with_minutes(self):
+        assert "acht Uhr dreißig" in normalize_text_de("Um 8:30 Uhr.")
+
+    def test_midnight(self):
+        assert "null Uhr" in normalize_text_de("Um 0:00 Uhr.")
+
+    def test_no_trailing_null(self):
+        r = normalize_text_de("Um 15:00")
+        assert "fünfzehn Uhr" in r
+        assert "null" not in r
+
+
+class TestDates:
+    def test_christmas(self):
+        r = normalize_text_de("Am 24.12.2024.")
+        assert "Dezember" in r
+        assert "24.12.2024" not in r
+
+    def test_new_year(self):
+        r = normalize_text_de("Am 1.1.2000.")
+        assert "erste" in r
+        assert "Januar" in r
+
+    def test_german_unity(self):
+        r = normalize_text_de("Am 3.10.1990.")
+        assert "dritt" in r
+        assert "Oktober" in r
+
+
+class TestOrdinalsMidSentence:
+    def test_ordinal_3(self):
+        assert "dritte" in normalize_text_de("Am 3. Mai")
+
+    def test_ordinal_1(self):
+        assert "erste" in normalize_text_de("Am 1. Mai")
+
+    def test_ordinal_20(self):
+        assert "zwanzigste" in normalize_text_de("Am 20. August")
+
+
+class TestYears:
+    def test_1989_in_text(self):
+        r = normalize_text_de("Im Jahr 1989.")
+        assert "neunzehnhundert" in r
+        assert "1989" not in r
+
+    def test_2024_in_text(self):
+        r = normalize_text_de("Im Jahr 2024.")
+        assert "zweitausend" in r
+
+
+class TestWhitespace:
+    def test_double_spaces(self):
+        assert "  " not in normalize_text_de("Hallo   Welt")
+
+    def test_trimmed(self):
+        r = normalize_text_de("  Hallo Welt  ")
+        assert r == r.strip()
+
+    def test_nbsp(self):
+        assert "\u00a0" not in normalize_text_de("Hallo\u00a0Welt")
+
+
+class TestComplexSentence:
+    def test_mixed(self):
+        t = "Dr. Müller kaufte am 3. Mai 2023 um 14:30 Uhr 3 Pakete für €29,99 bei der Muster GmbH."
+        r = normalize_text_de(t)
+        assert "Doktor" in r
+        assert "Mai" in r
+        assert "vierzehn Uhr dreißig" in r
+        assert "Euro" in r
+        assert "Gesellschaft" in r
+        assert "€" not in r
+        assert "Dr." not in r
+        assert "14:30" not in r
+
+
+# ── integration tests (require espeak-ng) ────────────────────────────────────
+
+try:
+    from misaki.espeak import EspeakG2P
+
+    ESPEAK_AVAILABLE = True
+except (ImportError, OSError):
+    ESPEAK_AVAILABLE = False
+
+
+@pytest.mark.skipif(
+    not ESPEAK_AVAILABLE, reason="espeak-ng or phonemizer not available"
+)
+class TestDEG2PIntegration:
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        from misaki.de import DEG2P
+
+        self.g2p = DEG2P()
+
+    def test_simple(self):
+        ps, tokens = self.g2p("Hallo Welt")
+        assert isinstance(ps, str)
+        assert len(ps) > 0
+        assert tokens is None
+
+    def test_normalized_numbers(self):
+        ps, _ = self.g2p("Es gibt 42 Katzen.")
+        assert isinstance(ps, str)
+        assert len(ps) > 0
+
+    def test_normalized_date(self):
+        ps, _ = self.g2p("Am 24.12.2024 ist Weihnachten.")
+        assert isinstance(ps, str)
+        assert len(ps) > 0
+
+    def test_normalized_currency(self):
+        ps, _ = self.g2p("Das kostet €9,99.")
+        assert isinstance(ps, str)
+        assert len(ps) > 0

From 3e7b63738eb340947f9c5959aa6607f85917edb6 Mon Sep 17 00:00:00 2001
From: Nico Thomaier <me@semidark.net>
Date: Fri, 17 Apr 2026 15:30:35 +0200
Subject: [PATCH 2/2] fix: prevent double 'Uhr' when time precedes 'Uhr' in
 text
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

'14:30 Uhr' was normalized to 'vierzehn Uhr dreißig Uhr'.
Now the regex optionally consumes trailing ' Uhr' after HH:MM.
---
 misaki/de.py     | 2 +-
 tests/test_de.py | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/misaki/de.py b/misaki/de.py
index f80bf2b..80ce437 100644
--- a/misaki/de.py
+++ b/misaki/de.py
@@ -227,7 +227,7 @@ def _time_repl(m):
         h, mi = int(m.group(1)), int(m.group(2))
         return _int_to_de(h) + " Uhr" + (" " + _int_to_de(mi) if mi else "")
 
-    text = re.sub(r"\b(\d{1,2}):(\d{2})\b", _time_repl, text)
+    text = re.sub(r"\b(\d{1,2}):(\d{2})(?:\s*Uhr)?", _time_repl, text)
 
     # 6. Full dates (DD.MM.YYYY)
     def _date_repl(m):
diff --git a/tests/test_de.py b/tests/test_de.py
index 3f8d4ac..2cf5da2 100644
--- a/tests/test_de.py
+++ b/tests/test_de.py
@@ -230,6 +230,10 @@ def test_no_trailing_null(self):
         assert "fünfzehn Uhr" in r
         assert "null" not in r
 
+    def test_no_double_uhr(self):
+        r = normalize_text_de("Um 14:30 Uhr")
+        assert r.count("Uhr") == 1
+
 
 class TestDates:
     def test_christmas(self):