From 5aea8c993c99a3fe8e10c460a9b6be4e96ee4c1d Mon Sep 17 00:00:00 2001 From: Nico Thomaier Date: Fri, 17 Apr 2026 15:16:10 +0200 Subject: [PATCH 1/2] feat: add German G2P with text normalization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add misaki/de.py with: - normalize_text_de(): expands numbers, dates, times, currency, abbreviations, ordinals, and years to spelled-out German text - DEG2P: wraps normalize_text_de() + EspeakG2P(language='de') Normalizer handles: - Cardinal numbers (zweiundvierzig), ordinals (dritte) - Years (neunzehnhundertfünfundachtzig) - Dates DD.MM.YYYY, times HH:MM - Currency (Euro, Dollar, Pfund, Yen with cents) - 30+ German abbreviations (Dr., GmbH, z.B., usw., months) - German-format numbers (1.234,56 with thousand dots + decimal comma) - Quote normalization, whitespace cleanup Add tests/test_de.py with 60 unit tests (CI-safe, no espeak needed) and 4 integration tests (skipped when phonemizer unavailable). Add 'de' optional dependency in pyproject.toml. --- misaki/de.py | 304 ++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 1 + tests/test_de.py | 338 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 643 insertions(+) create mode 100644 misaki/de.py create mode 100644 tests/test_de.py diff --git a/misaki/de.py b/misaki/de.py new file mode 100644 index 0000000..f80bf2b --- /dev/null +++ b/misaki/de.py @@ -0,0 +1,304 @@ +"""German G2P: text normalization + espeak-ng phonemization. + +normalize_text_de() expands numbers, dates, times, currency, and +abbreviations so espeak-ng receives clean spelled-out text. + +DEG2P wraps normalize_text_de() + EspeakG2P for use in KPipeline. +""" + +from typing import Tuple +import re + +# ── cardinal numbers ───────────────────────────────────────────────────────── + +_ONES = [ + "", + "ein", + "zwei", + "drei", + "vier", + "fünf", + "sechs", + "sieben", + "acht", + "neun", + "zehn", + "elf", + "zwölf", + "dreizehn", + "vierzehn", + "fünfzehn", + "sechzehn", + "siebzehn", + "achtzehn", + "neunzehn", +] +_TENS = [ + "", + "", + "zwanzig", + "dreißig", + "vierzig", + "fünfzig", + "sechzig", + "siebzig", + "achtzig", + "neunzig", +] + + +def _int_to_de(n, standalone=True): + """Convert integer to German words. + + standalone=True returns "eins" for 1, standalone=False returns "ein" + (used in composition: einhundert, eintausend). + """ + if n < 0: + return "minus " + _int_to_de(-n) + if n == 0: + return "null" + if n == 1: + return "eins" if standalone else "ein" + if n < 20: + return _ONES[n] + if n < 100: + ones, tens = n % 10, n // 10 + if ones: + return _ONES[ones] + "und" + _TENS[tens] + return _TENS[tens] + if n < 1_000: + h, r = n // 100, n % 100 + return _ONES[h] + "hundert" + (_int_to_de(r, standalone=False) if r else "") + if n < 1_000_000: + t, r = n // 1_000, n % 1_000 + prefix = _int_to_de(t, standalone=False) if t != 1 else "ein" + return prefix + "tausend" + (_int_to_de(r, standalone=False) if r else "") + if n < 1_000_000_000: + m, r = n // 1_000_000, n % 1_000_000 + word = ( + "eine Million" if m == 1 else _int_to_de(m, standalone=False) + " Millionen" + ) + return word + (" " + _int_to_de(r, standalone=False) if r else "") + b, r = n // 1_000_000_000, n % 1_000_000_000 + word = ( + "eine Milliarde" if b == 1 else _int_to_de(b, standalone=False) + " Milliarden" + ) + return word + (" " + _int_to_de(r, standalone=False) if r else "") + + +# ── ordinals ───────────────────────────────────────────────────────────────── + +_ORD_IRREG = {1: "erst", 2: "zweit", 3: "dritt", 7: "siebt", 8: "acht"} + + +def _ordinal_stem_de(n): + """Ordinal stem without inflection suffix.""" + if n in _ORD_IRREG: + return _ORD_IRREG[n] + return _int_to_de(n, standalone=False) + ("t" if n < 20 else "st") + + +# ── years ──────────────────────────────────────────────────────────────────── + + +def _year_de(n): + """German year pronunciation: 1985 -> neunzehnhundertfünfundachtzig.""" + if 1100 <= n <= 1999: + c, r = n // 100, n % 100 + return ( + _int_to_de(c, standalone=False) + + "hundert" + + (_int_to_de(r, standalone=False) if r else "") + ) + return _int_to_de(n) + + +# ── month names ────────────────────────────────────────────────────────────── + +_MONTHS = [ + "", + "Januar", + "Februar", + "März", + "April", + "Mai", + "Juni", + "Juli", + "August", + "September", + "Oktober", + "November", + "Dezember", +] + +# ── currency ───────────────────────────────────────────────────────────────── + +_CURRENCY = {"€": "Euro", "$": "Dollar", "£": "Pfund", "¥": "Yen"} + + +def _currency_repl(sym, num): + word = _CURRENCY.get(sym, sym) + cleaned = num.replace(".", "").replace(",", ".") + try: + val = float(cleaned) + except ValueError: + return sym + num + euros = int(val) + cents = round((val - euros) * 100) + if cents == 0: + return _int_to_de(euros) + " " + word + return _int_to_de(euros) + " " + word + " und " + _int_to_de(cents) + " Cent" + + +# ── text normalization ─────────────────────────────────────────────────────── + + +def normalize_text_de(text): + """Normalize German text for TTS: expand numbers, dates, times, currency, abbreviations.""" + if not text: + return text + + # 1. Quotes -> ASCII + text = text.replace("\u201e", '"').replace("\u201c", '"') # „ " + text = text.replace("\u2018", "'").replace("\u2019", "'") # ' ' + text = text.replace("\u00ab", '"').replace("\u00bb", '"') # « » + text = text.replace("\u2039", '"').replace("\u203a", '"') # ‹ › + + # 2. Non-breaking whitespace + text = re.sub(r"[^\S \n]", " ", text) + + # 3. Abbreviations + text = re.sub(r"\bDr\.(?=\s)", "Doktor", text) + text = re.sub(r"\bProf\.(?=\s)", "Professor", text) + text = re.sub(r"\bHr\.(?=\s)", "Herr ", text) + text = re.sub(r"\bFr\.(?=\s[A-ZÄÖÜ])", "Frau", text) + text = re.sub(r"\bDipl\.\s*-?\s*Ing\.", "Diplom-Ingenieur", text) + text = re.sub(r"\bStr\.(?=\s)", "Straße", text) + text = re.sub(r"\bNr\.(?=\s*\d)", "Nummer", text) + text = re.sub(r"\bTel\.(?=\s)", "Telefon", text) + text = re.sub(r"\bAbt\.(?=\s)", "Abteilung", text) + text = re.sub(r"\bGmbH\b", "Gesellschaft mit beschränkter Haftung", text) + text = re.sub(r"\bAG\b(?=[\s,.]|$)", "Aktiengesellschaft", text) + text = re.sub(r"\bz\.\s*B\.", "zum Beispiel", text, flags=re.IGNORECASE) + text = re.sub(r"\bd\.\s*h\.", "das heißt", text, flags=re.IGNORECASE) + text = re.sub(r"\bu\.\s*a\.", "unter anderem", text, flags=re.IGNORECASE) + text = re.sub(r"\bbzw\.", "beziehungsweise", text, flags=re.IGNORECASE) + text = re.sub(r"\busw\.", "und so weiter", text, flags=re.IGNORECASE) + text = re.sub(r"\betc\.", "et cetera", text, flags=re.IGNORECASE) + text = re.sub(r"\bca\.", "circa", text, flags=re.IGNORECASE) + text = re.sub(r"\bvgl\.", "vergleiche", text, flags=re.IGNORECASE) + text = re.sub(r"\binkl\.", "inklusive", text, flags=re.IGNORECASE) + text = re.sub(r"\bexkl\.", "exklusive", text, flags=re.IGNORECASE) + text = re.sub(r"\bggf\.", "gegebenenfalls", text, flags=re.IGNORECASE) + text = re.sub(r"\bi\.\s*d\.\s*R\.", "in der Regel", text, flags=re.IGNORECASE) + text = re.sub(r"\bo\.\s*ä\.", "oder ähnliches", text, flags=re.IGNORECASE) + text = re.sub(r"\bu\.\s*U\.", "unter Umständen", text, flags=re.IGNORECASE) + # Month abbreviations + for abbr, full in [ + ("Jan", "Januar"), + ("Feb", "Februar"), + ("Mär", "März"), + ("Apr", "April"), + ("Jun", "Juni"), + ("Jul", "Juli"), + ("Aug", "August"), + ("Sep", "September"), + ("Okt", "Oktober"), + ("Nov", "November"), + ("Dez", "Dezember"), + ]: + text = re.sub(rf"\b{abbr}\.(?=\s)", full, text) + + # 4. Currency (symbol before or after amount) + csym = r"[€$£¥]" + text = re.sub( + rf"({csym})\s*(\d[\d.,]*)", + lambda m: _currency_repl(m.group(1), m.group(2)), + text, + ) + text = re.sub( + rf"(\d[\d.,]*)\s*({csym})", + lambda m: _currency_repl(m.group(2), m.group(1)), + text, + ) + + # 5. Times (HH:MM) + def _time_repl(m): + h, mi = int(m.group(1)), int(m.group(2)) + return _int_to_de(h) + " Uhr" + (" " + _int_to_de(mi) if mi else "") + + text = re.sub(r"\b(\d{1,2}):(\d{2})\b", _time_repl, text) + + # 6. Full dates (DD.MM.YYYY) + def _date_repl(m): + d, mo, y = int(m.group(1)), int(m.group(2)), int(m.group(3)) + if d < 1 or d > 31 or mo < 1 or mo > 12: + return m.group(0) + return _ordinal_stem_de(d) + "e " + _MONTHS[mo] + " " + _year_de(y) + + text = re.sub(r"\b(\d{1,2})\.(\d{1,2})\.(\d{4})\b", _date_repl, text) + + # 7. Ordinals mid-sentence (e.g. "am 3. Mai") -- only 1-2 digit numbers + text = re.sub( + r"(? Tuple[str, None]: + text = normalize_text_de(text) + return self.espeak(text) diff --git a/pyproject.toml b/pyproject.toml index e0ee5aa..29cf13a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,7 @@ ja = ["fugashi", "jaconv", "mojimoji", "unidic", "pyopenjtalk"] ko = ["jamo", "nltk"] zh = ["jieba", "ordered-set", "pypinyin", "cn2an", "pypinyin-dict"] vi = ["num2words", "spacy", "spacy-curated-transformers", "underthesea"] +de = ["phonemizer-fork", "espeakng-loader"] he = ["mishkal-hebrew>=0.3.2"] [build-system] diff --git a/tests/test_de.py b/tests/test_de.py new file mode 100644 index 0000000..3f8d4ac --- /dev/null +++ b/tests/test_de.py @@ -0,0 +1,338 @@ +"""Tests for misaki.de — German text normalization and G2P. + +Unit tests (no espeak-ng required) test normalize_text_de() and helpers. +Integration tests (require espeak-ng) test DEG2P end-to-end. +""" + +import pytest +from misaki.de import _int_to_de, _ordinal_stem_de, _year_de, normalize_text_de + +# ── _int_to_de ─────────────────────────────────────────────────────────────── + + +class TestIntToDe: + def test_zero(self): + assert _int_to_de(0) == "null" + + def test_one_standalone(self): + assert _int_to_de(1) == "eins" + + def test_one_composition(self): + assert _int_to_de(1, standalone=False) == "ein" + + def test_teens(self): + assert _int_to_de(11) == "elf" + assert _int_to_de(12) == "zwölf" + assert _int_to_de(16) == "sechzehn" + assert _int_to_de(17) == "siebzehn" + + def test_tens(self): + assert _int_to_de(20) == "zwanzig" + assert _int_to_de(30) == "dreißig" + assert _int_to_de(70) == "siebzig" + + def test_compound(self): + assert _int_to_de(21) == "einundzwanzig" + assert _int_to_de(42) == "zweiundvierzig" + assert _int_to_de(99) == "neunundneunzig" + + def test_hundreds(self): + assert _int_to_de(100) == "einhundert" + assert _int_to_de(256) == "zweihundertsechsundfünfzig" + + def test_thousands(self): + assert _int_to_de(1000) == "eintausend" + assert _int_to_de(1234) == "eintausendzweihundertvierunddreißig" + + def test_millions(self): + assert _int_to_de(1_000_000) == "eine Million" + assert _int_to_de(2_000_000) == "zwei Millionen" + + def test_billions(self): + assert _int_to_de(1_000_000_000) == "eine Milliarde" + + def test_negative(self): + assert _int_to_de(-5) == "minus fünf" + + +# ── _ordinal_stem_de ───────────────────────────────────────────────────────── + + +class TestOrdinalStemDe: + def test_irregular(self): + assert _ordinal_stem_de(1) == "erst" + assert _ordinal_stem_de(2) == "zweit" + assert _ordinal_stem_de(3) == "dritt" + assert _ordinal_stem_de(7) == "siebt" + assert _ordinal_stem_de(8) == "acht" + + def test_regular_under_20(self): + assert _ordinal_stem_de(4) == "viert" + assert _ordinal_stem_de(5) == "fünft" + assert _ordinal_stem_de(19) == "neunzehnt" + + def test_regular_20_plus(self): + assert _ordinal_stem_de(20) == "zwanzigst" + assert _ordinal_stem_de(100) == "einhundertst" + + +# ── _year_de ───────────────────────────────────────────────────────────────── + + +class TestYearDe: + def test_1900(self): + assert _year_de(1900) == "neunzehnhundert" + + def test_1985(self): + assert _year_de(1985) == "neunzehnhundertfünfundachtzig" + + def test_1100(self): + assert "hundert" in _year_de(1100) + + def test_2024(self): + assert _year_de(2024) == "zweitausendvierundzwanzig" + + def test_below_1100(self): + # Falls through to cardinal + assert _year_de(800) == "achthundert" + + +# ── normalize_text_de ──────────────────────────────────────────────────────── + + +class TestNormalize: + def test_empty(self): + assert normalize_text_de("") == "" + + def test_plain_text(self): + r = normalize_text_de("Guten Morgen, wie geht es Ihnen?") + assert "Guten Morgen" in r + + def test_umlauts_preserved(self): + r = normalize_text_de("Äpfel, Österreich, Überraschung, Größe") + assert "Äpfel" in r + assert "Österreich" in r + assert "Überraschung" in r + assert "Größe" in r + + +class TestQuotes: + def test_german_quotes(self): + r = normalize_text_de("Er sagte: \u201eGuten Morgen.\u201c") + assert "\u201e" not in r + assert "\u201c" not in r + + def test_guillemets(self): + r = normalize_text_de("Das ist \u00abtoll\u00bb.") + assert "\u00ab" not in r + assert "\u00bb" not in r + + +class TestAbbreviations: + def test_doktor(self): + assert "Doktor" in normalize_text_de("Dr. Müller") + + def test_professor(self): + assert "Professor" in normalize_text_de("Prof. Schmidt hält") + + def test_herr(self): + # Hr. should expand to Herr (nominative), not Herrn + r = normalize_text_de("Hr. Müller") + assert "Herr" in r + assert "Herrn" not in r + + def test_strasse_standalone(self): + assert "Straße" in normalize_text_de("Str. des Friedens") + + def test_nummer(self): + assert "Nummer" in normalize_text_de("Nr. 5 bitte") + + def test_zum_beispiel(self): + assert "zum Beispiel" in normalize_text_de("z.B. morgen") + + def test_das_heisst(self): + assert "das heißt" in normalize_text_de("d.h. später") + + def test_und_so_weiter(self): + assert "und so weiter" in normalize_text_de("Äpfel usw.") + + def test_gmbh(self): + assert "Gesellschaft" in normalize_text_de("Muster GmbH") + + def test_ag(self): + assert "Aktiengesellschaft" in normalize_text_de("Siemens AG,") + + def test_month_jan(self): + assert "Januar" in normalize_text_de("Jan. war kalt") + + def test_month_okt(self): + assert "Oktober" in normalize_text_de("Okt. war schön") + + +class TestNumbers: + def test_standalone_number(self): + r = normalize_text_de("5 Katzen.") + assert "fünf" in r + assert "5" not in r + + def test_42(self): + r = normalize_text_de("42 Leute.") + assert "zweiundvierzig" in r + assert "42" not in r + + def test_german_thousands(self): + r = normalize_text_de("1.000 Menschen.") + assert "tausend" in r + assert "1.000" not in r + + def test_decimal_comma(self): + r = normalize_text_de("36,9 Grad.") + assert "Komma" in r + assert "36,9" not in r + + +class TestCurrency: + def test_euro_before(self): + r = normalize_text_de("kostet €10") + assert "Euro" in r + assert "€" not in r + + def test_euro_after(self): + r = normalize_text_de("kostet 10€") + assert "Euro" in r + assert "€" not in r + + def test_euro_with_cents(self): + r = normalize_text_de("€9,99 bitte") + assert "Euro" in r + assert "Cent" in r + + def test_dollar(self): + r = normalize_text_de("$100 Rabatt") + assert "Dollar" in r + assert "$" not in r + + +class TestTimes: + def test_full_hour(self): + r = normalize_text_de("Um 14:00 Uhr.") + assert "vierzehn Uhr" in r + assert "14:00" not in r + + def test_with_minutes(self): + assert "acht Uhr dreißig" in normalize_text_de("Um 8:30 Uhr.") + + def test_midnight(self): + assert "null Uhr" in normalize_text_de("Um 0:00 Uhr.") + + def test_no_trailing_null(self): + r = normalize_text_de("Um 15:00") + assert "fünfzehn Uhr" in r + assert "null" not in r + + +class TestDates: + def test_christmas(self): + r = normalize_text_de("Am 24.12.2024.") + assert "Dezember" in r + assert "24.12.2024" not in r + + def test_new_year(self): + r = normalize_text_de("Am 1.1.2000.") + assert "erste" in r + assert "Januar" in r + + def test_german_unity(self): + r = normalize_text_de("Am 3.10.1990.") + assert "dritt" in r + assert "Oktober" in r + + +class TestOrdinalsMidSentence: + def test_ordinal_3(self): + assert "dritte" in normalize_text_de("Am 3. Mai") + + def test_ordinal_1(self): + assert "erste" in normalize_text_de("Am 1. Mai") + + def test_ordinal_20(self): + assert "zwanzigste" in normalize_text_de("Am 20. August") + + +class TestYears: + def test_1989_in_text(self): + r = normalize_text_de("Im Jahr 1989.") + assert "neunzehnhundert" in r + assert "1989" not in r + + def test_2024_in_text(self): + r = normalize_text_de("Im Jahr 2024.") + assert "zweitausend" in r + + +class TestWhitespace: + def test_double_spaces(self): + assert " " not in normalize_text_de("Hallo Welt") + + def test_trimmed(self): + r = normalize_text_de(" Hallo Welt ") + assert r == r.strip() + + def test_nbsp(self): + assert "\u00a0" not in normalize_text_de("Hallo\u00a0Welt") + + +class TestComplexSentence: + def test_mixed(self): + t = "Dr. Müller kaufte am 3. Mai 2023 um 14:30 Uhr 3 Pakete für €29,99 bei der Muster GmbH." + r = normalize_text_de(t) + assert "Doktor" in r + assert "Mai" in r + assert "vierzehn Uhr dreißig" in r + assert "Euro" in r + assert "Gesellschaft" in r + assert "€" not in r + assert "Dr." not in r + assert "14:30" not in r + + +# ── integration tests (require espeak-ng) ──────────────────────────────────── + +try: + from misaki.espeak import EspeakG2P + + ESPEAK_AVAILABLE = True +except (ImportError, OSError): + ESPEAK_AVAILABLE = False + + +@pytest.mark.skipif( + not ESPEAK_AVAILABLE, reason="espeak-ng or phonemizer not available" +) +class TestDEG2PIntegration: + @pytest.fixture(autouse=True) + def setup(self): + from misaki.de import DEG2P + + self.g2p = DEG2P() + + def test_simple(self): + ps, tokens = self.g2p("Hallo Welt") + assert isinstance(ps, str) + assert len(ps) > 0 + assert tokens is None + + def test_normalized_numbers(self): + ps, _ = self.g2p("Es gibt 42 Katzen.") + assert isinstance(ps, str) + assert len(ps) > 0 + + def test_normalized_date(self): + ps, _ = self.g2p("Am 24.12.2024 ist Weihnachten.") + assert isinstance(ps, str) + assert len(ps) > 0 + + def test_normalized_currency(self): + ps, _ = self.g2p("Das kostet €9,99.") + assert isinstance(ps, str) + assert len(ps) > 0 From 3e7b63738eb340947f9c5959aa6607f85917edb6 Mon Sep 17 00:00:00 2001 From: Nico Thomaier Date: Fri, 17 Apr 2026 15:30:35 +0200 Subject: [PATCH 2/2] fix: prevent double 'Uhr' when time precedes 'Uhr' in text MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit '14:30 Uhr' was normalized to 'vierzehn Uhr dreißig Uhr'. Now the regex optionally consumes trailing ' Uhr' after HH:MM. --- misaki/de.py | 2 +- tests/test_de.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/misaki/de.py b/misaki/de.py index f80bf2b..80ce437 100644 --- a/misaki/de.py +++ b/misaki/de.py @@ -227,7 +227,7 @@ def _time_repl(m): h, mi = int(m.group(1)), int(m.group(2)) return _int_to_de(h) + " Uhr" + (" " + _int_to_de(mi) if mi else "") - text = re.sub(r"\b(\d{1,2}):(\d{2})\b", _time_repl, text) + text = re.sub(r"\b(\d{1,2}):(\d{2})(?:\s*Uhr)?", _time_repl, text) # 6. Full dates (DD.MM.YYYY) def _date_repl(m): diff --git a/tests/test_de.py b/tests/test_de.py index 3f8d4ac..2cf5da2 100644 --- a/tests/test_de.py +++ b/tests/test_de.py @@ -230,6 +230,10 @@ def test_no_trailing_null(self): assert "fünfzehn Uhr" in r assert "null" not in r + def test_no_double_uhr(self): + r = normalize_text_de("Um 14:30 Uhr") + assert r.count("Uhr") == 1 + class TestDates: def test_christmas(self):