From 5ab1ba3ec056a12ffb4962d931ba30f345032951 Mon Sep 17 00:00:00 2001 From: joshwhiton Date: Tue, 30 Dec 2025 21:13:45 +0700 Subject: [PATCH 1/3] feat: Add offline mode support to FallbackNetwork FallbackNetwork.from_pretrained() attempts network access even when TRANSFORMERS_OFFLINE=1 is set, because from_pretrained() does a HEAD request before falling back to cache. This PR: - Adds local_files_only parameter to FallbackNetwork and G2P - Respects TRANSFORMERS_OFFLINE and HF_HUB_OFFLINE env vars - Enables fully offline operation when models are pre-cached Use case: Air-gapped deployments, offline applications, reducing startup latency when network is slow/unavailable. --- misaki/en.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/misaki/en.py b/misaki/en.py index 222c170..6ac7b80 100644 --- a/misaki/en.py +++ b/misaki/en.py @@ -1,4 +1,5 @@ from . import data +import os from .token import MToken from dataclasses import dataclass, replace from num2words import num2words @@ -495,10 +496,19 @@ def __call__(self, tk, ctx): return None, None class FallbackNetwork: - def __init__(self, british): + def __init__(self, british, local_files_only=None): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Respect offline env vars if not explicitly set + if local_files_only is None: + local_files_only = ( + os.environ.get("TRANSFORMERS_OFFLINE", "0") == "1" or + os.environ.get("HF_HUB_OFFLINE", "0") == "1" + ) + self.model = BartForConditionalGeneration.from_pretrained( - "PeterReid/graphemes_to_phonemes_en_" + ("gb" if british else "us")) + "PeterReid/graphemes_to_phonemes_en_" + ("gb" if british else "us"), + local_files_only=local_files_only) self.model.to(self.device) self.model.eval() self.grapheme_to_token = {g: i for i, g in enumerate(self.model.config.grapheme_chars)} @@ -519,7 +529,7 @@ def __call__(self, input_token): return (output_text, 1) class G2P: - def __init__(self, version=None, trf=False, british=False, fallback=None, unk='❓'): + def __init__(self, version=None, trf=False, british=False, fallback=None, unk='❓', local_files_only=None): self.version = version self.british = british name = f"en_core_web_{'trf' if trf else 'sm'}" @@ -528,7 +538,7 @@ def __init__(self, version=None, trf=False, british=False, fallback=None, unk=' components = ['transformer' if trf else 'tok2vec', 'tagger'] self.nlp = spacy.load(name, enable=components) self.lexicon = Lexicon(british) - self.fallback = fallback if fallback else FallbackNetwork(british) + self.fallback = fallback if fallback else FallbackNetwork(british, local_files_only=local_files_only) self.unk = unk @staticmethod From 62f2c3688587f6d10bf3e01c21c51ed2189e03e7 Mon Sep 17 00:00:00 2001 From: joshwhiton Date: Tue, 30 Dec 2025 21:35:06 +0700 Subject: [PATCH 2/3] Merge community PRs #90 and #79 - PR #90: Restrict spacy<4 to avoid pre-release/yanked versions Fixes Python 3.13 compatibility issues with thinc/blis dependencies - PR #79: Strip whitespace from merged tokens Fixes lexicon lookup failures when multiple spaces appear between words --- misaki/en.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/misaki/en.py b/misaki/en.py index 6ac7b80..4602f40 100644 --- a/misaki/en.py +++ b/misaki/en.py @@ -26,7 +26,7 @@ def merge_tokens(tokens: List[MToken], unk: Optional[str] = None) -> MToken: phonemes += ' ' phonemes += unk if tk.phonemes is None else tk.phonemes return MToken( - text=''.join(tk.text + tk.whitespace for tk in tokens[:-1]) + tokens[-1].text, + text=(''.join(tk.text + tk.whitespace for tk in tokens[:-1]) + tokens[-1].text).strip(), tag=max(tokens, key=lambda tk: sum(1 if c == c.lower() else 2 for c in tk.text)).tag, whitespace=tokens[-1].whitespace, phonemes=phonemes, diff --git a/pyproject.toml b/pyproject.toml index e0ee5aa..30be697 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ dependencies = [ ] [project.optional-dependencies] -en = ["num2words", "spacy", "spacy-curated-transformers", "phonemizer-fork", "espeakng-loader", "torch", "transformers"] +en = ["num2words", "spacy<4", "spacy-curated-transformers", "phonemizer-fork", "espeakng-loader", "torch", "transformers"] ja = ["fugashi", "jaconv", "mojimoji", "unidic", "pyopenjtalk"] ko = ["jamo", "nltk"] zh = ["jieba", "ordered-set", "pypinyin", "cn2an", "pypinyin-dict"] From 9ecd3d4d12f08c0b01d0a34476ad0ee9d89df357 Mon Sep 17 00:00:00 2001 From: joshwhiton Date: Thu, 15 Jan 2026 13:26:26 +0700 Subject: [PATCH 3/3] Fix VBP pronunciation entries for read, reread, wound MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VBP (verb, present base form) should use present tense pronunciation, not past tense. For example, "I read books every day" should be pronounced "reed" not "red". Changes: - read VBP: "red" -> "reed" (US: ɹˈɛd -> ɹˈid, GB: ɹˈɛd -> ɹˈiːd) - reread VBP: match DEFAULT pronunciation - wound VBP: "wound" (injury) not "wound" (past of wind) --- misaki/data/gb_gold.json | 6 +++--- misaki/data/us_gold.json | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/misaki/data/gb_gold.json b/misaki/data/gb_gold.json index 8c7de92..f34f4f2 100644 --- a/misaki/data/gb_gold.json +++ b/misaki/data/gb_gold.json @@ -66252,7 +66252,7 @@ "DEFAULT": "ɹˈiːd", "VBD": "ɹˈɛd", "VBN": "ɹˈɛd", - "VBP": "ɹˈɛd" + "VBP": "ɹˈiːd" }, "read's": "ɹˈiːdz", "read-in": "ɹˈiːdɪn", @@ -68045,7 +68045,7 @@ "NOUN": "ɹˈiːɹiːd", "VBD": "ɹiːɹˈɛd", "VBN": "ɹiːɹˈɛd", - "VBP": "ɹiːɹˈɛd" + "VBP": "ɹiːɹˈiːd" }, "reread's": "ɹˈiːɹiːdz", "rereading": "ɹiːɹˈiːdɪŋ", @@ -89069,7 +89069,7 @@ "DEFAULT": "wˈuːnd", "VBD": "wˈWnd", "VBN": "wˈWnd", - "VBP": "wˈWnd" + "VBP": "wˈuːnd" }, "wound's": "wˈuːndz", "wounded": "wˈuːndɪd", diff --git a/misaki/data/us_gold.json b/misaki/data/us_gold.json index 8ef4bdc..381933e 100644 --- a/misaki/data/us_gold.json +++ b/misaki/data/us_gold.json @@ -68102,7 +68102,7 @@ "DEFAULT": "ɹˈid", "VBD": "ɹˈɛd", "VBN": "ɹˈɛd", - "VBP": "ɹˈɛd" + "VBP": "ɹˈid" }, "read's": "ɹˈidz", "read-in": "ɹˈidˌɪn", @@ -69997,7 +69997,7 @@ "NOUN": "ɹˈiɹid", "VBD": "ɹiɹˈɛd", "VBN": "ɹiɹˈɛd", - "VBP": "ɹiɹˈɛd" + "VBP": "ɹˌiɹˈid" }, "reread's": "ɹˈiɹidz", "rereading": "ɹˌiɹˈidɪŋ", @@ -91908,7 +91908,7 @@ "DEFAULT": "wˈund", "VBD": "wˈWnd", "VBN": "wˈWnd", - "VBP": "wˈWnd" + "VBP": "wˈund" }, "wound's": "wˈundz", "wounded": "wˈundᵻd",