From 0ae4dbc0b9a27ba1520f9fabbbd989802227eec5 Mon Sep 17 00:00:00 2001 From: umran666 Date: Wed, 10 Jun 2026 20:48:06 +0530 Subject: [PATCH] fix: load Qwen2TokenizerFast for qwen2 model types to prevent character corruption --- src/heretic/model.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/heretic/model.py b/src/heretic/model.py index cb4c103f..c7de0af2 100644 --- a/src/heretic/model.py +++ b/src/heretic/model.py @@ -73,12 +73,33 @@ def __init__(self, settings: Settings): print() print(f"Loading model [bold]{settings.model}[/]...") - self.tokenizer = AutoTokenizer.from_pretrained( + # Load configuration dictionary to verify model type. + # This prevents tokenizers configured with incorrect classes in upstream + # config metadata from generating corrupted/space-stripped tokens. + config_dict, _ = PretrainedConfig.get_config_dict( settings.model, trust_remote_code=settings.trust_remote_code, **self.revision_kwargs, ) + tokenizer_kwargs = { + "trust_remote_code": settings.trust_remote_code, + **self.revision_kwargs, + } + + if config_dict.get("model_type") == "qwen2": + from transformers import Qwen2TokenizerFast # ty:ignore[unresolved-import] + + self.tokenizer = Qwen2TokenizerFast.from_pretrained( + settings.model, + **tokenizer_kwargs, + ) + else: + self.tokenizer = AutoTokenizer.from_pretrained( + settings.model, + **tokenizer_kwargs, + ) + # Multimodal models have a processor we'll want to save. self.processor = None if get_model_class(settings.model) == AutoModelForImageTextToText: