From d605503bc5f37c3fa6459edc9dbcc04cffcacd3e Mon Sep 17 00:00:00 2001
From: scottmonster <dev@scottv.id>
Date: Thu, 14 May 2026 20:09:11 -0500
Subject: [PATCH 01/16] add type support via model.pyi

---
 MANIFEST.in            |   2 +
 pywhispercpp/model.pyi | 148 +++++++++++++++++++++++++++++++++++++++++
 pywhispercpp/py.typed  |   0
 setup.py               |   2 +-
 4 files changed, 151 insertions(+), 1 deletion(-)
 create mode 100644 pywhispercpp/model.pyi
 create mode 100644 pywhispercpp/py.typed

diff --git a/MANIFEST.in b/MANIFEST.in
index 0ac649a..47b5544 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,6 @@
 include README.md LICENSE pybind11/LICENSE version.txt
+include pywhispercpp/model.pyi
+include pywhispercpp/py.typed
 graft pybind11/include
 graft pybind11/tools
 graft src
diff --git a/pywhispercpp/model.pyi b/pywhispercpp/model.pyi
new file mode 100644
index 0000000..060372d
--- /dev/null
+++ b/pywhispercpp/model.pyi
@@ -0,0 +1,148 @@
+from __future__ import annotations
+
+# Generated by coverage/generate_pyi.py. Do not edit by hand.
+
+from typing import Any, Callable, Dict, List, Optional, TextIO, Tuple, TypedDict, Union
+
+import numpy as np
+import numpy.typing as npt
+
+AudioArray = npt.NDArray[np.float32]
+AudioInput = Union[str, AudioArray]
+
+
+class GreedyParams(TypedDict):
+    best_of: int
+
+
+class BeamSearchParams(TypedDict):
+    beam_size: int
+    patience: float
+
+
+class Segment:
+    t0: int
+    t1: int
+    text: str
+    probability: float
+
+    def __init__(self, t0: int, t1: int, text: str, probability: float = np.nan)->None: ...
+    def __str__(self)->str: ...
+    def __repr__(self)->str: ...
+
+
+class Model:
+    _new_segment_callback: Optional[Callable[[Segment], None]]
+
+    def __init__(
+        self,
+        model: str = 'tiny',
+        models_dir: Optional[str] = None,
+        params_sampling_strategy: int = 0,
+        redirect_whispercpp_logs_to: Union[bool, TextIO, str, None] = False,
+        use_openvino: bool = False,
+        openvino_model_path: Optional[str] = None,
+        openvino_device: str = 'CPU',
+        openvino_cache_dir: Optional[str] = None,
+        *,
+        n_threads: Optional[int] = None,
+        n_max_text_ctx: int = 16384,
+        offset_ms: int = 0,
+        duration_ms: int = 0,
+        translate: bool = False,
+        no_context: bool = False,
+        single_segment: bool = False,
+        print_special: bool = False,
+        print_progress: bool = True,
+        print_realtime: bool = False,
+        print_timestamps: bool = True,
+        token_timestamps: bool = False,
+        thold_pt: float = 0.01,
+        thold_ptsum: float = 0.01,
+        max_len: int = 0,
+        split_on_word: bool = False,
+        max_tokens: int = 0,
+        audio_ctx: int = 0,
+        initial_prompt: Optional[str] = None,
+        prompt_tokens: Optional[Tuple[Any, ...]] = None,
+        prompt_n_tokens: int = 0,
+        language: str = '',
+        suppress_blank: bool = True,
+        suppress_non_speech_tokens: bool = False,
+        temperature: float = 0.0,
+        max_initial_ts: float = 1.0,
+        length_penalty: float = -1.0,
+        temperature_inc: float = 0.2,
+        entropy_thold: float = 2.4,
+        logprob_thold: float = -1.0,
+        no_speech_thold: float = 0.6,
+        greedy: GreedyParams = {'best_of': -1},
+        beam_search: BeamSearchParams = {'beam_size': -1, 'patience': -1.0},
+        vad: bool = False,
+        vad_model_path: Optional[str] = None,
+    )->None: ...
+
+    def transcribe(
+        self,
+        media: AudioInput,
+        n_processors: Optional[int] = None,
+        new_segment_callback: Optional[Callable[[Segment], None]] = None,
+        *,
+        n_threads: Optional[int] = None,
+        n_max_text_ctx: int = 16384,
+        offset_ms: int = 0,
+        duration_ms: int = 0,
+        translate: bool = False,
+        no_context: bool = False,
+        single_segment: bool = False,
+        print_special: bool = False,
+        print_progress: bool = True,
+        print_realtime: bool = False,
+        print_timestamps: bool = True,
+        token_timestamps: bool = False,
+        thold_pt: float = 0.01,
+        thold_ptsum: float = 0.01,
+        max_len: int = 0,
+        split_on_word: bool = False,
+        max_tokens: int = 0,
+        audio_ctx: int = 0,
+        initial_prompt: Optional[str] = None,
+        prompt_tokens: Optional[Tuple[Any, ...]] = None,
+        prompt_n_tokens: int = 0,
+        language: str = '',
+        suppress_blank: bool = True,
+        suppress_non_speech_tokens: bool = False,
+        temperature: float = 0.0,
+        max_initial_ts: float = 1.0,
+        length_penalty: float = -1.0,
+        temperature_inc: float = 0.2,
+        entropy_thold: float = 2.4,
+        logprob_thold: float = -1.0,
+        no_speech_thold: float = 0.6,
+        greedy: GreedyParams = {'best_of': -1},
+        beam_search: BeamSearchParams = {'beam_size': -1, 'patience': -1.0},
+        extract_probability: bool = False,
+        vad: bool = False,
+        vad_model_path: Optional[str] = None,
+    ) -> List[Segment]: ...
+
+    def get_params(self) -> Dict[str, Any]: ...
+    @staticmethod
+    def get_params_schema() -> Dict[str, Dict[str, Any]]: ...
+    @staticmethod
+    def lang_max_id() -> int: ...
+    def print_timings(self) -> None: ...
+    @staticmethod
+    def system_info() -> Any: ...
+    @staticmethod
+    def available_languages() -> List[str]: ...
+    @staticmethod
+    def _load_audio(media_file_path: str) -> AudioArray: ...
+    def auto_detect_language(
+        self,
+        media: AudioInput,
+        offset_ms: int = 0,
+        n_threads: int = 4,
+    ) -> Tuple[Tuple[str, np.float32], Dict[str, np.float32]]: ...
+    def __del__(self) -> None: ...
+
diff --git a/pywhispercpp/py.typed b/pywhispercpp/py.typed
new file mode 100644
index 0000000..e69de29
diff --git a/setup.py b/setup.py
index d13545f..71a5ce7 100644
--- a/setup.py
+++ b/setup.py
@@ -251,7 +251,7 @@ def get_version() -> str:
     packages=find_packages('.'),
     package_dir={'': '.'},
     include_package_data=True,
-    package_data={'pywhispercpp': []},
+    package_data={'pywhispercpp': ["*.pyi", "py.typed"]},
     long_description_content_type="text/markdown",
     license='MIT',
     entry_points={

From ad5c7ad92258e8f2c220d63710aff73d1475246a Mon Sep 17 00:00:00 2001
From: scottmonster <dev@scottv.id>
Date: Fri, 15 May 2026 22:16:57 -0500
Subject: [PATCH 02/16] begin work on extending api

---
 .gitignore                            |   5 +
 CMakeLists.txt                        |   1 +
 pywhispercpp/constants.py             |  62 ++++-
 pywhispercpp/model.py                 |  89 +++++--
 pywhispercpp/model.pyi                |  42 +++-
 src/main.cpp                          | 346 +++++++++++++++++++++++++-
 tests/test_backwards_compatibility.py | 153 ++++++++++++
 tests/test_model.py                   |  57 +++++
 whsiper_args.txt                      | 252 +++++++++++++++++++
 9 files changed, 974 insertions(+), 33 deletions(-)
 create mode 100644 tests/test_backwards_compatibility.py
 create mode 100644 whsiper_args.txt

diff --git a/.gitignore b/.gitignore
index 1928866..3e25b4f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,11 @@ _generate/
 *env*
 _version.py
 
+coverage
+libggml*
+libwhisper*
+updating
+
 # custom
 .idea
 _docs
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 39c16a8..af94411 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,6 +6,7 @@ add_subdirectory(whisper.cpp)
 
 pybind11_add_module(_pywhispercpp
 	src/main.cpp
+	whisper.cpp/examples/grammar-parser.cpp
 )
 
 target_link_libraries (_pywhispercpp PRIVATE whisper)
diff --git a/pywhispercpp/constants.py b/pywhispercpp/constants.py
index f56a3e9..b742455 100644
--- a/pywhispercpp/constants.py
+++ b/pywhispercpp/constants.py
@@ -95,6 +95,12 @@
             'type': bool,
             'description': "do not use past transcription (if any) as initial prompt for the decoder",
             'options': None,
+            'default': True
+    },
+    'no_timestamps': {
+            'type': bool,
+            'description': "do not generate timestamps",
+            'options': None,
             'default': False
     },
     'single_segment': {
@@ -164,18 +170,42 @@
             'options': None,
             'default': 0
     },
+    'debug_mode': {
+            'type': bool,
+            'description': "enable debug mode in whisper.cpp",
+            'options': None,
+            'default': False
+    },
     'audio_ctx': {
             'type': int,
             'description': "overwrite the audio context size (0 = use default)",
             'options': None,
             'default': 0
     },
+    'tdrz_enable': {
+            'type': bool,
+            'description': "enable tinydiarize speaker turn detection",
+            'options': None,
+            'default': False
+    },
     'initial_prompt': {
             'type': str,
             'description': "Initial prompt, these are prepended to any existing text context from a previous call",
             'options': None,
             'default': None
     },
+    'grammar': {
+            'type': str,
+            'description': "GBNF grammar text or a path to a grammar file",
+            'options': None,
+            'default': None
+    },
+    'grammar_rule': {
+            'type': str,
+            'description': "top-level GBNF grammar rule name",
+            'options': None,
+            'default': 'root'
+    },
     'prompt_tokens': {
             'type': Tuple,
             'description': "tokens to provide to the whisper decoder as initial prompt",
@@ -188,11 +218,23 @@
             'options': None,
             'default': 0
     },
+    'carry_initial_prompt': {
+            'type': bool,
+            'description': "always prepend the initial prompt to each decode window",
+            'options': None,
+            'default': False
+    },
     'language': {
             'type': str,
             'description': 'for auto-detection, set to None, "" or "auto"',
             'options': None,
-            'default': ""
+            'default': "en"
+    },
+    'detect_language': {
+            'type': bool,
+            'description': 'enable automatic language detection during transcription',
+            'options': None,
+            'default': False
     },
     'suppress_blank': {
             'type': bool,
@@ -206,6 +248,12 @@
             'options': None,
             'default': False
     },
+    'suppress_nst': {
+            'type': bool,
+            'description': 'canonical whisper.cpp name for non-speech token suppression',
+            'options': None,
+            'default': False
+    },
     'temperature': {
             'type': float,
             'description': 'initial decoding temperature',
@@ -248,23 +296,29 @@
             'options': None,
             'default': 0.6
     },
+    'grammar_penalty': {
+            'type': float,
+            'description': 'scales down logits of non-grammar tokens',
+            'options': None,
+            'default': 100.0
+    },
     'greedy': {
             'type': dict,
             'description': 'greedy',
             'options': None,
-            'default': {"best_of": -1}
+            'default': {"best_of": 5}
     },
     'beam_search': {
             'type': dict,
             'description': 'beam_search',
             'options': None,
-            'default': {"beam_size": -1, "patience": -1.0}
+            'default': {"beam_size": 5, "patience": -1.0}
     },
     'extract_probability': {
             'type': bool,
             'description': 'calculate the geometric mean of token probabilities for each segment.',
             'options': None,
-            'default': True
+            'default': False
     },
     'vad': {
         'type': bool,
diff --git a/pywhispercpp/model.py b/pywhispercpp/model.py
index 7f0f2a3..e73cb06 100644
--- a/pywhispercpp/model.py
+++ b/pywhispercpp/model.py
@@ -11,7 +11,7 @@
 import sys
 from pathlib import Path
 from time import time
-from typing import Union, Callable, List, TextIO, Tuple, Optional
+from typing import Any, Union, Callable, List, TextIO, Tuple, Optional, Dict
 import _pywhispercpp as pw
 import numpy as np
 import pywhispercpp.utils as utils
@@ -72,13 +72,14 @@ class Model:
 
     def __init__(self,
                  model: str = 'tiny',
-                 models_dir: str = None,
+                 models_dir: Optional[str] = None,
                  params_sampling_strategy: int = 0,
                  redirect_whispercpp_logs_to: Union[bool, TextIO, str, None] = False,
                  use_openvino: bool = False,
-                 openvino_model_path: str = None,
+                 openvino_model_path: Optional[str] = None,
                  openvino_device: str = 'CPU',
-                 openvino_cache_dir: str = None,
+                 openvino_cache_dir: Optional[str] = None,
+                 context_params: Union[Dict[str, Any], Any, None] = None,
                  **params):
         """
         :param model: The name of the model, one of the [AVAILABLE_MODELS](/pywhispercpp/#pywhispercpp.constants.AVAILABLE_MODELS),
@@ -96,6 +97,7 @@ def __init__(self,
         """
         self.model_path = utils.resolve_model_path(model, models_dir)
         self._ctx = None
+        self._context_params = self._resolve_context_params(context_params)
         self._sampling_strategy = pw.whisper_sampling_strategy.WHISPER_SAMPLING_GREEDY if params_sampling_strategy == 0 else \
             pw.whisper_sampling_strategy.WHISPER_SAMPLING_BEAM_SEARCH
         self._params = pw.whisper_full_default_params(self._sampling_strategy)
@@ -112,8 +114,9 @@ def __init__(self,
 
     def transcribe(self,
                    media: Union[str, np.ndarray],
-                   n_processors: int = None,
-                   new_segment_callback: Callable[[Segment], None] = None,
+                   n_processors: Optional[int] = None,
+                   new_segment_callback: Optional[Callable[[Segment], None]] = None,
+                   abort_callback: Optional[Callable[[], bool]] = None,
                    **params) -> List[Segment]:
         """
         Transcribes the media provided as input and returns list of `Segment` objects.
@@ -124,12 +127,13 @@ def transcribe(self,
                              binding to whisper.cpp/whisper_full_parallel
                              > Split the input audio in chunks and process each chunk separately using whisper_full()
         :param new_segment_callback: callback function that will be called when a new segment is generated
+        :param abort_callback: callback function returning True to abort an in-flight transcription early
         :param params: keyword arguments for different whisper.cpp parameters, see ::: constants.PARAMS_SCHEMA
         :param extract_probability: If True, calculates the geometric mean of token probabilities for each segment,
             providing a confidence score interpretable as a probability in [0, 1].
         :return: List of transcription segments
         """
-        if type(media) is np.ndarray:
+        if isinstance(media, np.ndarray):
             audio = media
         else:
             if not Path(media).exists():
@@ -147,6 +151,11 @@ def transcribe(self,
             Model._new_segment_callback = new_segment_callback
             pw.assign_new_segment_callback(self._params, Model.__call_new_segment_callback)
 
+        if abort_callback is None:
+            pw.clear_abort_callback(self._params)
+        else:
+            pw.assign_abort_callback(self._params, abort_callback)
+
         # run inference
         start_time = time()
         logger.info("Transcribing ...")
@@ -191,7 +200,7 @@ def _get_segments(ctx, start: int, end: int, extract_probability: bool = False)
                 else:
                     avg_prob = np.nan
 
-            res.append(Segment(t0, t1, text.strip(), probability=np.float32(avg_prob)))
+            res.append(Segment(t0, t1, text.strip(), probability=float(avg_prob)))
         return res
 
     def get_params(self) -> dict:
@@ -246,7 +255,7 @@ def system_info() -> None:
         return pw.whisper_print_system_info()
 
     @staticmethod
-    def available_languages() -> list[str]:
+    def available_languages() -> List[str]:
         """
         Returns a list of supported language codes
 
@@ -258,6 +267,28 @@ def available_languages() -> list[str]:
             res.append(pw.whisper_lang_str(i))
         return res
 
+    @staticmethod
+    def _resolve_context_params(context_params: Union[Dict[str, Any], Any, None]):
+        if context_params is None:
+            return None
+
+        if isinstance(context_params, dict):
+            resolved = pw.whisper_context_default_params()
+            for key, value in context_params.items():
+                setattr(resolved, key, value)
+            return resolved
+
+        return context_params
+
+    @staticmethod
+    def _normalize_params(kwargs: dict) -> dict:
+        normalized = dict(kwargs)
+
+        if 'suppress_non_speech_tokens' in normalized and 'suppress_nst' not in normalized:
+            normalized['suppress_nst'] = normalized.pop('suppress_non_speech_tokens')
+
+        return normalized
+
     def _init_model(self) -> None:
         """
         Private method to initialize the method from the bindings, it will be called automatically from the __init__
@@ -265,7 +296,10 @@ def _init_model(self) -> None:
         """
         logger.info("Initializing the model ...")
         with utils.redirect_stderr(to=self.redirect_whispercpp_logs_to):
-            self._ctx = pw.whisper_init_from_file(self.model_path)
+            if self._context_params is None:
+                self._ctx = pw.whisper_init_from_file(self.model_path)
+            else:
+                self._ctx = pw.whisper_init_from_file_with_params(self.model_path, self._context_params)
             if self.use_openvino:
                 pw.whisper_ctx_init_openvino_encoder(self._ctx, self.openvino_model_path, self.openvino_device, self.openvino_cache_dir)
 
@@ -277,10 +311,25 @@ def _set_params(self, kwargs: dict) -> None:
         :param kwargs: dict like object for the different params
         :return: None
         """
-        for param in kwargs:
-            setattr(self._params, param, kwargs[param])
+        normalized = self._normalize_params(kwargs)
+        prompt_tokens = normalized.pop('prompt_tokens', None) if 'prompt_tokens' in normalized else None
+        grammar = normalized.pop('grammar', None) if 'grammar' in normalized else None
+        grammar_rule = normalized.pop('grammar_rule', 'root') if 'grammar_rule' in normalized else 'root'
+        grammar_penalty = normalized.get('grammar_penalty', self._params.grammar_penalty)
+
+        for param, value in normalized.items():
+            setattr(self._params, param, value)
+
+        if 'prompt_tokens' in kwargs:
+            self._params.set_prompt_tokens(prompt_tokens)
+
+        if 'grammar' in kwargs:
+            if grammar:
+                self._params.set_grammar(grammar, grammar_rule, grammar_penalty)
+            else:
+                self._params.clear_grammar()
 
-    def _transcribe(self, audio: np.ndarray, n_processors: int = None):
+    def _transcribe(self, audio: np.ndarray, n_processors: Optional[int] = None):
         """
         Private method to call the whisper.cpp/whisper_full function
 
@@ -310,10 +359,11 @@ def __call_new_segment_callback(ctx, n_new, user_data) -> None:
         start = n - n_new
         res = Model._get_segments(ctx, start, n, False)
         for segment in res:
-            Model._new_segment_callback(segment)
+            if Model._new_segment_callback is not None:
+                Model._new_segment_callback(segment)
 
     @staticmethod
-    def _load_audio(media_file_path: str) -> np.array:
+    def _load_audio(media_file_path: str) -> np.ndarray:
         """
          Helper method to return a `np.array` object from a media file
          If the media file is not a WAV file, it will try to convert it using ffmpeg
@@ -369,7 +419,7 @@ def wav_to_np(file_path):
             finally:
                 os.remove(temp_file_path)
 
-    def auto_detect_language(self,  media: Union[str, np.ndarray], offset_ms: int = 0, n_threads: int = 4) -> Tuple[Tuple[str, np.float32], dict[str, np.float32]]:
+    def auto_detect_language(self,  media: Union[str, np.ndarray], offset_ms: int = 0, n_threads: int = 4) -> Tuple[Tuple[str, np.float32], Dict[str, np.float32]]:
         """
         Automatic language detection using whisper.cpp/whisper_pcm_to_mel and whisper.cpp/whisper_lang_auto_detect
 
@@ -378,7 +428,7 @@ def auto_detect_language(self,  media: Union[str, np.ndarray], offset_ms: int =
         :param n_threads: number of threads to use
         :return: ((detected_language, probability), probabilities for all languages)
         """
-        if type(media) is np.ndarray:
+        if isinstance(media, np.ndarray):
             audio = media
         else:
             if not Path(media).exists():
@@ -391,11 +441,12 @@ def auto_detect_language(self,  media: Union[str, np.ndarray], offset_ms: int =
         auto_detect = pw.whisper_lang_auto_detect(self._ctx, offset_ms, n_threads, probs)
         langs = self.available_languages()
         lang_probs = {langs[i]: probs[i] for i in range(lang_count)}
-        return (langs[auto_detect], probs[auto_detect]), lang_probs
+        return (langs[auto_detect], np.float32(probs[auto_detect])), lang_probs
 
     def __del__(self):
         """
         Free up resources
         :return: None
         """
-        pw.whisper_free(self._ctx)
\ No newline at end of file
+        if self._ctx is not None:
+            pw.whisper_free(self._ctx)
\ No newline at end of file
diff --git a/pywhispercpp/model.pyi b/pywhispercpp/model.pyi
index 060372d..27e3d52 100644
--- a/pywhispercpp/model.pyi
+++ b/pywhispercpp/model.pyi
@@ -9,6 +9,7 @@ import numpy.typing as npt
 
 AudioArray = npt.NDArray[np.float32]
 AudioInput = Union[str, AudioArray]
+ContextParams = Union[Dict[str, Any], Any]
 
 
 class GreedyParams(TypedDict):
@@ -32,7 +33,12 @@ class Segment:
 
 
 class Model:
+    """
+    docuemnts strings
+    """
+  
     _new_segment_callback: Optional[Callable[[Segment], None]]
+    
 
     def __init__(
         self,
@@ -44,13 +50,15 @@ class Model:
         openvino_model_path: Optional[str] = None,
         openvino_device: str = 'CPU',
         openvino_cache_dir: Optional[str] = None,
+        context_params: Optional[ContextParams] = None,
         *,
         n_threads: Optional[int] = None,
         n_max_text_ctx: int = 16384,
         offset_ms: int = 0,
         duration_ms: int = 0,
         translate: bool = False,
-        no_context: bool = False,
+        no_context: bool = True,
+        no_timestamps: bool = False,
         single_segment: bool = False,
         print_special: bool = False,
         print_progress: bool = True,
@@ -62,13 +70,20 @@ class Model:
         max_len: int = 0,
         split_on_word: bool = False,
         max_tokens: int = 0,
+        debug_mode: bool = False,
         audio_ctx: int = 0,
+        tdrz_enable: bool = False,
         initial_prompt: Optional[str] = None,
+        grammar: Optional[str] = None,
+        grammar_rule: str = 'root',
         prompt_tokens: Optional[Tuple[Any, ...]] = None,
         prompt_n_tokens: int = 0,
-        language: str = '',
+        carry_initial_prompt: bool = False,
+        language: str = 'en',
+        detect_language: bool = False,
         suppress_blank: bool = True,
         suppress_non_speech_tokens: bool = False,
+        suppress_nst: bool = False,
         temperature: float = 0.0,
         max_initial_ts: float = 1.0,
         length_penalty: float = -1.0,
@@ -76,8 +91,9 @@ class Model:
         entropy_thold: float = 2.4,
         logprob_thold: float = -1.0,
         no_speech_thold: float = 0.6,
-        greedy: GreedyParams = {'best_of': -1},
-        beam_search: BeamSearchParams = {'beam_size': -1, 'patience': -1.0},
+        grammar_penalty: float = 100.0,
+        greedy: GreedyParams = {'best_of': 5},
+        beam_search: BeamSearchParams = {'beam_size': 5, 'patience': -1.0},
         vad: bool = False,
         vad_model_path: Optional[str] = None,
     )->None: ...
@@ -87,13 +103,15 @@ class Model:
         media: AudioInput,
         n_processors: Optional[int] = None,
         new_segment_callback: Optional[Callable[[Segment], None]] = None,
+        abort_callback: Optional[Callable[[], bool]] = None,
         *,
         n_threads: Optional[int] = None,
         n_max_text_ctx: int = 16384,
         offset_ms: int = 0,
         duration_ms: int = 0,
         translate: bool = False,
-        no_context: bool = False,
+        no_context: bool = True,
+        no_timestamps: bool = False,
         single_segment: bool = False,
         print_special: bool = False,
         print_progress: bool = True,
@@ -105,13 +123,20 @@ class Model:
         max_len: int = 0,
         split_on_word: bool = False,
         max_tokens: int = 0,
+        debug_mode: bool = False,
         audio_ctx: int = 0,
+        tdrz_enable: bool = False,
         initial_prompt: Optional[str] = None,
+        grammar: Optional[str] = None,
+        grammar_rule: str = 'root',
         prompt_tokens: Optional[Tuple[Any, ...]] = None,
         prompt_n_tokens: int = 0,
-        language: str = '',
+        carry_initial_prompt: bool = False,
+        language: str = 'en',
+        detect_language: bool = False,
         suppress_blank: bool = True,
         suppress_non_speech_tokens: bool = False,
+        suppress_nst: bool = False,
         temperature: float = 0.0,
         max_initial_ts: float = 1.0,
         length_penalty: float = -1.0,
@@ -119,8 +144,9 @@ class Model:
         entropy_thold: float = 2.4,
         logprob_thold: float = -1.0,
         no_speech_thold: float = 0.6,
-        greedy: GreedyParams = {'best_of': -1},
-        beam_search: BeamSearchParams = {'beam_size': -1, 'patience': -1.0},
+        grammar_penalty: float = 100.0,
+        greedy: GreedyParams = {'best_of': 5},
+        beam_search: BeamSearchParams = {'beam_size': 5, 'patience': -1.0},
         extract_probability: bool = False,
         vad: bool = False,
         vad_model_path: Optional[str] = None,
diff --git a/src/main.cpp b/src/main.cpp
index 48341bb..23150e5 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -16,6 +16,9 @@
 #include <pybind11/numpy.h>
 
 #include "whisper.h"
+#include "../whisper.cpp/examples/grammar-parser.h"
+
+#include <fstream>
 
 
 #define STRINGIFY(x) #x
@@ -32,6 +35,7 @@ using namespace pybind11::literals; // to bring in the `_a` literal
 py::function py_new_segment_callback;
 py::function py_encoder_begin_callback;
 py::function py_logits_filter_callback;
+py::object py_log_callback;
 
 
 // whisper context wrapper, to solve the incomplete type issue
@@ -56,6 +60,34 @@ struct whisper_model_loader_wrapper {
 
 };
 
+struct whisper_context_wrapper whisper_init_from_file_with_params_wrapper(
+        const char * path_model,
+        struct whisper_context_params cparams){
+    struct whisper_context * ctx = whisper_init_from_file_with_params(path_model, cparams);
+    struct whisper_context_wrapper ctw_w;
+    ctw_w.ptr = ctx;
+    return ctw_w;
+}
+
+struct whisper_context_wrapper whisper_init_from_buffer_with_params_wrapper(
+        void * buffer,
+        size_t buffer_size,
+        struct whisper_context_params cparams){
+    struct whisper_context * ctx = whisper_init_from_buffer_with_params(buffer, buffer_size, cparams);
+    struct whisper_context_wrapper ctw_w;
+    ctw_w.ptr = ctx;
+    return ctw_w;
+}
+
+struct whisper_context_wrapper whisper_init_with_params_wrapper(
+        struct whisper_model_loader_wrapper * loader,
+        struct whisper_context_params cparams){
+    struct whisper_context * ctx = whisper_init_with_params(loader->ptr, cparams);
+    struct whisper_context_wrapper ctw_w;
+    ctw_w.ptr = ctx;
+    return ctw_w;
+};
+
 struct whisper_context_wrapper whisper_init_from_file_wrapper(const char * path_model){
     struct whisper_context_params cparams = whisper_context_default_params();
     struct whisper_context * ctx = whisper_init_from_file_with_params(path_model, cparams);
@@ -291,6 +323,60 @@ float whisper_full_get_token_p_wrapper(struct whisper_context_wrapper * ctx, int
     return whisper_full_get_token_p(ctx->ptr, i_segment, i_token);
 }
 
+bool whisper_full_get_segment_speaker_turn_next_wrapper(struct whisper_context_wrapper * ctx, int i_segment){
+    return whisper_full_get_segment_speaker_turn_next(ctx->ptr, i_segment);
+}
+
+const char * whisper_model_type_readable_wrapper(struct whisper_context_wrapper * ctx_w){
+    return whisper_model_type_readable(ctx_w->ptr);
+}
+
+int whisper_model_n_vocab_wrapper(struct whisper_context_wrapper * ctx_w){
+    return whisper_model_n_vocab(ctx_w->ptr);
+}
+
+int whisper_model_n_audio_ctx_wrapper(struct whisper_context_wrapper * ctx_w){
+    return whisper_model_n_audio_ctx(ctx_w->ptr);
+}
+
+int whisper_model_n_audio_state_wrapper(struct whisper_context_wrapper * ctx_w){
+    return whisper_model_n_audio_state(ctx_w->ptr);
+}
+
+int whisper_model_n_audio_head_wrapper(struct whisper_context_wrapper * ctx_w){
+    return whisper_model_n_audio_head(ctx_w->ptr);
+}
+
+int whisper_model_n_audio_layer_wrapper(struct whisper_context_wrapper * ctx_w){
+    return whisper_model_n_audio_layer(ctx_w->ptr);
+}
+
+int whisper_model_n_text_ctx_wrapper(struct whisper_context_wrapper * ctx_w){
+    return whisper_model_n_text_ctx(ctx_w->ptr);
+}
+
+int whisper_model_n_text_state_wrapper(struct whisper_context_wrapper * ctx_w){
+    return whisper_model_n_text_state(ctx_w->ptr);
+}
+
+int whisper_model_n_text_head_wrapper(struct whisper_context_wrapper * ctx_w){
+    return whisper_model_n_text_head(ctx_w->ptr);
+}
+
+int whisper_model_n_text_layer_wrapper(struct whisper_context_wrapper * ctx_w){
+    return whisper_model_n_text_layer(ctx_w->ptr);
+}
+
+int whisper_model_n_mels_wrapper(struct whisper_context_wrapper * ctx_w){
+    return whisper_model_n_mels(ctx_w->ptr);
+}
+
+int whisper_model_ftype_wrapper(struct whisper_context_wrapper * ctx_w){
+    return whisper_model_ftype(ctx_w->ptr);
+}
+
+bool _abort_callback(void * user_data);
+
 int whisper_ctx_init_openvino_encoder_wrapper(struct whisper_context_wrapper * ctx, const char * model_path,
                     const char * device,
                     const char * cache_dir){
@@ -301,8 +387,13 @@ struct WhisperFullParamsWrapper : public whisper_full_params {
   std::string initial_prompt_str;
   std::string suppress_regex_str;
   std::string vad_model_path_str;
+    std::string grammar_rule_str;
+    grammar_parser::parse_state grammar_parsed;
+    std::vector<const whisper_grammar_element *> grammar_rules_storage;
+    std::vector<whisper_token> prompt_tokens_storage;
 public:
   py::function py_progress_callback;
+        py::object py_abort_callback;
   WhisperFullParamsWrapper(const whisper_full_params& params = whisper_full_params())
     : whisper_full_params(params),
       initial_prompt_str(params.initial_prompt ? params.initial_prompt : ""),
@@ -312,6 +403,7 @@ struct WhisperFullParamsWrapper : public whisper_full_params {
     initial_prompt = initial_prompt_str.empty() ? nullptr : initial_prompt_str.c_str();
     suppress_regex = suppress_regex_str.empty() ? nullptr : suppress_regex_str.c_str();
     vad_model_path = vad_model_path_str.empty() ? nullptr : vad_model_path_str.c_str();
+    abort_callback_user_data = this;
     // progress callback
     progress_callback_user_data = this;
     progress_callback = [](struct whisper_context* ctx, struct whisper_state* state, int progress, void* user_data) {
@@ -333,11 +425,24 @@ struct WhisperFullParamsWrapper : public whisper_full_params {
       initial_prompt_str(other.initial_prompt_str),
       suppress_regex_str(other.suppress_regex_str),
       vad_model_path_str(other.vad_model_path_str),
-      py_progress_callback(other.py_progress_callback) {
+            grammar_rule_str(other.grammar_rule_str),
+            grammar_parsed(other.grammar_parsed),
+            grammar_rules_storage(other.grammar_rules_storage),
+            prompt_tokens_storage(other.prompt_tokens_storage),
+        py_progress_callback(other.py_progress_callback),
+        py_abort_callback(other.py_abort_callback) {
     // Reset pointers to new string copies
     initial_prompt = initial_prompt_str.empty() ? nullptr : initial_prompt_str.c_str();
     suppress_regex = suppress_regex_str.empty() ? nullptr : suppress_regex_str.c_str();
     vad_model_path = vad_model_path_str.empty() ? nullptr : vad_model_path_str.c_str();
+        grammar_rules = grammar_rules_storage.empty() ? nullptr : grammar_rules_storage.data();
+        n_grammar_rules = grammar_rules_storage.size();
+        if (!grammar_rule_str.empty() && grammar_parsed.symbol_ids.find(grammar_rule_str) != grammar_parsed.symbol_ids.end()) {
+            i_start_rule = grammar_parsed.symbol_ids.at(grammar_rule_str);
+        }
+        prompt_tokens = prompt_tokens_storage.empty() ? nullptr : prompt_tokens_storage.data();
+        prompt_n_tokens = prompt_tokens_storage.size();
+    abort_callback_user_data = this;
     progress_callback_user_data = this;
     progress_callback = [](struct whisper_context* ctx, struct whisper_state* state, int progress, void* user_data) {
       auto* self = static_cast<WhisperFullParamsWrapper*>(user_data);
@@ -365,6 +470,89 @@ struct WhisperFullParamsWrapper : public whisper_full_params {
     vad_model_path_str = model_path;
     vad_model_path = vad_model_path_str.c_str();
   }
+        void set_abort_callback(py::function callback) {
+                py_abort_callback = callback;
+                abort_callback_user_data = this;
+                abort_callback = _abort_callback;
+        }
+        void clear_abort_callback() {
+                py_abort_callback = py::none();
+                abort_callback = nullptr;
+                abort_callback_user_data = this;
+        }
+    void clear_grammar() {
+        grammar_rule_str.clear();
+        grammar_parsed = grammar_parser::parse_state();
+        grammar_rules_storage.clear();
+        grammar_rules = nullptr;
+        n_grammar_rules = 0;
+        i_start_rule = 0;
+    }
+    void set_grammar(const std::string& grammar_input, const std::string& rule_name = "", float penalty = -1.0f) {
+        clear_grammar();
+
+        if (grammar_input.empty()) {
+            if (penalty >= 0.0f) {
+                grammar_penalty = penalty;
+            }
+            return;
+        }
+
+        std::string grammar_source = grammar_input;
+        std::ifstream grammar_file(grammar_input);
+        if (grammar_file.is_open()) {
+            grammar_source.assign((std::istreambuf_iterator<char>(grammar_file)),
+                                                        std::istreambuf_iterator<char>());
+        }
+
+        grammar_parsed = grammar_parser::parse(grammar_source.c_str());
+        if (grammar_parsed.rules.empty()) {
+            throw py::value_error("Failed to parse grammar input");
+        }
+
+        grammar_rule_str = rule_name.empty() ? "root" : rule_name;
+        if (grammar_parsed.symbol_ids.find(grammar_rule_str) == grammar_parsed.symbol_ids.end()) {
+            throw py::value_error("Grammar rule '" + grammar_rule_str + "' not found");
+        }
+
+        grammar_rules_storage = grammar_parsed.c_rules();
+        grammar_rules = grammar_rules_storage.data();
+        n_grammar_rules = grammar_rules_storage.size();
+        i_start_rule = grammar_parsed.symbol_ids.at(grammar_rule_str);
+
+        if (penalty >= 0.0f) {
+            grammar_penalty = penalty;
+        }
+    }
+    void set_prompt_tokens(const py::object& tokens_obj) {
+        prompt_tokens_storage.clear();
+
+        if (tokens_obj.is_none()) {
+            prompt_tokens = nullptr;
+            prompt_n_tokens = 0;
+            return;
+        }
+
+        py::sequence tokens = tokens_obj.cast<py::sequence>();
+        prompt_tokens_storage.reserve(tokens.size());
+        for (const auto & token : tokens) {
+            prompt_tokens_storage.push_back(token.cast<whisper_token>());
+        }
+
+        prompt_tokens = prompt_tokens_storage.empty() ? nullptr : prompt_tokens_storage.data();
+        prompt_n_tokens = prompt_tokens_storage.size();
+    }
+    py::tuple get_prompt_tokens() const {
+        const whisper_token * tokens_ptr = prompt_tokens_storage.empty() ? prompt_tokens : prompt_tokens_storage.data();
+        const size_t token_count = prompt_tokens_storage.empty() ? static_cast<size_t>(std::max(prompt_n_tokens, 0)) : prompt_tokens_storage.size();
+
+        py::tuple tokens(token_count);
+        for (size_t i = 0; i < token_count; ++i) {
+            tokens[i] = py::int_(tokens_ptr[i]);
+        }
+
+        return tokens;
+    }
 };
 WhisperFullParamsWrapper  whisper_full_default_params_wrapper(enum whisper_sampling_strategy strategy) {
     return WhisperFullParamsWrapper(whisper_full_default_params(strategy));
@@ -417,6 +605,57 @@ void assign_logits_filter_callback(struct whisper_full_params *params, py::funct
     py_logits_filter_callback = f;
 }
 
+bool _abort_callback(void * user_data) {
+    auto * params = static_cast<WhisperFullParamsWrapper *>(user_data);
+    if (!params || !params->py_abort_callback || params->py_abort_callback.is_none()) {
+        return false;
+    }
+
+    py::gil_scoped_acquire gil;
+    py::function callback = params->py_abort_callback.cast<py::function>();
+    py::object result_py = callback();
+    return result_py.cast<bool>();
+}
+
+void assign_abort_callback(whisper_full_params *params_base, py::object callback){
+    auto * params = static_cast<WhisperFullParamsWrapper *>(params_base);
+    if (callback.is_none()) {
+        params->py_abort_callback = py::none();
+        params->abort_callback = nullptr;
+        params->abort_callback_user_data = params;
+        return;
+    }
+
+    params->py_abort_callback = callback.cast<py::function>();
+    params->abort_callback_user_data = params;
+    params->abort_callback = _abort_callback;
+}
+
+void clear_abort_callback(whisper_full_params *params_base) {
+    auto * params = static_cast<WhisperFullParamsWrapper *>(params_base);
+    params->py_abort_callback = py::none();
+    params->abort_callback = nullptr;
+    params->abort_callback_user_data = params;
+}
+
+void whisper_log_set_wrapper(py::object callback) {
+    if (callback.is_none()) {
+        py_log_callback = py::none();
+        whisper_log_set(nullptr, nullptr);
+        return;
+    }
+
+    py_log_callback = callback.cast<py::function>();
+    whisper_log_set(
+        [](enum ggml_log_level level, const char * text, void * user_data) {
+            (void) user_data;
+            py::gil_scoped_acquire gil;
+            py::function log_callback = py_log_callback.cast<py::function>();
+            log_callback(py::int_(static_cast<int>(level)), py::str(text ? text : ""));
+        },
+        nullptr);
+}
+
 py::dict get_greedy(whisper_full_params * params){
     py::dict d("best_of"_a=params->greedy.best_of);
     return d;
@@ -532,7 +771,34 @@ PYBIND11_MODULE(_pywhispercpp, m) {
     m.attr("WHISPER_HOP_LENGTH") = WHISPER_HOP_LENGTH;
     m.attr("WHISPER_CHUNK_SIZE") = WHISPER_CHUNK_SIZE;
 
+    py::enum_<whisper_alignment_heads_preset>(m, "whisper_alignment_heads_preset")
+        .value("WHISPER_AHEADS_NONE", whisper_alignment_heads_preset::WHISPER_AHEADS_NONE)
+        .value("WHISPER_AHEADS_N_TOP_MOST", whisper_alignment_heads_preset::WHISPER_AHEADS_N_TOP_MOST)
+        .value("WHISPER_AHEADS_CUSTOM", whisper_alignment_heads_preset::WHISPER_AHEADS_CUSTOM)
+        .value("WHISPER_AHEADS_TINY_EN", whisper_alignment_heads_preset::WHISPER_AHEADS_TINY_EN)
+        .value("WHISPER_AHEADS_TINY", whisper_alignment_heads_preset::WHISPER_AHEADS_TINY)
+        .value("WHISPER_AHEADS_BASE_EN", whisper_alignment_heads_preset::WHISPER_AHEADS_BASE_EN)
+        .value("WHISPER_AHEADS_BASE", whisper_alignment_heads_preset::WHISPER_AHEADS_BASE)
+        .value("WHISPER_AHEADS_SMALL_EN", whisper_alignment_heads_preset::WHISPER_AHEADS_SMALL_EN)
+        .value("WHISPER_AHEADS_SMALL", whisper_alignment_heads_preset::WHISPER_AHEADS_SMALL)
+        .value("WHISPER_AHEADS_MEDIUM_EN", whisper_alignment_heads_preset::WHISPER_AHEADS_MEDIUM_EN)
+        .value("WHISPER_AHEADS_MEDIUM", whisper_alignment_heads_preset::WHISPER_AHEADS_MEDIUM)
+        .value("WHISPER_AHEADS_LARGE_V1", whisper_alignment_heads_preset::WHISPER_AHEADS_LARGE_V1)
+        .value("WHISPER_AHEADS_LARGE_V2", whisper_alignment_heads_preset::WHISPER_AHEADS_LARGE_V2)
+        .value("WHISPER_AHEADS_LARGE_V3", whisper_alignment_heads_preset::WHISPER_AHEADS_LARGE_V3)
+        .value("WHISPER_AHEADS_LARGE_V3_TURBO", whisper_alignment_heads_preset::WHISPER_AHEADS_LARGE_V3_TURBO)
+        .export_values();
+
     py::class_<whisper_context_wrapper>(m, "whisper_context");
+        py::class_<whisper_context_params>(m, "whisper_context_params")
+            .def(py::init<>())
+            .def_readwrite("use_gpu", &whisper_context_params::use_gpu)
+            .def_readwrite("flash_attn", &whisper_context_params::flash_attn)
+            .def_readwrite("gpu_device", &whisper_context_params::gpu_device)
+            .def_readwrite("dtw_token_timestamps", &whisper_context_params::dtw_token_timestamps)
+            .def_readwrite("dtw_aheads_preset", &whisper_context_params::dtw_aheads_preset)
+            .def_readwrite("dtw_n_top", &whisper_context_params::dtw_n_top)
+            .def_readwrite("dtw_mem_size", &whisper_context_params::dtw_mem_size);
     py::class_<whisper_token>(m, "whisper_token")
             .def(py::init<>());
     py::class_<whisper_token_data>(m,"whisper_token_data")
@@ -545,20 +811,33 @@ PYBIND11_MODULE(_pywhispercpp, m) {
             .def_readwrite("ptsum", &whisper_token_data::ptsum)
             .def_readwrite("t0", &whisper_token_data::t0)
             .def_readwrite("t1", &whisper_token_data::t1)
+            .def_readwrite("t_dtw", &whisper_token_data::t_dtw)
             .def_readwrite("vlen", &whisper_token_data::vlen);
 
     py::class_<whisper_model_loader_wrapper>(m,"whisper_model_loader")
             .def(py::init<>());
 
+        m.def("whisper_context_default_params", &whisper_context_default_params,
+            "Return the default context parameters used during model initialization.");
+
     DEF_RELEASE_GIL("whisper_init_from_file", &whisper_init_from_file_wrapper, "Various functions for loading a ggml whisper model.\n"
                                                                     "Allocate (almost) all memory needed for the model.\n"
                                                                     "Return NULL on failure");
+        DEF_RELEASE_GIL("whisper_init_from_file_with_params", &whisper_init_from_file_with_params_wrapper, "Various functions for loading a ggml whisper model.\n"
+                                                  "Allocate (almost) all memory needed for the model.\n"
+                                                  "Return NULL on failure");
     DEF_RELEASE_GIL("whisper_init_from_buffer", &whisper_init_from_buffer_wrapper, "Various functions for loading a ggml whisper model.\n"
                                                                         "Allocate (almost) all memory needed for the model.\n"
                                                                         "Return NULL on failure");
+        DEF_RELEASE_GIL("whisper_init_from_buffer_with_params", &whisper_init_from_buffer_with_params_wrapper, "Various functions for loading a ggml whisper model.\n"
+                                                    "Allocate (almost) all memory needed for the model.\n"
+                                                    "Return NULL on failure");
     DEF_RELEASE_GIL("whisper_init", &whisper_init_wrapper, "Various functions for loading a ggml whisper model.\n"
                                                 "Allocate (almost) all memory needed for the model.\n"
                                                 "Return NULL on failure");
+        DEF_RELEASE_GIL("whisper_init_with_params", &whisper_init_with_params_wrapper, "Various functions for loading a ggml whisper model.\n"
+                                    "Allocate (almost) all memory needed for the model.\n"
+                                    "Return NULL on failure");
 
 
     m.def("whisper_free", &whisper_free_wrapper, "Frees all memory allocated by the model.");
@@ -712,6 +991,7 @@ PYBIND11_MODULE(_pywhispercpp, m) {
         .def_readwrite("duration_ms", &WhisperFullParamsWrapper::duration_ms)
         .def_readwrite("translate", &WhisperFullParamsWrapper::translate)
         .def_readwrite("no_context", &WhisperFullParamsWrapper::no_context)
+        .def_readwrite("no_timestamps", &WhisperFullParamsWrapper::no_timestamps)
         .def_readwrite("single_segment", &WhisperFullParamsWrapper::single_segment)
         .def_readwrite("print_special", &WhisperFullParamsWrapper::print_special)
         .def_readwrite("print_progress", &WhisperFullParamsWrapper::print_progress)
@@ -724,7 +1004,9 @@ PYBIND11_MODULE(_pywhispercpp, m) {
         .def_readwrite("max_len", &WhisperFullParamsWrapper::max_len)
         .def_readwrite("split_on_word", &WhisperFullParamsWrapper::split_on_word)
         .def_readwrite("max_tokens", &WhisperFullParamsWrapper::max_tokens)
+        .def_readwrite("debug_mode", &WhisperFullParamsWrapper::debug_mode)
         .def_readwrite("audio_ctx", &WhisperFullParamsWrapper::audio_ctx)
+        .def_readwrite("tdrz_enable", &WhisperFullParamsWrapper::tdrz_enable)
         .def_property("suppress_regex",
             [](WhisperFullParamsWrapper &self) {
                 return py::str(self.suppress_regex ? self.suppress_regex : "");
@@ -740,8 +1022,29 @@ PYBIND11_MODULE(_pywhispercpp, m) {
                 self.set_initial_prompt(initial_prompt);
             }
         )
-        .def_readwrite("prompt_tokens", &WhisperFullParamsWrapper::prompt_tokens)
+        .def_property("prompt_tokens",
+            [](WhisperFullParamsWrapper &self) {
+                return self.get_prompt_tokens();
+            },
+            [](WhisperFullParamsWrapper &self, const py::object &tokens) {
+                self.set_prompt_tokens(tokens);
+            })
+        .def("set_abort_callback",
+             [](WhisperFullParamsWrapper &self, py::object callback) {
+                 if (callback.is_none()) {
+                     self.clear_abort_callback();
+                 } else {
+                     self.set_abort_callback(callback.cast<py::function>());
+                 }
+             },
+             py::arg("callback") = py::none(),
+             "Assign an abort callback that returns True to stop processing.")
+        .def("clear_abort_callback", &WhisperFullParamsWrapper::clear_abort_callback,
+             "Clear any previously assigned abort callback.")
+        .def("set_prompt_tokens", &WhisperFullParamsWrapper::set_prompt_tokens, py::arg("tokens"),
+             "Copy prompt tokens into C++-owned storage and update the raw pointers safely.")
         .def_readwrite("prompt_n_tokens", &WhisperFullParamsWrapper::prompt_n_tokens)
+        .def_readwrite("carry_initial_prompt", &WhisperFullParamsWrapper::carry_initial_prompt)
         .def_property("language",
             [](WhisperFullParamsWrapper &self) {
                 return py::str(self.language);
@@ -754,7 +1057,9 @@ PYBIND11_MODULE(_pywhispercpp, m) {
                     self.language = ""; //defaults to auto-detect
                 }
             })
+        .def_readwrite("detect_language", &WhisperFullParamsWrapper::detect_language)
         .def_readwrite("suppress_blank", &WhisperFullParamsWrapper::suppress_blank)
+        .def_readwrite("suppress_nst", &WhisperFullParamsWrapper::suppress_nst)
         .def_readwrite("temperature", &WhisperFullParamsWrapper::temperature)
         .def_readwrite("max_initial_ts", &WhisperFullParamsWrapper::max_initial_ts)
         .def_readwrite("length_penalty", &WhisperFullParamsWrapper::length_penalty)
@@ -767,9 +1072,15 @@ PYBIND11_MODULE(_pywhispercpp, m) {
                                  [](WhisperFullParamsWrapper &self, py::dict dict) {self.greedy.best_of = dict["best_of"].cast<int>();})
         .def_property("beam_search", [](WhisperFullParamsWrapper &self) {return py::dict("beam_size"_a=self.beam_search.beam_size, "patience"_a=self.beam_search.patience);},
                                 [](WhisperFullParamsWrapper &self, py::dict dict) {self.beam_search.beam_size = dict["beam_size"].cast<int>(); self.beam_search.patience = dict["patience"].cast<float>();})
+           .def("set_grammar", &WhisperFullParamsWrapper::set_grammar,
+               py::arg("grammar"), py::arg("rule_name") = "", py::arg("penalty") = -1.0f,
+               "Parse GBNF grammar text or a grammar file path and store the resulting grammar in C++-owned memory.")
+           .def("clear_grammar", &WhisperFullParamsWrapper::clear_grammar,
+               "Clear any previously configured grammar from the parameter object.")
         .def_readwrite("new_segment_callback_user_data", &WhisperFullParamsWrapper::new_segment_callback_user_data)
         .def_readwrite("encoder_begin_callback_user_data", &WhisperFullParamsWrapper::encoder_begin_callback_user_data)
         .def_readwrite("logits_filter_callback_user_data", &WhisperFullParamsWrapper::logits_filter_callback_user_data)
+                    .def_readwrite("grammar_penalty", &WhisperFullParamsWrapper::grammar_penalty)
         .def_readwrite("vad", &WhisperFullParamsWrapper::vad)
         .def_property("vad_model_path",
         [](WhisperFullParamsWrapper &self) {
@@ -799,6 +1110,8 @@ PYBIND11_MODULE(_pywhispercpp, m) {
     m.def("whisper_full_lang_id", &whisper_full_lang_id_wrapper, "Language id associated with the current context");
     m.def("whisper_full_get_segment_t0", &whisper_full_get_segment_t0_wrapper, "Get the start time of the specified segment");
     m.def("whisper_full_get_segment_t1", &whisper_full_get_segment_t1_wrapper, "Get the end time of the specified segment");
+        m.def("whisper_full_get_segment_speaker_turn_next", &whisper_full_get_segment_speaker_turn_next_wrapper,
+            "Get whether the next segment is predicted as a speaker turn.");
 
     m.def("whisper_full_get_segment_text", &whisper_full_get_segment_text_wrapper, "Get the text of the specified segment");
     m.def("whisper_full_n_tokens", &whisper_full_n_tokens_wrapper, "Get number of tokens in the specified segment.");
@@ -812,6 +1125,18 @@ PYBIND11_MODULE(_pywhispercpp, m) {
     m.def("whisper_full_get_token_p", &whisper_full_get_token_p_wrapper, "Get the probability of the specified token in the specified segment.");
 
     m.def("whisper_ctx_init_openvino_encoder", &whisper_ctx_init_openvino_encoder_wrapper, "Given a context, enable use of OpenVINO for encode inference.");
+    m.def("whisper_model_type_readable", &whisper_model_type_readable_wrapper, "Return the readable model type string.");
+    m.def("whisper_model_n_vocab", &whisper_model_n_vocab_wrapper, "Return the model vocabulary size.");
+    m.def("whisper_model_n_audio_ctx", &whisper_model_n_audio_ctx_wrapper, "Return the audio context size baked into the model.");
+    m.def("whisper_model_n_audio_state", &whisper_model_n_audio_state_wrapper, "Return the number of audio state units in the model.");
+    m.def("whisper_model_n_audio_head", &whisper_model_n_audio_head_wrapper, "Return the number of audio attention heads in the model.");
+    m.def("whisper_model_n_audio_layer", &whisper_model_n_audio_layer_wrapper, "Return the number of audio layers in the model.");
+    m.def("whisper_model_n_text_ctx", &whisper_model_n_text_ctx_wrapper, "Return the text context size baked into the model.");
+    m.def("whisper_model_n_text_state", &whisper_model_n_text_state_wrapper, "Return the number of text state units in the model.");
+    m.def("whisper_model_n_text_head", &whisper_model_n_text_head_wrapper, "Return the number of text attention heads in the model.");
+    m.def("whisper_model_n_text_layer", &whisper_model_n_text_layer_wrapper, "Return the number of text layers in the model.");
+    m.def("whisper_model_n_mels", &whisper_model_n_mels_wrapper, "Return the number of mel bins used by the model.");
+    m.def("whisper_model_ftype", &whisper_model_ftype_wrapper, "Return the model file type identifier.");
 
 
     ////////////////////////////////////////////////////////////////////////////
@@ -832,6 +1157,23 @@ PYBIND11_MODULE(_pywhispercpp, m) {
     m.def("assign_logits_filter_callback", &assign_logits_filter_callback, "Assigns a logits_filter_callback, takes <whisper_full_params> instance and a callable function with the same parameters which are defined in the interface",
             py::arg("params"), py::arg("callback"));
 
+        m.def("assign_abort_callback",
+            [](whisper_full_params * params, py::object callback) {
+                assign_abort_callback(params, callback);
+            },
+            "Assign an abort callback that returns True to stop processing.",
+            py::arg("params"), py::arg("callback") = py::none());
+
+            m.def("clear_abort_callback", &clear_abort_callback, "Clear any previously assigned abort callback.",
+                py::arg("params"));
+
+        m.def("whisper_log_set",
+            [](py::object callback) {
+                whisper_log_set_wrapper(callback);
+            },
+            "Assign a Python log callback or None to restore the default logger.",
+            py::arg("callback") = py::none());
+
     // VAD
     py::class_<whisper_vad_params>(m,"whisper_vad_params")
             .def(py::init<>())
diff --git a/tests/test_backwards_compatibility.py b/tests/test_backwards_compatibility.py
new file mode 100644
index 0000000..4e21cdc
--- /dev/null
+++ b/tests/test_backwards_compatibility.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import gc
+import subprocess
+import sys
+import textwrap
+import unittest
+from pathlib import Path
+from unittest import TestCase
+
+import _pywhispercpp as pw
+
+from pywhispercpp.model import Model, Segment
+
+
+WHISPER_CPP_DIR = Path(__file__).parent.parent / 'whisper.cpp'
+
+
+class TestBackwardsCompatibility(TestCase):
+    audio_file = WHISPER_CPP_DIR / 'samples/jfk.wav'
+    models_dir = str(WHISPER_CPP_DIR / 'models')
+    repo_root = Path(__file__).parent.parent
+
+    def tearDown(self):
+        gc.collect()
+
+    def _create_cpu_model(self):
+        return Model(
+            'tiny',
+            models_dir=self.models_dir,
+            context_params={'use_gpu': False, 'flash_attn': False},
+        )
+
+    def _run_python(self, code: str):
+        result = subprocess.run(
+            [sys.executable, '-c', textwrap.dedent(code)],
+            cwd=self.repo_root,
+            capture_output=True,
+            text=True,
+        )
+        self.assertEqual(
+            result.returncode,
+            0,
+            msg=f"stdout:\n{result.stdout}\n\nstderr:\n{result.stderr}",
+        )
+
+    def test_legacy_model_constructor_still_works(self):
+        self._run_python(
+            f'''
+            from pywhispercpp.model import Model
+
+            model = Model('tiny', models_dir={self.models_dir!r})
+            assert isinstance(model, Model)
+            '''
+        )
+
+    def test_legacy_alias_still_maps_to_suppress_nst(self):
+        self._run_python(
+            f'''
+            from pywhispercpp.model import Model
+
+            model = Model(
+                'tiny',
+                models_dir={self.models_dir!r},
+                context_params={{'use_gpu': False, 'flash_attn': False}},
+            )
+            model._set_params({{'suppress_non_speech_tokens': True}})
+            assert model.get_params()['suppress_nst'] is True
+            '''
+        )
+
+    def test_low_level_prompt_tokens_property_round_trips(self):
+        params = pw.whisper_full_default_params(
+            pw.whisper_sampling_strategy.WHISPER_SAMPLING_GREEDY
+        )
+        params.prompt_tokens = (1, 2, 3)
+        self.assertEqual(tuple(params.prompt_tokens), (1, 2, 3))
+        self.assertEqual(params.prompt_n_tokens, 3)
+
+    def test_context_params_dict_is_additive(self):
+        self._run_python(
+            f'''
+            from pywhispercpp.model import Model
+
+            model = Model(
+                'tiny',
+                models_dir={self.models_dir!r},
+                context_params={{'use_gpu': False, 'flash_attn': False}},
+            )
+            assert isinstance(model, Model)
+            '''
+        )
+
+    def test_existing_new_segment_callback_still_works(self):
+        self._run_python(
+            f'''
+            from pywhispercpp.model import Model, Segment
+
+            seen = []
+            model = Model(
+                'tiny',
+                models_dir={self.models_dir!r},
+                context_params={{'use_gpu': False, 'flash_attn': False}},
+            )
+
+            def on_segment(segment):
+                seen.append(segment)
+
+            segments = model.transcribe({str(self.audio_file)!r}, new_segment_callback=on_segment)
+            assert isinstance(segments, list)
+            assert len(seen) > 0
+            assert all(isinstance(segment, Segment) for segment in seen)
+            '''
+        )
+
+    def test_abort_callback_can_abort_and_then_clear(self):
+        self._run_python(
+            f'''
+            from pywhispercpp.model import Model
+
+            model = Model(
+                'tiny',
+                models_dir={self.models_dir!r},
+                context_params={{'use_gpu': False, 'flash_attn': False}},
+            )
+            callback_calls = []
+
+            def abort_immediately():
+                callback_calls.append(True)
+                return True
+
+            aborted_segments = model.transcribe({str(self.audio_file)!r}, abort_callback=abort_immediately)
+            assert isinstance(aborted_segments, list)
+            assert len(callback_calls) > 0
+
+            normal_segments = model.transcribe({str(self.audio_file)!r})
+            assert isinstance(normal_segments, list)
+            assert len(normal_segments) > 0
+            '''
+        )
+
+    def test_log_callback_can_be_set_and_cleared(self):
+        pw.whisper_log_set(lambda level, text: None)
+        pw.whisper_log_set(None)
+
+    def test_alignment_preset_enum_is_available(self):
+        preset = pw.whisper_alignment_heads_preset.WHISPER_AHEADS_TINY
+        self.assertIsNotNone(preset)
+
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/tests/test_model.py b/tests/test_model.py
index f38200f..9ee8f65 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -8,6 +8,7 @@
 from pathlib import Path
 from unittest import TestCase
 
+import _pywhispercpp as pw
 from pywhispercpp.model import Model, Segment
 
 if __name__ == '__main__':
@@ -44,6 +45,62 @@ def test_auto_detect_language(self):
         detected_language, probs = self.model.auto_detect_language(str(self.audio_file))
         return self.assertIsInstance(detected_language, tuple) and self.assertEqual(detected_language[0], 'en')
 
+    def test_context_params_dict_init(self):
+        model = Model(
+            "tiny",
+            models_dir=str(WHISPER_CPP_DIR/'models'),
+            context_params={"use_gpu": False, "flash_attn": False},
+        )
+        self.assertIsInstance(model, Model)
+
+    def test_compat_alias_for_non_speech_tokens(self):
+        model = Model(
+            "tiny",
+            models_dir=str(WHISPER_CPP_DIR/'models'),
+            suppress_non_speech_tokens=True,
+        )
+        self.assertTrue(model.get_params()["suppress_nst"])
+
+    def test_prompt_token_helper_exists(self):
+        params = pw.whisper_full_default_params(
+            pw.whisper_sampling_strategy.WHISPER_SAMPLING_GREEDY
+        )
+        params.set_prompt_tokens((1, 2, 3))
+        self.assertEqual(params.prompt_n_tokens, 3)
+
+    def test_grammar_helper_exists(self):
+        params = pw.whisper_full_default_params(
+            pw.whisper_sampling_strategy.WHISPER_SAMPLING_GREEDY
+        )
+        params.set_grammar('root ::= "yes" | "no"', 'root', 42.0)
+        self.assertEqual(params.grammar_penalty, 42.0)
+        params.clear_grammar()
+
+    def test_model_accepts_grammar_param(self):
+        model = Model(
+            "tiny",
+            models_dir=str(WHISPER_CPP_DIR/'models'),
+            grammar='root ::= "yes" | "no"',
+            grammar_rule='root',
+            grammar_penalty=42.0,
+        )
+        self.assertIsInstance(model, Model)
+
+    def test_model_metadata_bindings(self):
+        self.assertIsInstance(pw.whisper_model_type_readable(self.model._ctx), str)
+        self.assertGreater(pw.whisper_model_n_vocab(self.model._ctx), 0)
+        self.assertGreater(pw.whisper_model_n_audio_ctx(self.model._ctx), 0)
+        self.assertGreater(pw.whisper_model_n_text_ctx(self.model._ctx), 0)
+
+    def test_speaker_turn_accessor_smoke(self):
+        self.model.transcribe(str(self.audio_file))
+        segment_count = pw.whisper_full_n_segments(self.model._ctx)
+        self.assertGreater(segment_count, 0)
+        self.assertIsInstance(
+            pw.whisper_full_get_segment_speaker_turn_next(self.model._ctx, 0),
+            bool,
+        )
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/whsiper_args.txt b/whsiper_args.txt
new file mode 100644
index 0000000..35678ec
--- /dev/null
+++ b/whsiper_args.txt
@@ -0,0 +1,252 @@
+
+usage: ./whisper-cli [options] file0 file1 ...
+supported audio formats: flac, mp3, ogg, wav
+
+options:
+  --help                            [default] show this help message and exit
+  --threads N                       [4      ] number of threads to use during computation
+  --processors N                    [1      ] number of processors to use during computation
+  --offset-t N                      [0      ] time offset in milliseconds
+  --offset-n N                      [0      ] segment index offset
+  --duration N                      [0      ] duration of audio to process in milliseconds
+  --max-context N                   [-1     ] maximum number of text context tokens to store
+  --max-len N                       [0      ] maximum segment length in characters
+  --max-tokens N                    [0      ] maximum number of tokens per segment
+  --split-on-word                   [false  ] split on word rather than on token
+  --best-of N                       [5      ] number of best candidates to keep
+  --beam-size N                     [5      ] beam size for beam search
+  --audio-ctx N                     [0      ] audio context size (0 - all)
+  --word-thold N                    [0.01   ] word timestamp probability threshold
+  --entropy-thold N                 [2.40   ] entropy threshold for decoder fail
+  --logprob-thold N                 [-1.00  ] log probability threshold for decoder fail
+  --no-speech-thold N               [0.60   ] no speech threshold
+  --temperature N                   [0.00   ] The sampling temperature, between 0 and 1
+  --temperature-inc N               [0.20   ] The increment of temperature, between 0 and 1
+  --debug-mode                      [false  ] enable debug mode (eg. dump log_mel)
+  --translate                       [false  ] translate from source language to english
+  --diarize                         [false  ] stereo audio diarization
+  --tinydiarize                     [false  ] enable tinydiarize (requires a tdrz model)
+  --no-fallback                     [false  ] do not use temperature fallback while decoding
+  --output-txt                      [false  ] output result in a text file
+  --output-vtt                      [false  ] output result in a vtt file
+  --output-srt                      [false  ] output result in a srt file
+  --output-lrc                      [false  ] output result in a lrc file
+  --output-words                    [false  ] output script for generating karaoke video
+  --font-path                       [/System/Library/Fonts/Supplemental/Courier New Bold.ttf] path to a monospace font for karaoke video
+  --output-csv                      [false  ] output result in a CSV file
+  --output-json                     [false  ] output result in a JSON file
+  --output-json-full                [false  ] include more information in the JSON file
+  --output-file FNAME               [       ] output file path (without file extension)
+  --no-prints                       [false  ] do not print anything other than the results
+  --print-special                   [false  ] print special tokens
+  --print-colors                    [false  ] print colors
+  --print-confidence                [false  ] print confidence
+  --print-progress                  [false  ] print progress
+  --no-timestamps                   [false  ] do not print timestamps
+  --language LANG                   [en     ] spoken language ('auto' for auto-detect)
+  --detect-language                 [false  ] exit after automatically detecting language
+  --prompt PROMPT                   [       ] initial prompt (max n_text_ctx/2 tokens)
+  --carry-initial-prompt            [false  ] always prepend initial prompt
+  --model FNAME                     [models/ggml-base.en.bin] model path
+  --file FNAME                      [       ] input audio file path
+  --ov-e-device DNAME               [CPU    ] the OpenVINO device used for encode inference
+  --dtw MODEL                       [       ] compute token-level timestamps
+  --log-score                       [false  ] log best decoder scores of tokens
+  --no-gpu                          [false  ] disable GPU
+  --device N                        [0      ] GPU device ID (default: 0)
+  --flash-attn                      [true   ] enable flash attention
+  --no-flash-attn                   [false  ] disable flash attention
+  --suppress-blank                  [true   ] suppress blank outputs
+  --no-suppress-blank               [false  ] disable blank suppression
+  --suppress-nst                    [false  ] suppress non-speech tokens
+  --suppress-regex REGEX            [       ] regular expression matching tokens to suppress
+  --grammar GRAMMAR                 [       ] GBNF grammar to guide decoding
+  --grammar-rule RULE               [       ] top-level GBNF grammar rule name
+  --grammar-penalty N               [100.0  ] scales down logits of nongrammar tokens
+
+Voice Activity Detection (VAD) options:
+  --vad                               [false  ] enable Voice Activity Detection (VAD)
+  --vad-model FNAME                   [       ] VAD model path
+  --vad-threshold N                   [0.50   ] VAD threshold for speech recognition
+  --vad-min-speech-duration-ms N      [250    ] VAD min speech duration (0.0-1.0)
+  --vad-min-silence-duration-ms N     [100    ] VAD min silence duration (to split segments)
+  --vad-max-speech-duration-s N       [FLT_MAX] VAD max speech duration (auto-split longer)
+  --vad-speech-pad-ms N               [30     ] VAD speech padding (extend segments)
+  --vad-samples-overlap N             [0.10   ] VAD samples overlap (seconds between segments)
+
+
+usage: ./whisper-stream [options]
+
+options:
+  --help                            [default] show this help message and exit
+  --threads N                       [4      ] number of threads to use during computation
+  --step N                          [3000   ] audio step size in milliseconds
+  --length N                        [10000  ] audio length in milliseconds
+  --keep N                          [200    ] audio to keep from previous step in ms
+  --capture ID                      [-1     ] capture device ID
+  --max-tokens N                    [32     ] maximum number of tokens per audio chunk
+  --audio-ctx N                     [0      ] audio context size (0 - all)
+  --beam-size N                     [-1     ] beam size for beam search
+  --vad-thold N                     [0.60   ] voice activity detection threshold
+  --freq-thold N                    [100.00 ] high-pass frequency cutoff
+  --translate                       [false  ] translate from source language to english
+  --no-fallback                     [false  ] do not use temperature fallback while decoding
+  --print-special                   [false  ] print special tokens
+  --keep-context                    [false  ] keep context between audio chunks
+  --language LANG                   [en     ] spoken language
+  --model FNAME                     [models/ggml-base.en.bin] model path
+  --file FNAME                      [       ] text output file name
+  --tinydiarize                     [false  ] enable tinydiarize (requires a tdrz model)
+  --save-audio                      [false  ] save the recorded audio to a file
+  --no-gpu                          [false  ] disable GPU inference
+  --flash-attn                      [true   ] enable flash attention during inference
+  --no-flash-attn                   [false  ] disable flash attention during inference
+
+
+usage: ./whisper-server [options] 
+
+options:
+  --help                                 [default] show this help message and exit
+  --threads N                            [4      ] number of threads to use during computation
+  --processors N                         [1      ] number of processors to use during computation
+  --offset-t N                           [0      ] time offset in milliseconds
+  --offset-n N                           [0      ] segment index offset
+  --duration N                           [0      ] duration of audio to process in milliseconds
+  --max-context N                        [-1     ] maximum number of text context tokens to store
+  --max-len N                            [0      ] maximum segment length in characters
+  --split-on-word                        [false  ] split on word rather than on token
+  --best-of N                            [2      ] number of best candidates to keep
+  --beam-size N                          [-1     ] beam size for beam search
+  --audio-ctx N                          [0      ] audio context size (0 - all)
+  --word-thold N                         [0.01   ] word timestamp probability threshold
+  --entropy-thold N                      [2.40   ] entropy threshold for decoder fail
+  --logprob-thold N                      [-1.00  ] log probability threshold for decoder fail
+  --debug-mode                           [false  ] enable debug mode (eg. dump log_mel)
+  --translate                            [false  ] translate from source language to english
+  --diarize                              [false  ] stereo audio diarization
+  --tinydiarize                          [false  ] enable tinydiarize (requires a tdrz model)
+  --no-fallback                          [false  ] do not use temperature fallback while decoding
+  --print-special                        [false  ] print special tokens
+  --print-colors                         [false  ] print colors
+  --print-realtime                       [false  ] print output in realtime
+  --print-progress                       [false  ] print progress
+  --no-timestamps                        [false  ] do not print timestamps
+  --language LANG                        [en     ] spoken language ('auto' for auto-detect)
+  --detect-language                      [false  ] exit after automatically detecting language
+  --prompt PROMPT                        [       ] initial prompt
+  --model FNAME                          [models/ggml-base.en.bin] model path
+  --ov-e-device DNAME                    [CPU    ] the OpenVINO device used for encode inference
+  --dtw MODEL                            [       ] compute token-level timestamps
+  --host HOST                            [127.0.0.1] Hostname/ip-adress for the server
+  --port PORT                            [8080   ] Port number for the server
+  --public PATH                          [examples/server/public] Path to the public folder
+  --request-path PATH                    [       ] Request path for all requests
+  --inference-path PATH                  [/inference] Inference path for all requests
+  --convert                              [false  ] Convert audio to WAV, requires ffmpeg on the server
+  --tmp-dir                              [.      ] Temporary directory for ffmpeg transcoded files
+  --suppress-nst                         [false  ] suppress non-speech tokens
+  --no-speech-thold N                    [0.60   ] no speech threshold
+  --no-gpu                               [false  ] do not use gpu
+  --device N                             [0      ] GPU device ID (default: 0)
+  --flash-attn                           [true   ] enable flash attention
+  --no-flash-attn                        [false  ] disable flash attention
+  --no-language-probabilities            [false  ] exclude language probabilities from verbose_json output
+
+Voice Activity Detection (VAD) options:
+  --vad                               [false  ] enable Voice Activity Detection (VAD)
+  --vad-model FNAME                   [       ] VAD model path
+  --vad-threshold N                   [0.50   ] VAD threshold for speech recognition
+  --vad-min-speech-duration-ms N      [250    ] VAD min speech duration (0.0-1.0)
+  --vad-min-silence-duration-ms N     [100    ] VAD min silence duration (to split segments)
+  --vad-max-speech-duration-s N       [FLT_MAX] VAD max speech duration (auto-split longer)
+  --vad-speech-pad-ms N               [30     ] VAD speech padding (extend segments)
+  --vad-samples-overlap N             [0.10   ] VAD samples overlap (seconds between segments)
+
+
+
+deduped:
+options:
+  --help                            [default] show this help message and exit
+  --threads N                       [4      ] number of threads to use during computation
+  --processors N                    [1      ] number of processors to use during computation
+  --offset-t N                      [0      ] time offset in milliseconds
+  --offset-n N                      [0      ] segment index offset
+  --duration N                      [0      ] duration of audio to process in milliseconds
+  --max-context N                   [-1     ] maximum number of text context tokens to store
+  --max-len N                       [0      ] maximum segment length in characters
+  --max-tokens N                    [0      ] maximum number of tokens per segment
+  --split-on-word                   [false  ] split on word rather than on token
+  --best-of N                       [5      ] number of best candidates to keep
+  --beam-size N                     [5      ] beam size for beam search
+  --audio-ctx N                     [0      ] audio context size (0 - all)
+  --word-thold N                    [0.01   ] word timestamp probability threshold
+  --entropy-thold N                 [2.40   ] entropy threshold for decoder fail
+  --logprob-thold N                 [-1.00  ] log probability threshold for decoder fail
+  --no-speech-thold N               [0.60   ] no speech threshold
+  --temperature N                   [0.00   ] The sampling temperature, between 0 and 1
+  --temperature-inc N               [0.20   ] The increment of temperature, between 0 and 1
+  --debug-mode                      [false  ] enable debug mode (eg. dump log_mel)
+  --translate                       [false  ] translate from source language to english
+  --diarize                         [false  ] stereo audio diarization
+  --tinydiarize                     [false  ] enable tinydiarize (requires a tdrz model)
+  --no-fallback                     [false  ] do not use temperature fallback while decoding
+  --output-txt                      [false  ] output result in a text file
+  --output-vtt                      [false  ] output result in a vtt file
+  --output-srt                      [false  ] output result in a srt file
+  --output-lrc                      [false  ] output result in a lrc file
+  --output-words                    [false  ] output script for generating karaoke video
+  --font-path                       [/System/Library/Fonts/Supplemental/Courier New Bold.ttf] path to a monospace font for karaoke video
+  --output-csv                      [false  ] output result in a CSV file
+  --output-json                     [false  ] output result in a JSON file
+  --output-json-full                [false  ] include more information in the JSON file
+  --output-file FNAME               [       ] output file path (without file extension)
+  --no-prints                       [false  ] do not print anything other than the results
+  --print-special                   [false  ] print special tokens
+  --print-colors                    [false  ] print colors
+  --print-confidence                [false  ] print confidence
+  --print-progress                  [false  ] print progress
+  --no-timestamps                   [false  ] do not print timestamps
+  --language LANG                   [en     ] spoken language ('auto' for auto-detect)
+  --detect-language                 [false  ] exit after automatically detecting language
+  --prompt PROMPT                   [       ] initial prompt (max n_text_ctx/2 tokens)
+  --carry-initial-prompt            [false  ] always prepend initial prompt
+  --model FNAME                     [models/ggml-base.en.bin] model path
+  --file FNAME                      [       ] input audio file path
+  --ov-e-device DNAME               [CPU    ] the OpenVINO device used for encode inference
+  --dtw MODEL                       [       ] compute token-level timestamps
+  --log-score                       [false  ] log best decoder scores of tokens
+  --no-gpu                          [false  ] disable GPU
+  --device N                        [0      ] GPU device ID (default: 0)
+  --flash-attn                      [true   ] enable flash attention
+  --no-flash-attn                   [false  ] disable flash attention
+  --suppress-blank                  [true   ] suppress blank outputs
+  --no-suppress-blank               [false  ] disable blank suppression
+  --suppress-nst                    [false  ] suppress non-speech tokens
+  --suppress-regex REGEX            [       ] regular expression matching tokens to suppress
+  --grammar GRAMMAR                 [       ] GBNF grammar to guide decoding
+  --grammar-rule RULE               [       ] top-level GBNF grammar rule name
+  --grammar-penalty N               [100.0  ] scales down logits of nongrammar tokens
+  --vad                               [false  ] enable Voice Activity Detection (VAD)
+  --vad-model FNAME                   [       ] VAD model path
+  --vad-threshold N                   [0.50   ] VAD threshold for speech recognition
+  --vad-min-speech-duration-ms N      [250    ] VAD min speech duration (0.0-1.0)
+  --vad-min-silence-duration-ms N     [100    ] VAD min silence duration (to split segments)
+  --vad-max-speech-duration-s N       [FLT_MAX] VAD max speech duration (auto-split longer)
+  --vad-speech-pad-ms N               [30     ] VAD speech padding (extend segments)
+  --vad-samples-overlap N             [0.10   ] VAD samples overlap (seconds between segments)
+  --step N                          [3000   ] audio step size in milliseconds
+  --length N                        [10000  ] audio length in milliseconds
+  --keep N                          [200    ] audio to keep from previous step in ms
+  --capture ID                      [-1     ] capture device ID
+  --vad-thold N                     [0.60   ] voice activity detection threshold
+  --freq-thold N                    [100.00 ] high-pass frequency cutoff
+  --keep-context                    [false  ] keep context between audio chunks
+  --save-audio                      [false  ] save the recorded audio to a file
+  --host HOST                       [127.0.0.1] Hostname/ip-adress for the server
+  --port PORT                       [8080   ] Port number for the server
+  --public PATH                     [examples/server/public] Path to the public folder
+  --request-path PATH               [       ] Request path for all requests
+  --inference-path PATH             [/inference] Inference path for all requests
+  --convert                         [false  ] Convert audio to WAV, requires ffmpeg on the server
+  --tmp-dir                         [.      ] Temporary directory for ffmpeg transcoded files
+  --no-language-probabilities       [false  ] exclude language probabilities from verbose_json output

From 830bf271f8666858c5f00198ebeec8e2011df9b6 Mon Sep 17 00:00:00 2001
From: scottmonster <dev@scottv.id>
Date: Sun, 17 May 2026 17:19:08 -0500
Subject: [PATCH 03/16] add context params and update garmmar

---
 pywhispercpp/model.py  | 61 +++++++++++++++++++++++++++---------------
 pywhispercpp/model.pyi | 15 ++++++++---
 src/main.cpp           | 14 ++++++++--
 3 files changed, 64 insertions(+), 26 deletions(-)

diff --git a/pywhispercpp/model.py b/pywhispercpp/model.py
index e73cb06..84cc988 100644
--- a/pywhispercpp/model.py
+++ b/pywhispercpp/model.py
@@ -11,7 +11,7 @@
 import sys
 from pathlib import Path
 from time import time
-from typing import Any, Union, Callable, List, TextIO, Tuple, Optional, Dict
+from typing import Any, Union, Callable, List, TextIO, Tuple, Optional, Dict, TypedDict
 import _pywhispercpp as pw
 import numpy as np
 import pywhispercpp.utils as utils
@@ -29,6 +29,19 @@
 logger = logging.getLogger(__name__)
 
 
+class ContextParams(TypedDict, total=False):
+    use_gpu: bool
+    flash_attn: bool
+    gpu_device: int
+    dtw_token_timestamps: bool
+    dtw_aheads_preset: int
+    dtw_n_top: int
+    dtw_mem_size: int
+
+
+_CONTEXT_PARAM_KEYS = frozenset(ContextParams.__annotations__)
+
+
 class Segment:
     """
     A small class representing a transcription segment
@@ -79,7 +92,7 @@ def __init__(self,
                  openvino_model_path: Optional[str] = None,
                  openvino_device: str = 'CPU',
                  openvino_cache_dir: Optional[str] = None,
-                 context_params: Union[Dict[str, Any], Any, None] = None,
+                 context_params: Optional[ContextParams] = None,
                  **params):
         """
         :param model: The name of the model, one of the [AVAILABLE_MODELS](/pywhispercpp/#pywhispercpp.constants.AVAILABLE_MODELS),
@@ -151,10 +164,7 @@ def transcribe(self,
             Model._new_segment_callback = new_segment_callback
             pw.assign_new_segment_callback(self._params, Model.__call_new_segment_callback)
 
-        if abort_callback is None:
-            pw.clear_abort_callback(self._params)
-        else:
-            pw.assign_abort_callback(self._params, abort_callback)
+        pw.assign_abort_callback(self._params, abort_callback)
 
         # run inference
         start_time = time()
@@ -268,17 +278,23 @@ def available_languages() -> List[str]:
         return res
 
     @staticmethod
-    def _resolve_context_params(context_params: Union[Dict[str, Any], Any, None]):
+    def _resolve_context_params(context_params: Optional[ContextParams]):
         if context_params is None:
             return None
 
-        if isinstance(context_params, dict):
-            resolved = pw.whisper_context_default_params()
-            for key, value in context_params.items():
-                setattr(resolved, key, value)
-            return resolved
+        if not isinstance(context_params, dict):
+            raise TypeError("context_params must be a ContextParams dict or None")
 
-        return context_params
+        unknown_keys = sorted(set(context_params) - _CONTEXT_PARAM_KEYS)
+        if unknown_keys:
+            raise TypeError(
+                f"Unknown context_params keys: {', '.join(unknown_keys)}"
+            )
+
+        resolved = pw.whisper_context_default_params()
+        for key, value in context_params.items():
+            setattr(resolved, key, value)
+        return resolved
 
     @staticmethod
     def _normalize_params(kwargs: dict) -> dict:
@@ -314,7 +330,7 @@ def _set_params(self, kwargs: dict) -> None:
         normalized = self._normalize_params(kwargs)
         prompt_tokens = normalized.pop('prompt_tokens', None) if 'prompt_tokens' in normalized else None
         grammar = normalized.pop('grammar', None) if 'grammar' in normalized else None
-        grammar_rule = normalized.pop('grammar_rule', 'root') if 'grammar_rule' in normalized else 'root'
+        grammar_rule = normalized.pop('grammar_rule', None) if 'grammar_rule' in normalized else None
         grammar_penalty = normalized.get('grammar_penalty', self._params.grammar_penalty)
 
         for param, value in normalized.items():
@@ -324,10 +340,7 @@ def _set_params(self, kwargs: dict) -> None:
             self._params.set_prompt_tokens(prompt_tokens)
 
         if 'grammar' in kwargs:
-            if grammar:
-                self._params.set_grammar(grammar, grammar_rule, grammar_penalty)
-            else:
-                self._params.clear_grammar()
+            self._params.set_grammar(grammar, grammar_rule, grammar_penalty)
 
     def _transcribe(self, audio: np.ndarray, n_processors: Optional[int] = None):
         """
@@ -419,13 +432,13 @@ def wav_to_np(file_path):
             finally:
                 os.remove(temp_file_path)
 
-    def auto_detect_language(self,  media: Union[str, np.ndarray], offset_ms: int = 0, n_threads: int = 4) -> Tuple[Tuple[str, np.float32], Dict[str, np.float32]]:
+    def auto_detect_language(self, media: Union[str, np.ndarray], offset_ms: Optional[int] = None, n_threads: Optional[int] = None) -> Tuple[Tuple[str, np.float32], Dict[str, np.float32]]:
         """
         Automatic language detection using whisper.cpp/whisper_pcm_to_mel and whisper.cpp/whisper_lang_auto_detect
 
         :param media: Media file path or a numpy array
-        :param offset_ms: offset in milliseconds
-        :param n_threads: number of threads to use
+        :param offset_ms: offset in milliseconds, defaults to the model's configured offset
+        :param n_threads: number of threads to use, defaults to the model's configured thread count
         :return: ((detected_language, probability), probabilities for all languages)
         """
         if isinstance(media, np.ndarray):
@@ -435,6 +448,12 @@ def auto_detect_language(self,  media: Union[str, np.ndarray], offset_ms: int =
                 raise FileNotFoundError(media)
             audio = self._load_audio(media)
 
+        if offset_ms is None:
+            offset_ms = self._params.offset_ms
+
+        if n_threads is None:
+            n_threads = self._params.n_threads
+
         pw.whisper_pcm_to_mel(self._ctx, audio, len(audio), n_threads)
         lang_count = self.lang_max_id() + 1
         probs = np.zeros(lang_count, dtype=np.float32)
diff --git a/pywhispercpp/model.pyi b/pywhispercpp/model.pyi
index 27e3d52..e397820 100644
--- a/pywhispercpp/model.pyi
+++ b/pywhispercpp/model.pyi
@@ -9,7 +9,16 @@ import numpy.typing as npt
 
 AudioArray = npt.NDArray[np.float32]
 AudioInput = Union[str, AudioArray]
-ContextParams = Union[Dict[str, Any], Any]
+
+
+class ContextParams(TypedDict, total=False):
+    use_gpu: bool
+    flash_attn: bool
+    gpu_device: int
+    dtw_token_timestamps: bool
+    dtw_aheads_preset: int
+    dtw_n_top: int
+    dtw_mem_size: int
 
 
 class GreedyParams(TypedDict):
@@ -167,8 +176,8 @@ class Model:
     def auto_detect_language(
         self,
         media: AudioInput,
-        offset_ms: int = 0,
-        n_threads: int = 4,
+        offset_ms: Optional[int] = None,
+        n_threads: Optional[int] = None,
     ) -> Tuple[Tuple[str, np.float32], Dict[str, np.float32]]: ...
     def __del__(self) -> None: ...
 
diff --git a/src/main.cpp b/src/main.cpp
index 23150e5..bbbaff0 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1072,8 +1072,18 @@ PYBIND11_MODULE(_pywhispercpp, m) {
                                  [](WhisperFullParamsWrapper &self, py::dict dict) {self.greedy.best_of = dict["best_of"].cast<int>();})
         .def_property("beam_search", [](WhisperFullParamsWrapper &self) {return py::dict("beam_size"_a=self.beam_search.beam_size, "patience"_a=self.beam_search.patience);},
                                 [](WhisperFullParamsWrapper &self, py::dict dict) {self.beam_search.beam_size = dict["beam_size"].cast<int>(); self.beam_search.patience = dict["patience"].cast<float>();})
-           .def("set_grammar", &WhisperFullParamsWrapper::set_grammar,
-               py::arg("grammar"), py::arg("rule_name") = "", py::arg("penalty") = -1.0f,
+           .def("set_grammar",
+               [](WhisperFullParamsWrapper &self, py::object grammar, py::object rule_name, float penalty) {
+                   if (grammar.is_none()) {
+                       self.clear_grammar();
+                       return;
+                   }
+
+                   const std::string grammar_input = grammar.cast<std::string>();
+                   const std::string rule_name_str = rule_name.is_none() ? "" : rule_name.cast<std::string>();
+                   self.set_grammar(grammar_input, rule_name_str, penalty);
+               },
+               py::arg("grammar"), py::arg("rule_name") = py::none(), py::arg("penalty") = -1.0f,
                "Parse GBNF grammar text or a grammar file path and store the resulting grammar in C++-owned memory.")
            .def("clear_grammar", &WhisperFullParamsWrapper::clear_grammar,
                "Clear any previously configured grammar from the parameter object.")

From dde0958a00ae33aa748f4a2c1e53f349c00a40f6 Mon Sep 17 00:00:00 2001
From: scottmonster <dev@scottv.id>
Date: Sun, 17 May 2026 18:16:47 -0500
Subject: [PATCH 04/16] update docustrings

---
 pywhispercpp/model.py  | 87 +++++++++++++++++++++++++++++++++---------
 pywhispercpp/model.pyi |  5 +--
 2 files changed, 69 insertions(+), 23 deletions(-)

diff --git a/pywhispercpp/model.py b/pywhispercpp/model.py
index 84cc988..42de2f6 100644
--- a/pywhispercpp/model.py
+++ b/pywhispercpp/model.py
@@ -95,18 +95,67 @@ def __init__(self,
                  context_params: Optional[ContextParams] = None,
                  **params):
         """
-        :param model: The name of the model, one of the [AVAILABLE_MODELS](/pywhispercpp/#pywhispercpp.constants.AVAILABLE_MODELS),
-                        (default to `tiny`), or a direct path to a `ggml` model.
-        :param models_dir: The directory where the models are stored, or where they will be downloaded if they don't
-                            exist, default to [MODELS_DIR](/pywhispercpp/#pywhispercpp.constants.MODELS_DIR) <user_data_dir/pywhsipercpp/models>
-        :param params_sampling_strategy: 0 -> GREEDY, else BEAM_SEARCH
-        :param redirect_whispercpp_logs_to: where to redirect the whisper.cpp logs, default to False (no redirection), accepts str file path, sys.stdout, sys.stderr, or use None to redirect to devnull
-        :param use_openvino: whether to use OpenVINO or not
-        :param openvino_model_path: path to the OpenVINO model
-        :param openvino_device: OpenVINO device, default to CPU
-        :param openvino_cache_dir: OpenVINO cache directory
-        :param params: keyword arguments for different whisper.cpp parameters,
-                        see [PARAMS_SCHEMA](/pywhispercpp/#pywhispercpp.constants.PARAMS_SCHEMA)
+        :param model: model name, default `tiny`, or a direct path to a ggml model file.
+        :param models_dir: directory containing model files; if omitted, uses `MODELS_DIR` unless `model`
+                           is already a direct file path.
+        :param params_sampling_strategy: sampling strategy selector; `0` uses greedy decoding and any
+                                         other value uses beam search.
+        :param redirect_whispercpp_logs_to: log redirection target. Use `False` for no redirection, `None`
+                                            for `/dev/null`, a file path string, or `sys.stdout`/`sys.stderr`.
+        :param use_openvino: whether to initialize the OpenVINO encoder backend.
+        :param openvino_model_path: path to the OpenVINO model directory or files.
+        :param openvino_device: OpenVINO device name, default `CPU`.
+        :param openvino_cache_dir: OpenVINO cache directory.
+        :param context_params: optional whisper context loader params. Accepted keys are `use_gpu`,
+                               `flash_attn`, `gpu_device`, `dtw_token_timestamps`,
+                               `dtw_aheads_preset`, `dtw_n_top`, and `dtw_mem_size`. Omitted keys inherit
+                               from `whisper_context_default_params()`.
+        :param params: decode parameters forwarded to `whisper_full_params`.
+            Supported keys:
+            - `n_threads`: number of inference threads. Default is `min(4, hardware_concurrency())`.
+            - `n_max_text_ctx`: max prompt-text tokens carried into the decoder. Default `16384`.
+            - `offset_ms`: audio start offset in milliseconds. Default `0`.
+            - `duration_ms`: audio duration to process in milliseconds. Default `0`.
+            - `translate`: translate output to English. Default `False`.
+            - `no_context`: disable reuse of past transcription context. Default `True`.
+            - `no_timestamps`: disable timestamp generation. Default `False`.
+            - `single_segment`: force a single output segment. Default `False`.
+            - `print_special`: print special tokens. Default `False`.
+            - `print_progress`: print progress information. Default `True`.
+            - `print_realtime`: print realtime output from whisper.cpp. Default `False`.
+            - `print_timestamps`: print timestamps during realtime output. Default `True`.
+            - `token_timestamps`: enable token-level timestamps. Default `False`.
+            - `thold_pt`: token timestamp probability threshold. Default `0.01`.
+            - `thold_ptsum`: token timestamp sum threshold. Default `0.01`.
+            - `max_len`: max segment length in characters. Default `0`.
+            - `split_on_word`: split on words when `max_len` is used. Default `False`.
+            - `max_tokens`: max tokens per segment. Default `0`.
+            - `debug_mode`: enable whisper.cpp debug mode. Default `False`.
+            - `audio_ctx`: override audio context size. Default `0`.
+            - `tdrz_enable`: enable tinydiarize speaker-turn detection. Default `False`.
+            - `initial_prompt`: initial text prompt prepended before decoding. Default `None`.
+            - `grammar`: GBNF grammar text or path to a grammar file. Default `None`.
+            - `grammar_rule`: top-level grammar rule name. Default `root` when grammar is used.
+            - `prompt_tokens`: explicit prompt token sequence. Default `None`.
+            - `prompt_n_tokens`: number of prompt tokens. Default `0`.
+            - `carry_initial_prompt`: prepend the initial prompt to each decode window. Default `False`.
+            - `language`: language code. Default `en`.
+            - `detect_language`: enable automatic language detection during transcription. Default `False`.
+            - `suppress_blank`: suppress blank outputs. Default `True`.
+            - `suppress_non_speech_tokens`: Python alias for `suppress_nst`. Default `False`.
+            - `suppress_nst`: suppress non-speech tokens. Default `False`.
+            - `temperature`: initial decoding temperature. Default `0.0`.
+            - `max_initial_ts`: maximum initial timestamp. Default `1.0`.
+            - `length_penalty`: length penalty. Default `-1.0`.
+            - `temperature_inc`: fallback temperature increment. Default `0.2`.
+            - `entropy_thold`: entropy threshold. Default `2.4`.
+            - `logprob_thold`: logprob threshold. Default `-1.0`.
+            - `no_speech_thold`: no-speech threshold. Default `0.6`.
+            - `grammar_penalty`: penalty applied to non-grammar tokens. Default `100.0`.
+            - `greedy`: greedy-decoder settings, typically `{"best_of": 5}`.
+            - `beam_search`: beam-search settings, schema default `{"beam_size": 5, "patience": -1.0}`.
+            - `vad`: enable VAD. Default `False`.
+            - `vad_model_path`: path to the VAD model. Default `None`.
         """
         self.model_path = utils.resolve_model_path(model, models_dir)
         self._ctx = None
@@ -136,12 +185,12 @@ def transcribe(self,
         Accepts a media_file path (audio/video) or a raw numpy array.
 
         :param media: Media file path or a numpy array
-        :param n_processors: if not None, it will run the transcription on multiple processes
-                             binding to whisper.cpp/whisper_full_parallel
-                             > Split the input audio in chunks and process each chunk separately using whisper_full()
-        :param new_segment_callback: callback function that will be called when a new segment is generated
+        :param n_processors: number of worker processes for `whisper_full_parallel`. If omitted, runs a
+                     single-process `whisper_full()` decode.
+        :param new_segment_callback: callback invoked for each newly produced `Segment` during decoding.
         :param abort_callback: callback function returning True to abort an in-flight transcription early
-        :param params: keyword arguments for different whisper.cpp parameters, see ::: constants.PARAMS_SCHEMA
+        :param params: keyword arguments for different whisper.cpp parameters; these override the model's
+                       active decode params for this call
         :param extract_probability: If True, calculates the geometric mean of token probabilities for each segment,
             providing a confidence score interpretable as a probability in [0, 1].
         :return: List of transcription segments
@@ -437,8 +486,8 @@ def auto_detect_language(self, media: Union[str, np.ndarray], offset_ms: Optiona
         Automatic language detection using whisper.cpp/whisper_pcm_to_mel and whisper.cpp/whisper_lang_auto_detect
 
         :param media: Media file path or a numpy array
-        :param offset_ms: offset in milliseconds, defaults to the model's configured offset
-        :param n_threads: number of threads to use, defaults to the model's configured thread count
+        :param offset_ms: offset in milliseconds; when omitted, uses the model's current `offset_ms`
+        :param n_threads: number of threads to use; when omitted, uses the model's current `n_threads`
         :return: ((detected_language, probability), probabilities for all languages)
         """
         if isinstance(media, np.ndarray):
diff --git a/pywhispercpp/model.pyi b/pywhispercpp/model.pyi
index e397820..7e71af8 100644
--- a/pywhispercpp/model.pyi
+++ b/pywhispercpp/model.pyi
@@ -42,10 +42,7 @@ class Segment:
 
 
 class Model:
-    """
-    docuemnts strings
-    """
-  
+
     _new_segment_callback: Optional[Callable[[Segment], None]]
     
 

From b88c6398e94bb1402713de5991273bdad092ebcb Mon Sep 17 00:00:00 2001
From: scottmonster <dev@scottv.id>
Date: Sun, 17 May 2026 18:38:51 -0500
Subject: [PATCH 05/16] roll back to previous handling for grammar and
 prompt_tokens to maintain compatibility

---
 pywhispercpp/constants.py |  12 ----
 pywhispercpp/model.py     |  10 ----
 pywhispercpp/model.pyi    |   6 --
 src/main.cpp              | 116 +-------------------------------------
 4 files changed, 1 insertion(+), 143 deletions(-)

diff --git a/pywhispercpp/constants.py b/pywhispercpp/constants.py
index b742455..529b28b 100644
--- a/pywhispercpp/constants.py
+++ b/pywhispercpp/constants.py
@@ -194,18 +194,6 @@
             'options': None,
             'default': None
     },
-    'grammar': {
-            'type': str,
-            'description': "GBNF grammar text or a path to a grammar file",
-            'options': None,
-            'default': None
-    },
-    'grammar_rule': {
-            'type': str,
-            'description': "top-level GBNF grammar rule name",
-            'options': None,
-            'default': 'root'
-    },
     'prompt_tokens': {
             'type': Tuple,
             'description': "tokens to provide to the whisper decoder as initial prompt",
diff --git a/pywhispercpp/model.py b/pywhispercpp/model.py
index 42de2f6..611dc9e 100644
--- a/pywhispercpp/model.py
+++ b/pywhispercpp/model.py
@@ -377,20 +377,10 @@ def _set_params(self, kwargs: dict) -> None:
         :return: None
         """
         normalized = self._normalize_params(kwargs)
-        prompt_tokens = normalized.pop('prompt_tokens', None) if 'prompt_tokens' in normalized else None
-        grammar = normalized.pop('grammar', None) if 'grammar' in normalized else None
-        grammar_rule = normalized.pop('grammar_rule', None) if 'grammar_rule' in normalized else None
-        grammar_penalty = normalized.get('grammar_penalty', self._params.grammar_penalty)
 
         for param, value in normalized.items():
             setattr(self._params, param, value)
 
-        if 'prompt_tokens' in kwargs:
-            self._params.set_prompt_tokens(prompt_tokens)
-
-        if 'grammar' in kwargs:
-            self._params.set_grammar(grammar, grammar_rule, grammar_penalty)
-
     def _transcribe(self, audio: np.ndarray, n_processors: Optional[int] = None):
         """
         Private method to call the whisper.cpp/whisper_full function
diff --git a/pywhispercpp/model.pyi b/pywhispercpp/model.pyi
index 7e71af8..c548bea 100644
--- a/pywhispercpp/model.pyi
+++ b/pywhispercpp/model.pyi
@@ -80,8 +80,6 @@ class Model:
         audio_ctx: int = 0,
         tdrz_enable: bool = False,
         initial_prompt: Optional[str] = None,
-        grammar: Optional[str] = None,
-        grammar_rule: str = 'root',
         prompt_tokens: Optional[Tuple[Any, ...]] = None,
         prompt_n_tokens: int = 0,
         carry_initial_prompt: bool = False,
@@ -97,7 +95,6 @@ class Model:
         entropy_thold: float = 2.4,
         logprob_thold: float = -1.0,
         no_speech_thold: float = 0.6,
-        grammar_penalty: float = 100.0,
         greedy: GreedyParams = {'best_of': 5},
         beam_search: BeamSearchParams = {'beam_size': 5, 'patience': -1.0},
         vad: bool = False,
@@ -133,8 +130,6 @@ class Model:
         audio_ctx: int = 0,
         tdrz_enable: bool = False,
         initial_prompt: Optional[str] = None,
-        grammar: Optional[str] = None,
-        grammar_rule: str = 'root',
         prompt_tokens: Optional[Tuple[Any, ...]] = None,
         prompt_n_tokens: int = 0,
         carry_initial_prompt: bool = False,
@@ -150,7 +145,6 @@ class Model:
         entropy_thold: float = 2.4,
         logprob_thold: float = -1.0,
         no_speech_thold: float = 0.6,
-        grammar_penalty: float = 100.0,
         greedy: GreedyParams = {'best_of': 5},
         beam_search: BeamSearchParams = {'beam_size': 5, 'patience': -1.0},
         extract_probability: bool = False,
diff --git a/src/main.cpp b/src/main.cpp
index bbbaff0..7197cb7 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -16,9 +16,6 @@
 #include <pybind11/numpy.h>
 
 #include "whisper.h"
-#include "../whisper.cpp/examples/grammar-parser.h"
-
-#include <fstream>
 
 
 #define STRINGIFY(x) #x
@@ -387,10 +384,6 @@ struct WhisperFullParamsWrapper : public whisper_full_params {
   std::string initial_prompt_str;
   std::string suppress_regex_str;
   std::string vad_model_path_str;
-    std::string grammar_rule_str;
-    grammar_parser::parse_state grammar_parsed;
-    std::vector<const whisper_grammar_element *> grammar_rules_storage;
-    std::vector<whisper_token> prompt_tokens_storage;
 public:
   py::function py_progress_callback;
         py::object py_abort_callback;
@@ -425,23 +418,12 @@ struct WhisperFullParamsWrapper : public whisper_full_params {
       initial_prompt_str(other.initial_prompt_str),
       suppress_regex_str(other.suppress_regex_str),
       vad_model_path_str(other.vad_model_path_str),
-            grammar_rule_str(other.grammar_rule_str),
-            grammar_parsed(other.grammar_parsed),
-            grammar_rules_storage(other.grammar_rules_storage),
-            prompt_tokens_storage(other.prompt_tokens_storage),
         py_progress_callback(other.py_progress_callback),
         py_abort_callback(other.py_abort_callback) {
     // Reset pointers to new string copies
     initial_prompt = initial_prompt_str.empty() ? nullptr : initial_prompt_str.c_str();
     suppress_regex = suppress_regex_str.empty() ? nullptr : suppress_regex_str.c_str();
     vad_model_path = vad_model_path_str.empty() ? nullptr : vad_model_path_str.c_str();
-        grammar_rules = grammar_rules_storage.empty() ? nullptr : grammar_rules_storage.data();
-        n_grammar_rules = grammar_rules_storage.size();
-        if (!grammar_rule_str.empty() && grammar_parsed.symbol_ids.find(grammar_rule_str) != grammar_parsed.symbol_ids.end()) {
-            i_start_rule = grammar_parsed.symbol_ids.at(grammar_rule_str);
-        }
-        prompt_tokens = prompt_tokens_storage.empty() ? nullptr : prompt_tokens_storage.data();
-        prompt_n_tokens = prompt_tokens_storage.size();
     abort_callback_user_data = this;
     progress_callback_user_data = this;
     progress_callback = [](struct whisper_context* ctx, struct whisper_state* state, int progress, void* user_data) {
@@ -480,79 +462,6 @@ struct WhisperFullParamsWrapper : public whisper_full_params {
                 abort_callback = nullptr;
                 abort_callback_user_data = this;
         }
-    void clear_grammar() {
-        grammar_rule_str.clear();
-        grammar_parsed = grammar_parser::parse_state();
-        grammar_rules_storage.clear();
-        grammar_rules = nullptr;
-        n_grammar_rules = 0;
-        i_start_rule = 0;
-    }
-    void set_grammar(const std::string& grammar_input, const std::string& rule_name = "", float penalty = -1.0f) {
-        clear_grammar();
-
-        if (grammar_input.empty()) {
-            if (penalty >= 0.0f) {
-                grammar_penalty = penalty;
-            }
-            return;
-        }
-
-        std::string grammar_source = grammar_input;
-        std::ifstream grammar_file(grammar_input);
-        if (grammar_file.is_open()) {
-            grammar_source.assign((std::istreambuf_iterator<char>(grammar_file)),
-                                                        std::istreambuf_iterator<char>());
-        }
-
-        grammar_parsed = grammar_parser::parse(grammar_source.c_str());
-        if (grammar_parsed.rules.empty()) {
-            throw py::value_error("Failed to parse grammar input");
-        }
-
-        grammar_rule_str = rule_name.empty() ? "root" : rule_name;
-        if (grammar_parsed.symbol_ids.find(grammar_rule_str) == grammar_parsed.symbol_ids.end()) {
-            throw py::value_error("Grammar rule '" + grammar_rule_str + "' not found");
-        }
-
-        grammar_rules_storage = grammar_parsed.c_rules();
-        grammar_rules = grammar_rules_storage.data();
-        n_grammar_rules = grammar_rules_storage.size();
-        i_start_rule = grammar_parsed.symbol_ids.at(grammar_rule_str);
-
-        if (penalty >= 0.0f) {
-            grammar_penalty = penalty;
-        }
-    }
-    void set_prompt_tokens(const py::object& tokens_obj) {
-        prompt_tokens_storage.clear();
-
-        if (tokens_obj.is_none()) {
-            prompt_tokens = nullptr;
-            prompt_n_tokens = 0;
-            return;
-        }
-
-        py::sequence tokens = tokens_obj.cast<py::sequence>();
-        prompt_tokens_storage.reserve(tokens.size());
-        for (const auto & token : tokens) {
-            prompt_tokens_storage.push_back(token.cast<whisper_token>());
-        }
-
-        prompt_tokens = prompt_tokens_storage.empty() ? nullptr : prompt_tokens_storage.data();
-        prompt_n_tokens = prompt_tokens_storage.size();
-    }
-    py::tuple get_prompt_tokens() const {
-        const whisper_token * tokens_ptr = prompt_tokens_storage.empty() ? prompt_tokens : prompt_tokens_storage.data();
-        const size_t token_count = prompt_tokens_storage.empty() ? static_cast<size_t>(std::max(prompt_n_tokens, 0)) : prompt_tokens_storage.size();
-
-        py::tuple tokens(token_count);
-        for (size_t i = 0; i < token_count; ++i) {
-            tokens[i] = py::int_(tokens_ptr[i]);
-        }
-
-        return tokens;
-    }
 };
 WhisperFullParamsWrapper  whisper_full_default_params_wrapper(enum whisper_sampling_strategy strategy) {
     return WhisperFullParamsWrapper(whisper_full_default_params(strategy));
@@ -1022,13 +931,7 @@ PYBIND11_MODULE(_pywhispercpp, m) {
                 self.set_initial_prompt(initial_prompt);
             }
         )
-        .def_property("prompt_tokens",
-            [](WhisperFullParamsWrapper &self) {
-                return self.get_prompt_tokens();
-            },
-            [](WhisperFullParamsWrapper &self, const py::object &tokens) {
-                self.set_prompt_tokens(tokens);
-            })
+        .def_readwrite("prompt_tokens", &WhisperFullParamsWrapper::prompt_tokens)
         .def("set_abort_callback",
              [](WhisperFullParamsWrapper &self, py::object callback) {
                  if (callback.is_none()) {
@@ -1041,8 +944,6 @@ PYBIND11_MODULE(_pywhispercpp, m) {
              "Assign an abort callback that returns True to stop processing.")
         .def("clear_abort_callback", &WhisperFullParamsWrapper::clear_abort_callback,
              "Clear any previously assigned abort callback.")
-        .def("set_prompt_tokens", &WhisperFullParamsWrapper::set_prompt_tokens, py::arg("tokens"),
-             "Copy prompt tokens into C++-owned storage and update the raw pointers safely.")
         .def_readwrite("prompt_n_tokens", &WhisperFullParamsWrapper::prompt_n_tokens)
         .def_readwrite("carry_initial_prompt", &WhisperFullParamsWrapper::carry_initial_prompt)
         .def_property("language",
@@ -1072,21 +973,6 @@ PYBIND11_MODULE(_pywhispercpp, m) {
                                  [](WhisperFullParamsWrapper &self, py::dict dict) {self.greedy.best_of = dict["best_of"].cast<int>();})
         .def_property("beam_search", [](WhisperFullParamsWrapper &self) {return py::dict("beam_size"_a=self.beam_search.beam_size, "patience"_a=self.beam_search.patience);},
                                 [](WhisperFullParamsWrapper &self, py::dict dict) {self.beam_search.beam_size = dict["beam_size"].cast<int>(); self.beam_search.patience = dict["patience"].cast<float>();})
-           .def("set_grammar",
-               [](WhisperFullParamsWrapper &self, py::object grammar, py::object rule_name, float penalty) {
-                   if (grammar.is_none()) {
-                       self.clear_grammar();
-                       return;
-                   }
-
-                   const std::string grammar_input = grammar.cast<std::string>();
-                   const std::string rule_name_str = rule_name.is_none() ? "" : rule_name.cast<std::string>();
-                   self.set_grammar(grammar_input, rule_name_str, penalty);
-               },
-               py::arg("grammar"), py::arg("rule_name") = py::none(), py::arg("penalty") = -1.0f,
-               "Parse GBNF grammar text or a grammar file path and store the resulting grammar in C++-owned memory.")
-           .def("clear_grammar", &WhisperFullParamsWrapper::clear_grammar,
-               "Clear any previously configured grammar from the parameter object.")
         .def_readwrite("new_segment_callback_user_data", &WhisperFullParamsWrapper::new_segment_callback_user_data)
         .def_readwrite("encoder_begin_callback_user_data", &WhisperFullParamsWrapper::encoder_begin_callback_user_data)
         .def_readwrite("logits_filter_callback_user_data", &WhisperFullParamsWrapper::logits_filter_callback_user_data)

From 83ba8cccbf97a64bc213ab0076dd50994c106c4a Mon Sep 17 00:00:00 2001
From: scottmonster <87917233+scottmonster@users.noreply.github.com>
Date: Sun, 17 May 2026 20:24:10 -0500
Subject: [PATCH 06/16] update default in model.pyi

---
 pywhispercpp/model.pyi | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/pywhispercpp/model.pyi b/pywhispercpp/model.pyi
index c548bea..8b9b323 100644
--- a/pywhispercpp/model.pyi
+++ b/pywhispercpp/model.pyi
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-# Generated by coverage/generate_pyi.py. Do not edit by hand.
-
 from typing import Any, Callable, Dict, List, Optional, TextIO, Tuple, TypedDict, Union
 
 import numpy as np
@@ -42,9 +40,7 @@ class Segment:
 
 
 class Model:
-
     _new_segment_callback: Optional[Callable[[Segment], None]]
-    
 
     def __init__(
         self,
@@ -56,7 +52,6 @@ class Model:
         openvino_model_path: Optional[str] = None,
         openvino_device: str = 'CPU',
         openvino_cache_dir: Optional[str] = None,
-        context_params: Optional[ContextParams] = None,
         *,
         n_threads: Optional[int] = None,
         n_max_text_ctx: int = 16384,
@@ -95,10 +90,11 @@ class Model:
         entropy_thold: float = 2.4,
         logprob_thold: float = -1.0,
         no_speech_thold: float = 0.6,
-        greedy: GreedyParams = {'best_of': 5},
-        beam_search: BeamSearchParams = {'beam_size': 5, 'patience': -1.0},
+        greedy: GreedyParams = {'best_of': -1},
+        beam_search: BeamSearchParams = {'beam_size': -1, 'patience': -1.0},
         vad: bool = False,
         vad_model_path: Optional[str] = None,
+        **params
     )->None: ...
 
     def transcribe(
@@ -145,11 +141,12 @@ class Model:
         entropy_thold: float = 2.4,
         logprob_thold: float = -1.0,
         no_speech_thold: float = 0.6,
-        greedy: GreedyParams = {'best_of': 5},
-        beam_search: BeamSearchParams = {'beam_size': 5, 'patience': -1.0},
+        greedy: GreedyParams = {'best_of': -1},
+        beam_search: BeamSearchParams = {'beam_size': -1, 'patience': -1.0},
         extract_probability: bool = False,
         vad: bool = False,
         vad_model_path: Optional[str] = None,
+        **params
     ) -> List[Segment]: ...
 
     def get_params(self) -> Dict[str, Any]: ...

From eb7f21ec9b9c6eaccb52ae91abcd3f62cf89c68b Mon Sep 17 00:00:00 2001
From: scottmonster <87917233+scottmonster@users.noreply.github.com>
Date: Tue, 19 May 2026 18:20:46 -0500
Subject: [PATCH 07/16] prompt_tokens + grammar + callbacks

---
 pywhispercpp/model.py  |  43 ++++
 pywhispercpp/model.pyi |   5 +
 src/main.cpp           | 532 +++++++++++++++++++++++++++++++++++------
 3 files changed, 511 insertions(+), 69 deletions(-)

diff --git a/pywhispercpp/model.py b/pywhispercpp/model.py
index 611dc9e..44adaeb 100644
--- a/pywhispercpp/model.py
+++ b/pywhispercpp/model.py
@@ -354,6 +354,43 @@ def _normalize_params(kwargs: dict) -> dict:
 
         return normalized
 
+    def _apply_grammar_params(self, normalized: dict) -> dict:
+        has_grammar = 'grammar' in normalized
+        has_grammar_rule = 'grammar_rule' in normalized
+
+        if not has_grammar:
+            if has_grammar_rule:
+                raise AttributeError('grammar_rule requires grammar')
+            return normalized
+
+        grammar = normalized.pop('grammar')
+        grammar_rule = normalized.pop('grammar_rule', 'root')
+
+        if grammar is None:
+            self._params.clear_grammar()
+            return normalized
+
+        self._params.set_grammar(
+            grammar,
+            grammar_rule,
+            normalized.get('grammar_penalty', self._params.grammar_penalty),
+        )
+        return normalized
+
+    def _apply_prompt_token_params(self, normalized: dict) -> dict:
+        if 'prompt_tokens' not in normalized:
+            return normalized
+
+        prompt_tokens = normalized.pop('prompt_tokens')
+        normalized.pop('prompt_n_tokens', None)
+
+        if prompt_tokens is None:
+            self._params.clear_prompt_tokens()
+        else:
+            self._params.set_prompt_tokens(prompt_tokens)
+
+        return normalized
+
     def _init_model(self) -> None:
         """
         Private method to initialize the method from the bindings, it will be called automatically from the __init__
@@ -378,6 +415,12 @@ def _set_params(self, kwargs: dict) -> None:
         """
         normalized = self._normalize_params(kwargs)
 
+        if 'grammar' in normalized or 'grammar_rule' in normalized:
+            normalized = self._apply_grammar_params(normalized)
+
+        if 'prompt_tokens' in normalized:
+            normalized = self._apply_prompt_token_params(normalized)
+
         for param, value in normalized.items():
             setattr(self._params, param, value)
 
diff --git a/pywhispercpp/model.pyi b/pywhispercpp/model.pyi
index 8b9b323..3e0812b 100644
--- a/pywhispercpp/model.pyi
+++ b/pywhispercpp/model.pyi
@@ -75,6 +75,8 @@ class Model:
         audio_ctx: int = 0,
         tdrz_enable: bool = False,
         initial_prompt: Optional[str] = None,
+        grammar: Optional[str] = None,
+        grammar_rule: str = 'root',
         prompt_tokens: Optional[Tuple[Any, ...]] = None,
         prompt_n_tokens: int = 0,
         carry_initial_prompt: bool = False,
@@ -126,6 +128,8 @@ class Model:
         audio_ctx: int = 0,
         tdrz_enable: bool = False,
         initial_prompt: Optional[str] = None,
+        grammar: Optional[str] = None,
+        grammar_rule: str = 'root',
         prompt_tokens: Optional[Tuple[Any, ...]] = None,
         prompt_n_tokens: int = 0,
         carry_initial_prompt: bool = False,
@@ -141,6 +145,7 @@ class Model:
         entropy_thold: float = 2.4,
         logprob_thold: float = -1.0,
         no_speech_thold: float = 0.6,
+        grammar_penalty: float = 100.0,
         greedy: GreedyParams = {'best_of': -1},
         beam_search: BeamSearchParams = {'beam_size': -1, 'patience': -1.0},
         extract_probability: bool = False,
diff --git a/src/main.cpp b/src/main.cpp
index 7197cb7..815a962 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -15,7 +15,11 @@
 #include <pybind11/functional.h>
 #include <pybind11/numpy.h>
 
+#include <fstream>
+#include <iterator>
+
 #include "whisper.h"
+#include "../whisper.cpp/examples/grammar-parser.h"
 
 
 #define STRINGIFY(x) #x
@@ -373,6 +377,15 @@ int whisper_model_ftype_wrapper(struct whisper_context_wrapper * ctx_w){
 }
 
 bool _abort_callback(void * user_data);
+void _new_segment_callback(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data);
+bool _encoder_begin_callback(struct whisper_context * ctx, struct whisper_state * state, void * user_data);
+void _logits_filter_callback(
+    struct whisper_context * ctx,
+    struct whisper_state * state,
+    const whisper_token_data * tokens,
+    int   n_tokens,
+    float * logits,
+    void * user_data);
 
 int whisper_ctx_init_openvino_encoder_wrapper(struct whisper_context_wrapper * ctx, const char * model_path,
                     const char * device,
@@ -384,61 +397,105 @@ struct WhisperFullParamsWrapper : public whisper_full_params {
   std::string initial_prompt_str;
   std::string suppress_regex_str;
   std::string vad_model_path_str;
+        std::vector<whisper_token> prompt_token_storage;
+    grammar_parser::parse_state grammar_state;
+    std::vector<const whisper_grammar_element *> grammar_rule_ptrs;
+
+    void reset_progress_callback() {
+        progress_callback_user_data = this;
+        progress_callback = [](struct whisper_context* ctx, struct whisper_state* state, int progress, void* user_data) {
+            (void) ctx;
+            (void) state;
+            auto* self = static_cast<WhisperFullParamsWrapper*>(user_data);
+            if (self && self->print_progress) {
+                if (self->py_progress_callback) {
+                    py::gil_scoped_acquire gil;
+                    if (self->py_progress_callback_user_data.is_none()) {
+                        self->py_progress_callback(progress);
+                    } else {
+                        self->py_progress_callback(progress, self->py_progress_callback_user_data);
+                    }
+                } else {
+                    fprintf(stderr, "Progress: %3d%%\n", progress);
+                }
+            }
+        };
+    }
+
+    void sync_grammar_fields() {
+        grammar_rule_ptrs = grammar_state.c_rules();
+        grammar_rules = grammar_rule_ptrs.empty() ? nullptr : grammar_rule_ptrs.data();
+        n_grammar_rules = grammar_rule_ptrs.size();
+    }
+    void sync_prompt_tokens() {
+        prompt_tokens = prompt_token_storage.empty() ? nullptr : prompt_token_storage.data();
+        prompt_n_tokens = prompt_token_storage.size();
+    }
 public:
+    py::function py_new_segment_callback;
+        py::object py_new_segment_callback_user_data;
+        py::function py_encoder_begin_callback;
+        py::object py_encoder_begin_callback_user_data;
   py::function py_progress_callback;
-        py::object py_abort_callback;
+    py::object py_progress_callback_user_data;
+        py::function py_logits_filter_callback;
+        py::object py_logits_filter_callback_user_data;
+    py::object py_abort_callback;
+    py::object py_abort_callback_user_data;
   WhisperFullParamsWrapper(const whisper_full_params& params = whisper_full_params())
     : whisper_full_params(params),
       initial_prompt_str(params.initial_prompt ? params.initial_prompt : ""),
       suppress_regex_str(params.suppress_regex ? params.suppress_regex : ""),
-      vad_model_path_str(params.vad_model_path ? params.vad_model_path : "")
+            vad_model_path_str(params.vad_model_path ? params.vad_model_path : ""),
+                        prompt_token_storage(),
+                        py_new_segment_callback_user_data(py::none()),
+                        py_encoder_begin_callback_user_data(py::none()),
+            py_progress_callback_user_data(py::none()),
+                        py_logits_filter_callback_user_data(py::none()),
+            py_abort_callback(py::none()),
+            py_abort_callback_user_data(py::none())
     {
     initial_prompt = initial_prompt_str.empty() ? nullptr : initial_prompt_str.c_str();
     suppress_regex = suppress_regex_str.empty() ? nullptr : suppress_regex_str.c_str();
     vad_model_path = vad_model_path_str.empty() ? nullptr : vad_model_path_str.c_str();
+        new_segment_callback_user_data = this;
+        encoder_begin_callback_user_data = this;
     abort_callback_user_data = this;
-    // progress callback
-    progress_callback_user_data = this;
-    progress_callback = [](struct whisper_context* ctx, struct whisper_state* state, int progress, void* user_data) {
-      auto* self = static_cast<WhisperFullParamsWrapper*>(user_data);
-      if(self && self->print_progress){
-        if (self->py_progress_callback) {
-          // call the python callback
-          py::gil_scoped_acquire gil;
-          self->py_progress_callback(progress);  // Call Python callback
-        }
-        else {
-          fprintf(stderr, "Progress: %3d%%\n", progress);
-        } // Default message
-      }
-    } ;
+        logits_filter_callback_user_data = this;
+                if (params.prompt_tokens && params.prompt_n_tokens > 0) {
+                        prompt_token_storage.assign(params.prompt_tokens, params.prompt_tokens + params.prompt_n_tokens);
+                }
+                sync_prompt_tokens();
+        reset_progress_callback();
   }
   WhisperFullParamsWrapper(const WhisperFullParamsWrapper& other)
     : whisper_full_params(static_cast<whisper_full_params>(other)),  // Copy base struct
       initial_prompt_str(other.initial_prompt_str),
       suppress_regex_str(other.suppress_regex_str),
       vad_model_path_str(other.vad_model_path_str),
-        py_progress_callback(other.py_progress_callback),
-        py_abort_callback(other.py_abort_callback) {
+                        prompt_token_storage(other.prompt_token_storage),
+            grammar_state(other.grammar_state),
+            py_new_segment_callback(other.py_new_segment_callback),
+            py_new_segment_callback_user_data(other.py_new_segment_callback_user_data),
+            py_encoder_begin_callback(other.py_encoder_begin_callback),
+            py_encoder_begin_callback_user_data(other.py_encoder_begin_callback_user_data),
+            py_progress_callback(other.py_progress_callback),
+            py_progress_callback_user_data(other.py_progress_callback_user_data),
+            py_logits_filter_callback(other.py_logits_filter_callback),
+            py_logits_filter_callback_user_data(other.py_logits_filter_callback_user_data),
+            py_abort_callback(other.py_abort_callback),
+            py_abort_callback_user_data(other.py_abort_callback_user_data) {
     // Reset pointers to new string copies
     initial_prompt = initial_prompt_str.empty() ? nullptr : initial_prompt_str.c_str();
     suppress_regex = suppress_regex_str.empty() ? nullptr : suppress_regex_str.c_str();
     vad_model_path = vad_model_path_str.empty() ? nullptr : vad_model_path_str.c_str();
+        new_segment_callback_user_data = this;
+        encoder_begin_callback_user_data = this;
     abort_callback_user_data = this;
-    progress_callback_user_data = this;
-    progress_callback = [](struct whisper_context* ctx, struct whisper_state* state, int progress, void* user_data) {
-      auto* self = static_cast<WhisperFullParamsWrapper*>(user_data);
-      if(self && self->print_progress){
-        if (self->py_progress_callback) {
-          // call the python callback
-          py::gil_scoped_acquire gil;
-          self->py_progress_callback(progress);  // Call Python callback
-        }
-        else {
-          fprintf(stderr, "Progress: %3d%%\n", progress);
-        } // Default message
-      }
-    };
+        logits_filter_callback_user_data = this;
+        sync_prompt_tokens();
+        sync_grammar_fields();
+        reset_progress_callback();
   }
   void set_initial_prompt(const std::string& prompt) {
     initial_prompt_str = prompt;
@@ -452,16 +509,152 @@ struct WhisperFullParamsWrapper : public whisper_full_params {
     vad_model_path_str = model_path;
     vad_model_path = vad_model_path_str.c_str();
   }
-        void set_abort_callback(py::function callback) {
-                py_abort_callback = callback;
-                abort_callback_user_data = this;
-                abort_callback = _abort_callback;
+    py::tuple get_prompt_tokens() const {
+        py::tuple tokens(prompt_token_storage.size());
+        for (size_t index = 0; index < prompt_token_storage.size(); ++index) {
+            tokens[index] = prompt_token_storage[index];
+        }
+        return tokens;
+    }
+    void set_prompt_tokens(const std::vector<whisper_token>& tokens) {
+        prompt_token_storage = tokens;
+        sync_prompt_tokens();
+    }
+    void clear_prompt_tokens() {
+        prompt_token_storage.clear();
+        sync_prompt_tokens();
+    }
+    py::object get_new_segment_callback_user_data() const {
+        return py_new_segment_callback_user_data;
+    }
+    void set_new_segment_callback_user_data(py::object user_data) {
+        py_new_segment_callback_user_data = std::move(user_data);
+        new_segment_callback_user_data = this;
+    }
+    void set_new_segment_callback(py::function callback) {
+        py_new_segment_callback = std::move(callback);
+        new_segment_callback_user_data = this;
+        new_segment_callback = _new_segment_callback;
+    }
+    void clear_new_segment_callback() {
+        py_new_segment_callback = py::function();
+        new_segment_callback = nullptr;
+        new_segment_callback_user_data = this;
+    }
+    py::object get_encoder_begin_callback_user_data() const {
+        return py_encoder_begin_callback_user_data;
+    }
+    void set_encoder_begin_callback_user_data(py::object user_data) {
+        py_encoder_begin_callback_user_data = std::move(user_data);
+        encoder_begin_callback_user_data = this;
+    }
+    void set_encoder_begin_callback(py::function callback) {
+        py_encoder_begin_callback = std::move(callback);
+        encoder_begin_callback_user_data = this;
+        encoder_begin_callback = _encoder_begin_callback;
+    }
+    void clear_encoder_begin_callback() {
+        py_encoder_begin_callback = py::function();
+        encoder_begin_callback = nullptr;
+        encoder_begin_callback_user_data = this;
+    }
+    py::object get_progress_callback_user_data() const {
+        return py_progress_callback_user_data;
+    }
+    void set_progress_callback_user_data(py::object user_data) {
+        py_progress_callback_user_data = std::move(user_data);
+        progress_callback_user_data = this;
+    }
+    void set_progress_callback(py::function callback) {
+        py_progress_callback = callback;
+        reset_progress_callback();
+    }
+    void clear_progress_callback() {
+        py_progress_callback = py::function();
+        reset_progress_callback();
+    }
+    py::object get_logits_filter_callback_user_data() const {
+        return py_logits_filter_callback_user_data;
+    }
+    void set_logits_filter_callback_user_data(py::object user_data) {
+        py_logits_filter_callback_user_data = std::move(user_data);
+        logits_filter_callback_user_data = this;
+    }
+    void set_logits_filter_callback(py::function callback) {
+        py_logits_filter_callback = std::move(callback);
+        logits_filter_callback_user_data = this;
+        logits_filter_callback = _logits_filter_callback;
+    }
+    void clear_logits_filter_callback() {
+        py_logits_filter_callback = py::function();
+        logits_filter_callback = nullptr;
+        logits_filter_callback_user_data = this;
+    }
+    py::object get_abort_callback_user_data() const {
+        return py_abort_callback_user_data;
+    }
+    void set_abort_callback_user_data(py::object user_data) {
+        py_abort_callback_user_data = std::move(user_data);
+        abort_callback_user_data = this;
+    }
+    void set_abort_callback(py::function callback) {
+        py_abort_callback = callback;
+        abort_callback_user_data = this;
+        abort_callback = _abort_callback;
+    }
+    void clear_abort_callback() {
+        py_abort_callback = py::none();
+        abort_callback = nullptr;
+        abort_callback_user_data = this;
+    }
+    void set_grammar(const std::string& grammar, const std::string& start_rule, float penalty) {
+        if (grammar.empty()) {
+            clear_grammar();
+            grammar_penalty = penalty;
+            return;
         }
-        void clear_abort_callback() {
-                py_abort_callback = py::none();
-                abort_callback = nullptr;
-                abort_callback_user_data = this;
+
+        std::ifstream grammar_file(grammar);
+        std::string grammar_source;
+        if (grammar_file.good()) {
+            grammar_source.assign(
+                    std::istreambuf_iterator<char>(grammar_file),
+                    std::istreambuf_iterator<char>());
+        } else {
+            grammar_source = grammar;
+        }
+
+        auto parsed = grammar_parser::parse(grammar_source.c_str());
+        auto rule_iter = parsed.symbol_ids.find(start_rule);
+        if (rule_iter == parsed.symbol_ids.end()) {
+            throw std::runtime_error("unknown grammar start rule: " + start_rule);
         }
+
+        grammar_state = std::move(parsed);
+        sync_grammar_fields();
+        i_start_rule = rule_iter->second;
+        grammar_penalty = penalty;
+    }
+    void clear_grammar() {
+        grammar_state = grammar_parser::parse_state();
+        grammar_rule_ptrs.clear();
+        grammar_rules = nullptr;
+        n_grammar_rules = 0;
+        i_start_rule = 0;
+    }
+    py::list get_grammar_rules() const {
+        py::list rules;
+        for (const auto& rule : grammar_state.rules) {
+            py::list elements;
+            for (const auto& element : rule) {
+                elements.append(py::dict(
+                        "type"_a = static_cast<int>(element.type),
+                        "value"_a = element.value));
+            }
+            rules.append(elements);
+        }
+        return rules;
+    }
 };
 WhisperFullParamsWrapper  whisper_full_default_params_wrapper(enum whisper_sampling_strategy strategy) {
     return WhisperFullParamsWrapper(whisper_full_default_params(strategy));
@@ -470,30 +663,72 @@ WhisperFullParamsWrapper  whisper_full_default_params_wrapper(enum whisper_sampl
 // callbacks mechanism
 
 void _new_segment_callback(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data){
+    (void) state;
     struct whisper_context_wrapper ctx_w;
     ctx_w.ptr = ctx;
-    // call the python callback
-    py::gil_scoped_acquire gil;  // Acquire the GIL while in this scope.
-    py_new_segment_callback(ctx_w, n_new, user_data);
+    auto * params = static_cast<WhisperFullParamsWrapper *>(user_data);
+    if (!params || !params->py_new_segment_callback) {
+        return;
+    }
+
+    py::gil_scoped_acquire gil;
+    py::function callback = params->py_new_segment_callback;
+    callback(
+        ctx_w,
+        n_new,
+        params->py_new_segment_callback_user_data.is_none()
+            ? py::none()
+            : params->py_new_segment_callback_user_data);
 };
 
-void assign_new_segment_callback(struct whisper_full_params *params, py::function f){
-    params->new_segment_callback = _new_segment_callback;
-    py_new_segment_callback = f;
+void assign_new_segment_callback(struct whisper_full_params *params_base, py::object callback){
+    auto * params = static_cast<WhisperFullParamsWrapper *>(params_base);
+    if (callback.is_none()) {
+        params->clear_new_segment_callback();
+        return;
+    }
+
+    params->set_new_segment_callback(callback.cast<py::function>());
+}
+
+void clear_new_segment_callback(struct whisper_full_params *params_base) {
+    auto * params = static_cast<WhisperFullParamsWrapper *>(params_base);
+    params->clear_new_segment_callback();
 };
 
 bool _encoder_begin_callback(struct whisper_context * ctx, struct whisper_state * state, void * user_data){
+    (void) state;
     struct whisper_context_wrapper ctx_w;
     ctx_w.ptr = ctx;
-    // call the python callback
-    py::object result_py = py_encoder_begin_callback(ctx_w, user_data);
+    auto * params = static_cast<WhisperFullParamsWrapper *>(user_data);
+    if (!params || !params->py_encoder_begin_callback) {
+        return false;
+    }
+
+    py::gil_scoped_acquire gil;
+    py::function callback = params->py_encoder_begin_callback;
+    py::object result_py = callback(
+        ctx_w,
+        params->py_encoder_begin_callback_user_data.is_none()
+            ? py::none()
+            : params->py_encoder_begin_callback_user_data);
     bool res = result_py.cast<bool>();
     return res;
 }
 
-void assign_encoder_begin_callback(struct whisper_full_params *params, py::function f){
-    params->encoder_begin_callback = _encoder_begin_callback;
-    py_encoder_begin_callback = f;
+void assign_encoder_begin_callback(struct whisper_full_params *params_base, py::object callback){
+    auto * params = static_cast<WhisperFullParamsWrapper *>(params_base);
+    if (callback.is_none()) {
+        params->clear_encoder_begin_callback();
+        return;
+    }
+
+    params->set_encoder_begin_callback(callback.cast<py::function>());
+}
+
+void clear_encoder_begin_callback(struct whisper_full_params *params_base) {
+    auto * params = static_cast<WhisperFullParamsWrapper *>(params_base);
+    params->clear_encoder_begin_callback();
 }
 
 void _logits_filter_callback(
@@ -503,15 +738,54 @@ void _logits_filter_callback(
         int   n_tokens,
         float * logits,
         void * user_data){
+    (void) state;
+    (void) tokens;
     struct whisper_context_wrapper ctx_w;
     ctx_w.ptr = ctx;
-    // call the python callback
-    py_logits_filter_callback(ctx_w, n_tokens, logits, user_data);
+    auto * params = static_cast<WhisperFullParamsWrapper *>(user_data);
+    if (!params || !params->py_logits_filter_callback) {
+        return;
+    }
+
+    py::gil_scoped_acquire gil;
+    py::function callback = params->py_logits_filter_callback;
+    callback(
+        ctx_w,
+        n_tokens,
+        logits,
+        params->py_logits_filter_callback_user_data.is_none()
+            ? py::none()
+            : params->py_logits_filter_callback_user_data);
+}
+
+void assign_logits_filter_callback(struct whisper_full_params *params_base, py::object callback){
+    auto * params = static_cast<WhisperFullParamsWrapper *>(params_base);
+    if (callback.is_none()) {
+        params->clear_logits_filter_callback();
+        return;
+    }
+
+    params->set_logits_filter_callback(callback.cast<py::function>());
+}
+
+void clear_logits_filter_callback(struct whisper_full_params *params_base) {
+    auto * params = static_cast<WhisperFullParamsWrapper *>(params_base);
+    params->clear_logits_filter_callback();
 }
 
-void assign_logits_filter_callback(struct whisper_full_params *params, py::function f){
-    params->logits_filter_callback = _logits_filter_callback;
-    py_logits_filter_callback = f;
+void assign_progress_callback(whisper_full_params *params_base, py::object callback) {
+    auto * params = static_cast<WhisperFullParamsWrapper *>(params_base);
+    if (callback.is_none()) {
+        params->clear_progress_callback();
+        return;
+    }
+
+    params->set_progress_callback(callback.cast<py::function>());
+}
+
+void clear_progress_callback(whisper_full_params *params_base) {
+    auto * params = static_cast<WhisperFullParamsWrapper *>(params_base);
+    params->clear_progress_callback();
 }
 
 bool _abort_callback(void * user_data) {
@@ -522,7 +796,9 @@ bool _abort_callback(void * user_data) {
 
     py::gil_scoped_acquire gil;
     py::function callback = params->py_abort_callback.cast<py::function>();
-    py::object result_py = callback();
+    py::object result_py = params->py_abort_callback_user_data.is_none()
+        ? callback()
+        : callback(params->py_abort_callback_user_data);
     return result_py.cast<bool>();
 }
 
@@ -905,6 +1181,18 @@ PYBIND11_MODULE(_pywhispercpp, m) {
         .def_readwrite("print_special", &WhisperFullParamsWrapper::print_special)
         .def_readwrite("print_progress", &WhisperFullParamsWrapper::print_progress)
         .def_readwrite("progress_callback", &WhisperFullParamsWrapper::py_progress_callback)
+        .def("set_progress_callback",
+             [](WhisperFullParamsWrapper &self, py::object callback) {
+                 if (callback.is_none()) {
+                     self.clear_progress_callback();
+                 } else {
+                     self.set_progress_callback(callback.cast<py::function>());
+                 }
+             },
+             py::arg("callback") = py::none(),
+             "Assign a progress callback that receives progress updates.")
+        .def("clear_progress_callback", &WhisperFullParamsWrapper::clear_progress_callback,
+             "Clear any previously assigned progress callback while preserving default progress behavior.")
         .def_readwrite("print_realtime", &WhisperFullParamsWrapper::print_realtime)
         .def_readwrite("print_timestamps", &WhisperFullParamsWrapper::print_timestamps)
         .def_readwrite("token_timestamps", &WhisperFullParamsWrapper::token_timestamps)
@@ -931,7 +1219,46 @@ PYBIND11_MODULE(_pywhispercpp, m) {
                 self.set_initial_prompt(initial_prompt);
             }
         )
-        .def_readwrite("prompt_tokens", &WhisperFullParamsWrapper::prompt_tokens)
+        .def_property("prompt_tokens",
+            [](WhisperFullParamsWrapper &self) {
+                return self.get_prompt_tokens();
+            },
+            [](WhisperFullParamsWrapper &self, py::object tokens) {
+                if (tokens.is_none()) {
+                    self.clear_prompt_tokens();
+                } else {
+                    self.set_prompt_tokens(tokens.cast<std::vector<whisper_token>>());
+                }
+            })
+        .def("set_prompt_tokens", &WhisperFullParamsWrapper::set_prompt_tokens,
+             py::arg("tokens"),
+             "Assign prompt tokens from a Python sequence.")
+        .def("clear_prompt_tokens", &WhisperFullParamsWrapper::clear_prompt_tokens,
+             "Clear any previously assigned prompt tokens.")
+        .def("set_new_segment_callback",
+             [](WhisperFullParamsWrapper &self, py::object callback) {
+                 if (callback.is_none()) {
+                     self.clear_new_segment_callback();
+                 } else {
+                     self.set_new_segment_callback(callback.cast<py::function>());
+                 }
+             },
+             py::arg("callback") = py::none(),
+             "Assign a new-segment callback.")
+        .def("clear_new_segment_callback", &WhisperFullParamsWrapper::clear_new_segment_callback,
+             "Clear any previously assigned new-segment callback.")
+        .def("set_encoder_begin_callback",
+             [](WhisperFullParamsWrapper &self, py::object callback) {
+                 if (callback.is_none()) {
+                     self.clear_encoder_begin_callback();
+                 } else {
+                     self.set_encoder_begin_callback(callback.cast<py::function>());
+                 }
+             },
+             py::arg("callback") = py::none(),
+             "Assign an encoder-begin callback.")
+        .def("clear_encoder_begin_callback", &WhisperFullParamsWrapper::clear_encoder_begin_callback,
+             "Clear any previously assigned encoder-begin callback.")
         .def("set_abort_callback",
              [](WhisperFullParamsWrapper &self, py::object callback) {
                  if (callback.is_none()) {
@@ -973,10 +1300,42 @@ PYBIND11_MODULE(_pywhispercpp, m) {
                                  [](WhisperFullParamsWrapper &self, py::dict dict) {self.greedy.best_of = dict["best_of"].cast<int>();})
         .def_property("beam_search", [](WhisperFullParamsWrapper &self) {return py::dict("beam_size"_a=self.beam_search.beam_size, "patience"_a=self.beam_search.patience);},
                                 [](WhisperFullParamsWrapper &self, py::dict dict) {self.beam_search.beam_size = dict["beam_size"].cast<int>(); self.beam_search.patience = dict["patience"].cast<float>();})
-        .def_readwrite("new_segment_callback_user_data", &WhisperFullParamsWrapper::new_segment_callback_user_data)
-        .def_readwrite("encoder_begin_callback_user_data", &WhisperFullParamsWrapper::encoder_begin_callback_user_data)
-        .def_readwrite("logits_filter_callback_user_data", &WhisperFullParamsWrapper::logits_filter_callback_user_data)
-                    .def_readwrite("grammar_penalty", &WhisperFullParamsWrapper::grammar_penalty)
+        .def_property("new_segment_callback_user_data",
+            &WhisperFullParamsWrapper::get_new_segment_callback_user_data,
+            &WhisperFullParamsWrapper::set_new_segment_callback_user_data)
+        .def_property("progress_callback_user_data",
+            &WhisperFullParamsWrapper::get_progress_callback_user_data,
+            &WhisperFullParamsWrapper::set_progress_callback_user_data)
+        .def_property("encoder_begin_callback_user_data",
+            &WhisperFullParamsWrapper::get_encoder_begin_callback_user_data,
+            &WhisperFullParamsWrapper::set_encoder_begin_callback_user_data)
+        .def_property("abort_callback_user_data",
+            &WhisperFullParamsWrapper::get_abort_callback_user_data,
+            &WhisperFullParamsWrapper::set_abort_callback_user_data)
+        .def_property("logits_filter_callback_user_data",
+            &WhisperFullParamsWrapper::get_logits_filter_callback_user_data,
+            &WhisperFullParamsWrapper::set_logits_filter_callback_user_data)
+        .def_property_readonly("grammar_rules", &WhisperFullParamsWrapper::get_grammar_rules)
+        .def_property_readonly("n_grammar_rules", [](const WhisperFullParamsWrapper &self) { return self.n_grammar_rules; })
+        .def_property_readonly("i_start_rule", [](const WhisperFullParamsWrapper &self) { return self.i_start_rule; })
+        .def("set_grammar", &WhisperFullParamsWrapper::set_grammar,
+            py::arg("grammar"), py::arg("start_rule") = "root", py::arg("penalty") = 100.0f,
+            "Parse grammar text or a grammar file path and assign it to the params.")
+        .def("set_logits_filter_callback",
+             [](WhisperFullParamsWrapper &self, py::object callback) {
+                 if (callback.is_none()) {
+                     self.clear_logits_filter_callback();
+                 } else {
+                     self.set_logits_filter_callback(callback.cast<py::function>());
+                 }
+             },
+             py::arg("callback") = py::none(),
+             "Assign a logits-filter callback.")
+        .def("clear_logits_filter_callback", &WhisperFullParamsWrapper::clear_logits_filter_callback,
+             "Clear any previously assigned logits-filter callback.")
+        .def("clear_grammar", &WhisperFullParamsWrapper::clear_grammar,
+            "Clear any previously assigned grammar.")
+        .def_readwrite("grammar_penalty", &WhisperFullParamsWrapper::grammar_penalty)
         .def_readwrite("vad", &WhisperFullParamsWrapper::vad)
         .def_property("vad_model_path",
         [](WhisperFullParamsWrapper &self) {
@@ -1044,14 +1403,49 @@ PYBIND11_MODULE(_pywhispercpp, m) {
     // Helper mechanism to set callbacks from python
     // The only difference from the C-Style API
 
-    m.def("assign_new_segment_callback", &assign_new_segment_callback, "Assigns a new_segment_callback, takes <whisper_full_params> instance and a callable function with the same parameters which are defined in the interface",
-        py::arg("params"), py::arg("callback"));
+    m.def("assign_new_segment_callback",
+        [](whisper_full_params * params, py::object callback) {
+            assign_new_segment_callback(params, callback);
+        },
+        "Assign a new-segment callback.",
+        py::arg("params"), py::arg("callback") = py::none());
 
-    m.def("assign_encoder_begin_callback", &assign_encoder_begin_callback, "Assigns an encoder_begin_callback, takes <whisper_full_params> instance and a callable function with the same parameters which are defined in the interface",
-            py::arg("params"), py::arg("callback"));
+    m.def("clear_new_segment_callback", &clear_new_segment_callback,
+        "Clear any previously assigned new-segment callback.",
+        py::arg("params"));
+
+    m.def("assign_encoder_begin_callback",
+            [](whisper_full_params * params, py::object callback) {
+                assign_encoder_begin_callback(params, callback);
+            },
+            "Assign an encoder-begin callback.",
+            py::arg("params"), py::arg("callback") = py::none());
+
+    m.def("clear_encoder_begin_callback", &clear_encoder_begin_callback,
+            "Clear any previously assigned encoder-begin callback.",
+            py::arg("params"));
+
+    m.def("assign_logits_filter_callback",
+            [](whisper_full_params * params, py::object callback) {
+                assign_logits_filter_callback(params, callback);
+            },
+            "Assign a logits-filter callback.",
+            py::arg("params"), py::arg("callback") = py::none());
+
+    m.def("clear_logits_filter_callback", &clear_logits_filter_callback,
+            "Clear any previously assigned logits-filter callback.",
+            py::arg("params"));
+
+    m.def("assign_progress_callback",
+        [](whisper_full_params * params, py::object callback) {
+            assign_progress_callback(params, callback);
+        },
+        "Assign a progress callback that receives progress updates.",
+        py::arg("params"), py::arg("callback") = py::none());
 
-    m.def("assign_logits_filter_callback", &assign_logits_filter_callback, "Assigns a logits_filter_callback, takes <whisper_full_params> instance and a callable function with the same parameters which are defined in the interface",
-            py::arg("params"), py::arg("callback"));
+    m.def("clear_progress_callback", &clear_progress_callback,
+        "Clear any previously assigned progress callback while preserving default progress behavior.",
+        py::arg("params"));
 
         m.def("assign_abort_callback",
             [](whisper_full_params * params, py::object callback) {

From 66a46d458787f6338cae4692e4105a1c8e3a6b83 Mon Sep 17 00:00:00 2001
From: scottmonster <87917233+scottmonster@users.noreply.github.com>
Date: Tue, 19 May 2026 19:06:37 -0500
Subject: [PATCH 08/16] begin callback normalization

---
 src/main.cpp | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 815a962..a55d3ba 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -33,9 +33,6 @@ namespace py = pybind11;
 using namespace pybind11::literals; // to bring in the `_a` literal
 
 
-py::function py_new_segment_callback;
-py::function py_encoder_begin_callback;
-py::function py_logits_filter_callback;
 py::object py_log_callback;
 
 
@@ -440,7 +437,7 @@ struct WhisperFullParamsWrapper : public whisper_full_params {
     py::object py_progress_callback_user_data;
         py::function py_logits_filter_callback;
         py::object py_logits_filter_callback_user_data;
-    py::object py_abort_callback;
+    py::function py_abort_callback;
     py::object py_abort_callback_user_data;
   WhisperFullParamsWrapper(const whisper_full_params& params = whisper_full_params())
     : whisper_full_params(params),
@@ -452,7 +449,7 @@ struct WhisperFullParamsWrapper : public whisper_full_params {
                         py_encoder_begin_callback_user_data(py::none()),
             py_progress_callback_user_data(py::none()),
                         py_logits_filter_callback_user_data(py::none()),
-            py_abort_callback(py::none()),
+            py_abort_callback(),
             py_abort_callback_user_data(py::none())
     {
     initial_prompt = initial_prompt_str.empty() ? nullptr : initial_prompt_str.c_str();
@@ -598,12 +595,12 @@ struct WhisperFullParamsWrapper : public whisper_full_params {
         abort_callback_user_data = this;
     }
     void set_abort_callback(py::function callback) {
-        py_abort_callback = callback;
+        py_abort_callback = std::move(callback);
         abort_callback_user_data = this;
         abort_callback = _abort_callback;
     }
     void clear_abort_callback() {
-        py_abort_callback = py::none();
+        py_abort_callback = py::function();
         abort_callback = nullptr;
         abort_callback_user_data = this;
     }
@@ -790,12 +787,12 @@ void clear_progress_callback(whisper_full_params *params_base) {
 
 bool _abort_callback(void * user_data) {
     auto * params = static_cast<WhisperFullParamsWrapper *>(user_data);
-    if (!params || !params->py_abort_callback || params->py_abort_callback.is_none()) {
+    if (!params || !params->py_abort_callback) {
         return false;
     }
 
     py::gil_scoped_acquire gil;
-    py::function callback = params->py_abort_callback.cast<py::function>();
+    py::function callback = params->py_abort_callback;
     py::object result_py = params->py_abort_callback_user_data.is_none()
         ? callback()
         : callback(params->py_abort_callback_user_data);
@@ -805,22 +802,16 @@ bool _abort_callback(void * user_data) {
 void assign_abort_callback(whisper_full_params *params_base, py::object callback){
     auto * params = static_cast<WhisperFullParamsWrapper *>(params_base);
     if (callback.is_none()) {
-        params->py_abort_callback = py::none();
-        params->abort_callback = nullptr;
-        params->abort_callback_user_data = params;
+        params->clear_abort_callback();
         return;
     }
 
-    params->py_abort_callback = callback.cast<py::function>();
-    params->abort_callback_user_data = params;
-    params->abort_callback = _abort_callback;
+    params->set_abort_callback(callback.cast<py::function>());
 }
 
 void clear_abort_callback(whisper_full_params *params_base) {
     auto * params = static_cast<WhisperFullParamsWrapper *>(params_base);
-    params->py_abort_callback = py::none();
-    params->abort_callback = nullptr;
-    params->abort_callback_user_data = params;
+    params->clear_abort_callback();
 }
 
 void whisper_log_set_wrapper(py::object callback) {

From d3e68a810360850601850e266ad9800b801788c3 Mon Sep 17 00:00:00 2001
From: scottmonster <87917233+scottmonster@users.noreply.github.com>
Date: Tue, 19 May 2026 21:13:19 -0500
Subject: [PATCH 09/16] finish callback normalization

---
 .gitignore                            |   9 +-
 pywhispercpp/model.py                 |  26 +++--
 pywhispercpp/model.pyi                |   6 +-
 src/main.cpp                          |  51 +++++----
 tests/test_backwards_compatibility.py | 153 --------------------------
 5 files changed, 54 insertions(+), 191 deletions(-)
 delete mode 100644 tests/test_backwards_compatibility.py

diff --git a/.gitignore b/.gitignore
index 3e25b4f..d28e8f1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,12 +7,15 @@ _generate/
 *.py[cod]
 *.egg-info
 *env*
+# install -e artifacts
 _version.py
-
-coverage
 libggml*
 libwhisper*
-updating
+
+# ignore downloaded source code... really this is just for quickly checking previous versions
+pywhispercpp-*.*
+
+
 
 # custom
 .idea
diff --git a/pywhispercpp/model.py b/pywhispercpp/model.py
index 44adaeb..26701fd 100644
--- a/pywhispercpp/model.py
+++ b/pywhispercpp/model.py
@@ -81,7 +81,7 @@ class Model:
     ```
     """
 
-    _new_segment_callback = None
+    
 
     def __init__(self,
                  model: str = 'tiny',
@@ -144,6 +144,7 @@ def __init__(self,
             - `suppress_blank`: suppress blank outputs. Default `True`.
             - `suppress_non_speech_tokens`: Python alias for `suppress_nst`. Default `False`.
             - `suppress_nst`: suppress non-speech tokens. Default `False`.
+            - `suppress_regex`: regex pattern used to suppress matching text during decoding. Default `''`.
             - `temperature`: initial decoding temperature. Default `0.0`.
             - `max_initial_ts`: maximum initial timestamp. Default `1.0`.
             - `length_penalty`: length penalty. Default `-1.0`.
@@ -153,7 +154,7 @@ def __init__(self,
             - `no_speech_thold`: no-speech threshold. Default `0.6`.
             - `grammar_penalty`: penalty applied to non-grammar tokens. Default `100.0`.
             - `greedy`: greedy-decoder settings, typically `{"best_of": 5}`.
-            - `beam_search`: beam-search settings, schema default `{"beam_size": 5, "patience": -1.0}`.
+            - `beam_search`: beam-search settings, schema default `{"beam_size": -1, "patience": -1.0}`.
             - `vad`: enable VAD. Default `False`.
             - `vad_model_path`: path to the VAD model. Default `None`.
         """
@@ -171,6 +172,8 @@ def __init__(self,
         self.openvino_model_path = openvino_model_path
         self.openvino_device = openvino_device
         self.openvino_cache_dir = openvino_cache_dir
+        # todo... maybe setup default callbacks for segments and abort globaly and/or per model instance?
+        self._new_segment_callback = None
         # init the model
         self._init_model()
 
@@ -208,10 +211,13 @@ def transcribe(self,
         # update params if any
         self._set_params(params)
 
-        # setting up callback
-        if new_segment_callback:
-            Model._new_segment_callback = new_segment_callback
-            pw.assign_new_segment_callback(self._params, Model.__call_new_segment_callback)
+        # setting up callback. make sure self._new_segment_callback = None when new_segment_callback = None.
+        # since this is no lonmger bound to the Model but on self 
+        self._new_segment_callback = new_segment_callback
+        pw.assign_new_segment_callback(
+            self._params,
+            self.__call_new_segment_callback if new_segment_callback is not None else None,
+        )
 
         pw.assign_abort_callback(self._params, abort_callback)
 
@@ -441,8 +447,8 @@ def _transcribe(self, audio: np.ndarray, n_processors: Optional[int] = None):
         res = Model._get_segments(self._ctx, 0, n, self.extract_probability)
         return res
 
-    @staticmethod
-    def __call_new_segment_callback(ctx, n_new, user_data) -> None:
+    
+    def __call_new_segment_callback(self, ctx, n_new, user_data=None) -> None:
         """
         Internal new_segment_callback, it just calls the user's callback with the `Segment` object
         :param ctx: whisper.cpp ctx param
@@ -454,8 +460,8 @@ def __call_new_segment_callback(ctx, n_new, user_data) -> None:
         start = n - n_new
         res = Model._get_segments(ctx, start, n, False)
         for segment in res:
-            if Model._new_segment_callback is not None:
-                Model._new_segment_callback(segment)
+            if self._new_segment_callback is not None:
+                self._new_segment_callback(segment)
 
     @staticmethod
     def _load_audio(media_file_path: str) -> np.ndarray:
diff --git a/pywhispercpp/model.pyi b/pywhispercpp/model.pyi
index 3e0812b..936c6ac 100644
--- a/pywhispercpp/model.pyi
+++ b/pywhispercpp/model.pyi
@@ -85,6 +85,7 @@ class Model:
         suppress_blank: bool = True,
         suppress_non_speech_tokens: bool = False,
         suppress_nst: bool = False,
+        suppress_regex: str = '',
         temperature: float = 0.0,
         max_initial_ts: float = 1.0,
         length_penalty: float = -1.0,
@@ -92,7 +93,7 @@ class Model:
         entropy_thold: float = 2.4,
         logprob_thold: float = -1.0,
         no_speech_thold: float = 0.6,
-        greedy: GreedyParams = {'best_of': -1},
+        greedy: GreedyParams = {'best_of': 5},
         beam_search: BeamSearchParams = {'beam_size': -1, 'patience': -1.0},
         vad: bool = False,
         vad_model_path: Optional[str] = None,
@@ -138,6 +139,7 @@ class Model:
         suppress_blank: bool = True,
         suppress_non_speech_tokens: bool = False,
         suppress_nst: bool = False,
+        suppress_regex: str = '',
         temperature: float = 0.0,
         max_initial_ts: float = 1.0,
         length_penalty: float = -1.0,
@@ -146,7 +148,7 @@ class Model:
         logprob_thold: float = -1.0,
         no_speech_thold: float = 0.6,
         grammar_penalty: float = 100.0,
-        greedy: GreedyParams = {'best_of': -1},
+        greedy: GreedyParams = {'best_of': 5},
         beam_search: BeamSearchParams = {'beam_size': -1, 'patience': -1.0},
         extract_probability: bool = False,
         vad: bool = False,
diff --git a/src/main.cpp b/src/main.cpp
index a55d3ba..c76d8be 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -32,6 +32,10 @@
 namespace py = pybind11;
 using namespace pybind11::literals; // to bring in the `_a` literal
 
+inline bool has_python_user_data(const py::object & obj) {
+    return obj.ptr() != nullptr && obj.ptr() != Py_None;
+}
+
 
 py::object py_log_callback;
 
@@ -407,7 +411,7 @@ struct WhisperFullParamsWrapper : public whisper_full_params {
             if (self && self->print_progress) {
                 if (self->py_progress_callback) {
                     py::gil_scoped_acquire gil;
-                    if (self->py_progress_callback_user_data.is_none()) {
+                    if (!has_python_user_data(self->py_progress_callback_user_data)) {
                         self->py_progress_callback(progress);
                     } else {
                         self->py_progress_callback(progress, self->py_progress_callback_user_data);
@@ -563,7 +567,7 @@ struct WhisperFullParamsWrapper : public whisper_full_params {
         progress_callback_user_data = this;
     }
     void set_progress_callback(py::function callback) {
-        py_progress_callback = callback;
+        py_progress_callback = std::move(callback);
         reset_progress_callback();
     }
     void clear_progress_callback() {
@@ -670,12 +674,11 @@ void _new_segment_callback(struct whisper_context * ctx, struct whisper_state *
 
     py::gil_scoped_acquire gil;
     py::function callback = params->py_new_segment_callback;
-    callback(
-        ctx_w,
-        n_new,
-        params->py_new_segment_callback_user_data.is_none()
-            ? py::none()
-            : params->py_new_segment_callback_user_data);
+    if (!has_python_user_data(params->py_new_segment_callback_user_data)) {
+        callback(ctx_w, n_new);
+    } else {
+        callback(ctx_w, n_new, params->py_new_segment_callback_user_data);
+    }
 };
 
 void assign_new_segment_callback(struct whisper_full_params *params_base, py::object callback){
@@ -704,11 +707,12 @@ bool _encoder_begin_callback(struct whisper_context * ctx, struct whisper_state
 
     py::gil_scoped_acquire gil;
     py::function callback = params->py_encoder_begin_callback;
-    py::object result_py = callback(
-        ctx_w,
-        params->py_encoder_begin_callback_user_data.is_none()
-            ? py::none()
-            : params->py_encoder_begin_callback_user_data);
+    py::object result_py;
+    if (!has_python_user_data(params->py_encoder_begin_callback_user_data)) {
+        result_py = callback(ctx_w);
+    } else {
+        result_py = callback(ctx_w, params->py_encoder_begin_callback_user_data);
+    }
     bool res = result_py.cast<bool>();
     return res;
 }
@@ -746,13 +750,11 @@ void _logits_filter_callback(
 
     py::gil_scoped_acquire gil;
     py::function callback = params->py_logits_filter_callback;
-    callback(
-        ctx_w,
-        n_tokens,
-        logits,
-        params->py_logits_filter_callback_user_data.is_none()
-            ? py::none()
-            : params->py_logits_filter_callback_user_data);
+    if (!has_python_user_data(params->py_logits_filter_callback_user_data)) {
+        callback(ctx_w, n_tokens, logits);
+    } else {
+        callback(ctx_w, n_tokens, logits, params->py_logits_filter_callback_user_data);
+    }
 }
 
 void assign_logits_filter_callback(struct whisper_full_params *params_base, py::object callback){
@@ -793,9 +795,12 @@ bool _abort_callback(void * user_data) {
 
     py::gil_scoped_acquire gil;
     py::function callback = params->py_abort_callback;
-    py::object result_py = params->py_abort_callback_user_data.is_none()
-        ? callback()
-        : callback(params->py_abort_callback_user_data);
+    py::object result_py;
+    if (!has_python_user_data(params->py_abort_callback_user_data)) {
+        result_py = callback();
+    } else {
+        result_py = callback(params->py_abort_callback_user_data);
+    }
     return result_py.cast<bool>();
 }
 
diff --git a/tests/test_backwards_compatibility.py b/tests/test_backwards_compatibility.py
deleted file mode 100644
index 4e21cdc..0000000
--- a/tests/test_backwards_compatibility.py
+++ /dev/null
@@ -1,153 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import gc
-import subprocess
-import sys
-import textwrap
-import unittest
-from pathlib import Path
-from unittest import TestCase
-
-import _pywhispercpp as pw
-
-from pywhispercpp.model import Model, Segment
-
-
-WHISPER_CPP_DIR = Path(__file__).parent.parent / 'whisper.cpp'
-
-
-class TestBackwardsCompatibility(TestCase):
-    audio_file = WHISPER_CPP_DIR / 'samples/jfk.wav'
-    models_dir = str(WHISPER_CPP_DIR / 'models')
-    repo_root = Path(__file__).parent.parent
-
-    def tearDown(self):
-        gc.collect()
-
-    def _create_cpu_model(self):
-        return Model(
-            'tiny',
-            models_dir=self.models_dir,
-            context_params={'use_gpu': False, 'flash_attn': False},
-        )
-
-    def _run_python(self, code: str):
-        result = subprocess.run(
-            [sys.executable, '-c', textwrap.dedent(code)],
-            cwd=self.repo_root,
-            capture_output=True,
-            text=True,
-        )
-        self.assertEqual(
-            result.returncode,
-            0,
-            msg=f"stdout:\n{result.stdout}\n\nstderr:\n{result.stderr}",
-        )
-
-    def test_legacy_model_constructor_still_works(self):
-        self._run_python(
-            f'''
-            from pywhispercpp.model import Model
-
-            model = Model('tiny', models_dir={self.models_dir!r})
-            assert isinstance(model, Model)
-            '''
-        )
-
-    def test_legacy_alias_still_maps_to_suppress_nst(self):
-        self._run_python(
-            f'''
-            from pywhispercpp.model import Model
-
-            model = Model(
-                'tiny',
-                models_dir={self.models_dir!r},
-                context_params={{'use_gpu': False, 'flash_attn': False}},
-            )
-            model._set_params({{'suppress_non_speech_tokens': True}})
-            assert model.get_params()['suppress_nst'] is True
-            '''
-        )
-
-    def test_low_level_prompt_tokens_property_round_trips(self):
-        params = pw.whisper_full_default_params(
-            pw.whisper_sampling_strategy.WHISPER_SAMPLING_GREEDY
-        )
-        params.prompt_tokens = (1, 2, 3)
-        self.assertEqual(tuple(params.prompt_tokens), (1, 2, 3))
-        self.assertEqual(params.prompt_n_tokens, 3)
-
-    def test_context_params_dict_is_additive(self):
-        self._run_python(
-            f'''
-            from pywhispercpp.model import Model
-
-            model = Model(
-                'tiny',
-                models_dir={self.models_dir!r},
-                context_params={{'use_gpu': False, 'flash_attn': False}},
-            )
-            assert isinstance(model, Model)
-            '''
-        )
-
-    def test_existing_new_segment_callback_still_works(self):
-        self._run_python(
-            f'''
-            from pywhispercpp.model import Model, Segment
-
-            seen = []
-            model = Model(
-                'tiny',
-                models_dir={self.models_dir!r},
-                context_params={{'use_gpu': False, 'flash_attn': False}},
-            )
-
-            def on_segment(segment):
-                seen.append(segment)
-
-            segments = model.transcribe({str(self.audio_file)!r}, new_segment_callback=on_segment)
-            assert isinstance(segments, list)
-            assert len(seen) > 0
-            assert all(isinstance(segment, Segment) for segment in seen)
-            '''
-        )
-
-    def test_abort_callback_can_abort_and_then_clear(self):
-        self._run_python(
-            f'''
-            from pywhispercpp.model import Model
-
-            model = Model(
-                'tiny',
-                models_dir={self.models_dir!r},
-                context_params={{'use_gpu': False, 'flash_attn': False}},
-            )
-            callback_calls = []
-
-            def abort_immediately():
-                callback_calls.append(True)
-                return True
-
-            aborted_segments = model.transcribe({str(self.audio_file)!r}, abort_callback=abort_immediately)
-            assert isinstance(aborted_segments, list)
-            assert len(callback_calls) > 0
-
-            normal_segments = model.transcribe({str(self.audio_file)!r})
-            assert isinstance(normal_segments, list)
-            assert len(normal_segments) > 0
-            '''
-        )
-
-    def test_log_callback_can_be_set_and_cleared(self):
-        pw.whisper_log_set(lambda level, text: None)
-        pw.whisper_log_set(None)
-
-    def test_alignment_preset_enum_is_available(self):
-        preset = pw.whisper_alignment_heads_preset.WHISPER_AHEADS_TINY
-        self.assertIsNotNone(preset)
-
-
-if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file

From e71e375ddbd29c938d69a7b2ee32ae32b666c45e Mon Sep 17 00:00:00 2001
From: scottmonster <87917233+scottmonster@users.noreply.github.com>
Date: Tue, 19 May 2026 21:55:28 -0500
Subject: [PATCH 10/16] remove whisper_args.txt

---
 whsiper_args.txt | 252 -----------------------------------------------
 1 file changed, 252 deletions(-)
 delete mode 100644 whsiper_args.txt

diff --git a/whsiper_args.txt b/whsiper_args.txt
deleted file mode 100644
index 35678ec..0000000
--- a/whsiper_args.txt
+++ /dev/null
@@ -1,252 +0,0 @@
-
-usage: ./whisper-cli [options] file0 file1 ...
-supported audio formats: flac, mp3, ogg, wav
-
-options:
-  --help                            [default] show this help message and exit
-  --threads N                       [4      ] number of threads to use during computation
-  --processors N                    [1      ] number of processors to use during computation
-  --offset-t N                      [0      ] time offset in milliseconds
-  --offset-n N                      [0      ] segment index offset
-  --duration N                      [0      ] duration of audio to process in milliseconds
-  --max-context N                   [-1     ] maximum number of text context tokens to store
-  --max-len N                       [0      ] maximum segment length in characters
-  --max-tokens N                    [0      ] maximum number of tokens per segment
-  --split-on-word                   [false  ] split on word rather than on token
-  --best-of N                       [5      ] number of best candidates to keep
-  --beam-size N                     [5      ] beam size for beam search
-  --audio-ctx N                     [0      ] audio context size (0 - all)
-  --word-thold N                    [0.01   ] word timestamp probability threshold
-  --entropy-thold N                 [2.40   ] entropy threshold for decoder fail
-  --logprob-thold N                 [-1.00  ] log probability threshold for decoder fail
-  --no-speech-thold N               [0.60   ] no speech threshold
-  --temperature N                   [0.00   ] The sampling temperature, between 0 and 1
-  --temperature-inc N               [0.20   ] The increment of temperature, between 0 and 1
-  --debug-mode                      [false  ] enable debug mode (eg. dump log_mel)
-  --translate                       [false  ] translate from source language to english
-  --diarize                         [false  ] stereo audio diarization
-  --tinydiarize                     [false  ] enable tinydiarize (requires a tdrz model)
-  --no-fallback                     [false  ] do not use temperature fallback while decoding
-  --output-txt                      [false  ] output result in a text file
-  --output-vtt                      [false  ] output result in a vtt file
-  --output-srt                      [false  ] output result in a srt file
-  --output-lrc                      [false  ] output result in a lrc file
-  --output-words                    [false  ] output script for generating karaoke video
-  --font-path                       [/System/Library/Fonts/Supplemental/Courier New Bold.ttf] path to a monospace font for karaoke video
-  --output-csv                      [false  ] output result in a CSV file
-  --output-json                     [false  ] output result in a JSON file
-  --output-json-full                [false  ] include more information in the JSON file
-  --output-file FNAME               [       ] output file path (without file extension)
-  --no-prints                       [false  ] do not print anything other than the results
-  --print-special                   [false  ] print special tokens
-  --print-colors                    [false  ] print colors
-  --print-confidence                [false  ] print confidence
-  --print-progress                  [false  ] print progress
-  --no-timestamps                   [false  ] do not print timestamps
-  --language LANG                   [en     ] spoken language ('auto' for auto-detect)
-  --detect-language                 [false  ] exit after automatically detecting language
-  --prompt PROMPT                   [       ] initial prompt (max n_text_ctx/2 tokens)
-  --carry-initial-prompt            [false  ] always prepend initial prompt
-  --model FNAME                     [models/ggml-base.en.bin] model path
-  --file FNAME                      [       ] input audio file path
-  --ov-e-device DNAME               [CPU    ] the OpenVINO device used for encode inference
-  --dtw MODEL                       [       ] compute token-level timestamps
-  --log-score                       [false  ] log best decoder scores of tokens
-  --no-gpu                          [false  ] disable GPU
-  --device N                        [0      ] GPU device ID (default: 0)
-  --flash-attn                      [true   ] enable flash attention
-  --no-flash-attn                   [false  ] disable flash attention
-  --suppress-blank                  [true   ] suppress blank outputs
-  --no-suppress-blank               [false  ] disable blank suppression
-  --suppress-nst                    [false  ] suppress non-speech tokens
-  --suppress-regex REGEX            [       ] regular expression matching tokens to suppress
-  --grammar GRAMMAR                 [       ] GBNF grammar to guide decoding
-  --grammar-rule RULE               [       ] top-level GBNF grammar rule name
-  --grammar-penalty N               [100.0  ] scales down logits of nongrammar tokens
-
-Voice Activity Detection (VAD) options:
-  --vad                               [false  ] enable Voice Activity Detection (VAD)
-  --vad-model FNAME                   [       ] VAD model path
-  --vad-threshold N                   [0.50   ] VAD threshold for speech recognition
-  --vad-min-speech-duration-ms N      [250    ] VAD min speech duration (0.0-1.0)
-  --vad-min-silence-duration-ms N     [100    ] VAD min silence duration (to split segments)
-  --vad-max-speech-duration-s N       [FLT_MAX] VAD max speech duration (auto-split longer)
-  --vad-speech-pad-ms N               [30     ] VAD speech padding (extend segments)
-  --vad-samples-overlap N             [0.10   ] VAD samples overlap (seconds between segments)
-
-
-usage: ./whisper-stream [options]
-
-options:
-  --help                            [default] show this help message and exit
-  --threads N                       [4      ] number of threads to use during computation
-  --step N                          [3000   ] audio step size in milliseconds
-  --length N                        [10000  ] audio length in milliseconds
-  --keep N                          [200    ] audio to keep from previous step in ms
-  --capture ID                      [-1     ] capture device ID
-  --max-tokens N                    [32     ] maximum number of tokens per audio chunk
-  --audio-ctx N                     [0      ] audio context size (0 - all)
-  --beam-size N                     [-1     ] beam size for beam search
-  --vad-thold N                     [0.60   ] voice activity detection threshold
-  --freq-thold N                    [100.00 ] high-pass frequency cutoff
-  --translate                       [false  ] translate from source language to english
-  --no-fallback                     [false  ] do not use temperature fallback while decoding
-  --print-special                   [false  ] print special tokens
-  --keep-context                    [false  ] keep context between audio chunks
-  --language LANG                   [en     ] spoken language
-  --model FNAME                     [models/ggml-base.en.bin] model path
-  --file FNAME                      [       ] text output file name
-  --tinydiarize                     [false  ] enable tinydiarize (requires a tdrz model)
-  --save-audio                      [false  ] save the recorded audio to a file
-  --no-gpu                          [false  ] disable GPU inference
-  --flash-attn                      [true   ] enable flash attention during inference
-  --no-flash-attn                   [false  ] disable flash attention during inference
-
-
-usage: ./whisper-server [options] 
-
-options:
-  --help                                 [default] show this help message and exit
-  --threads N                            [4      ] number of threads to use during computation
-  --processors N                         [1      ] number of processors to use during computation
-  --offset-t N                           [0      ] time offset in milliseconds
-  --offset-n N                           [0      ] segment index offset
-  --duration N                           [0      ] duration of audio to process in milliseconds
-  --max-context N                        [-1     ] maximum number of text context tokens to store
-  --max-len N                            [0      ] maximum segment length in characters
-  --split-on-word                        [false  ] split on word rather than on token
-  --best-of N                            [2      ] number of best candidates to keep
-  --beam-size N                          [-1     ] beam size for beam search
-  --audio-ctx N                          [0      ] audio context size (0 - all)
-  --word-thold N                         [0.01   ] word timestamp probability threshold
-  --entropy-thold N                      [2.40   ] entropy threshold for decoder fail
-  --logprob-thold N                      [-1.00  ] log probability threshold for decoder fail
-  --debug-mode                           [false  ] enable debug mode (eg. dump log_mel)
-  --translate                            [false  ] translate from source language to english
-  --diarize                              [false  ] stereo audio diarization
-  --tinydiarize                          [false  ] enable tinydiarize (requires a tdrz model)
-  --no-fallback                          [false  ] do not use temperature fallback while decoding
-  --print-special                        [false  ] print special tokens
-  --print-colors                         [false  ] print colors
-  --print-realtime                       [false  ] print output in realtime
-  --print-progress                       [false  ] print progress
-  --no-timestamps                        [false  ] do not print timestamps
-  --language LANG                        [en     ] spoken language ('auto' for auto-detect)
-  --detect-language                      [false  ] exit after automatically detecting language
-  --prompt PROMPT                        [       ] initial prompt
-  --model FNAME                          [models/ggml-base.en.bin] model path
-  --ov-e-device DNAME                    [CPU    ] the OpenVINO device used for encode inference
-  --dtw MODEL                            [       ] compute token-level timestamps
-  --host HOST                            [127.0.0.1] Hostname/ip-adress for the server
-  --port PORT                            [8080   ] Port number for the server
-  --public PATH                          [examples/server/public] Path to the public folder
-  --request-path PATH                    [       ] Request path for all requests
-  --inference-path PATH                  [/inference] Inference path for all requests
-  --convert                              [false  ] Convert audio to WAV, requires ffmpeg on the server
-  --tmp-dir                              [.      ] Temporary directory for ffmpeg transcoded files
-  --suppress-nst                         [false  ] suppress non-speech tokens
-  --no-speech-thold N                    [0.60   ] no speech threshold
-  --no-gpu                               [false  ] do not use gpu
-  --device N                             [0      ] GPU device ID (default: 0)
-  --flash-attn                           [true   ] enable flash attention
-  --no-flash-attn                        [false  ] disable flash attention
-  --no-language-probabilities            [false  ] exclude language probabilities from verbose_json output
-
-Voice Activity Detection (VAD) options:
-  --vad                               [false  ] enable Voice Activity Detection (VAD)
-  --vad-model FNAME                   [       ] VAD model path
-  --vad-threshold N                   [0.50   ] VAD threshold for speech recognition
-  --vad-min-speech-duration-ms N      [250    ] VAD min speech duration (0.0-1.0)
-  --vad-min-silence-duration-ms N     [100    ] VAD min silence duration (to split segments)
-  --vad-max-speech-duration-s N       [FLT_MAX] VAD max speech duration (auto-split longer)
-  --vad-speech-pad-ms N               [30     ] VAD speech padding (extend segments)
-  --vad-samples-overlap N             [0.10   ] VAD samples overlap (seconds between segments)
-
-
-
-deduped:
-options:
-  --help                            [default] show this help message and exit
-  --threads N                       [4      ] number of threads to use during computation
-  --processors N                    [1      ] number of processors to use during computation
-  --offset-t N                      [0      ] time offset in milliseconds
-  --offset-n N                      [0      ] segment index offset
-  --duration N                      [0      ] duration of audio to process in milliseconds
-  --max-context N                   [-1     ] maximum number of text context tokens to store
-  --max-len N                       [0      ] maximum segment length in characters
-  --max-tokens N                    [0      ] maximum number of tokens per segment
-  --split-on-word                   [false  ] split on word rather than on token
-  --best-of N                       [5      ] number of best candidates to keep
-  --beam-size N                     [5      ] beam size for beam search
-  --audio-ctx N                     [0      ] audio context size (0 - all)
-  --word-thold N                    [0.01   ] word timestamp probability threshold
-  --entropy-thold N                 [2.40   ] entropy threshold for decoder fail
-  --logprob-thold N                 [-1.00  ] log probability threshold for decoder fail
-  --no-speech-thold N               [0.60   ] no speech threshold
-  --temperature N                   [0.00   ] The sampling temperature, between 0 and 1
-  --temperature-inc N               [0.20   ] The increment of temperature, between 0 and 1
-  --debug-mode                      [false  ] enable debug mode (eg. dump log_mel)
-  --translate                       [false  ] translate from source language to english
-  --diarize                         [false  ] stereo audio diarization
-  --tinydiarize                     [false  ] enable tinydiarize (requires a tdrz model)
-  --no-fallback                     [false  ] do not use temperature fallback while decoding
-  --output-txt                      [false  ] output result in a text file
-  --output-vtt                      [false  ] output result in a vtt file
-  --output-srt                      [false  ] output result in a srt file
-  --output-lrc                      [false  ] output result in a lrc file
-  --output-words                    [false  ] output script for generating karaoke video
-  --font-path                       [/System/Library/Fonts/Supplemental/Courier New Bold.ttf] path to a monospace font for karaoke video
-  --output-csv                      [false  ] output result in a CSV file
-  --output-json                     [false  ] output result in a JSON file
-  --output-json-full                [false  ] include more information in the JSON file
-  --output-file FNAME               [       ] output file path (without file extension)
-  --no-prints                       [false  ] do not print anything other than the results
-  --print-special                   [false  ] print special tokens
-  --print-colors                    [false  ] print colors
-  --print-confidence                [false  ] print confidence
-  --print-progress                  [false  ] print progress
-  --no-timestamps                   [false  ] do not print timestamps
-  --language LANG                   [en     ] spoken language ('auto' for auto-detect)
-  --detect-language                 [false  ] exit after automatically detecting language
-  --prompt PROMPT                   [       ] initial prompt (max n_text_ctx/2 tokens)
-  --carry-initial-prompt            [false  ] always prepend initial prompt
-  --model FNAME                     [models/ggml-base.en.bin] model path
-  --file FNAME                      [       ] input audio file path
-  --ov-e-device DNAME               [CPU    ] the OpenVINO device used for encode inference
-  --dtw MODEL                       [       ] compute token-level timestamps
-  --log-score                       [false  ] log best decoder scores of tokens
-  --no-gpu                          [false  ] disable GPU
-  --device N                        [0      ] GPU device ID (default: 0)
-  --flash-attn                      [true   ] enable flash attention
-  --no-flash-attn                   [false  ] disable flash attention
-  --suppress-blank                  [true   ] suppress blank outputs
-  --no-suppress-blank               [false  ] disable blank suppression
-  --suppress-nst                    [false  ] suppress non-speech tokens
-  --suppress-regex REGEX            [       ] regular expression matching tokens to suppress
-  --grammar GRAMMAR                 [       ] GBNF grammar to guide decoding
-  --grammar-rule RULE               [       ] top-level GBNF grammar rule name
-  --grammar-penalty N               [100.0  ] scales down logits of nongrammar tokens
-  --vad                               [false  ] enable Voice Activity Detection (VAD)
-  --vad-model FNAME                   [       ] VAD model path
-  --vad-threshold N                   [0.50   ] VAD threshold for speech recognition
-  --vad-min-speech-duration-ms N      [250    ] VAD min speech duration (0.0-1.0)
-  --vad-min-silence-duration-ms N     [100    ] VAD min silence duration (to split segments)
-  --vad-max-speech-duration-s N       [FLT_MAX] VAD max speech duration (auto-split longer)
-  --vad-speech-pad-ms N               [30     ] VAD speech padding (extend segments)
-  --vad-samples-overlap N             [0.10   ] VAD samples overlap (seconds between segments)
-  --step N                          [3000   ] audio step size in milliseconds
-  --length N                        [10000  ] audio length in milliseconds
-  --keep N                          [200    ] audio to keep from previous step in ms
-  --capture ID                      [-1     ] capture device ID
-  --vad-thold N                     [0.60   ] voice activity detection threshold
-  --freq-thold N                    [100.00 ] high-pass frequency cutoff
-  --keep-context                    [false  ] keep context between audio chunks
-  --save-audio                      [false  ] save the recorded audio to a file
-  --host HOST                       [127.0.0.1] Hostname/ip-adress for the server
-  --port PORT                       [8080   ] Port number for the server
-  --public PATH                     [examples/server/public] Path to the public folder
-  --request-path PATH               [       ] Request path for all requests
-  --inference-path PATH             [/inference] Inference path for all requests
-  --convert                         [false  ] Convert audio to WAV, requires ffmpeg on the server
-  --tmp-dir                         [.      ] Temporary directory for ffmpeg transcoded files
-  --no-language-probabilities       [false  ] exclude language probabilities from verbose_json output

From 8cf43341f5f5e54d391450a3e40b8dfc46b5944d Mon Sep 17 00:00:00 2001
From: scottmonster <87917233+scottmonster@users.noreply.github.com>
Date: Wed, 20 May 2026 19:50:11 -0500
Subject: [PATCH 11/16] remove grammar support

remove grammar, grammar_rule, etc. and related logic and bindings
since it is not in whisper.cpp and would require:
whisper.cpp/examples/grammar-parser.cpp
---
 CMakeLists.txt            |  1 -
 pywhispercpp/constants.py |  6 ---
 pywhispercpp/model.py     | 40 +++-----------------
 pywhispercpp/model.pyi    |  5 ---
 src/main.cpp              | 80 ++-------------------------------------
 tests/test_model.py       | 18 ---------
 6 files changed, 9 insertions(+), 141 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index af94411..39c16a8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,6 @@ add_subdirectory(whisper.cpp)
 
 pybind11_add_module(_pywhispercpp
 	src/main.cpp
-	whisper.cpp/examples/grammar-parser.cpp
 )
 
 target_link_libraries (_pywhispercpp PRIVATE whisper)
diff --git a/pywhispercpp/constants.py b/pywhispercpp/constants.py
index 529b28b..85018a8 100644
--- a/pywhispercpp/constants.py
+++ b/pywhispercpp/constants.py
@@ -284,12 +284,6 @@
             'options': None,
             'default': 0.6
     },
-    'grammar_penalty': {
-            'type': float,
-            'description': 'scales down logits of non-grammar tokens',
-            'options': None,
-            'default': 100.0
-    },
     'greedy': {
             'type': dict,
             'description': 'greedy',
diff --git a/pywhispercpp/model.py b/pywhispercpp/model.py
index 26701fd..d11260b 100644
--- a/pywhispercpp/model.py
+++ b/pywhispercpp/model.py
@@ -6,20 +6,21 @@
 [whisper.cpp](https://github.com/ggerganov/whisper.cpp) API.
 """
 import importlib.metadata
+import subprocess
+import os
 import logging
 import shutil
 import sys
+import tempfile
+import wave
 from pathlib import Path
 from time import time
 from typing import Any, Union, Callable, List, TextIO, Tuple, Optional, Dict, TypedDict
+
 import _pywhispercpp as pw
 import numpy as np
-import pywhispercpp.utils as utils
 import pywhispercpp.constants as constants
-import subprocess
-import os
-import tempfile
-import wave
+import pywhispercpp.utils as utils
 
 __author__ = "absadiki"
 __copyright__ = "Copyright 2023, "
@@ -134,8 +135,6 @@ def __init__(self,
             - `audio_ctx`: override audio context size. Default `0`.
             - `tdrz_enable`: enable tinydiarize speaker-turn detection. Default `False`.
             - `initial_prompt`: initial text prompt prepended before decoding. Default `None`.
-            - `grammar`: GBNF grammar text or path to a grammar file. Default `None`.
-            - `grammar_rule`: top-level grammar rule name. Default `root` when grammar is used.
             - `prompt_tokens`: explicit prompt token sequence. Default `None`.
             - `prompt_n_tokens`: number of prompt tokens. Default `0`.
             - `carry_initial_prompt`: prepend the initial prompt to each decode window. Default `False`.
@@ -152,7 +151,6 @@ def __init__(self,
             - `entropy_thold`: entropy threshold. Default `2.4`.
             - `logprob_thold`: logprob threshold. Default `-1.0`.
             - `no_speech_thold`: no-speech threshold. Default `0.6`.
-            - `grammar_penalty`: penalty applied to non-grammar tokens. Default `100.0`.
             - `greedy`: greedy-decoder settings, typically `{"best_of": 5}`.
             - `beam_search`: beam-search settings, schema default `{"beam_size": -1, "patience": -1.0}`.
             - `vad`: enable VAD. Default `False`.
@@ -360,29 +358,6 @@ def _normalize_params(kwargs: dict) -> dict:
 
         return normalized
 
-    def _apply_grammar_params(self, normalized: dict) -> dict:
-        has_grammar = 'grammar' in normalized
-        has_grammar_rule = 'grammar_rule' in normalized
-
-        if not has_grammar:
-            if has_grammar_rule:
-                raise AttributeError('grammar_rule requires grammar')
-            return normalized
-
-        grammar = normalized.pop('grammar')
-        grammar_rule = normalized.pop('grammar_rule', 'root')
-
-        if grammar is None:
-            self._params.clear_grammar()
-            return normalized
-
-        self._params.set_grammar(
-            grammar,
-            grammar_rule,
-            normalized.get('grammar_penalty', self._params.grammar_penalty),
-        )
-        return normalized
-
     def _apply_prompt_token_params(self, normalized: dict) -> dict:
         if 'prompt_tokens' not in normalized:
             return normalized
@@ -421,9 +396,6 @@ def _set_params(self, kwargs: dict) -> None:
         """
         normalized = self._normalize_params(kwargs)
 
-        if 'grammar' in normalized or 'grammar_rule' in normalized:
-            normalized = self._apply_grammar_params(normalized)
-
         if 'prompt_tokens' in normalized:
             normalized = self._apply_prompt_token_params(normalized)
 
diff --git a/pywhispercpp/model.pyi b/pywhispercpp/model.pyi
index 936c6ac..3f2852b 100644
--- a/pywhispercpp/model.pyi
+++ b/pywhispercpp/model.pyi
@@ -75,8 +75,6 @@ class Model:
         audio_ctx: int = 0,
         tdrz_enable: bool = False,
         initial_prompt: Optional[str] = None,
-        grammar: Optional[str] = None,
-        grammar_rule: str = 'root',
         prompt_tokens: Optional[Tuple[Any, ...]] = None,
         prompt_n_tokens: int = 0,
         carry_initial_prompt: bool = False,
@@ -129,8 +127,6 @@ class Model:
         audio_ctx: int = 0,
         tdrz_enable: bool = False,
         initial_prompt: Optional[str] = None,
-        grammar: Optional[str] = None,
-        grammar_rule: str = 'root',
         prompt_tokens: Optional[Tuple[Any, ...]] = None,
         prompt_n_tokens: int = 0,
         carry_initial_prompt: bool = False,
@@ -147,7 +143,6 @@ class Model:
         entropy_thold: float = 2.4,
         logprob_thold: float = -1.0,
         no_speech_thold: float = 0.6,
-        grammar_penalty: float = 100.0,
         greedy: GreedyParams = {'best_of': 5},
         beam_search: BeamSearchParams = {'beam_size': -1, 'patience': -1.0},
         extract_probability: bool = False,
diff --git a/src/main.cpp b/src/main.cpp
index c76d8be..35d74a2 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -15,11 +15,7 @@
 #include <pybind11/functional.h>
 #include <pybind11/numpy.h>
 
-#include <fstream>
-#include <iterator>
-
 #include "whisper.h"
-#include "../whisper.cpp/examples/grammar-parser.h"
 
 
 #define STRINGIFY(x) #x
@@ -398,9 +394,7 @@ struct WhisperFullParamsWrapper : public whisper_full_params {
   std::string initial_prompt_str;
   std::string suppress_regex_str;
   std::string vad_model_path_str;
-        std::vector<whisper_token> prompt_token_storage;
-    grammar_parser::parse_state grammar_state;
-    std::vector<const whisper_grammar_element *> grammar_rule_ptrs;
+    std::vector<whisper_token> prompt_token_storage;
 
     void reset_progress_callback() {
         progress_callback_user_data = this;
@@ -423,11 +417,6 @@ struct WhisperFullParamsWrapper : public whisper_full_params {
         };
     }
 
-    void sync_grammar_fields() {
-        grammar_rule_ptrs = grammar_state.c_rules();
-        grammar_rules = grammar_rule_ptrs.empty() ? nullptr : grammar_rule_ptrs.data();
-        n_grammar_rules = grammar_rule_ptrs.size();
-    }
     void sync_prompt_tokens() {
         prompt_tokens = prompt_token_storage.empty() ? nullptr : prompt_token_storage.data();
         prompt_n_tokens = prompt_token_storage.size();
@@ -474,8 +463,7 @@ struct WhisperFullParamsWrapper : public whisper_full_params {
       initial_prompt_str(other.initial_prompt_str),
       suppress_regex_str(other.suppress_regex_str),
       vad_model_path_str(other.vad_model_path_str),
-                        prompt_token_storage(other.prompt_token_storage),
-            grammar_state(other.grammar_state),
+            prompt_token_storage(other.prompt_token_storage),
             py_new_segment_callback(other.py_new_segment_callback),
             py_new_segment_callback_user_data(other.py_new_segment_callback_user_data),
             py_encoder_begin_callback(other.py_encoder_begin_callback),
@@ -495,7 +483,6 @@ struct WhisperFullParamsWrapper : public whisper_full_params {
     abort_callback_user_data = this;
         logits_filter_callback_user_data = this;
         sync_prompt_tokens();
-        sync_grammar_fields();
         reset_progress_callback();
   }
   void set_initial_prompt(const std::string& prompt) {
@@ -608,54 +595,6 @@ struct WhisperFullParamsWrapper : public whisper_full_params {
         abort_callback = nullptr;
         abort_callback_user_data = this;
     }
-    void set_grammar(const std::string& grammar, const std::string& start_rule, float penalty) {
-        if (grammar.empty()) {
-            clear_grammar();
-            grammar_penalty = penalty;
-            return;
-        }
-
-        std::ifstream grammar_file(grammar);
-        std::string grammar_source;
-        if (grammar_file.good()) {
-            grammar_source.assign(
-                    std::istreambuf_iterator<char>(grammar_file),
-                    std::istreambuf_iterator<char>());
-        } else {
-            grammar_source = grammar;
-        }
-
-        auto parsed = grammar_parser::parse(grammar_source.c_str());
-        auto rule_iter = parsed.symbol_ids.find(start_rule);
-        if (rule_iter == parsed.symbol_ids.end()) {
-            throw std::runtime_error("unknown grammar start rule: " + start_rule);
-        }
-
-        grammar_state = std::move(parsed);
-        sync_grammar_fields();
-        i_start_rule = rule_iter->second;
-        grammar_penalty = penalty;
-    }
-    void clear_grammar() {
-        grammar_state = grammar_parser::parse_state();
-        grammar_rule_ptrs.clear();
-        grammar_rules = nullptr;
-        n_grammar_rules = 0;
-        i_start_rule = 0;
-    }
-    py::list get_grammar_rules() const {
-        py::list rules;
-        for (const auto& rule : grammar_state.rules) {
-            py::list elements;
-            for (const auto& element : rule) {
-                elements.append(py::dict(
-                        "type"_a = static_cast<int>(element.type),
-                        "value"_a = element.value));
-            }
-            rules.append(elements);
-        }
-        return rules;
-    }
 };
 WhisperFullParamsWrapper  whisper_full_default_params_wrapper(enum whisper_sampling_strategy strategy) {
     return WhisperFullParamsWrapper(whisper_full_default_params(strategy));
@@ -1154,11 +1093,7 @@ PYBIND11_MODULE(_pywhispercpp, m) {
                 << "progress_callback=" << (self.progress_callback ? "(function pointer)" : "None") << ", "
                 << "encoder_begin_callback=" << (self.encoder_begin_callback ? "(function pointer)" : "None") << ", "
                 << "abort_callback=" << (self.abort_callback ? "(function pointer)" : "None") << ", "
-                << "logits_filter_callback=" << (self.logits_filter_callback ? "(function pointer)" : "None") << ", "
-                << "grammar_rules=" << (self.grammar_rules ? "(whisper_grammar_element **)" : "None") << ", "
-                << "n_grammar_rules=" << self.n_grammar_rules << ", "
-                << "i_start_rule=" << self.i_start_rule << ", "
-                << "grammar_penalty=" << self.grammar_penalty
+                << "logits_filter_callback=" << (self.logits_filter_callback ? "(function pointer)" : "None")
                 << ")";
             return oss.str();
         });
@@ -1311,12 +1246,6 @@ PYBIND11_MODULE(_pywhispercpp, m) {
         .def_property("logits_filter_callback_user_data",
             &WhisperFullParamsWrapper::get_logits_filter_callback_user_data,
             &WhisperFullParamsWrapper::set_logits_filter_callback_user_data)
-        .def_property_readonly("grammar_rules", &WhisperFullParamsWrapper::get_grammar_rules)
-        .def_property_readonly("n_grammar_rules", [](const WhisperFullParamsWrapper &self) { return self.n_grammar_rules; })
-        .def_property_readonly("i_start_rule", [](const WhisperFullParamsWrapper &self) { return self.i_start_rule; })
-        .def("set_grammar", &WhisperFullParamsWrapper::set_grammar,
-            py::arg("grammar"), py::arg("start_rule") = "root", py::arg("penalty") = 100.0f,
-            "Parse grammar text or a grammar file path and assign it to the params.")
         .def("set_logits_filter_callback",
              [](WhisperFullParamsWrapper &self, py::object callback) {
                  if (callback.is_none()) {
@@ -1329,9 +1258,6 @@ PYBIND11_MODULE(_pywhispercpp, m) {
              "Assign a logits-filter callback.")
         .def("clear_logits_filter_callback", &WhisperFullParamsWrapper::clear_logits_filter_callback,
              "Clear any previously assigned logits-filter callback.")
-        .def("clear_grammar", &WhisperFullParamsWrapper::clear_grammar,
-            "Clear any previously assigned grammar.")
-        .def_readwrite("grammar_penalty", &WhisperFullParamsWrapper::grammar_penalty)
         .def_readwrite("vad", &WhisperFullParamsWrapper::vad)
         .def_property("vad_model_path",
         [](WhisperFullParamsWrapper &self) {
diff --git a/tests/test_model.py b/tests/test_model.py
index 9ee8f65..b68f8a6 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -68,24 +68,6 @@ def test_prompt_token_helper_exists(self):
         params.set_prompt_tokens((1, 2, 3))
         self.assertEqual(params.prompt_n_tokens, 3)
 
-    def test_grammar_helper_exists(self):
-        params = pw.whisper_full_default_params(
-            pw.whisper_sampling_strategy.WHISPER_SAMPLING_GREEDY
-        )
-        params.set_grammar('root ::= "yes" | "no"', 'root', 42.0)
-        self.assertEqual(params.grammar_penalty, 42.0)
-        params.clear_grammar()
-
-    def test_model_accepts_grammar_param(self):
-        model = Model(
-            "tiny",
-            models_dir=str(WHISPER_CPP_DIR/'models'),
-            grammar='root ::= "yes" | "no"',
-            grammar_rule='root',
-            grammar_penalty=42.0,
-        )
-        self.assertIsInstance(model, Model)
-
     def test_model_metadata_bindings(self):
         self.assertIsInstance(pw.whisper_model_type_readable(self.model._ctx), str)
         self.assertGreater(pw.whisper_model_n_vocab(self.model._ctx), 0)

From 3bbf251aa1f9893daa30b3c475f4a653d4834b47 Mon Sep 17 00:00:00 2001
From: scottmonster <87917233+scottmonster@users.noreply.github.com>
Date: Wed, 20 May 2026 20:55:46 -0500
Subject: [PATCH 12/16] misc. stub and docustring fixes

---
 pywhispercpp/constants.py |  8 +++++++-
 pywhispercpp/model.py     | 12 +++++++-----
 pywhispercpp/model.pyi    |  9 +++++----
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/pywhispercpp/constants.py b/pywhispercpp/constants.py
index 85018a8..4a6d1cd 100644
--- a/pywhispercpp/constants.py
+++ b/pywhispercpp/constants.py
@@ -242,6 +242,12 @@
             'options': None,
             'default': False
     },
+    'suppress_regex': {
+            'type': str,
+            'description': 'regex pattern used to suppress matching text during decoding',
+            'options': None,
+            'default': ''
+    },
     'temperature': {
             'type': float,
             'description': 'initial decoding temperature',
@@ -294,7 +300,7 @@
             'type': dict,
             'description': 'beam_search',
             'options': None,
-            'default': {"beam_size": 5, "patience": -1.0}
+            'default': {"beam_size": -1, "patience": -1.0}
     },
     'extract_probability': {
             'type': bool,
diff --git a/pywhispercpp/model.py b/pywhispercpp/model.py
index d11260b..3864ed9 100644
--- a/pywhispercpp/model.py
+++ b/pywhispercpp/model.py
@@ -111,7 +111,8 @@ def __init__(self,
                                `flash_attn`, `gpu_device`, `dtw_token_timestamps`,
                                `dtw_aheads_preset`, `dtw_n_top`, and `dtw_mem_size`. Omitted keys inherit
                                from `whisper_context_default_params()`.
-        :param params: decode parameters forwarded to `whisper_full_params`.
+        :param params: keyword-only decode parameters matching the public API documented in `model.pyi`.
+            These values are forwarded to `whisper_full_params` and remain active for future calls.
             Supported keys:
             - `n_threads`: number of inference threads. Default is `min(4, hardware_concurrency())`.
             - `n_max_text_ctx`: max prompt-text tokens carried into the decoder. Default `16384`.
@@ -152,7 +153,7 @@ def __init__(self,
             - `logprob_thold`: logprob threshold. Default `-1.0`.
             - `no_speech_thold`: no-speech threshold. Default `0.6`.
             - `greedy`: greedy-decoder settings, typically `{"best_of": 5}`.
-            - `beam_search`: beam-search settings, schema default `{"beam_size": -1, "patience": -1.0}`.
+            - `beam_search`: beam-search settings. Default `{"beam_size": -1, "patience": -1.0}`.
             - `vad`: enable VAD. Default `False`.
             - `vad_model_path`: path to the VAD model. Default `None`.
         """
@@ -189,11 +190,12 @@ def transcribe(self,
         :param n_processors: number of worker processes for `whisper_full_parallel`. If omitted, runs a
                      single-process `whisper_full()` decode.
         :param new_segment_callback: callback invoked for each newly produced `Segment` during decoding.
-        :param abort_callback: callback function returning True to abort an in-flight transcription early
-        :param params: keyword arguments for different whisper.cpp parameters; these override the model's
-                       active decode params for this call
+        :param abort_callback: callback function returning True to abort an in-flight transcription early.
         :param extract_probability: If True, calculates the geometric mean of token probabilities for each segment,
             providing a confidence score interpretable as a probability in [0, 1].
+        :param params: additional keyword-only decode parameters matching the public API documented in
+            `model.pyi`, with the same supported keys and defaults as `Model.__init__`.
+            Any overrides applied here remain active for future calls.
         :return: List of transcription segments
         """
         if isinstance(media, np.ndarray):
diff --git a/pywhispercpp/model.pyi b/pywhispercpp/model.pyi
index 3f2852b..c2075fd 100644
--- a/pywhispercpp/model.pyi
+++ b/pywhispercpp/model.pyi
@@ -1,13 +1,12 @@
 from __future__ import annotations
 
-from typing import Any, Callable, Dict, List, Optional, TextIO, Tuple, TypedDict, Union
+from typing import Any, Callable, Dict, List, Optional, TextIO, Tuple, TypedDict, TypeAlias, Union
 
 import numpy as np
 import numpy.typing as npt
 
-AudioArray = npt.NDArray[np.float32]
-AudioInput = Union[str, AudioArray]
-
+AudioArray: TypeAlias = npt.NDArray[np.float32]
+AudioInput: TypeAlias = Union[str, AudioArray]
 
 class ContextParams(TypedDict, total=False):
     use_gpu: bool
@@ -40,6 +39,7 @@ class Segment:
 
 
 class Model:
+    model_path: str
     _new_segment_callback: Optional[Callable[[Segment], None]]
 
     def __init__(
@@ -52,6 +52,7 @@ class Model:
         openvino_model_path: Optional[str] = None,
         openvino_device: str = 'CPU',
         openvino_cache_dir: Optional[str] = None,
+        context_params: Optional[ContextParams] = None,
         *,
         n_threads: Optional[int] = None,
         n_max_text_ctx: int = 16384,

From 4b469e370a8f724b32a37f3220ece0ebfd586ff2 Mon Sep 17 00:00:00 2001
From: scottmonster <87917233+scottmonster@users.noreply.github.com>
Date: Wed, 20 May 2026 21:51:55 -0500
Subject: [PATCH 13/16] Pin whisper.cpp to v1.8.4

---
 whisper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/whisper.cpp b/whisper.cpp
index 4979e04..9386f23 160000
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -1 +1 @@
-Subproject commit 4979e04f5dcaccb36057e059bbaed8a2f5288315
+Subproject commit 9386f239401074690479731c1e41683fbbeac557

From c2eaf8cfe29987d5440410fdc46236d251ccefe3 Mon Sep 17 00:00:00 2001
From: scottmonster <87917233+scottmonster@users.noreply.github.com>
Date: Wed, 20 May 2026 23:56:49 -0500
Subject: [PATCH 14/16] remove WHISPER_DEPRECATED (whsiper.h) functions

---
 README.md             | 107 ++++++++++++++++++++++++++----------------
 pywhispercpp/model.py |  10 ++--
 src/main.cpp          |  34 --------------
 tests/test_c_api.py   |   5 +-
 4 files changed, 75 insertions(+), 81 deletions(-)

diff --git a/README.md b/README.md
index ccda1c6..86ba9c0 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,5 @@
 # pywhispercpp
+
 Python bindings for [whisper.cpp](https://github.com/ggerganov/whisper.cpp) with a simple Pythonic API on top of it.
 
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
@@ -7,38 +8,49 @@ Python bindings for [whisper.cpp](https://github.com/ggerganov/whisper.cpp) with
 [![Downloads](https://static.pepy.tech/badge/pywhispercpp)](https://pepy.tech/project/pywhispercpp)
 
 # Table of contents
+
 <!-- TOC -->
-* [Installation](#installation)
-    * [From source](#from-source)
-    * [Pre-built wheels](#pre-built-wheels)
-    * [NVIDIA GPU support](#nvidia-gpu-support)
-    * [CoreML support](#coreml-support)
-    * [Vulkan support](#vulkan-support)
-* [Quick start](#quick-start)
-* [Examples](#examples)
-  * [CLI](#cli)
-  * [GUI](#gui)
-  * [Assistant](#assistant)
-* [Advanced usage](#advanced-usage)
-* [Discussions and contributions](#discussions-and-contributions)
-* [License](#license)
+
+- [pywhispercpp](#pywhispercpp)
+- [Table of contents](#table-of-contents)
+- [Installation](#installation)
+    - [From source](#from-source)
+    - [Pre-built wheels](#pre-built-wheels)
+    - [NVIDIA GPU support](#nvidia-gpu-support)
+    - [CoreML support](#coreml-support)
+    - [Vulkan support](#vulkan-support)
+    - [OpenBLAS support](#openblas-support)
+    - [OpenVINO support](#openvino-support)
+- [Quick start](#quick-start)
+- [Examples](#examples)
+  - [CLI](#cli)
+  - [GUI](#gui)
+  - [Assistant](#assistant)
+- [Advanced usage](#advanced-usage)
+- [Discussions and contributions](#discussions-and-contributions)
+- [License](#license)
 <!-- TOC -->
 
 # Installation
 
 ### From source
-* For the best performance, you need to install the package from source:
+
+- For the best performance, you need to install the package from source:
+
 ```shell
 pip install git+https://github.com/absadiki/pywhispercpp
 ```
+
 ### Pre-built wheels
-* Otherwise, Basic Pre-built CPU wheels are available on PYPI
+
+- Otherwise, Basic Pre-built CPU wheels are available on PYPI
 
 ```shell
 pip install pywhispercpp # or pywhispercpp[examples] to install the extra dependencies needed for the examples
 ```
 
-[Optional] To transcribe files other than wav, you need to install ffmpeg:  
+[Optional] To transcribe files other than wav, you need to install ffmpeg:
+
 ```shell
 # on Ubuntu or Debian
 sudo apt update && sudo apt install ffmpeg
@@ -57,11 +69,13 @@ scoop install ffmpeg
 ```
 
 ### NVIDIA GPU support
+
 To Install the package with CUDA support, make sure you have [cuda](https://developer.nvidia.com/cuda-downloads) installed and use `GGML_CUDA=1`:
 
 ```shell
 GGML_CUDA=1 pip install git+https://github.com/absadiki/pywhispercpp
 ```
+
 ### CoreML support
 
 Install the package with `WHISPER_COREML=1`:
@@ -81,6 +95,7 @@ GGML_VULKAN=1 pip install git+https://github.com/absadiki/pywhispercpp
 ### OpenBLAS support
 
 If OpenBLAS is installed, you can use `GGML_BLAS=1`. The other flags ensure you're installing fresh with the correct flags, and printing output for sanity checking.
+
 ```shell
 GGML_BLAS=1 pip install git+https://github.com/absadiki/pywhispercpp --no-cache --force-reinstall -v
 ```
@@ -90,16 +105,15 @@ GGML_BLAS=1 pip install git+https://github.com/absadiki/pywhispercpp --no-cache
 Follow the the steps to download correct OpenVINO package (https://github.com/ggerganov/whisper.cpp?tab=readme-ov-file#openvino-support).
 
 Then init the OpenVINO environment and build.
+
 ```
-source ~/l_openvino_toolkit_ubuntu22_2023.0.0.10926.b4452d56304_x86_64/setupvars.sh 
+source ~/l_openvino_toolkit_ubuntu22_2023.0.0.10926.b4452d56304_x86_64/setupvars.sh
 WHISPER_OPENVINO=1 pip install git+https://github.com/absadiki/pywhispercpp --no-cache --force-reinstall
 ```
 
 Note that the toolkit for Ubuntu22 works on Ubuntu24
 
-
-** __Feel free to update this list and submit a PR if you tested the package on other backends.__
-
+\*\* **Feel free to update this list and submit a PR if you tested the package on other backends.**
 
 # Quick start
 
@@ -121,21 +135,22 @@ model = Model('base.en', print_realtime=False, print_progress=False)
 segments = model.transcribe('file.mp3', new_segment_callback=print)
 ```
 
-
-* The model will be downloaded automatically, or you can use the path to a local model.
-* You can pass any `whisper.cpp` [parameter](https://absadiki.github.io/pywhispercpp/#pywhispercpp.constants.PARAMS_SCHEMA) as a keyword argument to the `Model` class or to the `transcribe` function.
-* Check the [Model](https://absadiki.github.io/pywhispercpp/#pywhispercpp.model.Model) class documentation for more details.
+- The model will be downloaded automatically, or you can use the path to a local model.
+- You can pass any `whisper.cpp` [parameter](https://absadiki.github.io/pywhispercpp/#pywhispercpp.constants.PARAMS_SCHEMA) as a keyword argument to the `Model` class or to the `transcribe` function.
+- Check the [Model](https://absadiki.github.io/pywhispercpp/#pywhispercpp.model.Model) class documentation for more details.
 
 # Examples
 
 ## CLI
-Just a straightforward example Command Line Interface. 
+
+Just a straightforward example Command Line Interface.
 You can use it as follows:
 
 ```shell
 pwcpp file.wav -m base --output-srt --print_realtime true
 ```
-Run ```pwcpp --help``` to get the help message
+
+Run `pwcpp --help` to get the help message
 
 ```shell
 usage: pwcpp [-h] [-m MODEL] [--version] [--processors PROCESSORS] [-otxt] [-ovtt] [-osrt] [-ocsv] [--strategy STRATEGY]
@@ -229,13 +244,17 @@ options:
 ```
 
 ## GUI
-If you prefer a Graphical User Interface, you can use the `pwcpp-gui` command which will launch A simple graphical interface built with PyQt5. 
-* First you need to install the GUI dependencies:
+
+If you prefer a Graphical User Interface, you can use the `pwcpp-gui` command which will launch A simple graphical interface built with PyQt5.
+
+- First you need to install the GUI dependencies:
+
 ```bash
 pip install pywhispercpp[gui]
 ```
 
-* Then you can run the GUI with:
+- Then you can run the GUI with:
+
 ```bash
 pwcpp-gui
 ```
@@ -248,23 +267,25 @@ The GUI provides a user-friendly way to:
 - View and export transcription results
 
 ## Assistant
-This is a simple example showcasing the use of `pywhispercpp` to create an assistant like example. 
-The idea is to use a Voice Activity Detector (VAD) to detect speech (in this example, we used webrtcvad), and when some speech is detected, we run the transcription. 
+
+This is a simple example showcasing the use of `pywhispercpp` to create an assistant like example.
+The idea is to use a Voice Activity Detector (VAD) to detect speech (in this example, we used webrtcvad), and when some speech is detected, we run the transcription.
 It is inspired from the [whisper.cpp/examples/command](https://github.com/ggerganov/whisper.cpp/tree/master/examples/command) example.
 
-You can check the source code [here](https://github.com/absadiki/pywhispercpp/blob/main/pywhispercpp/examples/assistant.py) 
+You can check the source code [here](https://github.com/absadiki/pywhispercpp/blob/main/pywhispercpp/examples/assistant.py)
 or you can use the class directly to create your own assistant:
 
-
 ```python
 from pywhispercpp.examples.assistant import Assistant
 
 my_assistant = Assistant(commands_callback=print, n_threads=8)
 my_assistant.start()
 ```
+
 Here, we set the `commands_callback` to a simple print function, so the commands will just get printed on the screen.
 
 You can also run this example from the command line.
+
 ```shell
 $ pwcpp-assistant --help
 
@@ -281,25 +302,31 @@ options:
   -bd BLOCK_DURATION, --block_duration BLOCK_DURATION
                         minimum time audio updates in ms, default to 30
 ```
--------------
 
-* Check the [examples folder](https://github.com/absadiki/pywhispercpp/tree/main/pywhispercpp/examples) for more examples.
+---
+
+- Check the [examples folder](https://github.com/absadiki/pywhispercpp/tree/main/pywhispercpp/examples) for more examples.
 
 # Advanced usage
-* First check the [API documentation](https://absadiki.github.io/pywhispercpp/) for more advanced usage.
-* If you are a more experienced user, you can access the exposed C-APIs directly from the binding module `_pywhispercpp`.
+
+- First check the [API documentation](https://absadiki.github.io/pywhispercpp/) for more advanced usage.
+- If you are a more experienced user, you can access the exposed C-APIs directly from the binding module `_pywhispercpp`.
 
 ```python
 import _pywhispercpp as pwcpp
 
-ctx = pwcpp.whisper_init_from_file('path/to/ggml/model')
+ctx = pwcpp.whisper_init_from_file_with_params(
+  'path/to/ggml/model',
+  pwcpp.whisper_context_default_params(),
+)
 ```
 
 # Discussions and contributions
+
 If you find any bug, please open an [issue](https://github.com/absadiki/pywhispercpp/issues).
 
 If you have any feedback, or you want to share how you are using this project, feel free to use the [Discussions](https://github.com/absadiki/pywhispercpp/discussions) and open a new topic.
 
 # License
 
-This project is licensed under the same license as [whisper.cpp](https://github.com/ggerganov/whisper.cpp/blob/master/LICENSE) (MIT  [License](./LICENSE)).
+This project is licensed under the same license as [whisper.cpp](https://github.com/ggerganov/whisper.cpp/blob/master/LICENSE) (MIT [License](./LICENSE)).
diff --git a/pywhispercpp/model.py b/pywhispercpp/model.py
index 3864ed9..4833fb1 100644
--- a/pywhispercpp/model.py
+++ b/pywhispercpp/model.py
@@ -334,8 +334,10 @@ def available_languages() -> List[str]:
 
     @staticmethod
     def _resolve_context_params(context_params: Optional[ContextParams]):
+        resolved = pw.whisper_context_default_params()
+
         if context_params is None:
-            return None
+            return resolved
 
         if not isinstance(context_params, dict):
             raise TypeError("context_params must be a ContextParams dict or None")
@@ -346,7 +348,6 @@ def _resolve_context_params(context_params: Optional[ContextParams]):
                 f"Unknown context_params keys: {', '.join(unknown_keys)}"
             )
 
-        resolved = pw.whisper_context_default_params()
         for key, value in context_params.items():
             setattr(resolved, key, value)
         return resolved
@@ -381,10 +382,7 @@ def _init_model(self) -> None:
         """
         logger.info("Initializing the model ...")
         with utils.redirect_stderr(to=self.redirect_whispercpp_logs_to):
-            if self._context_params is None:
-                self._ctx = pw.whisper_init_from_file(self.model_path)
-            else:
-                self._ctx = pw.whisper_init_from_file_with_params(self.model_path, self._context_params)
+            self._ctx = pw.whisper_init_from_file_with_params(self.model_path, self._context_params)
             if self.use_openvino:
                 pw.whisper_ctx_init_openvino_encoder(self._ctx, self.openvino_model_path, self.openvino_device, self.openvino_cache_dir)
 
diff --git a/src/main.cpp b/src/main.cpp
index 35d74a2..6bc3c00 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -86,30 +86,6 @@ struct whisper_context_wrapper whisper_init_with_params_wrapper(
     return ctw_w;
 };
 
-struct whisper_context_wrapper whisper_init_from_file_wrapper(const char * path_model){
-    struct whisper_context_params cparams = whisper_context_default_params();
-    struct whisper_context * ctx = whisper_init_from_file_with_params(path_model, cparams);
-    struct whisper_context_wrapper ctw_w;
-    ctw_w.ptr = ctx;
-    return ctw_w;
-}
-
-struct whisper_context_wrapper whisper_init_from_buffer_wrapper(void * buffer, size_t buffer_size){
-    struct whisper_context_params cparams = whisper_context_default_params();
-    struct whisper_context * ctx = whisper_init_from_buffer_with_params(buffer, buffer_size, cparams);
-    struct whisper_context_wrapper ctw_w;
-    ctw_w.ptr = ctx;
-    return ctw_w;
-}
-
-struct whisper_context_wrapper whisper_init_wrapper(struct whisper_model_loader_wrapper * loader){
-    struct whisper_context_params cparams = whisper_context_default_params();
-    struct whisper_context * ctx = whisper_init_with_params(loader->ptr, cparams);
-    struct whisper_context_wrapper ctw_w;
-    ctw_w.ptr = ctx;
-    return ctw_w;
-};
-
 void whisper_free_wrapper(struct whisper_context_wrapper * ctx_w){
     whisper_free(ctx_w->ptr);
 };
@@ -939,22 +915,12 @@ PYBIND11_MODULE(_pywhispercpp, m) {
 
         m.def("whisper_context_default_params", &whisper_context_default_params,
             "Return the default context parameters used during model initialization.");
-
-    DEF_RELEASE_GIL("whisper_init_from_file", &whisper_init_from_file_wrapper, "Various functions for loading a ggml whisper model.\n"
-                                                                    "Allocate (almost) all memory needed for the model.\n"
-                                                                    "Return NULL on failure");
         DEF_RELEASE_GIL("whisper_init_from_file_with_params", &whisper_init_from_file_with_params_wrapper, "Various functions for loading a ggml whisper model.\n"
                                                   "Allocate (almost) all memory needed for the model.\n"
                                                   "Return NULL on failure");
-    DEF_RELEASE_GIL("whisper_init_from_buffer", &whisper_init_from_buffer_wrapper, "Various functions for loading a ggml whisper model.\n"
-                                                                        "Allocate (almost) all memory needed for the model.\n"
-                                                                        "Return NULL on failure");
         DEF_RELEASE_GIL("whisper_init_from_buffer_with_params", &whisper_init_from_buffer_with_params_wrapper, "Various functions for loading a ggml whisper model.\n"
                                                     "Allocate (almost) all memory needed for the model.\n"
                                                     "Return NULL on failure");
-    DEF_RELEASE_GIL("whisper_init", &whisper_init_wrapper, "Various functions for loading a ggml whisper model.\n"
-                                                "Allocate (almost) all memory needed for the model.\n"
-                                                "Return NULL on failure");
         DEF_RELEASE_GIL("whisper_init_with_params", &whisper_init_with_params_wrapper, "Various functions for loading a ggml whisper model.\n"
                                     "Allocate (almost) all memory needed for the model.\n"
                                     "Return NULL on failure");
diff --git a/tests/test_c_api.py b/tests/test_c_api.py
index b415395..6138c1c 100644
--- a/tests/test_c_api.py
+++ b/tests/test_c_api.py
@@ -11,7 +11,10 @@ class TestCAPI(TestCase):
     model_file = './whisper.cpp/models/for-tests-ggml-tiny.en.bin'
 
     def test_whisper_init_from_file(self):
-        ctx = pw.whisper_init_from_file(self.model_file)
+        ctx = pw.whisper_init_from_file_with_params(
+            self.model_file,
+            pw.whisper_context_default_params(),
+        )
         self.assertIsInstance(ctx, pw.whisper_context)
 
     def test_whisper_lang_str(self):

From c80acd91c08c8bb514d60da6d346e1d9c156efe9 Mon Sep 17 00:00:00 2001
From: scottmonster <87917233+scottmonster@users.noreply.github.com>
Date: Sat, 23 May 2026 19:24:01 -0500
Subject: [PATCH 15/16] revert language to empty string

revert language back to "" instead of "en" so that whisper will
auto-detect language
---
 pywhispercpp/constants.py | 2 +-
 pywhispercpp/model.py     | 2 +-
 pywhispercpp/model.pyi    | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pywhispercpp/constants.py b/pywhispercpp/constants.py
index 4a6d1cd..bfe582f 100644
--- a/pywhispercpp/constants.py
+++ b/pywhispercpp/constants.py
@@ -216,7 +216,7 @@
             'type': str,
             'description': 'for auto-detection, set to None, "" or "auto"',
             'options': None,
-            'default': "en"
+            'default': ""
     },
     'detect_language': {
             'type': bool,
diff --git a/pywhispercpp/model.py b/pywhispercpp/model.py
index 4833fb1..453e152 100644
--- a/pywhispercpp/model.py
+++ b/pywhispercpp/model.py
@@ -139,7 +139,7 @@ def __init__(self,
             - `prompt_tokens`: explicit prompt token sequence. Default `None`.
             - `prompt_n_tokens`: number of prompt tokens. Default `0`.
             - `carry_initial_prompt`: prepend the initial prompt to each decode window. Default `False`.
-            - `language`: language code. Default `en`.
+            - `language`: language code. Default ``.
             - `detect_language`: enable automatic language detection during transcription. Default `False`.
             - `suppress_blank`: suppress blank outputs. Default `True`.
             - `suppress_non_speech_tokens`: Python alias for `suppress_nst`. Default `False`.
diff --git a/pywhispercpp/model.pyi b/pywhispercpp/model.pyi
index c2075fd..35cb735 100644
--- a/pywhispercpp/model.pyi
+++ b/pywhispercpp/model.pyi
@@ -79,7 +79,7 @@ class Model:
         prompt_tokens: Optional[Tuple[Any, ...]] = None,
         prompt_n_tokens: int = 0,
         carry_initial_prompt: bool = False,
-        language: str = 'en',
+        language: str = '',
         detect_language: bool = False,
         suppress_blank: bool = True,
         suppress_non_speech_tokens: bool = False,
@@ -131,7 +131,7 @@ class Model:
         prompt_tokens: Optional[Tuple[Any, ...]] = None,
         prompt_n_tokens: int = 0,
         carry_initial_prompt: bool = False,
-        language: str = 'en',
+        language: str = '',
         detect_language: bool = False,
         suppress_blank: bool = True,
         suppress_non_speech_tokens: bool = False,

From 490c545f808f2dacc9d1871e44fee4288a22eddb Mon Sep 17 00:00:00 2001
From: scottmonster <87917233+scottmonster@users.noreply.github.com>
Date: Sat, 23 May 2026 20:00:43 -0500
Subject: [PATCH 16/16] update readme

---
 README.md | 102 +++++++++++++++++++++---------------------------------
 1 file changed, 39 insertions(+), 63 deletions(-)

diff --git a/README.md b/README.md
index 86ba9c0..018f18e 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,4 @@
 # pywhispercpp
-
 Python bindings for [whisper.cpp](https://github.com/ggerganov/whisper.cpp) with a simple Pythonic API on top of it.
 
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
@@ -8,49 +7,38 @@ Python bindings for [whisper.cpp](https://github.com/ggerganov/whisper.cpp) with
 [![Downloads](https://static.pepy.tech/badge/pywhispercpp)](https://pepy.tech/project/pywhispercpp)
 
 # Table of contents
-
 <!-- TOC -->
-
-- [pywhispercpp](#pywhispercpp)
-- [Table of contents](#table-of-contents)
-- [Installation](#installation)
-    - [From source](#from-source)
-    - [Pre-built wheels](#pre-built-wheels)
-    - [NVIDIA GPU support](#nvidia-gpu-support)
-    - [CoreML support](#coreml-support)
-    - [Vulkan support](#vulkan-support)
-    - [OpenBLAS support](#openblas-support)
-    - [OpenVINO support](#openvino-support)
-- [Quick start](#quick-start)
-- [Examples](#examples)
-  - [CLI](#cli)
-  - [GUI](#gui)
-  - [Assistant](#assistant)
-- [Advanced usage](#advanced-usage)
-- [Discussions and contributions](#discussions-and-contributions)
-- [License](#license)
+* [Installation](#installation)
+    * [From source](#from-source)
+    * [Pre-built wheels](#pre-built-wheels)
+    * [NVIDIA GPU support](#nvidia-gpu-support)
+    * [CoreML support](#coreml-support)
+    * [Vulkan support](#vulkan-support)
+* [Quick start](#quick-start)
+* [Examples](#examples)
+  * [CLI](#cli)
+  * [GUI](#gui)
+  * [Assistant](#assistant)
+* [Advanced usage](#advanced-usage)
+* [Discussions and contributions](#discussions-and-contributions)
+* [License](#license)
 <!-- TOC -->
 
 # Installation
 
 ### From source
-
-- For the best performance, you need to install the package from source:
-
+* For the best performance, you need to install the package from source:
 ```shell
 pip install git+https://github.com/absadiki/pywhispercpp
 ```
-
 ### Pre-built wheels
-
-- Otherwise, Basic Pre-built CPU wheels are available on PYPI
+* Otherwise, Basic Pre-built CPU wheels are available on PYPI
 
 ```shell
 pip install pywhispercpp # or pywhispercpp[examples] to install the extra dependencies needed for the examples
 ```
 
-[Optional] To transcribe files other than wav, you need to install ffmpeg:
-
+[Optional] To transcribe files other than wav, you need to install ffmpeg:  
 ```shell
 # on Ubuntu or Debian
 sudo apt update && sudo apt install ffmpeg
@@ -69,13 +57,11 @@ scoop install ffmpeg
 ```
 
 ### NVIDIA GPU support
-
 To Install the package with CUDA support, make sure you have [cuda](https://developer.nvidia.com/cuda-downloads) installed and use `GGML_CUDA=1`:
 
 ```shell
 GGML_CUDA=1 pip install git+https://github.com/absadiki/pywhispercpp
 ```
-
 ### CoreML support
 
 Install the package with `WHISPER_COREML=1`:
@@ -95,7 +81,6 @@ GGML_VULKAN=1 pip install git+https://github.com/absadiki/pywhispercpp
 ### OpenBLAS support
 
 If OpenBLAS is installed, you can use `GGML_BLAS=1`. The other flags ensure you're installing fresh with the correct flags, and printing output for sanity checking.
-
 ```shell
 GGML_BLAS=1 pip install git+https://github.com/absadiki/pywhispercpp --no-cache --force-reinstall -v
 ```
@@ -105,15 +90,16 @@ GGML_BLAS=1 pip install git+https://github.com/absadiki/pywhispercpp --no-cache
 Follow the the steps to download correct OpenVINO package (https://github.com/ggerganov/whisper.cpp?tab=readme-ov-file#openvino-support).
 
 Then init the OpenVINO environment and build.
-
 ```
-source ~/l_openvino_toolkit_ubuntu22_2023.0.0.10926.b4452d56304_x86_64/setupvars.sh
+source ~/l_openvino_toolkit_ubuntu22_2023.0.0.10926.b4452d56304_x86_64/setupvars.sh 
 WHISPER_OPENVINO=1 pip install git+https://github.com/absadiki/pywhispercpp --no-cache --force-reinstall
 ```
 
 Note that the toolkit for Ubuntu22 works on Ubuntu24
 
-\*\* **Feel free to update this list and submit a PR if you tested the package on other backends.**
+
+** __Feel free to update this list and submit a PR if you tested the package on other backends.__
+
 
 # Quick start
 
@@ -135,22 +121,21 @@ model = Model('base.en', print_realtime=False, print_progress=False)
 segments = model.transcribe('file.mp3', new_segment_callback=print)
 ```
 
-- The model will be downloaded automatically, or you can use the path to a local model.
-- You can pass any `whisper.cpp` [parameter](https://absadiki.github.io/pywhispercpp/#pywhispercpp.constants.PARAMS_SCHEMA) as a keyword argument to the `Model` class or to the `transcribe` function.
-- Check the [Model](https://absadiki.github.io/pywhispercpp/#pywhispercpp.model.Model) class documentation for more details.
+
+* The model will be downloaded automatically, or you can use the path to a local model.
+* You can pass any `whisper.cpp` [parameter](https://absadiki.github.io/pywhispercpp/#pywhispercpp.constants.PARAMS_SCHEMA) as a keyword argument to the `Model` class or to the `transcribe` function.
+* Check the [Model](https://absadiki.github.io/pywhispercpp/#pywhispercpp.model.Model) class documentation for more details.
 
 # Examples
 
 ## CLI
-
-Just a straightforward example Command Line Interface.
+Just a straightforward example Command Line Interface. 
 You can use it as follows:
 
 ```shell
 pwcpp file.wav -m base --output-srt --print_realtime true
 ```
-
-Run `pwcpp --help` to get the help message
+Run ```pwcpp --help``` to get the help message
 
 ```shell
 usage: pwcpp [-h] [-m MODEL] [--version] [--processors PROCESSORS] [-otxt] [-ovtt] [-osrt] [-ocsv] [--strategy STRATEGY]
@@ -244,17 +229,13 @@ options:
 ```
 
 ## GUI
-
-If you prefer a Graphical User Interface, you can use the `pwcpp-gui` command which will launch A simple graphical interface built with PyQt5.
-
-- First you need to install the GUI dependencies:
-
+If you prefer a Graphical User Interface, you can use the `pwcpp-gui` command which will launch A simple graphical interface built with PyQt5. 
+* First you need to install the GUI dependencies:
 ```bash
 pip install pywhispercpp[gui]
 ```
 
-- Then you can run the GUI with:
-
+* Then you can run the GUI with:
 ```bash
 pwcpp-gui
 ```
@@ -267,25 +248,23 @@ The GUI provides a user-friendly way to:
 - View and export transcription results
 
 ## Assistant
-
-This is a simple example showcasing the use of `pywhispercpp` to create an assistant like example.
-The idea is to use a Voice Activity Detector (VAD) to detect speech (in this example, we used webrtcvad), and when some speech is detected, we run the transcription.
+This is a simple example showcasing the use of `pywhispercpp` to create an assistant like example. 
+The idea is to use a Voice Activity Detector (VAD) to detect speech (in this example, we used webrtcvad), and when some speech is detected, we run the transcription. 
 It is inspired from the [whisper.cpp/examples/command](https://github.com/ggerganov/whisper.cpp/tree/master/examples/command) example.
 
-You can check the source code [here](https://github.com/absadiki/pywhispercpp/blob/main/pywhispercpp/examples/assistant.py)
+You can check the source code [here](https://github.com/absadiki/pywhispercpp/blob/main/pywhispercpp/examples/assistant.py) 
 or you can use the class directly to create your own assistant:
 
+
 ```python
 from pywhispercpp.examples.assistant import Assistant
 
 my_assistant = Assistant(commands_callback=print, n_threads=8)
 my_assistant.start()
 ```
-
 Here, we set the `commands_callback` to a simple print function, so the commands will just get printed on the screen.
 
 You can also run this example from the command line.
-
 ```shell
 $ pwcpp-assistant --help
 
@@ -302,15 +281,13 @@ options:
   -bd BLOCK_DURATION, --block_duration BLOCK_DURATION
                         minimum time audio updates in ms, default to 30
 ```
+-------------
 
----
-
-- Check the [examples folder](https://github.com/absadiki/pywhispercpp/tree/main/pywhispercpp/examples) for more examples.
+* Check the [examples folder](https://github.com/absadiki/pywhispercpp/tree/main/pywhispercpp/examples) for more examples.
 
 # Advanced usage
-
-- First check the [API documentation](https://absadiki.github.io/pywhispercpp/) for more advanced usage.
-- If you are a more experienced user, you can access the exposed C-APIs directly from the binding module `_pywhispercpp`.
+* First check the [API documentation](https://absadiki.github.io/pywhispercpp/) for more advanced usage.
+* If you are a more experienced user, you can access the exposed C-APIs directly from the binding module `_pywhispercpp`.
 
 ```python
 import _pywhispercpp as pwcpp
@@ -322,11 +299,10 @@ ctx = pwcpp.whisper_init_from_file_with_params(
 ```
 
 # Discussions and contributions
-
 If you find any bug, please open an [issue](https://github.com/absadiki/pywhispercpp/issues).
 
 If you have any feedback, or you want to share how you are using this project, feel free to use the [Discussions](https://github.com/absadiki/pywhispercpp/discussions) and open a new topic.
 
 # License
 
-This project is licensed under the same license as [whisper.cpp](https://github.com/ggerganov/whisper.cpp/blob/master/LICENSE) (MIT [License](./LICENSE)).
+This project is licensed under the same license as [whisper.cpp](https://github.com/ggerganov/whisper.cpp/blob/master/LICENSE) (MIT  [License](./LICENSE)).
\ No newline at end of file