From d605503bc5f37c3fa6459edc9dbcc04cffcacd3e Mon Sep 17 00:00:00 2001 From: scottmonster Date: Thu, 14 May 2026 20:09:11 -0500 Subject: [PATCH 01/16] add type support via model.pyi --- MANIFEST.in | 2 + pywhispercpp/model.pyi | 148 +++++++++++++++++++++++++++++++++++++++++ pywhispercpp/py.typed | 0 setup.py | 2 +- 4 files changed, 151 insertions(+), 1 deletion(-) create mode 100644 pywhispercpp/model.pyi create mode 100644 pywhispercpp/py.typed diff --git a/MANIFEST.in b/MANIFEST.in index 0ac649a..47b5544 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,6 @@ include README.md LICENSE pybind11/LICENSE version.txt +include pywhispercpp/model.pyi +include pywhispercpp/py.typed graft pybind11/include graft pybind11/tools graft src diff --git a/pywhispercpp/model.pyi b/pywhispercpp/model.pyi new file mode 100644 index 0000000..060372d --- /dev/null +++ b/pywhispercpp/model.pyi @@ -0,0 +1,148 @@ +from __future__ import annotations + +# Generated by coverage/generate_pyi.py. Do not edit by hand. + +from typing import Any, Callable, Dict, List, Optional, TextIO, Tuple, TypedDict, Union + +import numpy as np +import numpy.typing as npt + +AudioArray = npt.NDArray[np.float32] +AudioInput = Union[str, AudioArray] + + +class GreedyParams(TypedDict): + best_of: int + + +class BeamSearchParams(TypedDict): + beam_size: int + patience: float + + +class Segment: + t0: int + t1: int + text: str + probability: float + + def __init__(self, t0: int, t1: int, text: str, probability: float = np.nan)->None: ... + def __str__(self)->str: ... + def __repr__(self)->str: ... + + +class Model: + _new_segment_callback: Optional[Callable[[Segment], None]] + + def __init__( + self, + model: str = 'tiny', + models_dir: Optional[str] = None, + params_sampling_strategy: int = 0, + redirect_whispercpp_logs_to: Union[bool, TextIO, str, None] = False, + use_openvino: bool = False, + openvino_model_path: Optional[str] = None, + openvino_device: str = 'CPU', + openvino_cache_dir: Optional[str] = None, + *, + n_threads: Optional[int] = None, + n_max_text_ctx: int = 16384, + offset_ms: int = 0, + duration_ms: int = 0, + translate: bool = False, + no_context: bool = False, + single_segment: bool = False, + print_special: bool = False, + print_progress: bool = True, + print_realtime: bool = False, + print_timestamps: bool = True, + token_timestamps: bool = False, + thold_pt: float = 0.01, + thold_ptsum: float = 0.01, + max_len: int = 0, + split_on_word: bool = False, + max_tokens: int = 0, + audio_ctx: int = 0, + initial_prompt: Optional[str] = None, + prompt_tokens: Optional[Tuple[Any, ...]] = None, + prompt_n_tokens: int = 0, + language: str = '', + suppress_blank: bool = True, + suppress_non_speech_tokens: bool = False, + temperature: float = 0.0, + max_initial_ts: float = 1.0, + length_penalty: float = -1.0, + temperature_inc: float = 0.2, + entropy_thold: float = 2.4, + logprob_thold: float = -1.0, + no_speech_thold: float = 0.6, + greedy: GreedyParams = {'best_of': -1}, + beam_search: BeamSearchParams = {'beam_size': -1, 'patience': -1.0}, + vad: bool = False, + vad_model_path: Optional[str] = None, + )->None: ... + + def transcribe( + self, + media: AudioInput, + n_processors: Optional[int] = None, + new_segment_callback: Optional[Callable[[Segment], None]] = None, + *, + n_threads: Optional[int] = None, + n_max_text_ctx: int = 16384, + offset_ms: int = 0, + duration_ms: int = 0, + translate: bool = False, + no_context: bool = False, + single_segment: bool = False, + print_special: bool = False, + print_progress: bool = True, + print_realtime: bool = False, + print_timestamps: bool = True, + token_timestamps: bool = False, + thold_pt: float = 0.01, + thold_ptsum: float = 0.01, + max_len: int = 0, + split_on_word: bool = False, + max_tokens: int = 0, + audio_ctx: int = 0, + initial_prompt: Optional[str] = None, + prompt_tokens: Optional[Tuple[Any, ...]] = None, + prompt_n_tokens: int = 0, + language: str = '', + suppress_blank: bool = True, + suppress_non_speech_tokens: bool = False, + temperature: float = 0.0, + max_initial_ts: float = 1.0, + length_penalty: float = -1.0, + temperature_inc: float = 0.2, + entropy_thold: float = 2.4, + logprob_thold: float = -1.0, + no_speech_thold: float = 0.6, + greedy: GreedyParams = {'best_of': -1}, + beam_search: BeamSearchParams = {'beam_size': -1, 'patience': -1.0}, + extract_probability: bool = False, + vad: bool = False, + vad_model_path: Optional[str] = None, + ) -> List[Segment]: ... + + def get_params(self) -> Dict[str, Any]: ... + @staticmethod + def get_params_schema() -> Dict[str, Dict[str, Any]]: ... + @staticmethod + def lang_max_id() -> int: ... + def print_timings(self) -> None: ... + @staticmethod + def system_info() -> Any: ... + @staticmethod + def available_languages() -> List[str]: ... + @staticmethod + def _load_audio(media_file_path: str) -> AudioArray: ... + def auto_detect_language( + self, + media: AudioInput, + offset_ms: int = 0, + n_threads: int = 4, + ) -> Tuple[Tuple[str, np.float32], Dict[str, np.float32]]: ... + def __del__(self) -> None: ... + diff --git a/pywhispercpp/py.typed b/pywhispercpp/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/setup.py b/setup.py index d13545f..71a5ce7 100644 --- a/setup.py +++ b/setup.py @@ -251,7 +251,7 @@ def get_version() -> str: packages=find_packages('.'), package_dir={'': '.'}, include_package_data=True, - package_data={'pywhispercpp': []}, + package_data={'pywhispercpp': ["*.pyi", "py.typed"]}, long_description_content_type="text/markdown", license='MIT', entry_points={ From ad5c7ad92258e8f2c220d63710aff73d1475246a Mon Sep 17 00:00:00 2001 From: scottmonster Date: Fri, 15 May 2026 22:16:57 -0500 Subject: [PATCH 02/16] begin work on extending api --- .gitignore | 5 + CMakeLists.txt | 1 + pywhispercpp/constants.py | 62 ++++- pywhispercpp/model.py | 89 +++++-- pywhispercpp/model.pyi | 42 +++- src/main.cpp | 346 +++++++++++++++++++++++++- tests/test_backwards_compatibility.py | 153 ++++++++++++ tests/test_model.py | 57 +++++ whsiper_args.txt | 252 +++++++++++++++++++ 9 files changed, 974 insertions(+), 33 deletions(-) create mode 100644 tests/test_backwards_compatibility.py create mode 100644 whsiper_args.txt diff --git a/.gitignore b/.gitignore index 1928866..3e25b4f 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,11 @@ _generate/ *env* _version.py +coverage +libggml* +libwhisper* +updating + # custom .idea _docs diff --git a/CMakeLists.txt b/CMakeLists.txt index 39c16a8..af94411 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,6 +6,7 @@ add_subdirectory(whisper.cpp) pybind11_add_module(_pywhispercpp src/main.cpp + whisper.cpp/examples/grammar-parser.cpp ) target_link_libraries (_pywhispercpp PRIVATE whisper) diff --git a/pywhispercpp/constants.py b/pywhispercpp/constants.py index f56a3e9..b742455 100644 --- a/pywhispercpp/constants.py +++ b/pywhispercpp/constants.py @@ -95,6 +95,12 @@ 'type': bool, 'description': "do not use past transcription (if any) as initial prompt for the decoder", 'options': None, + 'default': True + }, + 'no_timestamps': { + 'type': bool, + 'description': "do not generate timestamps", + 'options': None, 'default': False }, 'single_segment': { @@ -164,18 +170,42 @@ 'options': None, 'default': 0 }, + 'debug_mode': { + 'type': bool, + 'description': "enable debug mode in whisper.cpp", + 'options': None, + 'default': False + }, 'audio_ctx': { 'type': int, 'description': "overwrite the audio context size (0 = use default)", 'options': None, 'default': 0 }, + 'tdrz_enable': { + 'type': bool, + 'description': "enable tinydiarize speaker turn detection", + 'options': None, + 'default': False + }, 'initial_prompt': { 'type': str, 'description': "Initial prompt, these are prepended to any existing text context from a previous call", 'options': None, 'default': None }, + 'grammar': { + 'type': str, + 'description': "GBNF grammar text or a path to a grammar file", + 'options': None, + 'default': None + }, + 'grammar_rule': { + 'type': str, + 'description': "top-level GBNF grammar rule name", + 'options': None, + 'default': 'root' + }, 'prompt_tokens': { 'type': Tuple, 'description': "tokens to provide to the whisper decoder as initial prompt", @@ -188,11 +218,23 @@ 'options': None, 'default': 0 }, + 'carry_initial_prompt': { + 'type': bool, + 'description': "always prepend the initial prompt to each decode window", + 'options': None, + 'default': False + }, 'language': { 'type': str, 'description': 'for auto-detection, set to None, "" or "auto"', 'options': None, - 'default': "" + 'default': "en" + }, + 'detect_language': { + 'type': bool, + 'description': 'enable automatic language detection during transcription', + 'options': None, + 'default': False }, 'suppress_blank': { 'type': bool, @@ -206,6 +248,12 @@ 'options': None, 'default': False }, + 'suppress_nst': { + 'type': bool, + 'description': 'canonical whisper.cpp name for non-speech token suppression', + 'options': None, + 'default': False + }, 'temperature': { 'type': float, 'description': 'initial decoding temperature', @@ -248,23 +296,29 @@ 'options': None, 'default': 0.6 }, + 'grammar_penalty': { + 'type': float, + 'description': 'scales down logits of non-grammar tokens', + 'options': None, + 'default': 100.0 + }, 'greedy': { 'type': dict, 'description': 'greedy', 'options': None, - 'default': {"best_of": -1} + 'default': {"best_of": 5} }, 'beam_search': { 'type': dict, 'description': 'beam_search', 'options': None, - 'default': {"beam_size": -1, "patience": -1.0} + 'default': {"beam_size": 5, "patience": -1.0} }, 'extract_probability': { 'type': bool, 'description': 'calculate the geometric mean of token probabilities for each segment.', 'options': None, - 'default': True + 'default': False }, 'vad': { 'type': bool, diff --git a/pywhispercpp/model.py b/pywhispercpp/model.py index 7f0f2a3..e73cb06 100644 --- a/pywhispercpp/model.py +++ b/pywhispercpp/model.py @@ -11,7 +11,7 @@ import sys from pathlib import Path from time import time -from typing import Union, Callable, List, TextIO, Tuple, Optional +from typing import Any, Union, Callable, List, TextIO, Tuple, Optional, Dict import _pywhispercpp as pw import numpy as np import pywhispercpp.utils as utils @@ -72,13 +72,14 @@ class Model: def __init__(self, model: str = 'tiny', - models_dir: str = None, + models_dir: Optional[str] = None, params_sampling_strategy: int = 0, redirect_whispercpp_logs_to: Union[bool, TextIO, str, None] = False, use_openvino: bool = False, - openvino_model_path: str = None, + openvino_model_path: Optional[str] = None, openvino_device: str = 'CPU', - openvino_cache_dir: str = None, + openvino_cache_dir: Optional[str] = None, + context_params: Union[Dict[str, Any], Any, None] = None, **params): """ :param model: The name of the model, one of the [AVAILABLE_MODELS](/pywhispercpp/#pywhispercpp.constants.AVAILABLE_MODELS), @@ -96,6 +97,7 @@ def __init__(self, """ self.model_path = utils.resolve_model_path(model, models_dir) self._ctx = None + self._context_params = self._resolve_context_params(context_params) self._sampling_strategy = pw.whisper_sampling_strategy.WHISPER_SAMPLING_GREEDY if params_sampling_strategy == 0 else \ pw.whisper_sampling_strategy.WHISPER_SAMPLING_BEAM_SEARCH self._params = pw.whisper_full_default_params(self._sampling_strategy) @@ -112,8 +114,9 @@ def __init__(self, def transcribe(self, media: Union[str, np.ndarray], - n_processors: int = None, - new_segment_callback: Callable[[Segment], None] = None, + n_processors: Optional[int] = None, + new_segment_callback: Optional[Callable[[Segment], None]] = None, + abort_callback: Optional[Callable[[], bool]] = None, **params) -> List[Segment]: """ Transcribes the media provided as input and returns list of `Segment` objects. @@ -124,12 +127,13 @@ def transcribe(self, binding to whisper.cpp/whisper_full_parallel > Split the input audio in chunks and process each chunk separately using whisper_full() :param new_segment_callback: callback function that will be called when a new segment is generated + :param abort_callback: callback function returning True to abort an in-flight transcription early :param params: keyword arguments for different whisper.cpp parameters, see ::: constants.PARAMS_SCHEMA :param extract_probability: If True, calculates the geometric mean of token probabilities for each segment, providing a confidence score interpretable as a probability in [0, 1]. :return: List of transcription segments """ - if type(media) is np.ndarray: + if isinstance(media, np.ndarray): audio = media else: if not Path(media).exists(): @@ -147,6 +151,11 @@ def transcribe(self, Model._new_segment_callback = new_segment_callback pw.assign_new_segment_callback(self._params, Model.__call_new_segment_callback) + if abort_callback is None: + pw.clear_abort_callback(self._params) + else: + pw.assign_abort_callback(self._params, abort_callback) + # run inference start_time = time() logger.info("Transcribing ...") @@ -191,7 +200,7 @@ def _get_segments(ctx, start: int, end: int, extract_probability: bool = False) else: avg_prob = np.nan - res.append(Segment(t0, t1, text.strip(), probability=np.float32(avg_prob))) + res.append(Segment(t0, t1, text.strip(), probability=float(avg_prob))) return res def get_params(self) -> dict: @@ -246,7 +255,7 @@ def system_info() -> None: return pw.whisper_print_system_info() @staticmethod - def available_languages() -> list[str]: + def available_languages() -> List[str]: """ Returns a list of supported language codes @@ -258,6 +267,28 @@ def available_languages() -> list[str]: res.append(pw.whisper_lang_str(i)) return res + @staticmethod + def _resolve_context_params(context_params: Union[Dict[str, Any], Any, None]): + if context_params is None: + return None + + if isinstance(context_params, dict): + resolved = pw.whisper_context_default_params() + for key, value in context_params.items(): + setattr(resolved, key, value) + return resolved + + return context_params + + @staticmethod + def _normalize_params(kwargs: dict) -> dict: + normalized = dict(kwargs) + + if 'suppress_non_speech_tokens' in normalized and 'suppress_nst' not in normalized: + normalized['suppress_nst'] = normalized.pop('suppress_non_speech_tokens') + + return normalized + def _init_model(self) -> None: """ Private method to initialize the method from the bindings, it will be called automatically from the __init__ @@ -265,7 +296,10 @@ def _init_model(self) -> None: """ logger.info("Initializing the model ...") with utils.redirect_stderr(to=self.redirect_whispercpp_logs_to): - self._ctx = pw.whisper_init_from_file(self.model_path) + if self._context_params is None: + self._ctx = pw.whisper_init_from_file(self.model_path) + else: + self._ctx = pw.whisper_init_from_file_with_params(self.model_path, self._context_params) if self.use_openvino: pw.whisper_ctx_init_openvino_encoder(self._ctx, self.openvino_model_path, self.openvino_device, self.openvino_cache_dir) @@ -277,10 +311,25 @@ def _set_params(self, kwargs: dict) -> None: :param kwargs: dict like object for the different params :return: None """ - for param in kwargs: - setattr(self._params, param, kwargs[param]) + normalized = self._normalize_params(kwargs) + prompt_tokens = normalized.pop('prompt_tokens', None) if 'prompt_tokens' in normalized else None + grammar = normalized.pop('grammar', None) if 'grammar' in normalized else None + grammar_rule = normalized.pop('grammar_rule', 'root') if 'grammar_rule' in normalized else 'root' + grammar_penalty = normalized.get('grammar_penalty', self._params.grammar_penalty) + + for param, value in normalized.items(): + setattr(self._params, param, value) + + if 'prompt_tokens' in kwargs: + self._params.set_prompt_tokens(prompt_tokens) + + if 'grammar' in kwargs: + if grammar: + self._params.set_grammar(grammar, grammar_rule, grammar_penalty) + else: + self._params.clear_grammar() - def _transcribe(self, audio: np.ndarray, n_processors: int = None): + def _transcribe(self, audio: np.ndarray, n_processors: Optional[int] = None): """ Private method to call the whisper.cpp/whisper_full function @@ -310,10 +359,11 @@ def __call_new_segment_callback(ctx, n_new, user_data) -> None: start = n - n_new res = Model._get_segments(ctx, start, n, False) for segment in res: - Model._new_segment_callback(segment) + if Model._new_segment_callback is not None: + Model._new_segment_callback(segment) @staticmethod - def _load_audio(media_file_path: str) -> np.array: + def _load_audio(media_file_path: str) -> np.ndarray: """ Helper method to return a `np.array` object from a media file If the media file is not a WAV file, it will try to convert it using ffmpeg @@ -369,7 +419,7 @@ def wav_to_np(file_path): finally: os.remove(temp_file_path) - def auto_detect_language(self, media: Union[str, np.ndarray], offset_ms: int = 0, n_threads: int = 4) -> Tuple[Tuple[str, np.float32], dict[str, np.float32]]: + def auto_detect_language(self, media: Union[str, np.ndarray], offset_ms: int = 0, n_threads: int = 4) -> Tuple[Tuple[str, np.float32], Dict[str, np.float32]]: """ Automatic language detection using whisper.cpp/whisper_pcm_to_mel and whisper.cpp/whisper_lang_auto_detect @@ -378,7 +428,7 @@ def auto_detect_language(self, media: Union[str, np.ndarray], offset_ms: int = :param n_threads: number of threads to use :return: ((detected_language, probability), probabilities for all languages) """ - if type(media) is np.ndarray: + if isinstance(media, np.ndarray): audio = media else: if not Path(media).exists(): @@ -391,11 +441,12 @@ def auto_detect_language(self, media: Union[str, np.ndarray], offset_ms: int = auto_detect = pw.whisper_lang_auto_detect(self._ctx, offset_ms, n_threads, probs) langs = self.available_languages() lang_probs = {langs[i]: probs[i] for i in range(lang_count)} - return (langs[auto_detect], probs[auto_detect]), lang_probs + return (langs[auto_detect], np.float32(probs[auto_detect])), lang_probs def __del__(self): """ Free up resources :return: None """ - pw.whisper_free(self._ctx) \ No newline at end of file + if self._ctx is not None: + pw.whisper_free(self._ctx) \ No newline at end of file diff --git a/pywhispercpp/model.pyi b/pywhispercpp/model.pyi index 060372d..27e3d52 100644 --- a/pywhispercpp/model.pyi +++ b/pywhispercpp/model.pyi @@ -9,6 +9,7 @@ import numpy.typing as npt AudioArray = npt.NDArray[np.float32] AudioInput = Union[str, AudioArray] +ContextParams = Union[Dict[str, Any], Any] class GreedyParams(TypedDict): @@ -32,7 +33,12 @@ class Segment: class Model: + """ + docuemnts strings + """ + _new_segment_callback: Optional[Callable[[Segment], None]] + def __init__( self, @@ -44,13 +50,15 @@ class Model: openvino_model_path: Optional[str] = None, openvino_device: str = 'CPU', openvino_cache_dir: Optional[str] = None, + context_params: Optional[ContextParams] = None, *, n_threads: Optional[int] = None, n_max_text_ctx: int = 16384, offset_ms: int = 0, duration_ms: int = 0, translate: bool = False, - no_context: bool = False, + no_context: bool = True, + no_timestamps: bool = False, single_segment: bool = False, print_special: bool = False, print_progress: bool = True, @@ -62,13 +70,20 @@ class Model: max_len: int = 0, split_on_word: bool = False, max_tokens: int = 0, + debug_mode: bool = False, audio_ctx: int = 0, + tdrz_enable: bool = False, initial_prompt: Optional[str] = None, + grammar: Optional[str] = None, + grammar_rule: str = 'root', prompt_tokens: Optional[Tuple[Any, ...]] = None, prompt_n_tokens: int = 0, - language: str = '', + carry_initial_prompt: bool = False, + language: str = 'en', + detect_language: bool = False, suppress_blank: bool = True, suppress_non_speech_tokens: bool = False, + suppress_nst: bool = False, temperature: float = 0.0, max_initial_ts: float = 1.0, length_penalty: float = -1.0, @@ -76,8 +91,9 @@ class Model: entropy_thold: float = 2.4, logprob_thold: float = -1.0, no_speech_thold: float = 0.6, - greedy: GreedyParams = {'best_of': -1}, - beam_search: BeamSearchParams = {'beam_size': -1, 'patience': -1.0}, + grammar_penalty: float = 100.0, + greedy: GreedyParams = {'best_of': 5}, + beam_search: BeamSearchParams = {'beam_size': 5, 'patience': -1.0}, vad: bool = False, vad_model_path: Optional[str] = None, )->None: ... @@ -87,13 +103,15 @@ class Model: media: AudioInput, n_processors: Optional[int] = None, new_segment_callback: Optional[Callable[[Segment], None]] = None, + abort_callback: Optional[Callable[[], bool]] = None, *, n_threads: Optional[int] = None, n_max_text_ctx: int = 16384, offset_ms: int = 0, duration_ms: int = 0, translate: bool = False, - no_context: bool = False, + no_context: bool = True, + no_timestamps: bool = False, single_segment: bool = False, print_special: bool = False, print_progress: bool = True, @@ -105,13 +123,20 @@ class Model: max_len: int = 0, split_on_word: bool = False, max_tokens: int = 0, + debug_mode: bool = False, audio_ctx: int = 0, + tdrz_enable: bool = False, initial_prompt: Optional[str] = None, + grammar: Optional[str] = None, + grammar_rule: str = 'root', prompt_tokens: Optional[Tuple[Any, ...]] = None, prompt_n_tokens: int = 0, - language: str = '', + carry_initial_prompt: bool = False, + language: str = 'en', + detect_language: bool = False, suppress_blank: bool = True, suppress_non_speech_tokens: bool = False, + suppress_nst: bool = False, temperature: float = 0.0, max_initial_ts: float = 1.0, length_penalty: float = -1.0, @@ -119,8 +144,9 @@ class Model: entropy_thold: float = 2.4, logprob_thold: float = -1.0, no_speech_thold: float = 0.6, - greedy: GreedyParams = {'best_of': -1}, - beam_search: BeamSearchParams = {'beam_size': -1, 'patience': -1.0}, + grammar_penalty: float = 100.0, + greedy: GreedyParams = {'best_of': 5}, + beam_search: BeamSearchParams = {'beam_size': 5, 'patience': -1.0}, extract_probability: bool = False, vad: bool = False, vad_model_path: Optional[str] = None, diff --git a/src/main.cpp b/src/main.cpp index 48341bb..23150e5 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,6 +16,9 @@ #include #include "whisper.h" +#include "../whisper.cpp/examples/grammar-parser.h" + +#include #define STRINGIFY(x) #x @@ -32,6 +35,7 @@ using namespace pybind11::literals; // to bring in the `_a` literal py::function py_new_segment_callback; py::function py_encoder_begin_callback; py::function py_logits_filter_callback; +py::object py_log_callback; // whisper context wrapper, to solve the incomplete type issue @@ -56,6 +60,34 @@ struct whisper_model_loader_wrapper { }; +struct whisper_context_wrapper whisper_init_from_file_with_params_wrapper( + const char * path_model, + struct whisper_context_params cparams){ + struct whisper_context * ctx = whisper_init_from_file_with_params(path_model, cparams); + struct whisper_context_wrapper ctw_w; + ctw_w.ptr = ctx; + return ctw_w; +} + +struct whisper_context_wrapper whisper_init_from_buffer_with_params_wrapper( + void * buffer, + size_t buffer_size, + struct whisper_context_params cparams){ + struct whisper_context * ctx = whisper_init_from_buffer_with_params(buffer, buffer_size, cparams); + struct whisper_context_wrapper ctw_w; + ctw_w.ptr = ctx; + return ctw_w; +} + +struct whisper_context_wrapper whisper_init_with_params_wrapper( + struct whisper_model_loader_wrapper * loader, + struct whisper_context_params cparams){ + struct whisper_context * ctx = whisper_init_with_params(loader->ptr, cparams); + struct whisper_context_wrapper ctw_w; + ctw_w.ptr = ctx; + return ctw_w; +}; + struct whisper_context_wrapper whisper_init_from_file_wrapper(const char * path_model){ struct whisper_context_params cparams = whisper_context_default_params(); struct whisper_context * ctx = whisper_init_from_file_with_params(path_model, cparams); @@ -291,6 +323,60 @@ float whisper_full_get_token_p_wrapper(struct whisper_context_wrapper * ctx, int return whisper_full_get_token_p(ctx->ptr, i_segment, i_token); } +bool whisper_full_get_segment_speaker_turn_next_wrapper(struct whisper_context_wrapper * ctx, int i_segment){ + return whisper_full_get_segment_speaker_turn_next(ctx->ptr, i_segment); +} + +const char * whisper_model_type_readable_wrapper(struct whisper_context_wrapper * ctx_w){ + return whisper_model_type_readable(ctx_w->ptr); +} + +int whisper_model_n_vocab_wrapper(struct whisper_context_wrapper * ctx_w){ + return whisper_model_n_vocab(ctx_w->ptr); +} + +int whisper_model_n_audio_ctx_wrapper(struct whisper_context_wrapper * ctx_w){ + return whisper_model_n_audio_ctx(ctx_w->ptr); +} + +int whisper_model_n_audio_state_wrapper(struct whisper_context_wrapper * ctx_w){ + return whisper_model_n_audio_state(ctx_w->ptr); +} + +int whisper_model_n_audio_head_wrapper(struct whisper_context_wrapper * ctx_w){ + return whisper_model_n_audio_head(ctx_w->ptr); +} + +int whisper_model_n_audio_layer_wrapper(struct whisper_context_wrapper * ctx_w){ + return whisper_model_n_audio_layer(ctx_w->ptr); +} + +int whisper_model_n_text_ctx_wrapper(struct whisper_context_wrapper * ctx_w){ + return whisper_model_n_text_ctx(ctx_w->ptr); +} + +int whisper_model_n_text_state_wrapper(struct whisper_context_wrapper * ctx_w){ + return whisper_model_n_text_state(ctx_w->ptr); +} + +int whisper_model_n_text_head_wrapper(struct whisper_context_wrapper * ctx_w){ + return whisper_model_n_text_head(ctx_w->ptr); +} + +int whisper_model_n_text_layer_wrapper(struct whisper_context_wrapper * ctx_w){ + return whisper_model_n_text_layer(ctx_w->ptr); +} + +int whisper_model_n_mels_wrapper(struct whisper_context_wrapper * ctx_w){ + return whisper_model_n_mels(ctx_w->ptr); +} + +int whisper_model_ftype_wrapper(struct whisper_context_wrapper * ctx_w){ + return whisper_model_ftype(ctx_w->ptr); +} + +bool _abort_callback(void * user_data); + int whisper_ctx_init_openvino_encoder_wrapper(struct whisper_context_wrapper * ctx, const char * model_path, const char * device, const char * cache_dir){ @@ -301,8 +387,13 @@ struct WhisperFullParamsWrapper : public whisper_full_params { std::string initial_prompt_str; std::string suppress_regex_str; std::string vad_model_path_str; + std::string grammar_rule_str; + grammar_parser::parse_state grammar_parsed; + std::vector grammar_rules_storage; + std::vector prompt_tokens_storage; public: py::function py_progress_callback; + py::object py_abort_callback; WhisperFullParamsWrapper(const whisper_full_params& params = whisper_full_params()) : whisper_full_params(params), initial_prompt_str(params.initial_prompt ? params.initial_prompt : ""), @@ -312,6 +403,7 @@ struct WhisperFullParamsWrapper : public whisper_full_params { initial_prompt = initial_prompt_str.empty() ? nullptr : initial_prompt_str.c_str(); suppress_regex = suppress_regex_str.empty() ? nullptr : suppress_regex_str.c_str(); vad_model_path = vad_model_path_str.empty() ? nullptr : vad_model_path_str.c_str(); + abort_callback_user_data = this; // progress callback progress_callback_user_data = this; progress_callback = [](struct whisper_context* ctx, struct whisper_state* state, int progress, void* user_data) { @@ -333,11 +425,24 @@ struct WhisperFullParamsWrapper : public whisper_full_params { initial_prompt_str(other.initial_prompt_str), suppress_regex_str(other.suppress_regex_str), vad_model_path_str(other.vad_model_path_str), - py_progress_callback(other.py_progress_callback) { + grammar_rule_str(other.grammar_rule_str), + grammar_parsed(other.grammar_parsed), + grammar_rules_storage(other.grammar_rules_storage), + prompt_tokens_storage(other.prompt_tokens_storage), + py_progress_callback(other.py_progress_callback), + py_abort_callback(other.py_abort_callback) { // Reset pointers to new string copies initial_prompt = initial_prompt_str.empty() ? nullptr : initial_prompt_str.c_str(); suppress_regex = suppress_regex_str.empty() ? nullptr : suppress_regex_str.c_str(); vad_model_path = vad_model_path_str.empty() ? nullptr : vad_model_path_str.c_str(); + grammar_rules = grammar_rules_storage.empty() ? nullptr : grammar_rules_storage.data(); + n_grammar_rules = grammar_rules_storage.size(); + if (!grammar_rule_str.empty() && grammar_parsed.symbol_ids.find(grammar_rule_str) != grammar_parsed.symbol_ids.end()) { + i_start_rule = grammar_parsed.symbol_ids.at(grammar_rule_str); + } + prompt_tokens = prompt_tokens_storage.empty() ? nullptr : prompt_tokens_storage.data(); + prompt_n_tokens = prompt_tokens_storage.size(); + abort_callback_user_data = this; progress_callback_user_data = this; progress_callback = [](struct whisper_context* ctx, struct whisper_state* state, int progress, void* user_data) { auto* self = static_cast(user_data); @@ -365,6 +470,89 @@ struct WhisperFullParamsWrapper : public whisper_full_params { vad_model_path_str = model_path; vad_model_path = vad_model_path_str.c_str(); } + void set_abort_callback(py::function callback) { + py_abort_callback = callback; + abort_callback_user_data = this; + abort_callback = _abort_callback; + } + void clear_abort_callback() { + py_abort_callback = py::none(); + abort_callback = nullptr; + abort_callback_user_data = this; + } + void clear_grammar() { + grammar_rule_str.clear(); + grammar_parsed = grammar_parser::parse_state(); + grammar_rules_storage.clear(); + grammar_rules = nullptr; + n_grammar_rules = 0; + i_start_rule = 0; + } + void set_grammar(const std::string& grammar_input, const std::string& rule_name = "", float penalty = -1.0f) { + clear_grammar(); + + if (grammar_input.empty()) { + if (penalty >= 0.0f) { + grammar_penalty = penalty; + } + return; + } + + std::string grammar_source = grammar_input; + std::ifstream grammar_file(grammar_input); + if (grammar_file.is_open()) { + grammar_source.assign((std::istreambuf_iterator(grammar_file)), + std::istreambuf_iterator()); + } + + grammar_parsed = grammar_parser::parse(grammar_source.c_str()); + if (grammar_parsed.rules.empty()) { + throw py::value_error("Failed to parse grammar input"); + } + + grammar_rule_str = rule_name.empty() ? "root" : rule_name; + if (grammar_parsed.symbol_ids.find(grammar_rule_str) == grammar_parsed.symbol_ids.end()) { + throw py::value_error("Grammar rule '" + grammar_rule_str + "' not found"); + } + + grammar_rules_storage = grammar_parsed.c_rules(); + grammar_rules = grammar_rules_storage.data(); + n_grammar_rules = grammar_rules_storage.size(); + i_start_rule = grammar_parsed.symbol_ids.at(grammar_rule_str); + + if (penalty >= 0.0f) { + grammar_penalty = penalty; + } + } + void set_prompt_tokens(const py::object& tokens_obj) { + prompt_tokens_storage.clear(); + + if (tokens_obj.is_none()) { + prompt_tokens = nullptr; + prompt_n_tokens = 0; + return; + } + + py::sequence tokens = tokens_obj.cast(); + prompt_tokens_storage.reserve(tokens.size()); + for (const auto & token : tokens) { + prompt_tokens_storage.push_back(token.cast()); + } + + prompt_tokens = prompt_tokens_storage.empty() ? nullptr : prompt_tokens_storage.data(); + prompt_n_tokens = prompt_tokens_storage.size(); + } + py::tuple get_prompt_tokens() const { + const whisper_token * tokens_ptr = prompt_tokens_storage.empty() ? prompt_tokens : prompt_tokens_storage.data(); + const size_t token_count = prompt_tokens_storage.empty() ? static_cast(std::max(prompt_n_tokens, 0)) : prompt_tokens_storage.size(); + + py::tuple tokens(token_count); + for (size_t i = 0; i < token_count; ++i) { + tokens[i] = py::int_(tokens_ptr[i]); + } + + return tokens; + } }; WhisperFullParamsWrapper whisper_full_default_params_wrapper(enum whisper_sampling_strategy strategy) { return WhisperFullParamsWrapper(whisper_full_default_params(strategy)); @@ -417,6 +605,57 @@ void assign_logits_filter_callback(struct whisper_full_params *params, py::funct py_logits_filter_callback = f; } +bool _abort_callback(void * user_data) { + auto * params = static_cast(user_data); + if (!params || !params->py_abort_callback || params->py_abort_callback.is_none()) { + return false; + } + + py::gil_scoped_acquire gil; + py::function callback = params->py_abort_callback.cast(); + py::object result_py = callback(); + return result_py.cast(); +} + +void assign_abort_callback(whisper_full_params *params_base, py::object callback){ + auto * params = static_cast(params_base); + if (callback.is_none()) { + params->py_abort_callback = py::none(); + params->abort_callback = nullptr; + params->abort_callback_user_data = params; + return; + } + + params->py_abort_callback = callback.cast(); + params->abort_callback_user_data = params; + params->abort_callback = _abort_callback; +} + +void clear_abort_callback(whisper_full_params *params_base) { + auto * params = static_cast(params_base); + params->py_abort_callback = py::none(); + params->abort_callback = nullptr; + params->abort_callback_user_data = params; +} + +void whisper_log_set_wrapper(py::object callback) { + if (callback.is_none()) { + py_log_callback = py::none(); + whisper_log_set(nullptr, nullptr); + return; + } + + py_log_callback = callback.cast(); + whisper_log_set( + [](enum ggml_log_level level, const char * text, void * user_data) { + (void) user_data; + py::gil_scoped_acquire gil; + py::function log_callback = py_log_callback.cast(); + log_callback(py::int_(static_cast(level)), py::str(text ? text : "")); + }, + nullptr); +} + py::dict get_greedy(whisper_full_params * params){ py::dict d("best_of"_a=params->greedy.best_of); return d; @@ -532,7 +771,34 @@ PYBIND11_MODULE(_pywhispercpp, m) { m.attr("WHISPER_HOP_LENGTH") = WHISPER_HOP_LENGTH; m.attr("WHISPER_CHUNK_SIZE") = WHISPER_CHUNK_SIZE; + py::enum_(m, "whisper_alignment_heads_preset") + .value("WHISPER_AHEADS_NONE", whisper_alignment_heads_preset::WHISPER_AHEADS_NONE) + .value("WHISPER_AHEADS_N_TOP_MOST", whisper_alignment_heads_preset::WHISPER_AHEADS_N_TOP_MOST) + .value("WHISPER_AHEADS_CUSTOM", whisper_alignment_heads_preset::WHISPER_AHEADS_CUSTOM) + .value("WHISPER_AHEADS_TINY_EN", whisper_alignment_heads_preset::WHISPER_AHEADS_TINY_EN) + .value("WHISPER_AHEADS_TINY", whisper_alignment_heads_preset::WHISPER_AHEADS_TINY) + .value("WHISPER_AHEADS_BASE_EN", whisper_alignment_heads_preset::WHISPER_AHEADS_BASE_EN) + .value("WHISPER_AHEADS_BASE", whisper_alignment_heads_preset::WHISPER_AHEADS_BASE) + .value("WHISPER_AHEADS_SMALL_EN", whisper_alignment_heads_preset::WHISPER_AHEADS_SMALL_EN) + .value("WHISPER_AHEADS_SMALL", whisper_alignment_heads_preset::WHISPER_AHEADS_SMALL) + .value("WHISPER_AHEADS_MEDIUM_EN", whisper_alignment_heads_preset::WHISPER_AHEADS_MEDIUM_EN) + .value("WHISPER_AHEADS_MEDIUM", whisper_alignment_heads_preset::WHISPER_AHEADS_MEDIUM) + .value("WHISPER_AHEADS_LARGE_V1", whisper_alignment_heads_preset::WHISPER_AHEADS_LARGE_V1) + .value("WHISPER_AHEADS_LARGE_V2", whisper_alignment_heads_preset::WHISPER_AHEADS_LARGE_V2) + .value("WHISPER_AHEADS_LARGE_V3", whisper_alignment_heads_preset::WHISPER_AHEADS_LARGE_V3) + .value("WHISPER_AHEADS_LARGE_V3_TURBO", whisper_alignment_heads_preset::WHISPER_AHEADS_LARGE_V3_TURBO) + .export_values(); + py::class_(m, "whisper_context"); + py::class_(m, "whisper_context_params") + .def(py::init<>()) + .def_readwrite("use_gpu", &whisper_context_params::use_gpu) + .def_readwrite("flash_attn", &whisper_context_params::flash_attn) + .def_readwrite("gpu_device", &whisper_context_params::gpu_device) + .def_readwrite("dtw_token_timestamps", &whisper_context_params::dtw_token_timestamps) + .def_readwrite("dtw_aheads_preset", &whisper_context_params::dtw_aheads_preset) + .def_readwrite("dtw_n_top", &whisper_context_params::dtw_n_top) + .def_readwrite("dtw_mem_size", &whisper_context_params::dtw_mem_size); py::class_(m, "whisper_token") .def(py::init<>()); py::class_(m,"whisper_token_data") @@ -545,20 +811,33 @@ PYBIND11_MODULE(_pywhispercpp, m) { .def_readwrite("ptsum", &whisper_token_data::ptsum) .def_readwrite("t0", &whisper_token_data::t0) .def_readwrite("t1", &whisper_token_data::t1) + .def_readwrite("t_dtw", &whisper_token_data::t_dtw) .def_readwrite("vlen", &whisper_token_data::vlen); py::class_(m,"whisper_model_loader") .def(py::init<>()); + m.def("whisper_context_default_params", &whisper_context_default_params, + "Return the default context parameters used during model initialization."); + DEF_RELEASE_GIL("whisper_init_from_file", &whisper_init_from_file_wrapper, "Various functions for loading a ggml whisper model.\n" "Allocate (almost) all memory needed for the model.\n" "Return NULL on failure"); + DEF_RELEASE_GIL("whisper_init_from_file_with_params", &whisper_init_from_file_with_params_wrapper, "Various functions for loading a ggml whisper model.\n" + "Allocate (almost) all memory needed for the model.\n" + "Return NULL on failure"); DEF_RELEASE_GIL("whisper_init_from_buffer", &whisper_init_from_buffer_wrapper, "Various functions for loading a ggml whisper model.\n" "Allocate (almost) all memory needed for the model.\n" "Return NULL on failure"); + DEF_RELEASE_GIL("whisper_init_from_buffer_with_params", &whisper_init_from_buffer_with_params_wrapper, "Various functions for loading a ggml whisper model.\n" + "Allocate (almost) all memory needed for the model.\n" + "Return NULL on failure"); DEF_RELEASE_GIL("whisper_init", &whisper_init_wrapper, "Various functions for loading a ggml whisper model.\n" "Allocate (almost) all memory needed for the model.\n" "Return NULL on failure"); + DEF_RELEASE_GIL("whisper_init_with_params", &whisper_init_with_params_wrapper, "Various functions for loading a ggml whisper model.\n" + "Allocate (almost) all memory needed for the model.\n" + "Return NULL on failure"); m.def("whisper_free", &whisper_free_wrapper, "Frees all memory allocated by the model."); @@ -712,6 +991,7 @@ PYBIND11_MODULE(_pywhispercpp, m) { .def_readwrite("duration_ms", &WhisperFullParamsWrapper::duration_ms) .def_readwrite("translate", &WhisperFullParamsWrapper::translate) .def_readwrite("no_context", &WhisperFullParamsWrapper::no_context) + .def_readwrite("no_timestamps", &WhisperFullParamsWrapper::no_timestamps) .def_readwrite("single_segment", &WhisperFullParamsWrapper::single_segment) .def_readwrite("print_special", &WhisperFullParamsWrapper::print_special) .def_readwrite("print_progress", &WhisperFullParamsWrapper::print_progress) @@ -724,7 +1004,9 @@ PYBIND11_MODULE(_pywhispercpp, m) { .def_readwrite("max_len", &WhisperFullParamsWrapper::max_len) .def_readwrite("split_on_word", &WhisperFullParamsWrapper::split_on_word) .def_readwrite("max_tokens", &WhisperFullParamsWrapper::max_tokens) + .def_readwrite("debug_mode", &WhisperFullParamsWrapper::debug_mode) .def_readwrite("audio_ctx", &WhisperFullParamsWrapper::audio_ctx) + .def_readwrite("tdrz_enable", &WhisperFullParamsWrapper::tdrz_enable) .def_property("suppress_regex", [](WhisperFullParamsWrapper &self) { return py::str(self.suppress_regex ? self.suppress_regex : ""); @@ -740,8 +1022,29 @@ PYBIND11_MODULE(_pywhispercpp, m) { self.set_initial_prompt(initial_prompt); } ) - .def_readwrite("prompt_tokens", &WhisperFullParamsWrapper::prompt_tokens) + .def_property("prompt_tokens", + [](WhisperFullParamsWrapper &self) { + return self.get_prompt_tokens(); + }, + [](WhisperFullParamsWrapper &self, const py::object &tokens) { + self.set_prompt_tokens(tokens); + }) + .def("set_abort_callback", + [](WhisperFullParamsWrapper &self, py::object callback) { + if (callback.is_none()) { + self.clear_abort_callback(); + } else { + self.set_abort_callback(callback.cast()); + } + }, + py::arg("callback") = py::none(), + "Assign an abort callback that returns True to stop processing.") + .def("clear_abort_callback", &WhisperFullParamsWrapper::clear_abort_callback, + "Clear any previously assigned abort callback.") + .def("set_prompt_tokens", &WhisperFullParamsWrapper::set_prompt_tokens, py::arg("tokens"), + "Copy prompt tokens into C++-owned storage and update the raw pointers safely.") .def_readwrite("prompt_n_tokens", &WhisperFullParamsWrapper::prompt_n_tokens) + .def_readwrite("carry_initial_prompt", &WhisperFullParamsWrapper::carry_initial_prompt) .def_property("language", [](WhisperFullParamsWrapper &self) { return py::str(self.language); @@ -754,7 +1057,9 @@ PYBIND11_MODULE(_pywhispercpp, m) { self.language = ""; //defaults to auto-detect } }) + .def_readwrite("detect_language", &WhisperFullParamsWrapper::detect_language) .def_readwrite("suppress_blank", &WhisperFullParamsWrapper::suppress_blank) + .def_readwrite("suppress_nst", &WhisperFullParamsWrapper::suppress_nst) .def_readwrite("temperature", &WhisperFullParamsWrapper::temperature) .def_readwrite("max_initial_ts", &WhisperFullParamsWrapper::max_initial_ts) .def_readwrite("length_penalty", &WhisperFullParamsWrapper::length_penalty) @@ -767,9 +1072,15 @@ PYBIND11_MODULE(_pywhispercpp, m) { [](WhisperFullParamsWrapper &self, py::dict dict) {self.greedy.best_of = dict["best_of"].cast();}) .def_property("beam_search", [](WhisperFullParamsWrapper &self) {return py::dict("beam_size"_a=self.beam_search.beam_size, "patience"_a=self.beam_search.patience);}, [](WhisperFullParamsWrapper &self, py::dict dict) {self.beam_search.beam_size = dict["beam_size"].cast(); self.beam_search.patience = dict["patience"].cast();}) + .def("set_grammar", &WhisperFullParamsWrapper::set_grammar, + py::arg("grammar"), py::arg("rule_name") = "", py::arg("penalty") = -1.0f, + "Parse GBNF grammar text or a grammar file path and store the resulting grammar in C++-owned memory.") + .def("clear_grammar", &WhisperFullParamsWrapper::clear_grammar, + "Clear any previously configured grammar from the parameter object.") .def_readwrite("new_segment_callback_user_data", &WhisperFullParamsWrapper::new_segment_callback_user_data) .def_readwrite("encoder_begin_callback_user_data", &WhisperFullParamsWrapper::encoder_begin_callback_user_data) .def_readwrite("logits_filter_callback_user_data", &WhisperFullParamsWrapper::logits_filter_callback_user_data) + .def_readwrite("grammar_penalty", &WhisperFullParamsWrapper::grammar_penalty) .def_readwrite("vad", &WhisperFullParamsWrapper::vad) .def_property("vad_model_path", [](WhisperFullParamsWrapper &self) { @@ -799,6 +1110,8 @@ PYBIND11_MODULE(_pywhispercpp, m) { m.def("whisper_full_lang_id", &whisper_full_lang_id_wrapper, "Language id associated with the current context"); m.def("whisper_full_get_segment_t0", &whisper_full_get_segment_t0_wrapper, "Get the start time of the specified segment"); m.def("whisper_full_get_segment_t1", &whisper_full_get_segment_t1_wrapper, "Get the end time of the specified segment"); + m.def("whisper_full_get_segment_speaker_turn_next", &whisper_full_get_segment_speaker_turn_next_wrapper, + "Get whether the next segment is predicted as a speaker turn."); m.def("whisper_full_get_segment_text", &whisper_full_get_segment_text_wrapper, "Get the text of the specified segment"); m.def("whisper_full_n_tokens", &whisper_full_n_tokens_wrapper, "Get number of tokens in the specified segment."); @@ -812,6 +1125,18 @@ PYBIND11_MODULE(_pywhispercpp, m) { m.def("whisper_full_get_token_p", &whisper_full_get_token_p_wrapper, "Get the probability of the specified token in the specified segment."); m.def("whisper_ctx_init_openvino_encoder", &whisper_ctx_init_openvino_encoder_wrapper, "Given a context, enable use of OpenVINO for encode inference."); + m.def("whisper_model_type_readable", &whisper_model_type_readable_wrapper, "Return the readable model type string."); + m.def("whisper_model_n_vocab", &whisper_model_n_vocab_wrapper, "Return the model vocabulary size."); + m.def("whisper_model_n_audio_ctx", &whisper_model_n_audio_ctx_wrapper, "Return the audio context size baked into the model."); + m.def("whisper_model_n_audio_state", &whisper_model_n_audio_state_wrapper, "Return the number of audio state units in the model."); + m.def("whisper_model_n_audio_head", &whisper_model_n_audio_head_wrapper, "Return the number of audio attention heads in the model."); + m.def("whisper_model_n_audio_layer", &whisper_model_n_audio_layer_wrapper, "Return the number of audio layers in the model."); + m.def("whisper_model_n_text_ctx", &whisper_model_n_text_ctx_wrapper, "Return the text context size baked into the model."); + m.def("whisper_model_n_text_state", &whisper_model_n_text_state_wrapper, "Return the number of text state units in the model."); + m.def("whisper_model_n_text_head", &whisper_model_n_text_head_wrapper, "Return the number of text attention heads in the model."); + m.def("whisper_model_n_text_layer", &whisper_model_n_text_layer_wrapper, "Return the number of text layers in the model."); + m.def("whisper_model_n_mels", &whisper_model_n_mels_wrapper, "Return the number of mel bins used by the model."); + m.def("whisper_model_ftype", &whisper_model_ftype_wrapper, "Return the model file type identifier."); //////////////////////////////////////////////////////////////////////////// @@ -832,6 +1157,23 @@ PYBIND11_MODULE(_pywhispercpp, m) { m.def("assign_logits_filter_callback", &assign_logits_filter_callback, "Assigns a logits_filter_callback, takes instance and a callable function with the same parameters which are defined in the interface", py::arg("params"), py::arg("callback")); + m.def("assign_abort_callback", + [](whisper_full_params * params, py::object callback) { + assign_abort_callback(params, callback); + }, + "Assign an abort callback that returns True to stop processing.", + py::arg("params"), py::arg("callback") = py::none()); + + m.def("clear_abort_callback", &clear_abort_callback, "Clear any previously assigned abort callback.", + py::arg("params")); + + m.def("whisper_log_set", + [](py::object callback) { + whisper_log_set_wrapper(callback); + }, + "Assign a Python log callback or None to restore the default logger.", + py::arg("callback") = py::none()); + // VAD py::class_(m,"whisper_vad_params") .def(py::init<>()) diff --git a/tests/test_backwards_compatibility.py b/tests/test_backwards_compatibility.py new file mode 100644 index 0000000..4e21cdc --- /dev/null +++ b/tests/test_backwards_compatibility.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import gc +import subprocess +import sys +import textwrap +import unittest +from pathlib import Path +from unittest import TestCase + +import _pywhispercpp as pw + +from pywhispercpp.model import Model, Segment + + +WHISPER_CPP_DIR = Path(__file__).parent.parent / 'whisper.cpp' + + +class TestBackwardsCompatibility(TestCase): + audio_file = WHISPER_CPP_DIR / 'samples/jfk.wav' + models_dir = str(WHISPER_CPP_DIR / 'models') + repo_root = Path(__file__).parent.parent + + def tearDown(self): + gc.collect() + + def _create_cpu_model(self): + return Model( + 'tiny', + models_dir=self.models_dir, + context_params={'use_gpu': False, 'flash_attn': False}, + ) + + def _run_python(self, code: str): + result = subprocess.run( + [sys.executable, '-c', textwrap.dedent(code)], + cwd=self.repo_root, + capture_output=True, + text=True, + ) + self.assertEqual( + result.returncode, + 0, + msg=f"stdout:\n{result.stdout}\n\nstderr:\n{result.stderr}", + ) + + def test_legacy_model_constructor_still_works(self): + self._run_python( + f''' + from pywhispercpp.model import Model + + model = Model('tiny', models_dir={self.models_dir!r}) + assert isinstance(model, Model) + ''' + ) + + def test_legacy_alias_still_maps_to_suppress_nst(self): + self._run_python( + f''' + from pywhispercpp.model import Model + + model = Model( + 'tiny', + models_dir={self.models_dir!r}, + context_params={{'use_gpu': False, 'flash_attn': False}}, + ) + model._set_params({{'suppress_non_speech_tokens': True}}) + assert model.get_params()['suppress_nst'] is True + ''' + ) + + def test_low_level_prompt_tokens_property_round_trips(self): + params = pw.whisper_full_default_params( + pw.whisper_sampling_strategy.WHISPER_SAMPLING_GREEDY + ) + params.prompt_tokens = (1, 2, 3) + self.assertEqual(tuple(params.prompt_tokens), (1, 2, 3)) + self.assertEqual(params.prompt_n_tokens, 3) + + def test_context_params_dict_is_additive(self): + self._run_python( + f''' + from pywhispercpp.model import Model + + model = Model( + 'tiny', + models_dir={self.models_dir!r}, + context_params={{'use_gpu': False, 'flash_attn': False}}, + ) + assert isinstance(model, Model) + ''' + ) + + def test_existing_new_segment_callback_still_works(self): + self._run_python( + f''' + from pywhispercpp.model import Model, Segment + + seen = [] + model = Model( + 'tiny', + models_dir={self.models_dir!r}, + context_params={{'use_gpu': False, 'flash_attn': False}}, + ) + + def on_segment(segment): + seen.append(segment) + + segments = model.transcribe({str(self.audio_file)!r}, new_segment_callback=on_segment) + assert isinstance(segments, list) + assert len(seen) > 0 + assert all(isinstance(segment, Segment) for segment in seen) + ''' + ) + + def test_abort_callback_can_abort_and_then_clear(self): + self._run_python( + f''' + from pywhispercpp.model import Model + + model = Model( + 'tiny', + models_dir={self.models_dir!r}, + context_params={{'use_gpu': False, 'flash_attn': False}}, + ) + callback_calls = [] + + def abort_immediately(): + callback_calls.append(True) + return True + + aborted_segments = model.transcribe({str(self.audio_file)!r}, abort_callback=abort_immediately) + assert isinstance(aborted_segments, list) + assert len(callback_calls) > 0 + + normal_segments = model.transcribe({str(self.audio_file)!r}) + assert isinstance(normal_segments, list) + assert len(normal_segments) > 0 + ''' + ) + + def test_log_callback_can_be_set_and_cleared(self): + pw.whisper_log_set(lambda level, text: None) + pw.whisper_log_set(None) + + def test_alignment_preset_enum_is_available(self): + preset = pw.whisper_alignment_heads_preset.WHISPER_AHEADS_TINY + self.assertIsNotNone(preset) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/test_model.py b/tests/test_model.py index f38200f..9ee8f65 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -8,6 +8,7 @@ from pathlib import Path from unittest import TestCase +import _pywhispercpp as pw from pywhispercpp.model import Model, Segment if __name__ == '__main__': @@ -44,6 +45,62 @@ def test_auto_detect_language(self): detected_language, probs = self.model.auto_detect_language(str(self.audio_file)) return self.assertIsInstance(detected_language, tuple) and self.assertEqual(detected_language[0], 'en') + def test_context_params_dict_init(self): + model = Model( + "tiny", + models_dir=str(WHISPER_CPP_DIR/'models'), + context_params={"use_gpu": False, "flash_attn": False}, + ) + self.assertIsInstance(model, Model) + + def test_compat_alias_for_non_speech_tokens(self): + model = Model( + "tiny", + models_dir=str(WHISPER_CPP_DIR/'models'), + suppress_non_speech_tokens=True, + ) + self.assertTrue(model.get_params()["suppress_nst"]) + + def test_prompt_token_helper_exists(self): + params = pw.whisper_full_default_params( + pw.whisper_sampling_strategy.WHISPER_SAMPLING_GREEDY + ) + params.set_prompt_tokens((1, 2, 3)) + self.assertEqual(params.prompt_n_tokens, 3) + + def test_grammar_helper_exists(self): + params = pw.whisper_full_default_params( + pw.whisper_sampling_strategy.WHISPER_SAMPLING_GREEDY + ) + params.set_grammar('root ::= "yes" | "no"', 'root', 42.0) + self.assertEqual(params.grammar_penalty, 42.0) + params.clear_grammar() + + def test_model_accepts_grammar_param(self): + model = Model( + "tiny", + models_dir=str(WHISPER_CPP_DIR/'models'), + grammar='root ::= "yes" | "no"', + grammar_rule='root', + grammar_penalty=42.0, + ) + self.assertIsInstance(model, Model) + + def test_model_metadata_bindings(self): + self.assertIsInstance(pw.whisper_model_type_readable(self.model._ctx), str) + self.assertGreater(pw.whisper_model_n_vocab(self.model._ctx), 0) + self.assertGreater(pw.whisper_model_n_audio_ctx(self.model._ctx), 0) + self.assertGreater(pw.whisper_model_n_text_ctx(self.model._ctx), 0) + + def test_speaker_turn_accessor_smoke(self): + self.model.transcribe(str(self.audio_file)) + segment_count = pw.whisper_full_n_segments(self.model._ctx) + self.assertGreater(segment_count, 0) + self.assertIsInstance( + pw.whisper_full_get_segment_speaker_turn_next(self.model._ctx, 0), + bool, + ) + if __name__ == '__main__': unittest.main() diff --git a/whsiper_args.txt b/whsiper_args.txt new file mode 100644 index 0000000..35678ec --- /dev/null +++ b/whsiper_args.txt @@ -0,0 +1,252 @@ + +usage: ./whisper-cli [options] file0 file1 ... +supported audio formats: flac, mp3, ogg, wav + +options: + --help [default] show this help message and exit + --threads N [4 ] number of threads to use during computation + --processors N [1 ] number of processors to use during computation + --offset-t N [0 ] time offset in milliseconds + --offset-n N [0 ] segment index offset + --duration N [0 ] duration of audio to process in milliseconds + --max-context N [-1 ] maximum number of text context tokens to store + --max-len N [0 ] maximum segment length in characters + --max-tokens N [0 ] maximum number of tokens per segment + --split-on-word [false ] split on word rather than on token + --best-of N [5 ] number of best candidates to keep + --beam-size N [5 ] beam size for beam search + --audio-ctx N [0 ] audio context size (0 - all) + --word-thold N [0.01 ] word timestamp probability threshold + --entropy-thold N [2.40 ] entropy threshold for decoder fail + --logprob-thold N [-1.00 ] log probability threshold for decoder fail + --no-speech-thold N [0.60 ] no speech threshold + --temperature N [0.00 ] The sampling temperature, between 0 and 1 + --temperature-inc N [0.20 ] The increment of temperature, between 0 and 1 + --debug-mode [false ] enable debug mode (eg. dump log_mel) + --translate [false ] translate from source language to english + --diarize [false ] stereo audio diarization + --tinydiarize [false ] enable tinydiarize (requires a tdrz model) + --no-fallback [false ] do not use temperature fallback while decoding + --output-txt [false ] output result in a text file + --output-vtt [false ] output result in a vtt file + --output-srt [false ] output result in a srt file + --output-lrc [false ] output result in a lrc file + --output-words [false ] output script for generating karaoke video + --font-path [/System/Library/Fonts/Supplemental/Courier New Bold.ttf] path to a monospace font for karaoke video + --output-csv [false ] output result in a CSV file + --output-json [false ] output result in a JSON file + --output-json-full [false ] include more information in the JSON file + --output-file FNAME [ ] output file path (without file extension) + --no-prints [false ] do not print anything other than the results + --print-special [false ] print special tokens + --print-colors [false ] print colors + --print-confidence [false ] print confidence + --print-progress [false ] print progress + --no-timestamps [false ] do not print timestamps + --language LANG [en ] spoken language ('auto' for auto-detect) + --detect-language [false ] exit after automatically detecting language + --prompt PROMPT [ ] initial prompt (max n_text_ctx/2 tokens) + --carry-initial-prompt [false ] always prepend initial prompt + --model FNAME [models/ggml-base.en.bin] model path + --file FNAME [ ] input audio file path + --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference + --dtw MODEL [ ] compute token-level timestamps + --log-score [false ] log best decoder scores of tokens + --no-gpu [false ] disable GPU + --device N [0 ] GPU device ID (default: 0) + --flash-attn [true ] enable flash attention + --no-flash-attn [false ] disable flash attention + --suppress-blank [true ] suppress blank outputs + --no-suppress-blank [false ] disable blank suppression + --suppress-nst [false ] suppress non-speech tokens + --suppress-regex REGEX [ ] regular expression matching tokens to suppress + --grammar GRAMMAR [ ] GBNF grammar to guide decoding + --grammar-rule RULE [ ] top-level GBNF grammar rule name + --grammar-penalty N [100.0 ] scales down logits of nongrammar tokens + +Voice Activity Detection (VAD) options: + --vad [false ] enable Voice Activity Detection (VAD) + --vad-model FNAME [ ] VAD model path + --vad-threshold N [0.50 ] VAD threshold for speech recognition + --vad-min-speech-duration-ms N [250 ] VAD min speech duration (0.0-1.0) + --vad-min-silence-duration-ms N [100 ] VAD min silence duration (to split segments) + --vad-max-speech-duration-s N [FLT_MAX] VAD max speech duration (auto-split longer) + --vad-speech-pad-ms N [30 ] VAD speech padding (extend segments) + --vad-samples-overlap N [0.10 ] VAD samples overlap (seconds between segments) + + +usage: ./whisper-stream [options] + +options: + --help [default] show this help message and exit + --threads N [4 ] number of threads to use during computation + --step N [3000 ] audio step size in milliseconds + --length N [10000 ] audio length in milliseconds + --keep N [200 ] audio to keep from previous step in ms + --capture ID [-1 ] capture device ID + --max-tokens N [32 ] maximum number of tokens per audio chunk + --audio-ctx N [0 ] audio context size (0 - all) + --beam-size N [-1 ] beam size for beam search + --vad-thold N [0.60 ] voice activity detection threshold + --freq-thold N [100.00 ] high-pass frequency cutoff + --translate [false ] translate from source language to english + --no-fallback [false ] do not use temperature fallback while decoding + --print-special [false ] print special tokens + --keep-context [false ] keep context between audio chunks + --language LANG [en ] spoken language + --model FNAME [models/ggml-base.en.bin] model path + --file FNAME [ ] text output file name + --tinydiarize [false ] enable tinydiarize (requires a tdrz model) + --save-audio [false ] save the recorded audio to a file + --no-gpu [false ] disable GPU inference + --flash-attn [true ] enable flash attention during inference + --no-flash-attn [false ] disable flash attention during inference + + +usage: ./whisper-server [options] + +options: + --help [default] show this help message and exit + --threads N [4 ] number of threads to use during computation + --processors N [1 ] number of processors to use during computation + --offset-t N [0 ] time offset in milliseconds + --offset-n N [0 ] segment index offset + --duration N [0 ] duration of audio to process in milliseconds + --max-context N [-1 ] maximum number of text context tokens to store + --max-len N [0 ] maximum segment length in characters + --split-on-word [false ] split on word rather than on token + --best-of N [2 ] number of best candidates to keep + --beam-size N [-1 ] beam size for beam search + --audio-ctx N [0 ] audio context size (0 - all) + --word-thold N [0.01 ] word timestamp probability threshold + --entropy-thold N [2.40 ] entropy threshold for decoder fail + --logprob-thold N [-1.00 ] log probability threshold for decoder fail + --debug-mode [false ] enable debug mode (eg. dump log_mel) + --translate [false ] translate from source language to english + --diarize [false ] stereo audio diarization + --tinydiarize [false ] enable tinydiarize (requires a tdrz model) + --no-fallback [false ] do not use temperature fallback while decoding + --print-special [false ] print special tokens + --print-colors [false ] print colors + --print-realtime [false ] print output in realtime + --print-progress [false ] print progress + --no-timestamps [false ] do not print timestamps + --language LANG [en ] spoken language ('auto' for auto-detect) + --detect-language [false ] exit after automatically detecting language + --prompt PROMPT [ ] initial prompt + --model FNAME [models/ggml-base.en.bin] model path + --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference + --dtw MODEL [ ] compute token-level timestamps + --host HOST [127.0.0.1] Hostname/ip-adress for the server + --port PORT [8080 ] Port number for the server + --public PATH [examples/server/public] Path to the public folder + --request-path PATH [ ] Request path for all requests + --inference-path PATH [/inference] Inference path for all requests + --convert [false ] Convert audio to WAV, requires ffmpeg on the server + --tmp-dir [. ] Temporary directory for ffmpeg transcoded files + --suppress-nst [false ] suppress non-speech tokens + --no-speech-thold N [0.60 ] no speech threshold + --no-gpu [false ] do not use gpu + --device N [0 ] GPU device ID (default: 0) + --flash-attn [true ] enable flash attention + --no-flash-attn [false ] disable flash attention + --no-language-probabilities [false ] exclude language probabilities from verbose_json output + +Voice Activity Detection (VAD) options: + --vad [false ] enable Voice Activity Detection (VAD) + --vad-model FNAME [ ] VAD model path + --vad-threshold N [0.50 ] VAD threshold for speech recognition + --vad-min-speech-duration-ms N [250 ] VAD min speech duration (0.0-1.0) + --vad-min-silence-duration-ms N [100 ] VAD min silence duration (to split segments) + --vad-max-speech-duration-s N [FLT_MAX] VAD max speech duration (auto-split longer) + --vad-speech-pad-ms N [30 ] VAD speech padding (extend segments) + --vad-samples-overlap N [0.10 ] VAD samples overlap (seconds between segments) + + + +deduped: +options: + --help [default] show this help message and exit + --threads N [4 ] number of threads to use during computation + --processors N [1 ] number of processors to use during computation + --offset-t N [0 ] time offset in milliseconds + --offset-n N [0 ] segment index offset + --duration N [0 ] duration of audio to process in milliseconds + --max-context N [-1 ] maximum number of text context tokens to store + --max-len N [0 ] maximum segment length in characters + --max-tokens N [0 ] maximum number of tokens per segment + --split-on-word [false ] split on word rather than on token + --best-of N [5 ] number of best candidates to keep + --beam-size N [5 ] beam size for beam search + --audio-ctx N [0 ] audio context size (0 - all) + --word-thold N [0.01 ] word timestamp probability threshold + --entropy-thold N [2.40 ] entropy threshold for decoder fail + --logprob-thold N [-1.00 ] log probability threshold for decoder fail + --no-speech-thold N [0.60 ] no speech threshold + --temperature N [0.00 ] The sampling temperature, between 0 and 1 + --temperature-inc N [0.20 ] The increment of temperature, between 0 and 1 + --debug-mode [false ] enable debug mode (eg. dump log_mel) + --translate [false ] translate from source language to english + --diarize [false ] stereo audio diarization + --tinydiarize [false ] enable tinydiarize (requires a tdrz model) + --no-fallback [false ] do not use temperature fallback while decoding + --output-txt [false ] output result in a text file + --output-vtt [false ] output result in a vtt file + --output-srt [false ] output result in a srt file + --output-lrc [false ] output result in a lrc file + --output-words [false ] output script for generating karaoke video + --font-path [/System/Library/Fonts/Supplemental/Courier New Bold.ttf] path to a monospace font for karaoke video + --output-csv [false ] output result in a CSV file + --output-json [false ] output result in a JSON file + --output-json-full [false ] include more information in the JSON file + --output-file FNAME [ ] output file path (without file extension) + --no-prints [false ] do not print anything other than the results + --print-special [false ] print special tokens + --print-colors [false ] print colors + --print-confidence [false ] print confidence + --print-progress [false ] print progress + --no-timestamps [false ] do not print timestamps + --language LANG [en ] spoken language ('auto' for auto-detect) + --detect-language [false ] exit after automatically detecting language + --prompt PROMPT [ ] initial prompt (max n_text_ctx/2 tokens) + --carry-initial-prompt [false ] always prepend initial prompt + --model FNAME [models/ggml-base.en.bin] model path + --file FNAME [ ] input audio file path + --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference + --dtw MODEL [ ] compute token-level timestamps + --log-score [false ] log best decoder scores of tokens + --no-gpu [false ] disable GPU + --device N [0 ] GPU device ID (default: 0) + --flash-attn [true ] enable flash attention + --no-flash-attn [false ] disable flash attention + --suppress-blank [true ] suppress blank outputs + --no-suppress-blank [false ] disable blank suppression + --suppress-nst [false ] suppress non-speech tokens + --suppress-regex REGEX [ ] regular expression matching tokens to suppress + --grammar GRAMMAR [ ] GBNF grammar to guide decoding + --grammar-rule RULE [ ] top-level GBNF grammar rule name + --grammar-penalty N [100.0 ] scales down logits of nongrammar tokens + --vad [false ] enable Voice Activity Detection (VAD) + --vad-model FNAME [ ] VAD model path + --vad-threshold N [0.50 ] VAD threshold for speech recognition + --vad-min-speech-duration-ms N [250 ] VAD min speech duration (0.0-1.0) + --vad-min-silence-duration-ms N [100 ] VAD min silence duration (to split segments) + --vad-max-speech-duration-s N [FLT_MAX] VAD max speech duration (auto-split longer) + --vad-speech-pad-ms N [30 ] VAD speech padding (extend segments) + --vad-samples-overlap N [0.10 ] VAD samples overlap (seconds between segments) + --step N [3000 ] audio step size in milliseconds + --length N [10000 ] audio length in milliseconds + --keep N [200 ] audio to keep from previous step in ms + --capture ID [-1 ] capture device ID + --vad-thold N [0.60 ] voice activity detection threshold + --freq-thold N [100.00 ] high-pass frequency cutoff + --keep-context [false ] keep context between audio chunks + --save-audio [false ] save the recorded audio to a file + --host HOST [127.0.0.1] Hostname/ip-adress for the server + --port PORT [8080 ] Port number for the server + --public PATH [examples/server/public] Path to the public folder + --request-path PATH [ ] Request path for all requests + --inference-path PATH [/inference] Inference path for all requests + --convert [false ] Convert audio to WAV, requires ffmpeg on the server + --tmp-dir [. ] Temporary directory for ffmpeg transcoded files + --no-language-probabilities [false ] exclude language probabilities from verbose_json output From 830bf271f8666858c5f00198ebeec8e2011df9b6 Mon Sep 17 00:00:00 2001 From: scottmonster Date: Sun, 17 May 2026 17:19:08 -0500 Subject: [PATCH 03/16] add context params and update garmmar --- pywhispercpp/model.py | 61 +++++++++++++++++++++++++++--------------- pywhispercpp/model.pyi | 15 ++++++++--- src/main.cpp | 14 ++++++++-- 3 files changed, 64 insertions(+), 26 deletions(-) diff --git a/pywhispercpp/model.py b/pywhispercpp/model.py index e73cb06..84cc988 100644 --- a/pywhispercpp/model.py +++ b/pywhispercpp/model.py @@ -11,7 +11,7 @@ import sys from pathlib import Path from time import time -from typing import Any, Union, Callable, List, TextIO, Tuple, Optional, Dict +from typing import Any, Union, Callable, List, TextIO, Tuple, Optional, Dict, TypedDict import _pywhispercpp as pw import numpy as np import pywhispercpp.utils as utils @@ -29,6 +29,19 @@ logger = logging.getLogger(__name__) +class ContextParams(TypedDict, total=False): + use_gpu: bool + flash_attn: bool + gpu_device: int + dtw_token_timestamps: bool + dtw_aheads_preset: int + dtw_n_top: int + dtw_mem_size: int + + +_CONTEXT_PARAM_KEYS = frozenset(ContextParams.__annotations__) + + class Segment: """ A small class representing a transcription segment @@ -79,7 +92,7 @@ def __init__(self, openvino_model_path: Optional[str] = None, openvino_device: str = 'CPU', openvino_cache_dir: Optional[str] = None, - context_params: Union[Dict[str, Any], Any, None] = None, + context_params: Optional[ContextParams] = None, **params): """ :param model: The name of the model, one of the [AVAILABLE_MODELS](/pywhispercpp/#pywhispercpp.constants.AVAILABLE_MODELS), @@ -151,10 +164,7 @@ def transcribe(self, Model._new_segment_callback = new_segment_callback pw.assign_new_segment_callback(self._params, Model.__call_new_segment_callback) - if abort_callback is None: - pw.clear_abort_callback(self._params) - else: - pw.assign_abort_callback(self._params, abort_callback) + pw.assign_abort_callback(self._params, abort_callback) # run inference start_time = time() @@ -268,17 +278,23 @@ def available_languages() -> List[str]: return res @staticmethod - def _resolve_context_params(context_params: Union[Dict[str, Any], Any, None]): + def _resolve_context_params(context_params: Optional[ContextParams]): if context_params is None: return None - if isinstance(context_params, dict): - resolved = pw.whisper_context_default_params() - for key, value in context_params.items(): - setattr(resolved, key, value) - return resolved + if not isinstance(context_params, dict): + raise TypeError("context_params must be a ContextParams dict or None") - return context_params + unknown_keys = sorted(set(context_params) - _CONTEXT_PARAM_KEYS) + if unknown_keys: + raise TypeError( + f"Unknown context_params keys: {', '.join(unknown_keys)}" + ) + + resolved = pw.whisper_context_default_params() + for key, value in context_params.items(): + setattr(resolved, key, value) + return resolved @staticmethod def _normalize_params(kwargs: dict) -> dict: @@ -314,7 +330,7 @@ def _set_params(self, kwargs: dict) -> None: normalized = self._normalize_params(kwargs) prompt_tokens = normalized.pop('prompt_tokens', None) if 'prompt_tokens' in normalized else None grammar = normalized.pop('grammar', None) if 'grammar' in normalized else None - grammar_rule = normalized.pop('grammar_rule', 'root') if 'grammar_rule' in normalized else 'root' + grammar_rule = normalized.pop('grammar_rule', None) if 'grammar_rule' in normalized else None grammar_penalty = normalized.get('grammar_penalty', self._params.grammar_penalty) for param, value in normalized.items(): @@ -324,10 +340,7 @@ def _set_params(self, kwargs: dict) -> None: self._params.set_prompt_tokens(prompt_tokens) if 'grammar' in kwargs: - if grammar: - self._params.set_grammar(grammar, grammar_rule, grammar_penalty) - else: - self._params.clear_grammar() + self._params.set_grammar(grammar, grammar_rule, grammar_penalty) def _transcribe(self, audio: np.ndarray, n_processors: Optional[int] = None): """ @@ -419,13 +432,13 @@ def wav_to_np(file_path): finally: os.remove(temp_file_path) - def auto_detect_language(self, media: Union[str, np.ndarray], offset_ms: int = 0, n_threads: int = 4) -> Tuple[Tuple[str, np.float32], Dict[str, np.float32]]: + def auto_detect_language(self, media: Union[str, np.ndarray], offset_ms: Optional[int] = None, n_threads: Optional[int] = None) -> Tuple[Tuple[str, np.float32], Dict[str, np.float32]]: """ Automatic language detection using whisper.cpp/whisper_pcm_to_mel and whisper.cpp/whisper_lang_auto_detect :param media: Media file path or a numpy array - :param offset_ms: offset in milliseconds - :param n_threads: number of threads to use + :param offset_ms: offset in milliseconds, defaults to the model's configured offset + :param n_threads: number of threads to use, defaults to the model's configured thread count :return: ((detected_language, probability), probabilities for all languages) """ if isinstance(media, np.ndarray): @@ -435,6 +448,12 @@ def auto_detect_language(self, media: Union[str, np.ndarray], offset_ms: int = raise FileNotFoundError(media) audio = self._load_audio(media) + if offset_ms is None: + offset_ms = self._params.offset_ms + + if n_threads is None: + n_threads = self._params.n_threads + pw.whisper_pcm_to_mel(self._ctx, audio, len(audio), n_threads) lang_count = self.lang_max_id() + 1 probs = np.zeros(lang_count, dtype=np.float32) diff --git a/pywhispercpp/model.pyi b/pywhispercpp/model.pyi index 27e3d52..e397820 100644 --- a/pywhispercpp/model.pyi +++ b/pywhispercpp/model.pyi @@ -9,7 +9,16 @@ import numpy.typing as npt AudioArray = npt.NDArray[np.float32] AudioInput = Union[str, AudioArray] -ContextParams = Union[Dict[str, Any], Any] + + +class ContextParams(TypedDict, total=False): + use_gpu: bool + flash_attn: bool + gpu_device: int + dtw_token_timestamps: bool + dtw_aheads_preset: int + dtw_n_top: int + dtw_mem_size: int class GreedyParams(TypedDict): @@ -167,8 +176,8 @@ class Model: def auto_detect_language( self, media: AudioInput, - offset_ms: int = 0, - n_threads: int = 4, + offset_ms: Optional[int] = None, + n_threads: Optional[int] = None, ) -> Tuple[Tuple[str, np.float32], Dict[str, np.float32]]: ... def __del__(self) -> None: ... diff --git a/src/main.cpp b/src/main.cpp index 23150e5..bbbaff0 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1072,8 +1072,18 @@ PYBIND11_MODULE(_pywhispercpp, m) { [](WhisperFullParamsWrapper &self, py::dict dict) {self.greedy.best_of = dict["best_of"].cast();}) .def_property("beam_search", [](WhisperFullParamsWrapper &self) {return py::dict("beam_size"_a=self.beam_search.beam_size, "patience"_a=self.beam_search.patience);}, [](WhisperFullParamsWrapper &self, py::dict dict) {self.beam_search.beam_size = dict["beam_size"].cast(); self.beam_search.patience = dict["patience"].cast();}) - .def("set_grammar", &WhisperFullParamsWrapper::set_grammar, - py::arg("grammar"), py::arg("rule_name") = "", py::arg("penalty") = -1.0f, + .def("set_grammar", + [](WhisperFullParamsWrapper &self, py::object grammar, py::object rule_name, float penalty) { + if (grammar.is_none()) { + self.clear_grammar(); + return; + } + + const std::string grammar_input = grammar.cast(); + const std::string rule_name_str = rule_name.is_none() ? "" : rule_name.cast(); + self.set_grammar(grammar_input, rule_name_str, penalty); + }, + py::arg("grammar"), py::arg("rule_name") = py::none(), py::arg("penalty") = -1.0f, "Parse GBNF grammar text or a grammar file path and store the resulting grammar in C++-owned memory.") .def("clear_grammar", &WhisperFullParamsWrapper::clear_grammar, "Clear any previously configured grammar from the parameter object.") From dde0958a00ae33aa748f4a2c1e53f349c00a40f6 Mon Sep 17 00:00:00 2001 From: scottmonster Date: Sun, 17 May 2026 18:16:47 -0500 Subject: [PATCH 04/16] update docustrings --- pywhispercpp/model.py | 87 +++++++++++++++++++++++++++++++++--------- pywhispercpp/model.pyi | 5 +-- 2 files changed, 69 insertions(+), 23 deletions(-) diff --git a/pywhispercpp/model.py b/pywhispercpp/model.py index 84cc988..42de2f6 100644 --- a/pywhispercpp/model.py +++ b/pywhispercpp/model.py @@ -95,18 +95,67 @@ def __init__(self, context_params: Optional[ContextParams] = None, **params): """ - :param model: The name of the model, one of the [AVAILABLE_MODELS](/pywhispercpp/#pywhispercpp.constants.AVAILABLE_MODELS), - (default to `tiny`), or a direct path to a `ggml` model. - :param models_dir: The directory where the models are stored, or where they will be downloaded if they don't - exist, default to [MODELS_DIR](/pywhispercpp/#pywhispercpp.constants.MODELS_DIR) - :param params_sampling_strategy: 0 -> GREEDY, else BEAM_SEARCH - :param redirect_whispercpp_logs_to: where to redirect the whisper.cpp logs, default to False (no redirection), accepts str file path, sys.stdout, sys.stderr, or use None to redirect to devnull - :param use_openvino: whether to use OpenVINO or not - :param openvino_model_path: path to the OpenVINO model - :param openvino_device: OpenVINO device, default to CPU - :param openvino_cache_dir: OpenVINO cache directory - :param params: keyword arguments for different whisper.cpp parameters, - see [PARAMS_SCHEMA](/pywhispercpp/#pywhispercpp.constants.PARAMS_SCHEMA) + :param model: model name, default `tiny`, or a direct path to a ggml model file. + :param models_dir: directory containing model files; if omitted, uses `MODELS_DIR` unless `model` + is already a direct file path. + :param params_sampling_strategy: sampling strategy selector; `0` uses greedy decoding and any + other value uses beam search. + :param redirect_whispercpp_logs_to: log redirection target. Use `False` for no redirection, `None` + for `/dev/null`, a file path string, or `sys.stdout`/`sys.stderr`. + :param use_openvino: whether to initialize the OpenVINO encoder backend. + :param openvino_model_path: path to the OpenVINO model directory or files. + :param openvino_device: OpenVINO device name, default `CPU`. + :param openvino_cache_dir: OpenVINO cache directory. + :param context_params: optional whisper context loader params. Accepted keys are `use_gpu`, + `flash_attn`, `gpu_device`, `dtw_token_timestamps`, + `dtw_aheads_preset`, `dtw_n_top`, and `dtw_mem_size`. Omitted keys inherit + from `whisper_context_default_params()`. + :param params: decode parameters forwarded to `whisper_full_params`. + Supported keys: + - `n_threads`: number of inference threads. Default is `min(4, hardware_concurrency())`. + - `n_max_text_ctx`: max prompt-text tokens carried into the decoder. Default `16384`. + - `offset_ms`: audio start offset in milliseconds. Default `0`. + - `duration_ms`: audio duration to process in milliseconds. Default `0`. + - `translate`: translate output to English. Default `False`. + - `no_context`: disable reuse of past transcription context. Default `True`. + - `no_timestamps`: disable timestamp generation. Default `False`. + - `single_segment`: force a single output segment. Default `False`. + - `print_special`: print special tokens. Default `False`. + - `print_progress`: print progress information. Default `True`. + - `print_realtime`: print realtime output from whisper.cpp. Default `False`. + - `print_timestamps`: print timestamps during realtime output. Default `True`. + - `token_timestamps`: enable token-level timestamps. Default `False`. + - `thold_pt`: token timestamp probability threshold. Default `0.01`. + - `thold_ptsum`: token timestamp sum threshold. Default `0.01`. + - `max_len`: max segment length in characters. Default `0`. + - `split_on_word`: split on words when `max_len` is used. Default `False`. + - `max_tokens`: max tokens per segment. Default `0`. + - `debug_mode`: enable whisper.cpp debug mode. Default `False`. + - `audio_ctx`: override audio context size. Default `0`. + - `tdrz_enable`: enable tinydiarize speaker-turn detection. Default `False`. + - `initial_prompt`: initial text prompt prepended before decoding. Default `None`. + - `grammar`: GBNF grammar text or path to a grammar file. Default `None`. + - `grammar_rule`: top-level grammar rule name. Default `root` when grammar is used. + - `prompt_tokens`: explicit prompt token sequence. Default `None`. + - `prompt_n_tokens`: number of prompt tokens. Default `0`. + - `carry_initial_prompt`: prepend the initial prompt to each decode window. Default `False`. + - `language`: language code. Default `en`. + - `detect_language`: enable automatic language detection during transcription. Default `False`. + - `suppress_blank`: suppress blank outputs. Default `True`. + - `suppress_non_speech_tokens`: Python alias for `suppress_nst`. Default `False`. + - `suppress_nst`: suppress non-speech tokens. Default `False`. + - `temperature`: initial decoding temperature. Default `0.0`. + - `max_initial_ts`: maximum initial timestamp. Default `1.0`. + - `length_penalty`: length penalty. Default `-1.0`. + - `temperature_inc`: fallback temperature increment. Default `0.2`. + - `entropy_thold`: entropy threshold. Default `2.4`. + - `logprob_thold`: logprob threshold. Default `-1.0`. + - `no_speech_thold`: no-speech threshold. Default `0.6`. + - `grammar_penalty`: penalty applied to non-grammar tokens. Default `100.0`. + - `greedy`: greedy-decoder settings, typically `{"best_of": 5}`. + - `beam_search`: beam-search settings, schema default `{"beam_size": 5, "patience": -1.0}`. + - `vad`: enable VAD. Default `False`. + - `vad_model_path`: path to the VAD model. Default `None`. """ self.model_path = utils.resolve_model_path(model, models_dir) self._ctx = None @@ -136,12 +185,12 @@ def transcribe(self, Accepts a media_file path (audio/video) or a raw numpy array. :param media: Media file path or a numpy array - :param n_processors: if not None, it will run the transcription on multiple processes - binding to whisper.cpp/whisper_full_parallel - > Split the input audio in chunks and process each chunk separately using whisper_full() - :param new_segment_callback: callback function that will be called when a new segment is generated + :param n_processors: number of worker processes for `whisper_full_parallel`. If omitted, runs a + single-process `whisper_full()` decode. + :param new_segment_callback: callback invoked for each newly produced `Segment` during decoding. :param abort_callback: callback function returning True to abort an in-flight transcription early - :param params: keyword arguments for different whisper.cpp parameters, see ::: constants.PARAMS_SCHEMA + :param params: keyword arguments for different whisper.cpp parameters; these override the model's + active decode params for this call :param extract_probability: If True, calculates the geometric mean of token probabilities for each segment, providing a confidence score interpretable as a probability in [0, 1]. :return: List of transcription segments @@ -437,8 +486,8 @@ def auto_detect_language(self, media: Union[str, np.ndarray], offset_ms: Optiona Automatic language detection using whisper.cpp/whisper_pcm_to_mel and whisper.cpp/whisper_lang_auto_detect :param media: Media file path or a numpy array - :param offset_ms: offset in milliseconds, defaults to the model's configured offset - :param n_threads: number of threads to use, defaults to the model's configured thread count + :param offset_ms: offset in milliseconds; when omitted, uses the model's current `offset_ms` + :param n_threads: number of threads to use; when omitted, uses the model's current `n_threads` :return: ((detected_language, probability), probabilities for all languages) """ if isinstance(media, np.ndarray): diff --git a/pywhispercpp/model.pyi b/pywhispercpp/model.pyi index e397820..7e71af8 100644 --- a/pywhispercpp/model.pyi +++ b/pywhispercpp/model.pyi @@ -42,10 +42,7 @@ class Segment: class Model: - """ - docuemnts strings - """ - + _new_segment_callback: Optional[Callable[[Segment], None]] From b88c6398e94bb1402713de5991273bdad092ebcb Mon Sep 17 00:00:00 2001 From: scottmonster Date: Sun, 17 May 2026 18:38:51 -0500 Subject: [PATCH 05/16] roll back to previous handling for grammar and prompt_tokens to maintain compatibility --- pywhispercpp/constants.py | 12 ---- pywhispercpp/model.py | 10 ---- pywhispercpp/model.pyi | 6 -- src/main.cpp | 116 +------------------------------------- 4 files changed, 1 insertion(+), 143 deletions(-) diff --git a/pywhispercpp/constants.py b/pywhispercpp/constants.py index b742455..529b28b 100644 --- a/pywhispercpp/constants.py +++ b/pywhispercpp/constants.py @@ -194,18 +194,6 @@ 'options': None, 'default': None }, - 'grammar': { - 'type': str, - 'description': "GBNF grammar text or a path to a grammar file", - 'options': None, - 'default': None - }, - 'grammar_rule': { - 'type': str, - 'description': "top-level GBNF grammar rule name", - 'options': None, - 'default': 'root' - }, 'prompt_tokens': { 'type': Tuple, 'description': "tokens to provide to the whisper decoder as initial prompt", diff --git a/pywhispercpp/model.py b/pywhispercpp/model.py index 42de2f6..611dc9e 100644 --- a/pywhispercpp/model.py +++ b/pywhispercpp/model.py @@ -377,20 +377,10 @@ def _set_params(self, kwargs: dict) -> None: :return: None """ normalized = self._normalize_params(kwargs) - prompt_tokens = normalized.pop('prompt_tokens', None) if 'prompt_tokens' in normalized else None - grammar = normalized.pop('grammar', None) if 'grammar' in normalized else None - grammar_rule = normalized.pop('grammar_rule', None) if 'grammar_rule' in normalized else None - grammar_penalty = normalized.get('grammar_penalty', self._params.grammar_penalty) for param, value in normalized.items(): setattr(self._params, param, value) - if 'prompt_tokens' in kwargs: - self._params.set_prompt_tokens(prompt_tokens) - - if 'grammar' in kwargs: - self._params.set_grammar(grammar, grammar_rule, grammar_penalty) - def _transcribe(self, audio: np.ndarray, n_processors: Optional[int] = None): """ Private method to call the whisper.cpp/whisper_full function diff --git a/pywhispercpp/model.pyi b/pywhispercpp/model.pyi index 7e71af8..c548bea 100644 --- a/pywhispercpp/model.pyi +++ b/pywhispercpp/model.pyi @@ -80,8 +80,6 @@ class Model: audio_ctx: int = 0, tdrz_enable: bool = False, initial_prompt: Optional[str] = None, - grammar: Optional[str] = None, - grammar_rule: str = 'root', prompt_tokens: Optional[Tuple[Any, ...]] = None, prompt_n_tokens: int = 0, carry_initial_prompt: bool = False, @@ -97,7 +95,6 @@ class Model: entropy_thold: float = 2.4, logprob_thold: float = -1.0, no_speech_thold: float = 0.6, - grammar_penalty: float = 100.0, greedy: GreedyParams = {'best_of': 5}, beam_search: BeamSearchParams = {'beam_size': 5, 'patience': -1.0}, vad: bool = False, @@ -133,8 +130,6 @@ class Model: audio_ctx: int = 0, tdrz_enable: bool = False, initial_prompt: Optional[str] = None, - grammar: Optional[str] = None, - grammar_rule: str = 'root', prompt_tokens: Optional[Tuple[Any, ...]] = None, prompt_n_tokens: int = 0, carry_initial_prompt: bool = False, @@ -150,7 +145,6 @@ class Model: entropy_thold: float = 2.4, logprob_thold: float = -1.0, no_speech_thold: float = 0.6, - grammar_penalty: float = 100.0, greedy: GreedyParams = {'best_of': 5}, beam_search: BeamSearchParams = {'beam_size': 5, 'patience': -1.0}, extract_probability: bool = False, diff --git a/src/main.cpp b/src/main.cpp index bbbaff0..7197cb7 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,9 +16,6 @@ #include #include "whisper.h" -#include "../whisper.cpp/examples/grammar-parser.h" - -#include #define STRINGIFY(x) #x @@ -387,10 +384,6 @@ struct WhisperFullParamsWrapper : public whisper_full_params { std::string initial_prompt_str; std::string suppress_regex_str; std::string vad_model_path_str; - std::string grammar_rule_str; - grammar_parser::parse_state grammar_parsed; - std::vector grammar_rules_storage; - std::vector prompt_tokens_storage; public: py::function py_progress_callback; py::object py_abort_callback; @@ -425,23 +418,12 @@ struct WhisperFullParamsWrapper : public whisper_full_params { initial_prompt_str(other.initial_prompt_str), suppress_regex_str(other.suppress_regex_str), vad_model_path_str(other.vad_model_path_str), - grammar_rule_str(other.grammar_rule_str), - grammar_parsed(other.grammar_parsed), - grammar_rules_storage(other.grammar_rules_storage), - prompt_tokens_storage(other.prompt_tokens_storage), py_progress_callback(other.py_progress_callback), py_abort_callback(other.py_abort_callback) { // Reset pointers to new string copies initial_prompt = initial_prompt_str.empty() ? nullptr : initial_prompt_str.c_str(); suppress_regex = suppress_regex_str.empty() ? nullptr : suppress_regex_str.c_str(); vad_model_path = vad_model_path_str.empty() ? nullptr : vad_model_path_str.c_str(); - grammar_rules = grammar_rules_storage.empty() ? nullptr : grammar_rules_storage.data(); - n_grammar_rules = grammar_rules_storage.size(); - if (!grammar_rule_str.empty() && grammar_parsed.symbol_ids.find(grammar_rule_str) != grammar_parsed.symbol_ids.end()) { - i_start_rule = grammar_parsed.symbol_ids.at(grammar_rule_str); - } - prompt_tokens = prompt_tokens_storage.empty() ? nullptr : prompt_tokens_storage.data(); - prompt_n_tokens = prompt_tokens_storage.size(); abort_callback_user_data = this; progress_callback_user_data = this; progress_callback = [](struct whisper_context* ctx, struct whisper_state* state, int progress, void* user_data) { @@ -480,79 +462,6 @@ struct WhisperFullParamsWrapper : public whisper_full_params { abort_callback = nullptr; abort_callback_user_data = this; } - void clear_grammar() { - grammar_rule_str.clear(); - grammar_parsed = grammar_parser::parse_state(); - grammar_rules_storage.clear(); - grammar_rules = nullptr; - n_grammar_rules = 0; - i_start_rule = 0; - } - void set_grammar(const std::string& grammar_input, const std::string& rule_name = "", float penalty = -1.0f) { - clear_grammar(); - - if (grammar_input.empty()) { - if (penalty >= 0.0f) { - grammar_penalty = penalty; - } - return; - } - - std::string grammar_source = grammar_input; - std::ifstream grammar_file(grammar_input); - if (grammar_file.is_open()) { - grammar_source.assign((std::istreambuf_iterator(grammar_file)), - std::istreambuf_iterator()); - } - - grammar_parsed = grammar_parser::parse(grammar_source.c_str()); - if (grammar_parsed.rules.empty()) { - throw py::value_error("Failed to parse grammar input"); - } - - grammar_rule_str = rule_name.empty() ? "root" : rule_name; - if (grammar_parsed.symbol_ids.find(grammar_rule_str) == grammar_parsed.symbol_ids.end()) { - throw py::value_error("Grammar rule '" + grammar_rule_str + "' not found"); - } - - grammar_rules_storage = grammar_parsed.c_rules(); - grammar_rules = grammar_rules_storage.data(); - n_grammar_rules = grammar_rules_storage.size(); - i_start_rule = grammar_parsed.symbol_ids.at(grammar_rule_str); - - if (penalty >= 0.0f) { - grammar_penalty = penalty; - } - } - void set_prompt_tokens(const py::object& tokens_obj) { - prompt_tokens_storage.clear(); - - if (tokens_obj.is_none()) { - prompt_tokens = nullptr; - prompt_n_tokens = 0; - return; - } - - py::sequence tokens = tokens_obj.cast(); - prompt_tokens_storage.reserve(tokens.size()); - for (const auto & token : tokens) { - prompt_tokens_storage.push_back(token.cast()); - } - - prompt_tokens = prompt_tokens_storage.empty() ? nullptr : prompt_tokens_storage.data(); - prompt_n_tokens = prompt_tokens_storage.size(); - } - py::tuple get_prompt_tokens() const { - const whisper_token * tokens_ptr = prompt_tokens_storage.empty() ? prompt_tokens : prompt_tokens_storage.data(); - const size_t token_count = prompt_tokens_storage.empty() ? static_cast(std::max(prompt_n_tokens, 0)) : prompt_tokens_storage.size(); - - py::tuple tokens(token_count); - for (size_t i = 0; i < token_count; ++i) { - tokens[i] = py::int_(tokens_ptr[i]); - } - - return tokens; - } }; WhisperFullParamsWrapper whisper_full_default_params_wrapper(enum whisper_sampling_strategy strategy) { return WhisperFullParamsWrapper(whisper_full_default_params(strategy)); @@ -1022,13 +931,7 @@ PYBIND11_MODULE(_pywhispercpp, m) { self.set_initial_prompt(initial_prompt); } ) - .def_property("prompt_tokens", - [](WhisperFullParamsWrapper &self) { - return self.get_prompt_tokens(); - }, - [](WhisperFullParamsWrapper &self, const py::object &tokens) { - self.set_prompt_tokens(tokens); - }) + .def_readwrite("prompt_tokens", &WhisperFullParamsWrapper::prompt_tokens) .def("set_abort_callback", [](WhisperFullParamsWrapper &self, py::object callback) { if (callback.is_none()) { @@ -1041,8 +944,6 @@ PYBIND11_MODULE(_pywhispercpp, m) { "Assign an abort callback that returns True to stop processing.") .def("clear_abort_callback", &WhisperFullParamsWrapper::clear_abort_callback, "Clear any previously assigned abort callback.") - .def("set_prompt_tokens", &WhisperFullParamsWrapper::set_prompt_tokens, py::arg("tokens"), - "Copy prompt tokens into C++-owned storage and update the raw pointers safely.") .def_readwrite("prompt_n_tokens", &WhisperFullParamsWrapper::prompt_n_tokens) .def_readwrite("carry_initial_prompt", &WhisperFullParamsWrapper::carry_initial_prompt) .def_property("language", @@ -1072,21 +973,6 @@ PYBIND11_MODULE(_pywhispercpp, m) { [](WhisperFullParamsWrapper &self, py::dict dict) {self.greedy.best_of = dict["best_of"].cast();}) .def_property("beam_search", [](WhisperFullParamsWrapper &self) {return py::dict("beam_size"_a=self.beam_search.beam_size, "patience"_a=self.beam_search.patience);}, [](WhisperFullParamsWrapper &self, py::dict dict) {self.beam_search.beam_size = dict["beam_size"].cast(); self.beam_search.patience = dict["patience"].cast();}) - .def("set_grammar", - [](WhisperFullParamsWrapper &self, py::object grammar, py::object rule_name, float penalty) { - if (grammar.is_none()) { - self.clear_grammar(); - return; - } - - const std::string grammar_input = grammar.cast(); - const std::string rule_name_str = rule_name.is_none() ? "" : rule_name.cast(); - self.set_grammar(grammar_input, rule_name_str, penalty); - }, - py::arg("grammar"), py::arg("rule_name") = py::none(), py::arg("penalty") = -1.0f, - "Parse GBNF grammar text or a grammar file path and store the resulting grammar in C++-owned memory.") - .def("clear_grammar", &WhisperFullParamsWrapper::clear_grammar, - "Clear any previously configured grammar from the parameter object.") .def_readwrite("new_segment_callback_user_data", &WhisperFullParamsWrapper::new_segment_callback_user_data) .def_readwrite("encoder_begin_callback_user_data", &WhisperFullParamsWrapper::encoder_begin_callback_user_data) .def_readwrite("logits_filter_callback_user_data", &WhisperFullParamsWrapper::logits_filter_callback_user_data) From 83ba8cccbf97a64bc213ab0076dd50994c106c4a Mon Sep 17 00:00:00 2001 From: scottmonster <87917233+scottmonster@users.noreply.github.com> Date: Sun, 17 May 2026 20:24:10 -0500 Subject: [PATCH 06/16] update default in model.pyi --- pywhispercpp/model.pyi | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/pywhispercpp/model.pyi b/pywhispercpp/model.pyi index c548bea..8b9b323 100644 --- a/pywhispercpp/model.pyi +++ b/pywhispercpp/model.pyi @@ -1,7 +1,5 @@ from __future__ import annotations -# Generated by coverage/generate_pyi.py. Do not edit by hand. - from typing import Any, Callable, Dict, List, Optional, TextIO, Tuple, TypedDict, Union import numpy as np @@ -42,9 +40,7 @@ class Segment: class Model: - _new_segment_callback: Optional[Callable[[Segment], None]] - def __init__( self, @@ -56,7 +52,6 @@ class Model: openvino_model_path: Optional[str] = None, openvino_device: str = 'CPU', openvino_cache_dir: Optional[str] = None, - context_params: Optional[ContextParams] = None, *, n_threads: Optional[int] = None, n_max_text_ctx: int = 16384, @@ -95,10 +90,11 @@ class Model: entropy_thold: float = 2.4, logprob_thold: float = -1.0, no_speech_thold: float = 0.6, - greedy: GreedyParams = {'best_of': 5}, - beam_search: BeamSearchParams = {'beam_size': 5, 'patience': -1.0}, + greedy: GreedyParams = {'best_of': -1}, + beam_search: BeamSearchParams = {'beam_size': -1, 'patience': -1.0}, vad: bool = False, vad_model_path: Optional[str] = None, + **params )->None: ... def transcribe( @@ -145,11 +141,12 @@ class Model: entropy_thold: float = 2.4, logprob_thold: float = -1.0, no_speech_thold: float = 0.6, - greedy: GreedyParams = {'best_of': 5}, - beam_search: BeamSearchParams = {'beam_size': 5, 'patience': -1.0}, + greedy: GreedyParams = {'best_of': -1}, + beam_search: BeamSearchParams = {'beam_size': -1, 'patience': -1.0}, extract_probability: bool = False, vad: bool = False, vad_model_path: Optional[str] = None, + **params ) -> List[Segment]: ... def get_params(self) -> Dict[str, Any]: ... From eb7f21ec9b9c6eaccb52ae91abcd3f62cf89c68b Mon Sep 17 00:00:00 2001 From: scottmonster <87917233+scottmonster@users.noreply.github.com> Date: Tue, 19 May 2026 18:20:46 -0500 Subject: [PATCH 07/16] prompt_tokens + grammar + callbacks --- pywhispercpp/model.py | 43 ++++ pywhispercpp/model.pyi | 5 + src/main.cpp | 532 +++++++++++++++++++++++++++++++++++------ 3 files changed, 511 insertions(+), 69 deletions(-) diff --git a/pywhispercpp/model.py b/pywhispercpp/model.py index 611dc9e..44adaeb 100644 --- a/pywhispercpp/model.py +++ b/pywhispercpp/model.py @@ -354,6 +354,43 @@ def _normalize_params(kwargs: dict) -> dict: return normalized + def _apply_grammar_params(self, normalized: dict) -> dict: + has_grammar = 'grammar' in normalized + has_grammar_rule = 'grammar_rule' in normalized + + if not has_grammar: + if has_grammar_rule: + raise AttributeError('grammar_rule requires grammar') + return normalized + + grammar = normalized.pop('grammar') + grammar_rule = normalized.pop('grammar_rule', 'root') + + if grammar is None: + self._params.clear_grammar() + return normalized + + self._params.set_grammar( + grammar, + grammar_rule, + normalized.get('grammar_penalty', self._params.grammar_penalty), + ) + return normalized + + def _apply_prompt_token_params(self, normalized: dict) -> dict: + if 'prompt_tokens' not in normalized: + return normalized + + prompt_tokens = normalized.pop('prompt_tokens') + normalized.pop('prompt_n_tokens', None) + + if prompt_tokens is None: + self._params.clear_prompt_tokens() + else: + self._params.set_prompt_tokens(prompt_tokens) + + return normalized + def _init_model(self) -> None: """ Private method to initialize the method from the bindings, it will be called automatically from the __init__ @@ -378,6 +415,12 @@ def _set_params(self, kwargs: dict) -> None: """ normalized = self._normalize_params(kwargs) + if 'grammar' in normalized or 'grammar_rule' in normalized: + normalized = self._apply_grammar_params(normalized) + + if 'prompt_tokens' in normalized: + normalized = self._apply_prompt_token_params(normalized) + for param, value in normalized.items(): setattr(self._params, param, value) diff --git a/pywhispercpp/model.pyi b/pywhispercpp/model.pyi index 8b9b323..3e0812b 100644 --- a/pywhispercpp/model.pyi +++ b/pywhispercpp/model.pyi @@ -75,6 +75,8 @@ class Model: audio_ctx: int = 0, tdrz_enable: bool = False, initial_prompt: Optional[str] = None, + grammar: Optional[str] = None, + grammar_rule: str = 'root', prompt_tokens: Optional[Tuple[Any, ...]] = None, prompt_n_tokens: int = 0, carry_initial_prompt: bool = False, @@ -126,6 +128,8 @@ class Model: audio_ctx: int = 0, tdrz_enable: bool = False, initial_prompt: Optional[str] = None, + grammar: Optional[str] = None, + grammar_rule: str = 'root', prompt_tokens: Optional[Tuple[Any, ...]] = None, prompt_n_tokens: int = 0, carry_initial_prompt: bool = False, @@ -141,6 +145,7 @@ class Model: entropy_thold: float = 2.4, logprob_thold: float = -1.0, no_speech_thold: float = 0.6, + grammar_penalty: float = 100.0, greedy: GreedyParams = {'best_of': -1}, beam_search: BeamSearchParams = {'beam_size': -1, 'patience': -1.0}, extract_probability: bool = False, diff --git a/src/main.cpp b/src/main.cpp index 7197cb7..815a962 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -15,7 +15,11 @@ #include #include +#include +#include + #include "whisper.h" +#include "../whisper.cpp/examples/grammar-parser.h" #define STRINGIFY(x) #x @@ -373,6 +377,15 @@ int whisper_model_ftype_wrapper(struct whisper_context_wrapper * ctx_w){ } bool _abort_callback(void * user_data); +void _new_segment_callback(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data); +bool _encoder_begin_callback(struct whisper_context * ctx, struct whisper_state * state, void * user_data); +void _logits_filter_callback( + struct whisper_context * ctx, + struct whisper_state * state, + const whisper_token_data * tokens, + int n_tokens, + float * logits, + void * user_data); int whisper_ctx_init_openvino_encoder_wrapper(struct whisper_context_wrapper * ctx, const char * model_path, const char * device, @@ -384,61 +397,105 @@ struct WhisperFullParamsWrapper : public whisper_full_params { std::string initial_prompt_str; std::string suppress_regex_str; std::string vad_model_path_str; + std::vector prompt_token_storage; + grammar_parser::parse_state grammar_state; + std::vector grammar_rule_ptrs; + + void reset_progress_callback() { + progress_callback_user_data = this; + progress_callback = [](struct whisper_context* ctx, struct whisper_state* state, int progress, void* user_data) { + (void) ctx; + (void) state; + auto* self = static_cast(user_data); + if (self && self->print_progress) { + if (self->py_progress_callback) { + py::gil_scoped_acquire gil; + if (self->py_progress_callback_user_data.is_none()) { + self->py_progress_callback(progress); + } else { + self->py_progress_callback(progress, self->py_progress_callback_user_data); + } + } else { + fprintf(stderr, "Progress: %3d%%\n", progress); + } + } + }; + } + + void sync_grammar_fields() { + grammar_rule_ptrs = grammar_state.c_rules(); + grammar_rules = grammar_rule_ptrs.empty() ? nullptr : grammar_rule_ptrs.data(); + n_grammar_rules = grammar_rule_ptrs.size(); + } + void sync_prompt_tokens() { + prompt_tokens = prompt_token_storage.empty() ? nullptr : prompt_token_storage.data(); + prompt_n_tokens = prompt_token_storage.size(); + } public: + py::function py_new_segment_callback; + py::object py_new_segment_callback_user_data; + py::function py_encoder_begin_callback; + py::object py_encoder_begin_callback_user_data; py::function py_progress_callback; - py::object py_abort_callback; + py::object py_progress_callback_user_data; + py::function py_logits_filter_callback; + py::object py_logits_filter_callback_user_data; + py::object py_abort_callback; + py::object py_abort_callback_user_data; WhisperFullParamsWrapper(const whisper_full_params& params = whisper_full_params()) : whisper_full_params(params), initial_prompt_str(params.initial_prompt ? params.initial_prompt : ""), suppress_regex_str(params.suppress_regex ? params.suppress_regex : ""), - vad_model_path_str(params.vad_model_path ? params.vad_model_path : "") + vad_model_path_str(params.vad_model_path ? params.vad_model_path : ""), + prompt_token_storage(), + py_new_segment_callback_user_data(py::none()), + py_encoder_begin_callback_user_data(py::none()), + py_progress_callback_user_data(py::none()), + py_logits_filter_callback_user_data(py::none()), + py_abort_callback(py::none()), + py_abort_callback_user_data(py::none()) { initial_prompt = initial_prompt_str.empty() ? nullptr : initial_prompt_str.c_str(); suppress_regex = suppress_regex_str.empty() ? nullptr : suppress_regex_str.c_str(); vad_model_path = vad_model_path_str.empty() ? nullptr : vad_model_path_str.c_str(); + new_segment_callback_user_data = this; + encoder_begin_callback_user_data = this; abort_callback_user_data = this; - // progress callback - progress_callback_user_data = this; - progress_callback = [](struct whisper_context* ctx, struct whisper_state* state, int progress, void* user_data) { - auto* self = static_cast(user_data); - if(self && self->print_progress){ - if (self->py_progress_callback) { - // call the python callback - py::gil_scoped_acquire gil; - self->py_progress_callback(progress); // Call Python callback - } - else { - fprintf(stderr, "Progress: %3d%%\n", progress); - } // Default message - } - } ; + logits_filter_callback_user_data = this; + if (params.prompt_tokens && params.prompt_n_tokens > 0) { + prompt_token_storage.assign(params.prompt_tokens, params.prompt_tokens + params.prompt_n_tokens); + } + sync_prompt_tokens(); + reset_progress_callback(); } WhisperFullParamsWrapper(const WhisperFullParamsWrapper& other) : whisper_full_params(static_cast(other)), // Copy base struct initial_prompt_str(other.initial_prompt_str), suppress_regex_str(other.suppress_regex_str), vad_model_path_str(other.vad_model_path_str), - py_progress_callback(other.py_progress_callback), - py_abort_callback(other.py_abort_callback) { + prompt_token_storage(other.prompt_token_storage), + grammar_state(other.grammar_state), + py_new_segment_callback(other.py_new_segment_callback), + py_new_segment_callback_user_data(other.py_new_segment_callback_user_data), + py_encoder_begin_callback(other.py_encoder_begin_callback), + py_encoder_begin_callback_user_data(other.py_encoder_begin_callback_user_data), + py_progress_callback(other.py_progress_callback), + py_progress_callback_user_data(other.py_progress_callback_user_data), + py_logits_filter_callback(other.py_logits_filter_callback), + py_logits_filter_callback_user_data(other.py_logits_filter_callback_user_data), + py_abort_callback(other.py_abort_callback), + py_abort_callback_user_data(other.py_abort_callback_user_data) { // Reset pointers to new string copies initial_prompt = initial_prompt_str.empty() ? nullptr : initial_prompt_str.c_str(); suppress_regex = suppress_regex_str.empty() ? nullptr : suppress_regex_str.c_str(); vad_model_path = vad_model_path_str.empty() ? nullptr : vad_model_path_str.c_str(); + new_segment_callback_user_data = this; + encoder_begin_callback_user_data = this; abort_callback_user_data = this; - progress_callback_user_data = this; - progress_callback = [](struct whisper_context* ctx, struct whisper_state* state, int progress, void* user_data) { - auto* self = static_cast(user_data); - if(self && self->print_progress){ - if (self->py_progress_callback) { - // call the python callback - py::gil_scoped_acquire gil; - self->py_progress_callback(progress); // Call Python callback - } - else { - fprintf(stderr, "Progress: %3d%%\n", progress); - } // Default message - } - }; + logits_filter_callback_user_data = this; + sync_prompt_tokens(); + sync_grammar_fields(); + reset_progress_callback(); } void set_initial_prompt(const std::string& prompt) { initial_prompt_str = prompt; @@ -452,16 +509,152 @@ struct WhisperFullParamsWrapper : public whisper_full_params { vad_model_path_str = model_path; vad_model_path = vad_model_path_str.c_str(); } - void set_abort_callback(py::function callback) { - py_abort_callback = callback; - abort_callback_user_data = this; - abort_callback = _abort_callback; + py::tuple get_prompt_tokens() const { + py::tuple tokens(prompt_token_storage.size()); + for (size_t index = 0; index < prompt_token_storage.size(); ++index) { + tokens[index] = prompt_token_storage[index]; + } + return tokens; + } + void set_prompt_tokens(const std::vector& tokens) { + prompt_token_storage = tokens; + sync_prompt_tokens(); + } + void clear_prompt_tokens() { + prompt_token_storage.clear(); + sync_prompt_tokens(); + } + py::object get_new_segment_callback_user_data() const { + return py_new_segment_callback_user_data; + } + void set_new_segment_callback_user_data(py::object user_data) { + py_new_segment_callback_user_data = std::move(user_data); + new_segment_callback_user_data = this; + } + void set_new_segment_callback(py::function callback) { + py_new_segment_callback = std::move(callback); + new_segment_callback_user_data = this; + new_segment_callback = _new_segment_callback; + } + void clear_new_segment_callback() { + py_new_segment_callback = py::function(); + new_segment_callback = nullptr; + new_segment_callback_user_data = this; + } + py::object get_encoder_begin_callback_user_data() const { + return py_encoder_begin_callback_user_data; + } + void set_encoder_begin_callback_user_data(py::object user_data) { + py_encoder_begin_callback_user_data = std::move(user_data); + encoder_begin_callback_user_data = this; + } + void set_encoder_begin_callback(py::function callback) { + py_encoder_begin_callback = std::move(callback); + encoder_begin_callback_user_data = this; + encoder_begin_callback = _encoder_begin_callback; + } + void clear_encoder_begin_callback() { + py_encoder_begin_callback = py::function(); + encoder_begin_callback = nullptr; + encoder_begin_callback_user_data = this; + } + py::object get_progress_callback_user_data() const { + return py_progress_callback_user_data; + } + void set_progress_callback_user_data(py::object user_data) { + py_progress_callback_user_data = std::move(user_data); + progress_callback_user_data = this; + } + void set_progress_callback(py::function callback) { + py_progress_callback = callback; + reset_progress_callback(); + } + void clear_progress_callback() { + py_progress_callback = py::function(); + reset_progress_callback(); + } + py::object get_logits_filter_callback_user_data() const { + return py_logits_filter_callback_user_data; + } + void set_logits_filter_callback_user_data(py::object user_data) { + py_logits_filter_callback_user_data = std::move(user_data); + logits_filter_callback_user_data = this; + } + void set_logits_filter_callback(py::function callback) { + py_logits_filter_callback = std::move(callback); + logits_filter_callback_user_data = this; + logits_filter_callback = _logits_filter_callback; + } + void clear_logits_filter_callback() { + py_logits_filter_callback = py::function(); + logits_filter_callback = nullptr; + logits_filter_callback_user_data = this; + } + py::object get_abort_callback_user_data() const { + return py_abort_callback_user_data; + } + void set_abort_callback_user_data(py::object user_data) { + py_abort_callback_user_data = std::move(user_data); + abort_callback_user_data = this; + } + void set_abort_callback(py::function callback) { + py_abort_callback = callback; + abort_callback_user_data = this; + abort_callback = _abort_callback; + } + void clear_abort_callback() { + py_abort_callback = py::none(); + abort_callback = nullptr; + abort_callback_user_data = this; + } + void set_grammar(const std::string& grammar, const std::string& start_rule, float penalty) { + if (grammar.empty()) { + clear_grammar(); + grammar_penalty = penalty; + return; } - void clear_abort_callback() { - py_abort_callback = py::none(); - abort_callback = nullptr; - abort_callback_user_data = this; + + std::ifstream grammar_file(grammar); + std::string grammar_source; + if (grammar_file.good()) { + grammar_source.assign( + std::istreambuf_iterator(grammar_file), + std::istreambuf_iterator()); + } else { + grammar_source = grammar; + } + + auto parsed = grammar_parser::parse(grammar_source.c_str()); + auto rule_iter = parsed.symbol_ids.find(start_rule); + if (rule_iter == parsed.symbol_ids.end()) { + throw std::runtime_error("unknown grammar start rule: " + start_rule); } + + grammar_state = std::move(parsed); + sync_grammar_fields(); + i_start_rule = rule_iter->second; + grammar_penalty = penalty; + } + void clear_grammar() { + grammar_state = grammar_parser::parse_state(); + grammar_rule_ptrs.clear(); + grammar_rules = nullptr; + n_grammar_rules = 0; + i_start_rule = 0; + } + py::list get_grammar_rules() const { + py::list rules; + for (const auto& rule : grammar_state.rules) { + py::list elements; + for (const auto& element : rule) { + elements.append(py::dict( + "type"_a = static_cast(element.type), + "value"_a = element.value)); + } + rules.append(elements); + } + return rules; + } }; WhisperFullParamsWrapper whisper_full_default_params_wrapper(enum whisper_sampling_strategy strategy) { return WhisperFullParamsWrapper(whisper_full_default_params(strategy)); @@ -470,30 +663,72 @@ WhisperFullParamsWrapper whisper_full_default_params_wrapper(enum whisper_sampl // callbacks mechanism void _new_segment_callback(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data){ + (void) state; struct whisper_context_wrapper ctx_w; ctx_w.ptr = ctx; - // call the python callback - py::gil_scoped_acquire gil; // Acquire the GIL while in this scope. - py_new_segment_callback(ctx_w, n_new, user_data); + auto * params = static_cast(user_data); + if (!params || !params->py_new_segment_callback) { + return; + } + + py::gil_scoped_acquire gil; + py::function callback = params->py_new_segment_callback; + callback( + ctx_w, + n_new, + params->py_new_segment_callback_user_data.is_none() + ? py::none() + : params->py_new_segment_callback_user_data); }; -void assign_new_segment_callback(struct whisper_full_params *params, py::function f){ - params->new_segment_callback = _new_segment_callback; - py_new_segment_callback = f; +void assign_new_segment_callback(struct whisper_full_params *params_base, py::object callback){ + auto * params = static_cast(params_base); + if (callback.is_none()) { + params->clear_new_segment_callback(); + return; + } + + params->set_new_segment_callback(callback.cast()); +} + +void clear_new_segment_callback(struct whisper_full_params *params_base) { + auto * params = static_cast(params_base); + params->clear_new_segment_callback(); }; bool _encoder_begin_callback(struct whisper_context * ctx, struct whisper_state * state, void * user_data){ + (void) state; struct whisper_context_wrapper ctx_w; ctx_w.ptr = ctx; - // call the python callback - py::object result_py = py_encoder_begin_callback(ctx_w, user_data); + auto * params = static_cast(user_data); + if (!params || !params->py_encoder_begin_callback) { + return false; + } + + py::gil_scoped_acquire gil; + py::function callback = params->py_encoder_begin_callback; + py::object result_py = callback( + ctx_w, + params->py_encoder_begin_callback_user_data.is_none() + ? py::none() + : params->py_encoder_begin_callback_user_data); bool res = result_py.cast(); return res; } -void assign_encoder_begin_callback(struct whisper_full_params *params, py::function f){ - params->encoder_begin_callback = _encoder_begin_callback; - py_encoder_begin_callback = f; +void assign_encoder_begin_callback(struct whisper_full_params *params_base, py::object callback){ + auto * params = static_cast(params_base); + if (callback.is_none()) { + params->clear_encoder_begin_callback(); + return; + } + + params->set_encoder_begin_callback(callback.cast()); +} + +void clear_encoder_begin_callback(struct whisper_full_params *params_base) { + auto * params = static_cast(params_base); + params->clear_encoder_begin_callback(); } void _logits_filter_callback( @@ -503,15 +738,54 @@ void _logits_filter_callback( int n_tokens, float * logits, void * user_data){ + (void) state; + (void) tokens; struct whisper_context_wrapper ctx_w; ctx_w.ptr = ctx; - // call the python callback - py_logits_filter_callback(ctx_w, n_tokens, logits, user_data); + auto * params = static_cast(user_data); + if (!params || !params->py_logits_filter_callback) { + return; + } + + py::gil_scoped_acquire gil; + py::function callback = params->py_logits_filter_callback; + callback( + ctx_w, + n_tokens, + logits, + params->py_logits_filter_callback_user_data.is_none() + ? py::none() + : params->py_logits_filter_callback_user_data); +} + +void assign_logits_filter_callback(struct whisper_full_params *params_base, py::object callback){ + auto * params = static_cast(params_base); + if (callback.is_none()) { + params->clear_logits_filter_callback(); + return; + } + + params->set_logits_filter_callback(callback.cast()); +} + +void clear_logits_filter_callback(struct whisper_full_params *params_base) { + auto * params = static_cast(params_base); + params->clear_logits_filter_callback(); } -void assign_logits_filter_callback(struct whisper_full_params *params, py::function f){ - params->logits_filter_callback = _logits_filter_callback; - py_logits_filter_callback = f; +void assign_progress_callback(whisper_full_params *params_base, py::object callback) { + auto * params = static_cast(params_base); + if (callback.is_none()) { + params->clear_progress_callback(); + return; + } + + params->set_progress_callback(callback.cast()); +} + +void clear_progress_callback(whisper_full_params *params_base) { + auto * params = static_cast(params_base); + params->clear_progress_callback(); } bool _abort_callback(void * user_data) { @@ -522,7 +796,9 @@ bool _abort_callback(void * user_data) { py::gil_scoped_acquire gil; py::function callback = params->py_abort_callback.cast(); - py::object result_py = callback(); + py::object result_py = params->py_abort_callback_user_data.is_none() + ? callback() + : callback(params->py_abort_callback_user_data); return result_py.cast(); } @@ -905,6 +1181,18 @@ PYBIND11_MODULE(_pywhispercpp, m) { .def_readwrite("print_special", &WhisperFullParamsWrapper::print_special) .def_readwrite("print_progress", &WhisperFullParamsWrapper::print_progress) .def_readwrite("progress_callback", &WhisperFullParamsWrapper::py_progress_callback) + .def("set_progress_callback", + [](WhisperFullParamsWrapper &self, py::object callback) { + if (callback.is_none()) { + self.clear_progress_callback(); + } else { + self.set_progress_callback(callback.cast()); + } + }, + py::arg("callback") = py::none(), + "Assign a progress callback that receives progress updates.") + .def("clear_progress_callback", &WhisperFullParamsWrapper::clear_progress_callback, + "Clear any previously assigned progress callback while preserving default progress behavior.") .def_readwrite("print_realtime", &WhisperFullParamsWrapper::print_realtime) .def_readwrite("print_timestamps", &WhisperFullParamsWrapper::print_timestamps) .def_readwrite("token_timestamps", &WhisperFullParamsWrapper::token_timestamps) @@ -931,7 +1219,46 @@ PYBIND11_MODULE(_pywhispercpp, m) { self.set_initial_prompt(initial_prompt); } ) - .def_readwrite("prompt_tokens", &WhisperFullParamsWrapper::prompt_tokens) + .def_property("prompt_tokens", + [](WhisperFullParamsWrapper &self) { + return self.get_prompt_tokens(); + }, + [](WhisperFullParamsWrapper &self, py::object tokens) { + if (tokens.is_none()) { + self.clear_prompt_tokens(); + } else { + self.set_prompt_tokens(tokens.cast>()); + } + }) + .def("set_prompt_tokens", &WhisperFullParamsWrapper::set_prompt_tokens, + py::arg("tokens"), + "Assign prompt tokens from a Python sequence.") + .def("clear_prompt_tokens", &WhisperFullParamsWrapper::clear_prompt_tokens, + "Clear any previously assigned prompt tokens.") + .def("set_new_segment_callback", + [](WhisperFullParamsWrapper &self, py::object callback) { + if (callback.is_none()) { + self.clear_new_segment_callback(); + } else { + self.set_new_segment_callback(callback.cast()); + } + }, + py::arg("callback") = py::none(), + "Assign a new-segment callback.") + .def("clear_new_segment_callback", &WhisperFullParamsWrapper::clear_new_segment_callback, + "Clear any previously assigned new-segment callback.") + .def("set_encoder_begin_callback", + [](WhisperFullParamsWrapper &self, py::object callback) { + if (callback.is_none()) { + self.clear_encoder_begin_callback(); + } else { + self.set_encoder_begin_callback(callback.cast()); + } + }, + py::arg("callback") = py::none(), + "Assign an encoder-begin callback.") + .def("clear_encoder_begin_callback", &WhisperFullParamsWrapper::clear_encoder_begin_callback, + "Clear any previously assigned encoder-begin callback.") .def("set_abort_callback", [](WhisperFullParamsWrapper &self, py::object callback) { if (callback.is_none()) { @@ -973,10 +1300,42 @@ PYBIND11_MODULE(_pywhispercpp, m) { [](WhisperFullParamsWrapper &self, py::dict dict) {self.greedy.best_of = dict["best_of"].cast();}) .def_property("beam_search", [](WhisperFullParamsWrapper &self) {return py::dict("beam_size"_a=self.beam_search.beam_size, "patience"_a=self.beam_search.patience);}, [](WhisperFullParamsWrapper &self, py::dict dict) {self.beam_search.beam_size = dict["beam_size"].cast(); self.beam_search.patience = dict["patience"].cast();}) - .def_readwrite("new_segment_callback_user_data", &WhisperFullParamsWrapper::new_segment_callback_user_data) - .def_readwrite("encoder_begin_callback_user_data", &WhisperFullParamsWrapper::encoder_begin_callback_user_data) - .def_readwrite("logits_filter_callback_user_data", &WhisperFullParamsWrapper::logits_filter_callback_user_data) - .def_readwrite("grammar_penalty", &WhisperFullParamsWrapper::grammar_penalty) + .def_property("new_segment_callback_user_data", + &WhisperFullParamsWrapper::get_new_segment_callback_user_data, + &WhisperFullParamsWrapper::set_new_segment_callback_user_data) + .def_property("progress_callback_user_data", + &WhisperFullParamsWrapper::get_progress_callback_user_data, + &WhisperFullParamsWrapper::set_progress_callback_user_data) + .def_property("encoder_begin_callback_user_data", + &WhisperFullParamsWrapper::get_encoder_begin_callback_user_data, + &WhisperFullParamsWrapper::set_encoder_begin_callback_user_data) + .def_property("abort_callback_user_data", + &WhisperFullParamsWrapper::get_abort_callback_user_data, + &WhisperFullParamsWrapper::set_abort_callback_user_data) + .def_property("logits_filter_callback_user_data", + &WhisperFullParamsWrapper::get_logits_filter_callback_user_data, + &WhisperFullParamsWrapper::set_logits_filter_callback_user_data) + .def_property_readonly("grammar_rules", &WhisperFullParamsWrapper::get_grammar_rules) + .def_property_readonly("n_grammar_rules", [](const WhisperFullParamsWrapper &self) { return self.n_grammar_rules; }) + .def_property_readonly("i_start_rule", [](const WhisperFullParamsWrapper &self) { return self.i_start_rule; }) + .def("set_grammar", &WhisperFullParamsWrapper::set_grammar, + py::arg("grammar"), py::arg("start_rule") = "root", py::arg("penalty") = 100.0f, + "Parse grammar text or a grammar file path and assign it to the params.") + .def("set_logits_filter_callback", + [](WhisperFullParamsWrapper &self, py::object callback) { + if (callback.is_none()) { + self.clear_logits_filter_callback(); + } else { + self.set_logits_filter_callback(callback.cast()); + } + }, + py::arg("callback") = py::none(), + "Assign a logits-filter callback.") + .def("clear_logits_filter_callback", &WhisperFullParamsWrapper::clear_logits_filter_callback, + "Clear any previously assigned logits-filter callback.") + .def("clear_grammar", &WhisperFullParamsWrapper::clear_grammar, + "Clear any previously assigned grammar.") + .def_readwrite("grammar_penalty", &WhisperFullParamsWrapper::grammar_penalty) .def_readwrite("vad", &WhisperFullParamsWrapper::vad) .def_property("vad_model_path", [](WhisperFullParamsWrapper &self) { @@ -1044,14 +1403,49 @@ PYBIND11_MODULE(_pywhispercpp, m) { // Helper mechanism to set callbacks from python // The only difference from the C-Style API - m.def("assign_new_segment_callback", &assign_new_segment_callback, "Assigns a new_segment_callback, takes instance and a callable function with the same parameters which are defined in the interface", - py::arg("params"), py::arg("callback")); + m.def("assign_new_segment_callback", + [](whisper_full_params * params, py::object callback) { + assign_new_segment_callback(params, callback); + }, + "Assign a new-segment callback.", + py::arg("params"), py::arg("callback") = py::none()); - m.def("assign_encoder_begin_callback", &assign_encoder_begin_callback, "Assigns an encoder_begin_callback, takes instance and a callable function with the same parameters which are defined in the interface", - py::arg("params"), py::arg("callback")); + m.def("clear_new_segment_callback", &clear_new_segment_callback, + "Clear any previously assigned new-segment callback.", + py::arg("params")); + + m.def("assign_encoder_begin_callback", + [](whisper_full_params * params, py::object callback) { + assign_encoder_begin_callback(params, callback); + }, + "Assign an encoder-begin callback.", + py::arg("params"), py::arg("callback") = py::none()); + + m.def("clear_encoder_begin_callback", &clear_encoder_begin_callback, + "Clear any previously assigned encoder-begin callback.", + py::arg("params")); + + m.def("assign_logits_filter_callback", + [](whisper_full_params * params, py::object callback) { + assign_logits_filter_callback(params, callback); + }, + "Assign a logits-filter callback.", + py::arg("params"), py::arg("callback") = py::none()); + + m.def("clear_logits_filter_callback", &clear_logits_filter_callback, + "Clear any previously assigned logits-filter callback.", + py::arg("params")); + + m.def("assign_progress_callback", + [](whisper_full_params * params, py::object callback) { + assign_progress_callback(params, callback); + }, + "Assign a progress callback that receives progress updates.", + py::arg("params"), py::arg("callback") = py::none()); - m.def("assign_logits_filter_callback", &assign_logits_filter_callback, "Assigns a logits_filter_callback, takes instance and a callable function with the same parameters which are defined in the interface", - py::arg("params"), py::arg("callback")); + m.def("clear_progress_callback", &clear_progress_callback, + "Clear any previously assigned progress callback while preserving default progress behavior.", + py::arg("params")); m.def("assign_abort_callback", [](whisper_full_params * params, py::object callback) { From 66a46d458787f6338cae4692e4105a1c8e3a6b83 Mon Sep 17 00:00:00 2001 From: scottmonster <87917233+scottmonster@users.noreply.github.com> Date: Tue, 19 May 2026 19:06:37 -0500 Subject: [PATCH 08/16] begin callback normalization --- src/main.cpp | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 815a962..a55d3ba 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -33,9 +33,6 @@ namespace py = pybind11; using namespace pybind11::literals; // to bring in the `_a` literal -py::function py_new_segment_callback; -py::function py_encoder_begin_callback; -py::function py_logits_filter_callback; py::object py_log_callback; @@ -440,7 +437,7 @@ struct WhisperFullParamsWrapper : public whisper_full_params { py::object py_progress_callback_user_data; py::function py_logits_filter_callback; py::object py_logits_filter_callback_user_data; - py::object py_abort_callback; + py::function py_abort_callback; py::object py_abort_callback_user_data; WhisperFullParamsWrapper(const whisper_full_params& params = whisper_full_params()) : whisper_full_params(params), @@ -452,7 +449,7 @@ struct WhisperFullParamsWrapper : public whisper_full_params { py_encoder_begin_callback_user_data(py::none()), py_progress_callback_user_data(py::none()), py_logits_filter_callback_user_data(py::none()), - py_abort_callback(py::none()), + py_abort_callback(), py_abort_callback_user_data(py::none()) { initial_prompt = initial_prompt_str.empty() ? nullptr : initial_prompt_str.c_str(); @@ -598,12 +595,12 @@ struct WhisperFullParamsWrapper : public whisper_full_params { abort_callback_user_data = this; } void set_abort_callback(py::function callback) { - py_abort_callback = callback; + py_abort_callback = std::move(callback); abort_callback_user_data = this; abort_callback = _abort_callback; } void clear_abort_callback() { - py_abort_callback = py::none(); + py_abort_callback = py::function(); abort_callback = nullptr; abort_callback_user_data = this; } @@ -790,12 +787,12 @@ void clear_progress_callback(whisper_full_params *params_base) { bool _abort_callback(void * user_data) { auto * params = static_cast(user_data); - if (!params || !params->py_abort_callback || params->py_abort_callback.is_none()) { + if (!params || !params->py_abort_callback) { return false; } py::gil_scoped_acquire gil; - py::function callback = params->py_abort_callback.cast(); + py::function callback = params->py_abort_callback; py::object result_py = params->py_abort_callback_user_data.is_none() ? callback() : callback(params->py_abort_callback_user_data); @@ -805,22 +802,16 @@ bool _abort_callback(void * user_data) { void assign_abort_callback(whisper_full_params *params_base, py::object callback){ auto * params = static_cast(params_base); if (callback.is_none()) { - params->py_abort_callback = py::none(); - params->abort_callback = nullptr; - params->abort_callback_user_data = params; + params->clear_abort_callback(); return; } - params->py_abort_callback = callback.cast(); - params->abort_callback_user_data = params; - params->abort_callback = _abort_callback; + params->set_abort_callback(callback.cast()); } void clear_abort_callback(whisper_full_params *params_base) { auto * params = static_cast(params_base); - params->py_abort_callback = py::none(); - params->abort_callback = nullptr; - params->abort_callback_user_data = params; + params->clear_abort_callback(); } void whisper_log_set_wrapper(py::object callback) { From d3e68a810360850601850e266ad9800b801788c3 Mon Sep 17 00:00:00 2001 From: scottmonster <87917233+scottmonster@users.noreply.github.com> Date: Tue, 19 May 2026 21:13:19 -0500 Subject: [PATCH 09/16] finish callback normalization --- .gitignore | 9 +- pywhispercpp/model.py | 26 +++-- pywhispercpp/model.pyi | 6 +- src/main.cpp | 51 +++++---- tests/test_backwards_compatibility.py | 153 -------------------------- 5 files changed, 54 insertions(+), 191 deletions(-) delete mode 100644 tests/test_backwards_compatibility.py diff --git a/.gitignore b/.gitignore index 3e25b4f..d28e8f1 100644 --- a/.gitignore +++ b/.gitignore @@ -7,12 +7,15 @@ _generate/ *.py[cod] *.egg-info *env* +# install -e artifacts _version.py - -coverage libggml* libwhisper* -updating + +# ignore downloaded source code... really this is just for quickly checking previous versions +pywhispercpp-*.* + + # custom .idea diff --git a/pywhispercpp/model.py b/pywhispercpp/model.py index 44adaeb..26701fd 100644 --- a/pywhispercpp/model.py +++ b/pywhispercpp/model.py @@ -81,7 +81,7 @@ class Model: ``` """ - _new_segment_callback = None + def __init__(self, model: str = 'tiny', @@ -144,6 +144,7 @@ def __init__(self, - `suppress_blank`: suppress blank outputs. Default `True`. - `suppress_non_speech_tokens`: Python alias for `suppress_nst`. Default `False`. - `suppress_nst`: suppress non-speech tokens. Default `False`. + - `suppress_regex`: regex pattern used to suppress matching text during decoding. Default `''`. - `temperature`: initial decoding temperature. Default `0.0`. - `max_initial_ts`: maximum initial timestamp. Default `1.0`. - `length_penalty`: length penalty. Default `-1.0`. @@ -153,7 +154,7 @@ def __init__(self, - `no_speech_thold`: no-speech threshold. Default `0.6`. - `grammar_penalty`: penalty applied to non-grammar tokens. Default `100.0`. - `greedy`: greedy-decoder settings, typically `{"best_of": 5}`. - - `beam_search`: beam-search settings, schema default `{"beam_size": 5, "patience": -1.0}`. + - `beam_search`: beam-search settings, schema default `{"beam_size": -1, "patience": -1.0}`. - `vad`: enable VAD. Default `False`. - `vad_model_path`: path to the VAD model. Default `None`. """ @@ -171,6 +172,8 @@ def __init__(self, self.openvino_model_path = openvino_model_path self.openvino_device = openvino_device self.openvino_cache_dir = openvino_cache_dir + # todo... maybe setup default callbacks for segments and abort globaly and/or per model instance? + self._new_segment_callback = None # init the model self._init_model() @@ -208,10 +211,13 @@ def transcribe(self, # update params if any self._set_params(params) - # setting up callback - if new_segment_callback: - Model._new_segment_callback = new_segment_callback - pw.assign_new_segment_callback(self._params, Model.__call_new_segment_callback) + # setting up callback. make sure self._new_segment_callback = None when new_segment_callback = None. + # since this is no lonmger bound to the Model but on self + self._new_segment_callback = new_segment_callback + pw.assign_new_segment_callback( + self._params, + self.__call_new_segment_callback if new_segment_callback is not None else None, + ) pw.assign_abort_callback(self._params, abort_callback) @@ -441,8 +447,8 @@ def _transcribe(self, audio: np.ndarray, n_processors: Optional[int] = None): res = Model._get_segments(self._ctx, 0, n, self.extract_probability) return res - @staticmethod - def __call_new_segment_callback(ctx, n_new, user_data) -> None: + + def __call_new_segment_callback(self, ctx, n_new, user_data=None) -> None: """ Internal new_segment_callback, it just calls the user's callback with the `Segment` object :param ctx: whisper.cpp ctx param @@ -454,8 +460,8 @@ def __call_new_segment_callback(ctx, n_new, user_data) -> None: start = n - n_new res = Model._get_segments(ctx, start, n, False) for segment in res: - if Model._new_segment_callback is not None: - Model._new_segment_callback(segment) + if self._new_segment_callback is not None: + self._new_segment_callback(segment) @staticmethod def _load_audio(media_file_path: str) -> np.ndarray: diff --git a/pywhispercpp/model.pyi b/pywhispercpp/model.pyi index 3e0812b..936c6ac 100644 --- a/pywhispercpp/model.pyi +++ b/pywhispercpp/model.pyi @@ -85,6 +85,7 @@ class Model: suppress_blank: bool = True, suppress_non_speech_tokens: bool = False, suppress_nst: bool = False, + suppress_regex: str = '', temperature: float = 0.0, max_initial_ts: float = 1.0, length_penalty: float = -1.0, @@ -92,7 +93,7 @@ class Model: entropy_thold: float = 2.4, logprob_thold: float = -1.0, no_speech_thold: float = 0.6, - greedy: GreedyParams = {'best_of': -1}, + greedy: GreedyParams = {'best_of': 5}, beam_search: BeamSearchParams = {'beam_size': -1, 'patience': -1.0}, vad: bool = False, vad_model_path: Optional[str] = None, @@ -138,6 +139,7 @@ class Model: suppress_blank: bool = True, suppress_non_speech_tokens: bool = False, suppress_nst: bool = False, + suppress_regex: str = '', temperature: float = 0.0, max_initial_ts: float = 1.0, length_penalty: float = -1.0, @@ -146,7 +148,7 @@ class Model: logprob_thold: float = -1.0, no_speech_thold: float = 0.6, grammar_penalty: float = 100.0, - greedy: GreedyParams = {'best_of': -1}, + greedy: GreedyParams = {'best_of': 5}, beam_search: BeamSearchParams = {'beam_size': -1, 'patience': -1.0}, extract_probability: bool = False, vad: bool = False, diff --git a/src/main.cpp b/src/main.cpp index a55d3ba..c76d8be 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -32,6 +32,10 @@ namespace py = pybind11; using namespace pybind11::literals; // to bring in the `_a` literal +inline bool has_python_user_data(const py::object & obj) { + return obj.ptr() != nullptr && obj.ptr() != Py_None; +} + py::object py_log_callback; @@ -407,7 +411,7 @@ struct WhisperFullParamsWrapper : public whisper_full_params { if (self && self->print_progress) { if (self->py_progress_callback) { py::gil_scoped_acquire gil; - if (self->py_progress_callback_user_data.is_none()) { + if (!has_python_user_data(self->py_progress_callback_user_data)) { self->py_progress_callback(progress); } else { self->py_progress_callback(progress, self->py_progress_callback_user_data); @@ -563,7 +567,7 @@ struct WhisperFullParamsWrapper : public whisper_full_params { progress_callback_user_data = this; } void set_progress_callback(py::function callback) { - py_progress_callback = callback; + py_progress_callback = std::move(callback); reset_progress_callback(); } void clear_progress_callback() { @@ -670,12 +674,11 @@ void _new_segment_callback(struct whisper_context * ctx, struct whisper_state * py::gil_scoped_acquire gil; py::function callback = params->py_new_segment_callback; - callback( - ctx_w, - n_new, - params->py_new_segment_callback_user_data.is_none() - ? py::none() - : params->py_new_segment_callback_user_data); + if (!has_python_user_data(params->py_new_segment_callback_user_data)) { + callback(ctx_w, n_new); + } else { + callback(ctx_w, n_new, params->py_new_segment_callback_user_data); + } }; void assign_new_segment_callback(struct whisper_full_params *params_base, py::object callback){ @@ -704,11 +707,12 @@ bool _encoder_begin_callback(struct whisper_context * ctx, struct whisper_state py::gil_scoped_acquire gil; py::function callback = params->py_encoder_begin_callback; - py::object result_py = callback( - ctx_w, - params->py_encoder_begin_callback_user_data.is_none() - ? py::none() - : params->py_encoder_begin_callback_user_data); + py::object result_py; + if (!has_python_user_data(params->py_encoder_begin_callback_user_data)) { + result_py = callback(ctx_w); + } else { + result_py = callback(ctx_w, params->py_encoder_begin_callback_user_data); + } bool res = result_py.cast(); return res; } @@ -746,13 +750,11 @@ void _logits_filter_callback( py::gil_scoped_acquire gil; py::function callback = params->py_logits_filter_callback; - callback( - ctx_w, - n_tokens, - logits, - params->py_logits_filter_callback_user_data.is_none() - ? py::none() - : params->py_logits_filter_callback_user_data); + if (!has_python_user_data(params->py_logits_filter_callback_user_data)) { + callback(ctx_w, n_tokens, logits); + } else { + callback(ctx_w, n_tokens, logits, params->py_logits_filter_callback_user_data); + } } void assign_logits_filter_callback(struct whisper_full_params *params_base, py::object callback){ @@ -793,9 +795,12 @@ bool _abort_callback(void * user_data) { py::gil_scoped_acquire gil; py::function callback = params->py_abort_callback; - py::object result_py = params->py_abort_callback_user_data.is_none() - ? callback() - : callback(params->py_abort_callback_user_data); + py::object result_py; + if (!has_python_user_data(params->py_abort_callback_user_data)) { + result_py = callback(); + } else { + result_py = callback(params->py_abort_callback_user_data); + } return result_py.cast(); } diff --git a/tests/test_backwards_compatibility.py b/tests/test_backwards_compatibility.py deleted file mode 100644 index 4e21cdc..0000000 --- a/tests/test_backwards_compatibility.py +++ /dev/null @@ -1,153 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import gc -import subprocess -import sys -import textwrap -import unittest -from pathlib import Path -from unittest import TestCase - -import _pywhispercpp as pw - -from pywhispercpp.model import Model, Segment - - -WHISPER_CPP_DIR = Path(__file__).parent.parent / 'whisper.cpp' - - -class TestBackwardsCompatibility(TestCase): - audio_file = WHISPER_CPP_DIR / 'samples/jfk.wav' - models_dir = str(WHISPER_CPP_DIR / 'models') - repo_root = Path(__file__).parent.parent - - def tearDown(self): - gc.collect() - - def _create_cpu_model(self): - return Model( - 'tiny', - models_dir=self.models_dir, - context_params={'use_gpu': False, 'flash_attn': False}, - ) - - def _run_python(self, code: str): - result = subprocess.run( - [sys.executable, '-c', textwrap.dedent(code)], - cwd=self.repo_root, - capture_output=True, - text=True, - ) - self.assertEqual( - result.returncode, - 0, - msg=f"stdout:\n{result.stdout}\n\nstderr:\n{result.stderr}", - ) - - def test_legacy_model_constructor_still_works(self): - self._run_python( - f''' - from pywhispercpp.model import Model - - model = Model('tiny', models_dir={self.models_dir!r}) - assert isinstance(model, Model) - ''' - ) - - def test_legacy_alias_still_maps_to_suppress_nst(self): - self._run_python( - f''' - from pywhispercpp.model import Model - - model = Model( - 'tiny', - models_dir={self.models_dir!r}, - context_params={{'use_gpu': False, 'flash_attn': False}}, - ) - model._set_params({{'suppress_non_speech_tokens': True}}) - assert model.get_params()['suppress_nst'] is True - ''' - ) - - def test_low_level_prompt_tokens_property_round_trips(self): - params = pw.whisper_full_default_params( - pw.whisper_sampling_strategy.WHISPER_SAMPLING_GREEDY - ) - params.prompt_tokens = (1, 2, 3) - self.assertEqual(tuple(params.prompt_tokens), (1, 2, 3)) - self.assertEqual(params.prompt_n_tokens, 3) - - def test_context_params_dict_is_additive(self): - self._run_python( - f''' - from pywhispercpp.model import Model - - model = Model( - 'tiny', - models_dir={self.models_dir!r}, - context_params={{'use_gpu': False, 'flash_attn': False}}, - ) - assert isinstance(model, Model) - ''' - ) - - def test_existing_new_segment_callback_still_works(self): - self._run_python( - f''' - from pywhispercpp.model import Model, Segment - - seen = [] - model = Model( - 'tiny', - models_dir={self.models_dir!r}, - context_params={{'use_gpu': False, 'flash_attn': False}}, - ) - - def on_segment(segment): - seen.append(segment) - - segments = model.transcribe({str(self.audio_file)!r}, new_segment_callback=on_segment) - assert isinstance(segments, list) - assert len(seen) > 0 - assert all(isinstance(segment, Segment) for segment in seen) - ''' - ) - - def test_abort_callback_can_abort_and_then_clear(self): - self._run_python( - f''' - from pywhispercpp.model import Model - - model = Model( - 'tiny', - models_dir={self.models_dir!r}, - context_params={{'use_gpu': False, 'flash_attn': False}}, - ) - callback_calls = [] - - def abort_immediately(): - callback_calls.append(True) - return True - - aborted_segments = model.transcribe({str(self.audio_file)!r}, abort_callback=abort_immediately) - assert isinstance(aborted_segments, list) - assert len(callback_calls) > 0 - - normal_segments = model.transcribe({str(self.audio_file)!r}) - assert isinstance(normal_segments, list) - assert len(normal_segments) > 0 - ''' - ) - - def test_log_callback_can_be_set_and_cleared(self): - pw.whisper_log_set(lambda level, text: None) - pw.whisper_log_set(None) - - def test_alignment_preset_enum_is_available(self): - preset = pw.whisper_alignment_heads_preset.WHISPER_AHEADS_TINY - self.assertIsNotNone(preset) - - -if __name__ == '__main__': - unittest.main() \ No newline at end of file From e71e375ddbd29c938d69a7b2ee32ae32b666c45e Mon Sep 17 00:00:00 2001 From: scottmonster <87917233+scottmonster@users.noreply.github.com> Date: Tue, 19 May 2026 21:55:28 -0500 Subject: [PATCH 10/16] remove whisper_args.txt --- whsiper_args.txt | 252 ----------------------------------------------- 1 file changed, 252 deletions(-) delete mode 100644 whsiper_args.txt diff --git a/whsiper_args.txt b/whsiper_args.txt deleted file mode 100644 index 35678ec..0000000 --- a/whsiper_args.txt +++ /dev/null @@ -1,252 +0,0 @@ - -usage: ./whisper-cli [options] file0 file1 ... -supported audio formats: flac, mp3, ogg, wav - -options: - --help [default] show this help message and exit - --threads N [4 ] number of threads to use during computation - --processors N [1 ] number of processors to use during computation - --offset-t N [0 ] time offset in milliseconds - --offset-n N [0 ] segment index offset - --duration N [0 ] duration of audio to process in milliseconds - --max-context N [-1 ] maximum number of text context tokens to store - --max-len N [0 ] maximum segment length in characters - --max-tokens N [0 ] maximum number of tokens per segment - --split-on-word [false ] split on word rather than on token - --best-of N [5 ] number of best candidates to keep - --beam-size N [5 ] beam size for beam search - --audio-ctx N [0 ] audio context size (0 - all) - --word-thold N [0.01 ] word timestamp probability threshold - --entropy-thold N [2.40 ] entropy threshold for decoder fail - --logprob-thold N [-1.00 ] log probability threshold for decoder fail - --no-speech-thold N [0.60 ] no speech threshold - --temperature N [0.00 ] The sampling temperature, between 0 and 1 - --temperature-inc N [0.20 ] The increment of temperature, between 0 and 1 - --debug-mode [false ] enable debug mode (eg. dump log_mel) - --translate [false ] translate from source language to english - --diarize [false ] stereo audio diarization - --tinydiarize [false ] enable tinydiarize (requires a tdrz model) - --no-fallback [false ] do not use temperature fallback while decoding - --output-txt [false ] output result in a text file - --output-vtt [false ] output result in a vtt file - --output-srt [false ] output result in a srt file - --output-lrc [false ] output result in a lrc file - --output-words [false ] output script for generating karaoke video - --font-path [/System/Library/Fonts/Supplemental/Courier New Bold.ttf] path to a monospace font for karaoke video - --output-csv [false ] output result in a CSV file - --output-json [false ] output result in a JSON file - --output-json-full [false ] include more information in the JSON file - --output-file FNAME [ ] output file path (without file extension) - --no-prints [false ] do not print anything other than the results - --print-special [false ] print special tokens - --print-colors [false ] print colors - --print-confidence [false ] print confidence - --print-progress [false ] print progress - --no-timestamps [false ] do not print timestamps - --language LANG [en ] spoken language ('auto' for auto-detect) - --detect-language [false ] exit after automatically detecting language - --prompt PROMPT [ ] initial prompt (max n_text_ctx/2 tokens) - --carry-initial-prompt [false ] always prepend initial prompt - --model FNAME [models/ggml-base.en.bin] model path - --file FNAME [ ] input audio file path - --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference - --dtw MODEL [ ] compute token-level timestamps - --log-score [false ] log best decoder scores of tokens - --no-gpu [false ] disable GPU - --device N [0 ] GPU device ID (default: 0) - --flash-attn [true ] enable flash attention - --no-flash-attn [false ] disable flash attention - --suppress-blank [true ] suppress blank outputs - --no-suppress-blank [false ] disable blank suppression - --suppress-nst [false ] suppress non-speech tokens - --suppress-regex REGEX [ ] regular expression matching tokens to suppress - --grammar GRAMMAR [ ] GBNF grammar to guide decoding - --grammar-rule RULE [ ] top-level GBNF grammar rule name - --grammar-penalty N [100.0 ] scales down logits of nongrammar tokens - -Voice Activity Detection (VAD) options: - --vad [false ] enable Voice Activity Detection (VAD) - --vad-model FNAME [ ] VAD model path - --vad-threshold N [0.50 ] VAD threshold for speech recognition - --vad-min-speech-duration-ms N [250 ] VAD min speech duration (0.0-1.0) - --vad-min-silence-duration-ms N [100 ] VAD min silence duration (to split segments) - --vad-max-speech-duration-s N [FLT_MAX] VAD max speech duration (auto-split longer) - --vad-speech-pad-ms N [30 ] VAD speech padding (extend segments) - --vad-samples-overlap N [0.10 ] VAD samples overlap (seconds between segments) - - -usage: ./whisper-stream [options] - -options: - --help [default] show this help message and exit - --threads N [4 ] number of threads to use during computation - --step N [3000 ] audio step size in milliseconds - --length N [10000 ] audio length in milliseconds - --keep N [200 ] audio to keep from previous step in ms - --capture ID [-1 ] capture device ID - --max-tokens N [32 ] maximum number of tokens per audio chunk - --audio-ctx N [0 ] audio context size (0 - all) - --beam-size N [-1 ] beam size for beam search - --vad-thold N [0.60 ] voice activity detection threshold - --freq-thold N [100.00 ] high-pass frequency cutoff - --translate [false ] translate from source language to english - --no-fallback [false ] do not use temperature fallback while decoding - --print-special [false ] print special tokens - --keep-context [false ] keep context between audio chunks - --language LANG [en ] spoken language - --model FNAME [models/ggml-base.en.bin] model path - --file FNAME [ ] text output file name - --tinydiarize [false ] enable tinydiarize (requires a tdrz model) - --save-audio [false ] save the recorded audio to a file - --no-gpu [false ] disable GPU inference - --flash-attn [true ] enable flash attention during inference - --no-flash-attn [false ] disable flash attention during inference - - -usage: ./whisper-server [options] - -options: - --help [default] show this help message and exit - --threads N [4 ] number of threads to use during computation - --processors N [1 ] number of processors to use during computation - --offset-t N [0 ] time offset in milliseconds - --offset-n N [0 ] segment index offset - --duration N [0 ] duration of audio to process in milliseconds - --max-context N [-1 ] maximum number of text context tokens to store - --max-len N [0 ] maximum segment length in characters - --split-on-word [false ] split on word rather than on token - --best-of N [2 ] number of best candidates to keep - --beam-size N [-1 ] beam size for beam search - --audio-ctx N [0 ] audio context size (0 - all) - --word-thold N [0.01 ] word timestamp probability threshold - --entropy-thold N [2.40 ] entropy threshold for decoder fail - --logprob-thold N [-1.00 ] log probability threshold for decoder fail - --debug-mode [false ] enable debug mode (eg. dump log_mel) - --translate [false ] translate from source language to english - --diarize [false ] stereo audio diarization - --tinydiarize [false ] enable tinydiarize (requires a tdrz model) - --no-fallback [false ] do not use temperature fallback while decoding - --print-special [false ] print special tokens - --print-colors [false ] print colors - --print-realtime [false ] print output in realtime - --print-progress [false ] print progress - --no-timestamps [false ] do not print timestamps - --language LANG [en ] spoken language ('auto' for auto-detect) - --detect-language [false ] exit after automatically detecting language - --prompt PROMPT [ ] initial prompt - --model FNAME [models/ggml-base.en.bin] model path - --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference - --dtw MODEL [ ] compute token-level timestamps - --host HOST [127.0.0.1] Hostname/ip-adress for the server - --port PORT [8080 ] Port number for the server - --public PATH [examples/server/public] Path to the public folder - --request-path PATH [ ] Request path for all requests - --inference-path PATH [/inference] Inference path for all requests - --convert [false ] Convert audio to WAV, requires ffmpeg on the server - --tmp-dir [. ] Temporary directory for ffmpeg transcoded files - --suppress-nst [false ] suppress non-speech tokens - --no-speech-thold N [0.60 ] no speech threshold - --no-gpu [false ] do not use gpu - --device N [0 ] GPU device ID (default: 0) - --flash-attn [true ] enable flash attention - --no-flash-attn [false ] disable flash attention - --no-language-probabilities [false ] exclude language probabilities from verbose_json output - -Voice Activity Detection (VAD) options: - --vad [false ] enable Voice Activity Detection (VAD) - --vad-model FNAME [ ] VAD model path - --vad-threshold N [0.50 ] VAD threshold for speech recognition - --vad-min-speech-duration-ms N [250 ] VAD min speech duration (0.0-1.0) - --vad-min-silence-duration-ms N [100 ] VAD min silence duration (to split segments) - --vad-max-speech-duration-s N [FLT_MAX] VAD max speech duration (auto-split longer) - --vad-speech-pad-ms N [30 ] VAD speech padding (extend segments) - --vad-samples-overlap N [0.10 ] VAD samples overlap (seconds between segments) - - - -deduped: -options: - --help [default] show this help message and exit - --threads N [4 ] number of threads to use during computation - --processors N [1 ] number of processors to use during computation - --offset-t N [0 ] time offset in milliseconds - --offset-n N [0 ] segment index offset - --duration N [0 ] duration of audio to process in milliseconds - --max-context N [-1 ] maximum number of text context tokens to store - --max-len N [0 ] maximum segment length in characters - --max-tokens N [0 ] maximum number of tokens per segment - --split-on-word [false ] split on word rather than on token - --best-of N [5 ] number of best candidates to keep - --beam-size N [5 ] beam size for beam search - --audio-ctx N [0 ] audio context size (0 - all) - --word-thold N [0.01 ] word timestamp probability threshold - --entropy-thold N [2.40 ] entropy threshold for decoder fail - --logprob-thold N [-1.00 ] log probability threshold for decoder fail - --no-speech-thold N [0.60 ] no speech threshold - --temperature N [0.00 ] The sampling temperature, between 0 and 1 - --temperature-inc N [0.20 ] The increment of temperature, between 0 and 1 - --debug-mode [false ] enable debug mode (eg. dump log_mel) - --translate [false ] translate from source language to english - --diarize [false ] stereo audio diarization - --tinydiarize [false ] enable tinydiarize (requires a tdrz model) - --no-fallback [false ] do not use temperature fallback while decoding - --output-txt [false ] output result in a text file - --output-vtt [false ] output result in a vtt file - --output-srt [false ] output result in a srt file - --output-lrc [false ] output result in a lrc file - --output-words [false ] output script for generating karaoke video - --font-path [/System/Library/Fonts/Supplemental/Courier New Bold.ttf] path to a monospace font for karaoke video - --output-csv [false ] output result in a CSV file - --output-json [false ] output result in a JSON file - --output-json-full [false ] include more information in the JSON file - --output-file FNAME [ ] output file path (without file extension) - --no-prints [false ] do not print anything other than the results - --print-special [false ] print special tokens - --print-colors [false ] print colors - --print-confidence [false ] print confidence - --print-progress [false ] print progress - --no-timestamps [false ] do not print timestamps - --language LANG [en ] spoken language ('auto' for auto-detect) - --detect-language [false ] exit after automatically detecting language - --prompt PROMPT [ ] initial prompt (max n_text_ctx/2 tokens) - --carry-initial-prompt [false ] always prepend initial prompt - --model FNAME [models/ggml-base.en.bin] model path - --file FNAME [ ] input audio file path - --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference - --dtw MODEL [ ] compute token-level timestamps - --log-score [false ] log best decoder scores of tokens - --no-gpu [false ] disable GPU - --device N [0 ] GPU device ID (default: 0) - --flash-attn [true ] enable flash attention - --no-flash-attn [false ] disable flash attention - --suppress-blank [true ] suppress blank outputs - --no-suppress-blank [false ] disable blank suppression - --suppress-nst [false ] suppress non-speech tokens - --suppress-regex REGEX [ ] regular expression matching tokens to suppress - --grammar GRAMMAR [ ] GBNF grammar to guide decoding - --grammar-rule RULE [ ] top-level GBNF grammar rule name - --grammar-penalty N [100.0 ] scales down logits of nongrammar tokens - --vad [false ] enable Voice Activity Detection (VAD) - --vad-model FNAME [ ] VAD model path - --vad-threshold N [0.50 ] VAD threshold for speech recognition - --vad-min-speech-duration-ms N [250 ] VAD min speech duration (0.0-1.0) - --vad-min-silence-duration-ms N [100 ] VAD min silence duration (to split segments) - --vad-max-speech-duration-s N [FLT_MAX] VAD max speech duration (auto-split longer) - --vad-speech-pad-ms N [30 ] VAD speech padding (extend segments) - --vad-samples-overlap N [0.10 ] VAD samples overlap (seconds between segments) - --step N [3000 ] audio step size in milliseconds - --length N [10000 ] audio length in milliseconds - --keep N [200 ] audio to keep from previous step in ms - --capture ID [-1 ] capture device ID - --vad-thold N [0.60 ] voice activity detection threshold - --freq-thold N [100.00 ] high-pass frequency cutoff - --keep-context [false ] keep context between audio chunks - --save-audio [false ] save the recorded audio to a file - --host HOST [127.0.0.1] Hostname/ip-adress for the server - --port PORT [8080 ] Port number for the server - --public PATH [examples/server/public] Path to the public folder - --request-path PATH [ ] Request path for all requests - --inference-path PATH [/inference] Inference path for all requests - --convert [false ] Convert audio to WAV, requires ffmpeg on the server - --tmp-dir [. ] Temporary directory for ffmpeg transcoded files - --no-language-probabilities [false ] exclude language probabilities from verbose_json output From 8cf43341f5f5e54d391450a3e40b8dfc46b5944d Mon Sep 17 00:00:00 2001 From: scottmonster <87917233+scottmonster@users.noreply.github.com> Date: Wed, 20 May 2026 19:50:11 -0500 Subject: [PATCH 11/16] remove grammar support remove grammar, grammar_rule, etc. and related logic and bindings since it is not in whisper.cpp and would require: whisper.cpp/examples/grammar-parser.cpp --- CMakeLists.txt | 1 - pywhispercpp/constants.py | 6 --- pywhispercpp/model.py | 40 +++----------------- pywhispercpp/model.pyi | 5 --- src/main.cpp | 80 ++------------------------------------- tests/test_model.py | 18 --------- 6 files changed, 9 insertions(+), 141 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index af94411..39c16a8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,6 @@ add_subdirectory(whisper.cpp) pybind11_add_module(_pywhispercpp src/main.cpp - whisper.cpp/examples/grammar-parser.cpp ) target_link_libraries (_pywhispercpp PRIVATE whisper) diff --git a/pywhispercpp/constants.py b/pywhispercpp/constants.py index 529b28b..85018a8 100644 --- a/pywhispercpp/constants.py +++ b/pywhispercpp/constants.py @@ -284,12 +284,6 @@ 'options': None, 'default': 0.6 }, - 'grammar_penalty': { - 'type': float, - 'description': 'scales down logits of non-grammar tokens', - 'options': None, - 'default': 100.0 - }, 'greedy': { 'type': dict, 'description': 'greedy', diff --git a/pywhispercpp/model.py b/pywhispercpp/model.py index 26701fd..d11260b 100644 --- a/pywhispercpp/model.py +++ b/pywhispercpp/model.py @@ -6,20 +6,21 @@ [whisper.cpp](https://github.com/ggerganov/whisper.cpp) API. """ import importlib.metadata +import subprocess +import os import logging import shutil import sys +import tempfile +import wave from pathlib import Path from time import time from typing import Any, Union, Callable, List, TextIO, Tuple, Optional, Dict, TypedDict + import _pywhispercpp as pw import numpy as np -import pywhispercpp.utils as utils import pywhispercpp.constants as constants -import subprocess -import os -import tempfile -import wave +import pywhispercpp.utils as utils __author__ = "absadiki" __copyright__ = "Copyright 2023, " @@ -134,8 +135,6 @@ def __init__(self, - `audio_ctx`: override audio context size. Default `0`. - `tdrz_enable`: enable tinydiarize speaker-turn detection. Default `False`. - `initial_prompt`: initial text prompt prepended before decoding. Default `None`. - - `grammar`: GBNF grammar text or path to a grammar file. Default `None`. - - `grammar_rule`: top-level grammar rule name. Default `root` when grammar is used. - `prompt_tokens`: explicit prompt token sequence. Default `None`. - `prompt_n_tokens`: number of prompt tokens. Default `0`. - `carry_initial_prompt`: prepend the initial prompt to each decode window. Default `False`. @@ -152,7 +151,6 @@ def __init__(self, - `entropy_thold`: entropy threshold. Default `2.4`. - `logprob_thold`: logprob threshold. Default `-1.0`. - `no_speech_thold`: no-speech threshold. Default `0.6`. - - `grammar_penalty`: penalty applied to non-grammar tokens. Default `100.0`. - `greedy`: greedy-decoder settings, typically `{"best_of": 5}`. - `beam_search`: beam-search settings, schema default `{"beam_size": -1, "patience": -1.0}`. - `vad`: enable VAD. Default `False`. @@ -360,29 +358,6 @@ def _normalize_params(kwargs: dict) -> dict: return normalized - def _apply_grammar_params(self, normalized: dict) -> dict: - has_grammar = 'grammar' in normalized - has_grammar_rule = 'grammar_rule' in normalized - - if not has_grammar: - if has_grammar_rule: - raise AttributeError('grammar_rule requires grammar') - return normalized - - grammar = normalized.pop('grammar') - grammar_rule = normalized.pop('grammar_rule', 'root') - - if grammar is None: - self._params.clear_grammar() - return normalized - - self._params.set_grammar( - grammar, - grammar_rule, - normalized.get('grammar_penalty', self._params.grammar_penalty), - ) - return normalized - def _apply_prompt_token_params(self, normalized: dict) -> dict: if 'prompt_tokens' not in normalized: return normalized @@ -421,9 +396,6 @@ def _set_params(self, kwargs: dict) -> None: """ normalized = self._normalize_params(kwargs) - if 'grammar' in normalized or 'grammar_rule' in normalized: - normalized = self._apply_grammar_params(normalized) - if 'prompt_tokens' in normalized: normalized = self._apply_prompt_token_params(normalized) diff --git a/pywhispercpp/model.pyi b/pywhispercpp/model.pyi index 936c6ac..3f2852b 100644 --- a/pywhispercpp/model.pyi +++ b/pywhispercpp/model.pyi @@ -75,8 +75,6 @@ class Model: audio_ctx: int = 0, tdrz_enable: bool = False, initial_prompt: Optional[str] = None, - grammar: Optional[str] = None, - grammar_rule: str = 'root', prompt_tokens: Optional[Tuple[Any, ...]] = None, prompt_n_tokens: int = 0, carry_initial_prompt: bool = False, @@ -129,8 +127,6 @@ class Model: audio_ctx: int = 0, tdrz_enable: bool = False, initial_prompt: Optional[str] = None, - grammar: Optional[str] = None, - grammar_rule: str = 'root', prompt_tokens: Optional[Tuple[Any, ...]] = None, prompt_n_tokens: int = 0, carry_initial_prompt: bool = False, @@ -147,7 +143,6 @@ class Model: entropy_thold: float = 2.4, logprob_thold: float = -1.0, no_speech_thold: float = 0.6, - grammar_penalty: float = 100.0, greedy: GreedyParams = {'best_of': 5}, beam_search: BeamSearchParams = {'beam_size': -1, 'patience': -1.0}, extract_probability: bool = False, diff --git a/src/main.cpp b/src/main.cpp index c76d8be..35d74a2 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -15,11 +15,7 @@ #include #include -#include -#include - #include "whisper.h" -#include "../whisper.cpp/examples/grammar-parser.h" #define STRINGIFY(x) #x @@ -398,9 +394,7 @@ struct WhisperFullParamsWrapper : public whisper_full_params { std::string initial_prompt_str; std::string suppress_regex_str; std::string vad_model_path_str; - std::vector prompt_token_storage; - grammar_parser::parse_state grammar_state; - std::vector grammar_rule_ptrs; + std::vector prompt_token_storage; void reset_progress_callback() { progress_callback_user_data = this; @@ -423,11 +417,6 @@ struct WhisperFullParamsWrapper : public whisper_full_params { }; } - void sync_grammar_fields() { - grammar_rule_ptrs = grammar_state.c_rules(); - grammar_rules = grammar_rule_ptrs.empty() ? nullptr : grammar_rule_ptrs.data(); - n_grammar_rules = grammar_rule_ptrs.size(); - } void sync_prompt_tokens() { prompt_tokens = prompt_token_storage.empty() ? nullptr : prompt_token_storage.data(); prompt_n_tokens = prompt_token_storage.size(); @@ -474,8 +463,7 @@ struct WhisperFullParamsWrapper : public whisper_full_params { initial_prompt_str(other.initial_prompt_str), suppress_regex_str(other.suppress_regex_str), vad_model_path_str(other.vad_model_path_str), - prompt_token_storage(other.prompt_token_storage), - grammar_state(other.grammar_state), + prompt_token_storage(other.prompt_token_storage), py_new_segment_callback(other.py_new_segment_callback), py_new_segment_callback_user_data(other.py_new_segment_callback_user_data), py_encoder_begin_callback(other.py_encoder_begin_callback), @@ -495,7 +483,6 @@ struct WhisperFullParamsWrapper : public whisper_full_params { abort_callback_user_data = this; logits_filter_callback_user_data = this; sync_prompt_tokens(); - sync_grammar_fields(); reset_progress_callback(); } void set_initial_prompt(const std::string& prompt) { @@ -608,54 +595,6 @@ struct WhisperFullParamsWrapper : public whisper_full_params { abort_callback = nullptr; abort_callback_user_data = this; } - void set_grammar(const std::string& grammar, const std::string& start_rule, float penalty) { - if (grammar.empty()) { - clear_grammar(); - grammar_penalty = penalty; - return; - } - - std::ifstream grammar_file(grammar); - std::string grammar_source; - if (grammar_file.good()) { - grammar_source.assign( - std::istreambuf_iterator(grammar_file), - std::istreambuf_iterator()); - } else { - grammar_source = grammar; - } - - auto parsed = grammar_parser::parse(grammar_source.c_str()); - auto rule_iter = parsed.symbol_ids.find(start_rule); - if (rule_iter == parsed.symbol_ids.end()) { - throw std::runtime_error("unknown grammar start rule: " + start_rule); - } - - grammar_state = std::move(parsed); - sync_grammar_fields(); - i_start_rule = rule_iter->second; - grammar_penalty = penalty; - } - void clear_grammar() { - grammar_state = grammar_parser::parse_state(); - grammar_rule_ptrs.clear(); - grammar_rules = nullptr; - n_grammar_rules = 0; - i_start_rule = 0; - } - py::list get_grammar_rules() const { - py::list rules; - for (const auto& rule : grammar_state.rules) { - py::list elements; - for (const auto& element : rule) { - elements.append(py::dict( - "type"_a = static_cast(element.type), - "value"_a = element.value)); - } - rules.append(elements); - } - return rules; - } }; WhisperFullParamsWrapper whisper_full_default_params_wrapper(enum whisper_sampling_strategy strategy) { return WhisperFullParamsWrapper(whisper_full_default_params(strategy)); @@ -1154,11 +1093,7 @@ PYBIND11_MODULE(_pywhispercpp, m) { << "progress_callback=" << (self.progress_callback ? "(function pointer)" : "None") << ", " << "encoder_begin_callback=" << (self.encoder_begin_callback ? "(function pointer)" : "None") << ", " << "abort_callback=" << (self.abort_callback ? "(function pointer)" : "None") << ", " - << "logits_filter_callback=" << (self.logits_filter_callback ? "(function pointer)" : "None") << ", " - << "grammar_rules=" << (self.grammar_rules ? "(whisper_grammar_element **)" : "None") << ", " - << "n_grammar_rules=" << self.n_grammar_rules << ", " - << "i_start_rule=" << self.i_start_rule << ", " - << "grammar_penalty=" << self.grammar_penalty + << "logits_filter_callback=" << (self.logits_filter_callback ? "(function pointer)" : "None") << ")"; return oss.str(); }); @@ -1311,12 +1246,6 @@ PYBIND11_MODULE(_pywhispercpp, m) { .def_property("logits_filter_callback_user_data", &WhisperFullParamsWrapper::get_logits_filter_callback_user_data, &WhisperFullParamsWrapper::set_logits_filter_callback_user_data) - .def_property_readonly("grammar_rules", &WhisperFullParamsWrapper::get_grammar_rules) - .def_property_readonly("n_grammar_rules", [](const WhisperFullParamsWrapper &self) { return self.n_grammar_rules; }) - .def_property_readonly("i_start_rule", [](const WhisperFullParamsWrapper &self) { return self.i_start_rule; }) - .def("set_grammar", &WhisperFullParamsWrapper::set_grammar, - py::arg("grammar"), py::arg("start_rule") = "root", py::arg("penalty") = 100.0f, - "Parse grammar text or a grammar file path and assign it to the params.") .def("set_logits_filter_callback", [](WhisperFullParamsWrapper &self, py::object callback) { if (callback.is_none()) { @@ -1329,9 +1258,6 @@ PYBIND11_MODULE(_pywhispercpp, m) { "Assign a logits-filter callback.") .def("clear_logits_filter_callback", &WhisperFullParamsWrapper::clear_logits_filter_callback, "Clear any previously assigned logits-filter callback.") - .def("clear_grammar", &WhisperFullParamsWrapper::clear_grammar, - "Clear any previously assigned grammar.") - .def_readwrite("grammar_penalty", &WhisperFullParamsWrapper::grammar_penalty) .def_readwrite("vad", &WhisperFullParamsWrapper::vad) .def_property("vad_model_path", [](WhisperFullParamsWrapper &self) { diff --git a/tests/test_model.py b/tests/test_model.py index 9ee8f65..b68f8a6 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -68,24 +68,6 @@ def test_prompt_token_helper_exists(self): params.set_prompt_tokens((1, 2, 3)) self.assertEqual(params.prompt_n_tokens, 3) - def test_grammar_helper_exists(self): - params = pw.whisper_full_default_params( - pw.whisper_sampling_strategy.WHISPER_SAMPLING_GREEDY - ) - params.set_grammar('root ::= "yes" | "no"', 'root', 42.0) - self.assertEqual(params.grammar_penalty, 42.0) - params.clear_grammar() - - def test_model_accepts_grammar_param(self): - model = Model( - "tiny", - models_dir=str(WHISPER_CPP_DIR/'models'), - grammar='root ::= "yes" | "no"', - grammar_rule='root', - grammar_penalty=42.0, - ) - self.assertIsInstance(model, Model) - def test_model_metadata_bindings(self): self.assertIsInstance(pw.whisper_model_type_readable(self.model._ctx), str) self.assertGreater(pw.whisper_model_n_vocab(self.model._ctx), 0) From 3bbf251aa1f9893daa30b3c475f4a653d4834b47 Mon Sep 17 00:00:00 2001 From: scottmonster <87917233+scottmonster@users.noreply.github.com> Date: Wed, 20 May 2026 20:55:46 -0500 Subject: [PATCH 12/16] misc. stub and docustring fixes --- pywhispercpp/constants.py | 8 +++++++- pywhispercpp/model.py | 12 +++++++----- pywhispercpp/model.pyi | 9 +++++---- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/pywhispercpp/constants.py b/pywhispercpp/constants.py index 85018a8..4a6d1cd 100644 --- a/pywhispercpp/constants.py +++ b/pywhispercpp/constants.py @@ -242,6 +242,12 @@ 'options': None, 'default': False }, + 'suppress_regex': { + 'type': str, + 'description': 'regex pattern used to suppress matching text during decoding', + 'options': None, + 'default': '' + }, 'temperature': { 'type': float, 'description': 'initial decoding temperature', @@ -294,7 +300,7 @@ 'type': dict, 'description': 'beam_search', 'options': None, - 'default': {"beam_size": 5, "patience": -1.0} + 'default': {"beam_size": -1, "patience": -1.0} }, 'extract_probability': { 'type': bool, diff --git a/pywhispercpp/model.py b/pywhispercpp/model.py index d11260b..3864ed9 100644 --- a/pywhispercpp/model.py +++ b/pywhispercpp/model.py @@ -111,7 +111,8 @@ def __init__(self, `flash_attn`, `gpu_device`, `dtw_token_timestamps`, `dtw_aheads_preset`, `dtw_n_top`, and `dtw_mem_size`. Omitted keys inherit from `whisper_context_default_params()`. - :param params: decode parameters forwarded to `whisper_full_params`. + :param params: keyword-only decode parameters matching the public API documented in `model.pyi`. + These values are forwarded to `whisper_full_params` and remain active for future calls. Supported keys: - `n_threads`: number of inference threads. Default is `min(4, hardware_concurrency())`. - `n_max_text_ctx`: max prompt-text tokens carried into the decoder. Default `16384`. @@ -152,7 +153,7 @@ def __init__(self, - `logprob_thold`: logprob threshold. Default `-1.0`. - `no_speech_thold`: no-speech threshold. Default `0.6`. - `greedy`: greedy-decoder settings, typically `{"best_of": 5}`. - - `beam_search`: beam-search settings, schema default `{"beam_size": -1, "patience": -1.0}`. + - `beam_search`: beam-search settings. Default `{"beam_size": -1, "patience": -1.0}`. - `vad`: enable VAD. Default `False`. - `vad_model_path`: path to the VAD model. Default `None`. """ @@ -189,11 +190,12 @@ def transcribe(self, :param n_processors: number of worker processes for `whisper_full_parallel`. If omitted, runs a single-process `whisper_full()` decode. :param new_segment_callback: callback invoked for each newly produced `Segment` during decoding. - :param abort_callback: callback function returning True to abort an in-flight transcription early - :param params: keyword arguments for different whisper.cpp parameters; these override the model's - active decode params for this call + :param abort_callback: callback function returning True to abort an in-flight transcription early. :param extract_probability: If True, calculates the geometric mean of token probabilities for each segment, providing a confidence score interpretable as a probability in [0, 1]. + :param params: additional keyword-only decode parameters matching the public API documented in + `model.pyi`, with the same supported keys and defaults as `Model.__init__`. + Any overrides applied here remain active for future calls. :return: List of transcription segments """ if isinstance(media, np.ndarray): diff --git a/pywhispercpp/model.pyi b/pywhispercpp/model.pyi index 3f2852b..c2075fd 100644 --- a/pywhispercpp/model.pyi +++ b/pywhispercpp/model.pyi @@ -1,13 +1,12 @@ from __future__ import annotations -from typing import Any, Callable, Dict, List, Optional, TextIO, Tuple, TypedDict, Union +from typing import Any, Callable, Dict, List, Optional, TextIO, Tuple, TypedDict, TypeAlias, Union import numpy as np import numpy.typing as npt -AudioArray = npt.NDArray[np.float32] -AudioInput = Union[str, AudioArray] - +AudioArray: TypeAlias = npt.NDArray[np.float32] +AudioInput: TypeAlias = Union[str, AudioArray] class ContextParams(TypedDict, total=False): use_gpu: bool @@ -40,6 +39,7 @@ class Segment: class Model: + model_path: str _new_segment_callback: Optional[Callable[[Segment], None]] def __init__( @@ -52,6 +52,7 @@ class Model: openvino_model_path: Optional[str] = None, openvino_device: str = 'CPU', openvino_cache_dir: Optional[str] = None, + context_params: Optional[ContextParams] = None, *, n_threads: Optional[int] = None, n_max_text_ctx: int = 16384, From 4b469e370a8f724b32a37f3220ece0ebfd586ff2 Mon Sep 17 00:00:00 2001 From: scottmonster <87917233+scottmonster@users.noreply.github.com> Date: Wed, 20 May 2026 21:51:55 -0500 Subject: [PATCH 13/16] Pin whisper.cpp to v1.8.4 --- whisper.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whisper.cpp b/whisper.cpp index 4979e04..9386f23 160000 --- a/whisper.cpp +++ b/whisper.cpp @@ -1 +1 @@ -Subproject commit 4979e04f5dcaccb36057e059bbaed8a2f5288315 +Subproject commit 9386f239401074690479731c1e41683fbbeac557 From c2eaf8cfe29987d5440410fdc46236d251ccefe3 Mon Sep 17 00:00:00 2001 From: scottmonster <87917233+scottmonster@users.noreply.github.com> Date: Wed, 20 May 2026 23:56:49 -0500 Subject: [PATCH 14/16] remove WHISPER_DEPRECATED (whsiper.h) functions --- README.md | 107 ++++++++++++++++++++++++++---------------- pywhispercpp/model.py | 10 ++-- src/main.cpp | 34 -------------- tests/test_c_api.py | 5 +- 4 files changed, 75 insertions(+), 81 deletions(-) diff --git a/README.md b/README.md index ccda1c6..86ba9c0 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ # pywhispercpp + Python bindings for [whisper.cpp](https://github.com/ggerganov/whisper.cpp) with a simple Pythonic API on top of it. [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) @@ -7,38 +8,49 @@ Python bindings for [whisper.cpp](https://github.com/ggerganov/whisper.cpp) with [![Downloads](https://static.pepy.tech/badge/pywhispercpp)](https://pepy.tech/project/pywhispercpp) # Table of contents + -* [Installation](#installation) - * [From source](#from-source) - * [Pre-built wheels](#pre-built-wheels) - * [NVIDIA GPU support](#nvidia-gpu-support) - * [CoreML support](#coreml-support) - * [Vulkan support](#vulkan-support) -* [Quick start](#quick-start) -* [Examples](#examples) - * [CLI](#cli) - * [GUI](#gui) - * [Assistant](#assistant) -* [Advanced usage](#advanced-usage) -* [Discussions and contributions](#discussions-and-contributions) -* [License](#license) + +- [pywhispercpp](#pywhispercpp) +- [Table of contents](#table-of-contents) +- [Installation](#installation) + - [From source](#from-source) + - [Pre-built wheels](#pre-built-wheels) + - [NVIDIA GPU support](#nvidia-gpu-support) + - [CoreML support](#coreml-support) + - [Vulkan support](#vulkan-support) + - [OpenBLAS support](#openblas-support) + - [OpenVINO support](#openvino-support) +- [Quick start](#quick-start) +- [Examples](#examples) + - [CLI](#cli) + - [GUI](#gui) + - [Assistant](#assistant) +- [Advanced usage](#advanced-usage) +- [Discussions and contributions](#discussions-and-contributions) +- [License](#license) # Installation ### From source -* For the best performance, you need to install the package from source: + +- For the best performance, you need to install the package from source: + ```shell pip install git+https://github.com/absadiki/pywhispercpp ``` + ### Pre-built wheels -* Otherwise, Basic Pre-built CPU wheels are available on PYPI + +- Otherwise, Basic Pre-built CPU wheels are available on PYPI ```shell pip install pywhispercpp # or pywhispercpp[examples] to install the extra dependencies needed for the examples ``` -[Optional] To transcribe files other than wav, you need to install ffmpeg: +[Optional] To transcribe files other than wav, you need to install ffmpeg: + ```shell # on Ubuntu or Debian sudo apt update && sudo apt install ffmpeg @@ -57,11 +69,13 @@ scoop install ffmpeg ``` ### NVIDIA GPU support + To Install the package with CUDA support, make sure you have [cuda](https://developer.nvidia.com/cuda-downloads) installed and use `GGML_CUDA=1`: ```shell GGML_CUDA=1 pip install git+https://github.com/absadiki/pywhispercpp ``` + ### CoreML support Install the package with `WHISPER_COREML=1`: @@ -81,6 +95,7 @@ GGML_VULKAN=1 pip install git+https://github.com/absadiki/pywhispercpp ### OpenBLAS support If OpenBLAS is installed, you can use `GGML_BLAS=1`. The other flags ensure you're installing fresh with the correct flags, and printing output for sanity checking. + ```shell GGML_BLAS=1 pip install git+https://github.com/absadiki/pywhispercpp --no-cache --force-reinstall -v ``` @@ -90,16 +105,15 @@ GGML_BLAS=1 pip install git+https://github.com/absadiki/pywhispercpp --no-cache Follow the the steps to download correct OpenVINO package (https://github.com/ggerganov/whisper.cpp?tab=readme-ov-file#openvino-support). Then init the OpenVINO environment and build. + ``` -source ~/l_openvino_toolkit_ubuntu22_2023.0.0.10926.b4452d56304_x86_64/setupvars.sh +source ~/l_openvino_toolkit_ubuntu22_2023.0.0.10926.b4452d56304_x86_64/setupvars.sh WHISPER_OPENVINO=1 pip install git+https://github.com/absadiki/pywhispercpp --no-cache --force-reinstall ``` Note that the toolkit for Ubuntu22 works on Ubuntu24 - -** __Feel free to update this list and submit a PR if you tested the package on other backends.__ - +\*\* **Feel free to update this list and submit a PR if you tested the package on other backends.** # Quick start @@ -121,21 +135,22 @@ model = Model('base.en', print_realtime=False, print_progress=False) segments = model.transcribe('file.mp3', new_segment_callback=print) ``` - -* The model will be downloaded automatically, or you can use the path to a local model. -* You can pass any `whisper.cpp` [parameter](https://absadiki.github.io/pywhispercpp/#pywhispercpp.constants.PARAMS_SCHEMA) as a keyword argument to the `Model` class or to the `transcribe` function. -* Check the [Model](https://absadiki.github.io/pywhispercpp/#pywhispercpp.model.Model) class documentation for more details. +- The model will be downloaded automatically, or you can use the path to a local model. +- You can pass any `whisper.cpp` [parameter](https://absadiki.github.io/pywhispercpp/#pywhispercpp.constants.PARAMS_SCHEMA) as a keyword argument to the `Model` class or to the `transcribe` function. +- Check the [Model](https://absadiki.github.io/pywhispercpp/#pywhispercpp.model.Model) class documentation for more details. # Examples ## CLI -Just a straightforward example Command Line Interface. + +Just a straightforward example Command Line Interface. You can use it as follows: ```shell pwcpp file.wav -m base --output-srt --print_realtime true ``` -Run ```pwcpp --help``` to get the help message + +Run `pwcpp --help` to get the help message ```shell usage: pwcpp [-h] [-m MODEL] [--version] [--processors PROCESSORS] [-otxt] [-ovtt] [-osrt] [-ocsv] [--strategy STRATEGY] @@ -229,13 +244,17 @@ options: ``` ## GUI -If you prefer a Graphical User Interface, you can use the `pwcpp-gui` command which will launch A simple graphical interface built with PyQt5. -* First you need to install the GUI dependencies: + +If you prefer a Graphical User Interface, you can use the `pwcpp-gui` command which will launch A simple graphical interface built with PyQt5. + +- First you need to install the GUI dependencies: + ```bash pip install pywhispercpp[gui] ``` -* Then you can run the GUI with: +- Then you can run the GUI with: + ```bash pwcpp-gui ``` @@ -248,23 +267,25 @@ The GUI provides a user-friendly way to: - View and export transcription results ## Assistant -This is a simple example showcasing the use of `pywhispercpp` to create an assistant like example. -The idea is to use a Voice Activity Detector (VAD) to detect speech (in this example, we used webrtcvad), and when some speech is detected, we run the transcription. + +This is a simple example showcasing the use of `pywhispercpp` to create an assistant like example. +The idea is to use a Voice Activity Detector (VAD) to detect speech (in this example, we used webrtcvad), and when some speech is detected, we run the transcription. It is inspired from the [whisper.cpp/examples/command](https://github.com/ggerganov/whisper.cpp/tree/master/examples/command) example. -You can check the source code [here](https://github.com/absadiki/pywhispercpp/blob/main/pywhispercpp/examples/assistant.py) +You can check the source code [here](https://github.com/absadiki/pywhispercpp/blob/main/pywhispercpp/examples/assistant.py) or you can use the class directly to create your own assistant: - ```python from pywhispercpp.examples.assistant import Assistant my_assistant = Assistant(commands_callback=print, n_threads=8) my_assistant.start() ``` + Here, we set the `commands_callback` to a simple print function, so the commands will just get printed on the screen. You can also run this example from the command line. + ```shell $ pwcpp-assistant --help @@ -281,25 +302,31 @@ options: -bd BLOCK_DURATION, --block_duration BLOCK_DURATION minimum time audio updates in ms, default to 30 ``` -------------- -* Check the [examples folder](https://github.com/absadiki/pywhispercpp/tree/main/pywhispercpp/examples) for more examples. +--- + +- Check the [examples folder](https://github.com/absadiki/pywhispercpp/tree/main/pywhispercpp/examples) for more examples. # Advanced usage -* First check the [API documentation](https://absadiki.github.io/pywhispercpp/) for more advanced usage. -* If you are a more experienced user, you can access the exposed C-APIs directly from the binding module `_pywhispercpp`. + +- First check the [API documentation](https://absadiki.github.io/pywhispercpp/) for more advanced usage. +- If you are a more experienced user, you can access the exposed C-APIs directly from the binding module `_pywhispercpp`. ```python import _pywhispercpp as pwcpp -ctx = pwcpp.whisper_init_from_file('path/to/ggml/model') +ctx = pwcpp.whisper_init_from_file_with_params( + 'path/to/ggml/model', + pwcpp.whisper_context_default_params(), +) ``` # Discussions and contributions + If you find any bug, please open an [issue](https://github.com/absadiki/pywhispercpp/issues). If you have any feedback, or you want to share how you are using this project, feel free to use the [Discussions](https://github.com/absadiki/pywhispercpp/discussions) and open a new topic. # License -This project is licensed under the same license as [whisper.cpp](https://github.com/ggerganov/whisper.cpp/blob/master/LICENSE) (MIT [License](./LICENSE)). +This project is licensed under the same license as [whisper.cpp](https://github.com/ggerganov/whisper.cpp/blob/master/LICENSE) (MIT [License](./LICENSE)). diff --git a/pywhispercpp/model.py b/pywhispercpp/model.py index 3864ed9..4833fb1 100644 --- a/pywhispercpp/model.py +++ b/pywhispercpp/model.py @@ -334,8 +334,10 @@ def available_languages() -> List[str]: @staticmethod def _resolve_context_params(context_params: Optional[ContextParams]): + resolved = pw.whisper_context_default_params() + if context_params is None: - return None + return resolved if not isinstance(context_params, dict): raise TypeError("context_params must be a ContextParams dict or None") @@ -346,7 +348,6 @@ def _resolve_context_params(context_params: Optional[ContextParams]): f"Unknown context_params keys: {', '.join(unknown_keys)}" ) - resolved = pw.whisper_context_default_params() for key, value in context_params.items(): setattr(resolved, key, value) return resolved @@ -381,10 +382,7 @@ def _init_model(self) -> None: """ logger.info("Initializing the model ...") with utils.redirect_stderr(to=self.redirect_whispercpp_logs_to): - if self._context_params is None: - self._ctx = pw.whisper_init_from_file(self.model_path) - else: - self._ctx = pw.whisper_init_from_file_with_params(self.model_path, self._context_params) + self._ctx = pw.whisper_init_from_file_with_params(self.model_path, self._context_params) if self.use_openvino: pw.whisper_ctx_init_openvino_encoder(self._ctx, self.openvino_model_path, self.openvino_device, self.openvino_cache_dir) diff --git a/src/main.cpp b/src/main.cpp index 35d74a2..6bc3c00 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -86,30 +86,6 @@ struct whisper_context_wrapper whisper_init_with_params_wrapper( return ctw_w; }; -struct whisper_context_wrapper whisper_init_from_file_wrapper(const char * path_model){ - struct whisper_context_params cparams = whisper_context_default_params(); - struct whisper_context * ctx = whisper_init_from_file_with_params(path_model, cparams); - struct whisper_context_wrapper ctw_w; - ctw_w.ptr = ctx; - return ctw_w; -} - -struct whisper_context_wrapper whisper_init_from_buffer_wrapper(void * buffer, size_t buffer_size){ - struct whisper_context_params cparams = whisper_context_default_params(); - struct whisper_context * ctx = whisper_init_from_buffer_with_params(buffer, buffer_size, cparams); - struct whisper_context_wrapper ctw_w; - ctw_w.ptr = ctx; - return ctw_w; -} - -struct whisper_context_wrapper whisper_init_wrapper(struct whisper_model_loader_wrapper * loader){ - struct whisper_context_params cparams = whisper_context_default_params(); - struct whisper_context * ctx = whisper_init_with_params(loader->ptr, cparams); - struct whisper_context_wrapper ctw_w; - ctw_w.ptr = ctx; - return ctw_w; -}; - void whisper_free_wrapper(struct whisper_context_wrapper * ctx_w){ whisper_free(ctx_w->ptr); }; @@ -939,22 +915,12 @@ PYBIND11_MODULE(_pywhispercpp, m) { m.def("whisper_context_default_params", &whisper_context_default_params, "Return the default context parameters used during model initialization."); - - DEF_RELEASE_GIL("whisper_init_from_file", &whisper_init_from_file_wrapper, "Various functions for loading a ggml whisper model.\n" - "Allocate (almost) all memory needed for the model.\n" - "Return NULL on failure"); DEF_RELEASE_GIL("whisper_init_from_file_with_params", &whisper_init_from_file_with_params_wrapper, "Various functions for loading a ggml whisper model.\n" "Allocate (almost) all memory needed for the model.\n" "Return NULL on failure"); - DEF_RELEASE_GIL("whisper_init_from_buffer", &whisper_init_from_buffer_wrapper, "Various functions for loading a ggml whisper model.\n" - "Allocate (almost) all memory needed for the model.\n" - "Return NULL on failure"); DEF_RELEASE_GIL("whisper_init_from_buffer_with_params", &whisper_init_from_buffer_with_params_wrapper, "Various functions for loading a ggml whisper model.\n" "Allocate (almost) all memory needed for the model.\n" "Return NULL on failure"); - DEF_RELEASE_GIL("whisper_init", &whisper_init_wrapper, "Various functions for loading a ggml whisper model.\n" - "Allocate (almost) all memory needed for the model.\n" - "Return NULL on failure"); DEF_RELEASE_GIL("whisper_init_with_params", &whisper_init_with_params_wrapper, "Various functions for loading a ggml whisper model.\n" "Allocate (almost) all memory needed for the model.\n" "Return NULL on failure"); diff --git a/tests/test_c_api.py b/tests/test_c_api.py index b415395..6138c1c 100644 --- a/tests/test_c_api.py +++ b/tests/test_c_api.py @@ -11,7 +11,10 @@ class TestCAPI(TestCase): model_file = './whisper.cpp/models/for-tests-ggml-tiny.en.bin' def test_whisper_init_from_file(self): - ctx = pw.whisper_init_from_file(self.model_file) + ctx = pw.whisper_init_from_file_with_params( + self.model_file, + pw.whisper_context_default_params(), + ) self.assertIsInstance(ctx, pw.whisper_context) def test_whisper_lang_str(self): From c80acd91c08c8bb514d60da6d346e1d9c156efe9 Mon Sep 17 00:00:00 2001 From: scottmonster <87917233+scottmonster@users.noreply.github.com> Date: Sat, 23 May 2026 19:24:01 -0500 Subject: [PATCH 15/16] revert language to empty string revert language back to "" instead of "en" so that whisper will auto-detect language --- pywhispercpp/constants.py | 2 +- pywhispercpp/model.py | 2 +- pywhispercpp/model.pyi | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pywhispercpp/constants.py b/pywhispercpp/constants.py index 4a6d1cd..bfe582f 100644 --- a/pywhispercpp/constants.py +++ b/pywhispercpp/constants.py @@ -216,7 +216,7 @@ 'type': str, 'description': 'for auto-detection, set to None, "" or "auto"', 'options': None, - 'default': "en" + 'default': "" }, 'detect_language': { 'type': bool, diff --git a/pywhispercpp/model.py b/pywhispercpp/model.py index 4833fb1..453e152 100644 --- a/pywhispercpp/model.py +++ b/pywhispercpp/model.py @@ -139,7 +139,7 @@ def __init__(self, - `prompt_tokens`: explicit prompt token sequence. Default `None`. - `prompt_n_tokens`: number of prompt tokens. Default `0`. - `carry_initial_prompt`: prepend the initial prompt to each decode window. Default `False`. - - `language`: language code. Default `en`. + - `language`: language code. Default ``. - `detect_language`: enable automatic language detection during transcription. Default `False`. - `suppress_blank`: suppress blank outputs. Default `True`. - `suppress_non_speech_tokens`: Python alias for `suppress_nst`. Default `False`. diff --git a/pywhispercpp/model.pyi b/pywhispercpp/model.pyi index c2075fd..35cb735 100644 --- a/pywhispercpp/model.pyi +++ b/pywhispercpp/model.pyi @@ -79,7 +79,7 @@ class Model: prompt_tokens: Optional[Tuple[Any, ...]] = None, prompt_n_tokens: int = 0, carry_initial_prompt: bool = False, - language: str = 'en', + language: str = '', detect_language: bool = False, suppress_blank: bool = True, suppress_non_speech_tokens: bool = False, @@ -131,7 +131,7 @@ class Model: prompt_tokens: Optional[Tuple[Any, ...]] = None, prompt_n_tokens: int = 0, carry_initial_prompt: bool = False, - language: str = 'en', + language: str = '', detect_language: bool = False, suppress_blank: bool = True, suppress_non_speech_tokens: bool = False, From 490c545f808f2dacc9d1871e44fee4288a22eddb Mon Sep 17 00:00:00 2001 From: scottmonster <87917233+scottmonster@users.noreply.github.com> Date: Sat, 23 May 2026 20:00:43 -0500 Subject: [PATCH 16/16] update readme --- README.md | 102 +++++++++++++++++++++--------------------------------- 1 file changed, 39 insertions(+), 63 deletions(-) diff --git a/README.md b/README.md index 86ba9c0..018f18e 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ # pywhispercpp - Python bindings for [whisper.cpp](https://github.com/ggerganov/whisper.cpp) with a simple Pythonic API on top of it. [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) @@ -8,49 +7,38 @@ Python bindings for [whisper.cpp](https://github.com/ggerganov/whisper.cpp) with [![Downloads](https://static.pepy.tech/badge/pywhispercpp)](https://pepy.tech/project/pywhispercpp) # Table of contents - - -- [pywhispercpp](#pywhispercpp) -- [Table of contents](#table-of-contents) -- [Installation](#installation) - - [From source](#from-source) - - [Pre-built wheels](#pre-built-wheels) - - [NVIDIA GPU support](#nvidia-gpu-support) - - [CoreML support](#coreml-support) - - [Vulkan support](#vulkan-support) - - [OpenBLAS support](#openblas-support) - - [OpenVINO support](#openvino-support) -- [Quick start](#quick-start) -- [Examples](#examples) - - [CLI](#cli) - - [GUI](#gui) - - [Assistant](#assistant) -- [Advanced usage](#advanced-usage) -- [Discussions and contributions](#discussions-and-contributions) -- [License](#license) +* [Installation](#installation) + * [From source](#from-source) + * [Pre-built wheels](#pre-built-wheels) + * [NVIDIA GPU support](#nvidia-gpu-support) + * [CoreML support](#coreml-support) + * [Vulkan support](#vulkan-support) +* [Quick start](#quick-start) +* [Examples](#examples) + * [CLI](#cli) + * [GUI](#gui) + * [Assistant](#assistant) +* [Advanced usage](#advanced-usage) +* [Discussions and contributions](#discussions-and-contributions) +* [License](#license) # Installation ### From source - -- For the best performance, you need to install the package from source: - +* For the best performance, you need to install the package from source: ```shell pip install git+https://github.com/absadiki/pywhispercpp ``` - ### Pre-built wheels - -- Otherwise, Basic Pre-built CPU wheels are available on PYPI +* Otherwise, Basic Pre-built CPU wheels are available on PYPI ```shell pip install pywhispercpp # or pywhispercpp[examples] to install the extra dependencies needed for the examples ``` -[Optional] To transcribe files other than wav, you need to install ffmpeg: - +[Optional] To transcribe files other than wav, you need to install ffmpeg: ```shell # on Ubuntu or Debian sudo apt update && sudo apt install ffmpeg @@ -69,13 +57,11 @@ scoop install ffmpeg ``` ### NVIDIA GPU support - To Install the package with CUDA support, make sure you have [cuda](https://developer.nvidia.com/cuda-downloads) installed and use `GGML_CUDA=1`: ```shell GGML_CUDA=1 pip install git+https://github.com/absadiki/pywhispercpp ``` - ### CoreML support Install the package with `WHISPER_COREML=1`: @@ -95,7 +81,6 @@ GGML_VULKAN=1 pip install git+https://github.com/absadiki/pywhispercpp ### OpenBLAS support If OpenBLAS is installed, you can use `GGML_BLAS=1`. The other flags ensure you're installing fresh with the correct flags, and printing output for sanity checking. - ```shell GGML_BLAS=1 pip install git+https://github.com/absadiki/pywhispercpp --no-cache --force-reinstall -v ``` @@ -105,15 +90,16 @@ GGML_BLAS=1 pip install git+https://github.com/absadiki/pywhispercpp --no-cache Follow the the steps to download correct OpenVINO package (https://github.com/ggerganov/whisper.cpp?tab=readme-ov-file#openvino-support). Then init the OpenVINO environment and build. - ``` -source ~/l_openvino_toolkit_ubuntu22_2023.0.0.10926.b4452d56304_x86_64/setupvars.sh +source ~/l_openvino_toolkit_ubuntu22_2023.0.0.10926.b4452d56304_x86_64/setupvars.sh WHISPER_OPENVINO=1 pip install git+https://github.com/absadiki/pywhispercpp --no-cache --force-reinstall ``` Note that the toolkit for Ubuntu22 works on Ubuntu24 -\*\* **Feel free to update this list and submit a PR if you tested the package on other backends.** + +** __Feel free to update this list and submit a PR if you tested the package on other backends.__ + # Quick start @@ -135,22 +121,21 @@ model = Model('base.en', print_realtime=False, print_progress=False) segments = model.transcribe('file.mp3', new_segment_callback=print) ``` -- The model will be downloaded automatically, or you can use the path to a local model. -- You can pass any `whisper.cpp` [parameter](https://absadiki.github.io/pywhispercpp/#pywhispercpp.constants.PARAMS_SCHEMA) as a keyword argument to the `Model` class or to the `transcribe` function. -- Check the [Model](https://absadiki.github.io/pywhispercpp/#pywhispercpp.model.Model) class documentation for more details. + +* The model will be downloaded automatically, or you can use the path to a local model. +* You can pass any `whisper.cpp` [parameter](https://absadiki.github.io/pywhispercpp/#pywhispercpp.constants.PARAMS_SCHEMA) as a keyword argument to the `Model` class or to the `transcribe` function. +* Check the [Model](https://absadiki.github.io/pywhispercpp/#pywhispercpp.model.Model) class documentation for more details. # Examples ## CLI - -Just a straightforward example Command Line Interface. +Just a straightforward example Command Line Interface. You can use it as follows: ```shell pwcpp file.wav -m base --output-srt --print_realtime true ``` - -Run `pwcpp --help` to get the help message +Run ```pwcpp --help``` to get the help message ```shell usage: pwcpp [-h] [-m MODEL] [--version] [--processors PROCESSORS] [-otxt] [-ovtt] [-osrt] [-ocsv] [--strategy STRATEGY] @@ -244,17 +229,13 @@ options: ``` ## GUI - -If you prefer a Graphical User Interface, you can use the `pwcpp-gui` command which will launch A simple graphical interface built with PyQt5. - -- First you need to install the GUI dependencies: - +If you prefer a Graphical User Interface, you can use the `pwcpp-gui` command which will launch A simple graphical interface built with PyQt5. +* First you need to install the GUI dependencies: ```bash pip install pywhispercpp[gui] ``` -- Then you can run the GUI with: - +* Then you can run the GUI with: ```bash pwcpp-gui ``` @@ -267,25 +248,23 @@ The GUI provides a user-friendly way to: - View and export transcription results ## Assistant - -This is a simple example showcasing the use of `pywhispercpp` to create an assistant like example. -The idea is to use a Voice Activity Detector (VAD) to detect speech (in this example, we used webrtcvad), and when some speech is detected, we run the transcription. +This is a simple example showcasing the use of `pywhispercpp` to create an assistant like example. +The idea is to use a Voice Activity Detector (VAD) to detect speech (in this example, we used webrtcvad), and when some speech is detected, we run the transcription. It is inspired from the [whisper.cpp/examples/command](https://github.com/ggerganov/whisper.cpp/tree/master/examples/command) example. -You can check the source code [here](https://github.com/absadiki/pywhispercpp/blob/main/pywhispercpp/examples/assistant.py) +You can check the source code [here](https://github.com/absadiki/pywhispercpp/blob/main/pywhispercpp/examples/assistant.py) or you can use the class directly to create your own assistant: + ```python from pywhispercpp.examples.assistant import Assistant my_assistant = Assistant(commands_callback=print, n_threads=8) my_assistant.start() ``` - Here, we set the `commands_callback` to a simple print function, so the commands will just get printed on the screen. You can also run this example from the command line. - ```shell $ pwcpp-assistant --help @@ -302,15 +281,13 @@ options: -bd BLOCK_DURATION, --block_duration BLOCK_DURATION minimum time audio updates in ms, default to 30 ``` +------------- ---- - -- Check the [examples folder](https://github.com/absadiki/pywhispercpp/tree/main/pywhispercpp/examples) for more examples. +* Check the [examples folder](https://github.com/absadiki/pywhispercpp/tree/main/pywhispercpp/examples) for more examples. # Advanced usage - -- First check the [API documentation](https://absadiki.github.io/pywhispercpp/) for more advanced usage. -- If you are a more experienced user, you can access the exposed C-APIs directly from the binding module `_pywhispercpp`. +* First check the [API documentation](https://absadiki.github.io/pywhispercpp/) for more advanced usage. +* If you are a more experienced user, you can access the exposed C-APIs directly from the binding module `_pywhispercpp`. ```python import _pywhispercpp as pwcpp @@ -322,11 +299,10 @@ ctx = pwcpp.whisper_init_from_file_with_params( ``` # Discussions and contributions - If you find any bug, please open an [issue](https://github.com/absadiki/pywhispercpp/issues). If you have any feedback, or you want to share how you are using this project, feel free to use the [Discussions](https://github.com/absadiki/pywhispercpp/discussions) and open a new topic. # License -This project is licensed under the same license as [whisper.cpp](https://github.com/ggerganov/whisper.cpp/blob/master/LICENSE) (MIT [License](./LICENSE)). +This project is licensed under the same license as [whisper.cpp](https://github.com/ggerganov/whisper.cpp/blob/master/LICENSE) (MIT [License](./LICENSE)). \ No newline at end of file