Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ The first gen Chinese tokenizer uses jieba to cut, pypinyin, and pinyin-to-ipa.
### Vietnamese
- https://github.com/v-nhandt21/Viphoneme

## Hebrew

- https://github.com/thewh1teagle/phonikud

### TODO
- [ ] Data: Compress [data](https://github.com/hexgrad/misaki/tree/main/misaki/data) (no need for indented json) and eliminate redundancy between gold and silver dictionaries.
- [ ] Fallbacks: Train seq2seq fallback models on dictionaries using [this notebook](https://github.com/Kyubyong/nlp_made_easy/blob/master/PyTorch%20seq2seq%20template%20based%20on%20the%20g2p%20task.ipynb).
Expand Down
18 changes: 5 additions & 13 deletions misaki/he.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,14 @@
"""
Phonemize Hebrew using mishkal package from https://github.com/thewh1teagle/mishkal
Phonemize Hebrew using mishkal package from https://github.com/thewh1teagle/phonikud
"""

import mishkal
import phonikud

class HEG2P:
def __call__(self, text: str, preserve_punctuation = True, preserve_stress = True):
def __call__(self, text: str, preserve_punctuation = True, preserve_stress = True, **kwargs):
"""
Convert Hebrew text to IPA
Text is expected to be with diacritics (niqqud)
Enable debug to return Word objects that contais detailed conversion information
Text is expected to be with enhanced diacritics (nikud)
"""

return mishkal.phonemize(text, preserve_punctuation=preserve_punctuation, preserve_stress=preserve_stress)

def get_phonene_set(self):
"""
Return list with exact phonemes used in mishkal package
"""

return mishkal.get_phoneme_set()
return phonikud.phonemize(text, preserve_punctuation=preserve_punctuation, preserve_stress=preserve_stress, **kwargs)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ ja = ["fugashi", "jaconv", "mojimoji", "unidic", "pyopenjtalk"]
ko = ["jamo", "nltk"]
zh = ["jieba", "ordered-set", "pypinyin", "cn2an", "pypinyin-dict"]
vi = ["num2words", "spacy", "spacy-curated-transformers", "underthesea"]
he = ["mishkal-hebrew>=0.3.2"]
he = ["phonikud>=0.3.9"]

[build-system]
requires = ["hatchling"]
Expand Down
Loading