Skip to content

Commit

Permalink
Add tokenizer IPA
Browse files Browse the repository at this point in the history
  • Loading branch information
nickovchinnikov committed Dec 22, 2023
1 parent 4b09324 commit 3e435cb
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 0 deletions.
4 changes: 4 additions & 0 deletions docs/training/preprocess/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ Preprocessing `PreprocessLibriTTS` audio and text data for use with a `TacotronS

The Wav2VecAligner model is designed for aligning audio data with text data.
This class handles the training and validation of the Wav2VecAligner model.

### [TokenizerIPA](./tokenizer_ipa.md)

The tokenizer of IPA tokens with punctuation
1 change: 1 addition & 0 deletions docs/training/preprocess/tokenizer_ipa.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
::: training.preprocess.tokenizer_ipa
33 changes: 33 additions & 0 deletions training/preprocess/tests/test_tokenizer_ipa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import unittest

from training.preprocess.tokenizer_ipa import TokenizerIPA

class TestTokenizerIPA(unittest.TestCase):
def setUp(self):
self.tokenizer = TokenizerIPA()

def test_init(self):
self.assertEqual(self.tokenizer.lang, "en_us")
self.assertIsNotNone(self.tokenizer.phonemizer)
self.assertIsNotNone(self.tokenizer.tokenizer)

def test_call(self):
text = "hello world"
phones_ipa, tokens = self.tokenizer(text)

self.assertIsInstance(phones_ipa, str)
self.assertIsInstance(tokens, list)
self.assertTrue(all(isinstance(token, int) for token in tokens))

def test_call_with_punctuation(self):
text = "hello world"
phones_ipa, tokens = self.tokenizer(text)

text2 = "Hello, world!"
phones_ipa2, tokens2 = self.tokenizer(text2)

self.assertNotEqual(phones_ipa, phones_ipa2)
self.assertNotEqual(tokens, tokens2)

if __name__ == '__main__':
unittest.main()
61 changes: 61 additions & 0 deletions training/preprocess/tokenizer_ipa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from typing import Union

from dp.phonemizer import Phonemizer
from dp.preprocessing.text import SequenceTokenizer

from model.config import get_lang_map

class TokenizerIPA:
r"""
TokenizerIPA is a class for tokenizing International Phonetic Alphabet (IPA) phonemes.
Attributes:
lang (str): Language to be used. Default is "en".
phonemizer_checkpoint (str): Path to the phonemizer checkpoint file.
phonemizer (Phonemizer): Phonemizer object for converting text to phonemes.
tokenizer (SequenceTokenizer): SequenceTokenizer object for tokenizing the phonemes.
"""

def __init__(
self,
lang: str = "en",
phonemizer_checkpoint: str = "checkpoints/en_us_cmudict_ipa_forward.pt"
):
r"""
Initializes TokenizerIPA with the given language and phonemizer checkpoint.
Args:
lang (str): The language to be used. Default is "en".
phonemizer_checkpoint (str): The path to the phonemizer checkpoint file.
"""
lang_map = get_lang_map(lang)
self.lang = lang_map.phonemizer

self.phonemizer = Phonemizer.from_checkpoint(phonemizer_checkpoint)

phoneme_symbols = [
# IPA symbols
'a', 'b', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'æ', 'ç', 'ð', 'ø', 'ŋ', 'œ', 'ɐ', 'ɑ', 'ɔ', 'ə', 'ɛ', 'ɝ', 'ɹ', 'ɡ', 'ɪ', 'ʁ', 'ʃ', 'ʊ', 'ʌ', 'ʏ', 'ʒ', 'ʔ', 'ˈ', 'ˌ', 'ː', '̃', '̍', '̥', '̩', '̯', '͡', 'θ',
# Punctuation
'!', '?', ',', '.', '-', ':', ';', '"', "'", '(', ')'
]

self.tokenizer = SequenceTokenizer(phoneme_symbols,
languages=['de', 'en_us'],
lowercase=True,
char_repeats=1,
append_start_end=True)

def __call__(self, text: str) -> tuple[Union[str, list[str]], list[int]]:
r"""
Converts the input text to phonemes and tokenizes them.
Args:
text (str): The input text to be tokenized.
Returns:
list: The tokenized phonemes.
"""
phones_ipa = self.phonemizer(text, lang=self.lang)
return phones_ipa, self.tokenizer(phones_ipa, language=self.lang)

0 comments on commit 3e435cb

Please sign in to comment.