Add tokenizer IPA

PeechApp · Dec 22, 2023 · 3e435cb · 3e435cb
1 parent 4b09324
commit 3e435cb
Show file tree

Hide file tree

Showing 4 changed files with 99 additions and 0 deletions.
diff --git a/docs/training/preprocess/readme.md b/docs/training/preprocess/readme.md
@@ -26,3 +26,7 @@ Preprocessing `PreprocessLibriTTS` audio and text data for use with a `TacotronS
 
 The Wav2VecAligner model is designed for aligning audio data with text data.
 This class handles the training and validation of the Wav2VecAligner model.
+
+### [TokenizerIPA](./tokenizer_ipa.md)
+
+The tokenizer of IPA tokens with punctuation
diff --git a/docs/training/preprocess/tokenizer_ipa.md b/docs/training/preprocess/tokenizer_ipa.md
@@ -0,0 +1 @@
+::: training.preprocess.tokenizer_ipa
diff --git a/training/preprocess/tests/test_tokenizer_ipa.py b/training/preprocess/tests/test_tokenizer_ipa.py
@@ -0,0 +1,33 @@
+import unittest
+
+from training.preprocess.tokenizer_ipa import TokenizerIPA
+
+class TestTokenizerIPA(unittest.TestCase):
+    def setUp(self):
+        self.tokenizer = TokenizerIPA()
+
+    def test_init(self):
+        self.assertEqual(self.tokenizer.lang, "en_us")
+        self.assertIsNotNone(self.tokenizer.phonemizer)
+        self.assertIsNotNone(self.tokenizer.tokenizer)
+
+    def test_call(self):
+        text = "hello world"
+        phones_ipa, tokens = self.tokenizer(text)
+
+        self.assertIsInstance(phones_ipa, str)
+        self.assertIsInstance(tokens, list)
+        self.assertTrue(all(isinstance(token, int) for token in tokens))
+
+    def test_call_with_punctuation(self):
+        text = "hello world"
+        phones_ipa, tokens = self.tokenizer(text)
+
+        text2 = "Hello, world!"
+        phones_ipa2, tokens2 = self.tokenizer(text2)
+
+        self.assertNotEqual(phones_ipa, phones_ipa2)
+        self.assertNotEqual(tokens, tokens2)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/training/preprocess/tokenizer_ipa.py b/training/preprocess/tokenizer_ipa.py
@@ -0,0 +1,61 @@
+from typing import Union
+
+from dp.phonemizer import Phonemizer
+from dp.preprocessing.text import SequenceTokenizer
+
+from model.config import get_lang_map
+
+class TokenizerIPA:
+    r"""
+    TokenizerIPA is a class for tokenizing International Phonetic Alphabet (IPA) phonemes.
+
+    Attributes:
+        lang (str): Language to be used. Default is "en".
+        phonemizer_checkpoint (str): Path to the phonemizer checkpoint file.
+        phonemizer (Phonemizer): Phonemizer object for converting text to phonemes.
+        tokenizer (SequenceTokenizer): SequenceTokenizer object for tokenizing the phonemes.
+    """
+
+    def __init__(
+            self,
+            lang: str = "en",
+            phonemizer_checkpoint: str = "checkpoints/en_us_cmudict_ipa_forward.pt"
+    ):
+        r"""
+        Initializes TokenizerIPA with the given language and phonemizer checkpoint.
+
+        Args:
+            lang (str): The language to be used. Default is "en".
+            phonemizer_checkpoint (str): The path to the phonemizer checkpoint file.
+        """
+        lang_map = get_lang_map(lang)
+        self.lang = lang_map.phonemizer
+
+        self.phonemizer = Phonemizer.from_checkpoint(phonemizer_checkpoint)
+
+        phoneme_symbols = [
+            # IPA symbols
+            'a', 'b', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'æ', 'ç', 'ð', 'ø', 'ŋ', 'œ', 'ɐ', 'ɑ', 'ɔ', 'ə', 'ɛ', 'ɝ', 'ɹ', 'ɡ', 'ɪ', 'ʁ', 'ʃ', 'ʊ', 'ʌ', 'ʏ', 'ʒ', 'ʔ', 'ˈ', 'ˌ', 'ː', '̃', '̍', '̥', '̩', '̯', '͡', 'θ',
+            # Punctuation
+            '!', '?', ',', '.', '-', ':', ';', '"', "'", '(', ')'
+        ]
+
+        self.tokenizer = SequenceTokenizer(phoneme_symbols,
+                                        languages=['de', 'en_us'],
+                                        lowercase=True,
+                                        char_repeats=1,
+                                        append_start_end=True)
+
+    def __call__(self, text: str) -> tuple[Union[str, list[str]], list[int]]:
+        r"""
+        Converts the input text to phonemes and tokenizes them.
+
+        Args:
+            text (str): The input text to be tokenized.
+
+        Returns:
+            list: The tokenized phonemes.
+
+        """
+        phones_ipa = self.phonemizer(text, lang=self.lang)
+        return phones_ipa, self.tokenizer(phones_ipa, language=self.lang)