diff --git a/egs/csj/ASR/.gitignore b/egs/csj/ASR/.gitignore new file mode 100644 index 0000000000..c0a162e206 --- /dev/null +++ b/egs/csj/ASR/.gitignore @@ -0,0 +1,7 @@ +librispeech_*.* +todelete* +lang* +notify_tg.py +finetune_* +misc.ini +.vscode/* \ No newline at end of file diff --git a/egs/csj/ASR/local/compute_fbank_csj.py b/egs/csj/ASR/local/compute_fbank_csj.py new file mode 100644 index 0000000000..994dedbddd --- /dev/null +++ b/egs/csj/ASR/local/compute_fbank_csj.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python3 +# Copyright 2022 The University of Electro-Communications (Author: Teo Wen Shen) # noqa +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import argparse +import logging +import os +from itertools import islice +from pathlib import Path +from random import Random +from typing import List, Tuple + +import torch +from lhotse import ( + CutSet, + Fbank, + FbankConfig, + # fmt: off + # See the following for why LilcomChunkyWriter is preferred + # https://github.com/k2-fsa/icefall/pull/404 + # https://github.com/lhotse-speech/lhotse/pull/527 + # fmt: on + LilcomChunkyWriter, + RecordingSet, + SupervisionSet, +) + +ARGPARSE_DESCRIPTION = """ +This script follows the espnet method of splitting the remaining core+noncore +utterances into valid and train cutsets at an index which is by default 4000. + +In other words, the core+noncore utterances are shuffled, where 4000 utterances +of the shuffled set go to the `valid` cutset and are not subject to speed +perturbation. The remaining utterances become the `train` cutset and are speed- +perturbed (0.9x, 1.0x, 1.1x). + +""" + +# Torch's multithreaded behavior needs to be disabled or +# it wastes a lot of CPU and slow things down. +# Do this outside of main() in case it needs to take effect +# even when we are not invoking the main (e.g. when spawning subprocesses). +torch.set_num_threads(1) +torch.set_num_interop_threads(1) + +RNG_SEED = 42 + + +def make_cutset_blueprints( + manifest_dir: Path, + split: int, +) -> List[Tuple[str, CutSet]]: + + cut_sets = [] + # Create eval datasets + logging.info("Creating eval cuts.") + for i in range(1, 4): + cut_set = CutSet.from_manifests( + recordings=RecordingSet.from_file( + manifest_dir / f"csj_recordings_eval{i}.jsonl.gz" + ), + supervisions=SupervisionSet.from_file( + manifest_dir / f"csj_supervisions_eval{i}.jsonl.gz" + ), + ) + cut_set = cut_set.trim_to_supervisions(keep_overlapping=False) + cut_sets.append((f"eval{i}", cut_set)) + + # Create train and valid cuts + logging.info( + "Loading, trimming, and shuffling the remaining core+noncore cuts." + ) + recording_set = RecordingSet.from_file( + manifest_dir / "csj_recordings_core.jsonl.gz" + ) + RecordingSet.from_file(manifest_dir / "csj_recordings_noncore.jsonl.gz") + supervision_set = SupervisionSet.from_file( + manifest_dir / "csj_supervisions_core.jsonl.gz" + ) + SupervisionSet.from_file( + manifest_dir / "csj_supervisions_noncore.jsonl.gz" + ) + + cut_set = CutSet.from_manifests( + recordings=recording_set, + supervisions=supervision_set, + ) + cut_set = cut_set.trim_to_supervisions(keep_overlapping=False) + cut_set = cut_set.shuffle(Random(RNG_SEED)) + + logging.info( + "Creating valid and train cuts from core and noncore," + f"split at {split}." + ) + valid_set = CutSet.from_cuts(islice(cut_set, 0, split)) + + train_set = CutSet.from_cuts(islice(cut_set, split, None)) + train_set = ( + train_set + train_set.perturb_speed(0.9) + train_set.perturb_speed(1.1) + ) + + cut_sets.extend([("valid", valid_set), ("train", train_set)]) + + return cut_sets + + +def get_args(): + parser = argparse.ArgumentParser( + description=ARGPARSE_DESCRIPTION, + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.add_argument( + "--manifest-dir", type=Path, help="Path to save manifests" + ) + parser.add_argument( + "--fbank-dir", type=Path, help="Path to save fbank features" + ) + parser.add_argument( + "--split", type=int, default=4000, help="Split at this index" + ) + + return parser.parse_args() + + +def main(): + args = get_args() + + extractor = Fbank(FbankConfig(num_mel_bins=80)) + num_jobs = min(16, os.cpu_count()) + + formatter = ( + "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + ) + + logging.basicConfig(format=formatter, level=logging.INFO) + + if (args.fbank_dir / ".done").exists(): + logging.info( + "Previous fbank computed for CSJ found. " + f"Delete {args.fbank_dir / '.done'} to allow recomputing fbank." + ) + return + else: + cut_sets = make_cutset_blueprints(args.manifest_dir, args.split) + for part, cut_set in cut_sets: + logging.info(f"Processing {part}") + cut_set = cut_set.compute_and_store_features( + extractor=extractor, + num_jobs=num_jobs, + storage_path=(args.fbank_dir / f"feats_{part}").as_posix(), + storage_type=LilcomChunkyWriter, + ) + cut_set.to_file(args.manifest_dir / f"csj_cuts_{part}.jsonl.gz") + + logging.info("All fbank computed for CSJ.") + (args.fbank_dir / ".done").touch() + + +if __name__ == "__main__": + main() diff --git a/egs/csj/ASR/local/compute_fbank_musan.py b/egs/csj/ASR/local/compute_fbank_musan.py new file mode 120000 index 0000000000..5833f2484e --- /dev/null +++ b/egs/csj/ASR/local/compute_fbank_musan.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/compute_fbank_musan.py \ No newline at end of file diff --git a/egs/csj/ASR/local/conf/disfluent.ini b/egs/csj/ASR/local/conf/disfluent.ini new file mode 100644 index 0000000000..eb70673de7 --- /dev/null +++ b/egs/csj/ASR/local/conf/disfluent.ini @@ -0,0 +1,321 @@ +; # This section is ignored if this file is not supplied as the first config file to +; # lhotse prepare csj +[SEGMENTS] +; # Allowed period of nonverbal noise. If exceeded, a new segment is created. +gap = 0.5 +; # Maximum length of segment (s). +maxlen = 10 +; # Minimum length of segment (s). Segments shorter than `minlen` will be dropped silently. +minlen = 0.02 +; # Use this symbol to represent a period of allowed nonverbal noise, i.e. `gap`. +; # Pass an empty string to avoid adding any symbol. It was "" in kaldi. +; # If you intend to use a multicharacter string for gap_sym, remember to register the +; # multicharacter string as part of userdef-string in prepare_lang_char.py. +gap_sym = + +[CONSTANTS] +; # Name of this mode +MODE = disfluent +; # Suffixes to use after the word surface (no longer used) +MORPH = pos1 cForm cType2 pos2 +; # Used to differentiate between A tag and A_num tag +JPN_NUM = ゼロ 0 零 一 二 三 四 五 六 七 八 九 十 百 千 . +; # Dummy character to delineate multiline words +PLUS = + + +[DECISIONS] +; # TAG+'^'とは、タグが一つの転記単位に独立していない場合 +; # The PLUS (fullwidth) sign '+' marks line boundaries for multiline entries + +; # フィラー、感情表出系感動詞 +; # 0 to remain, 1 to delete +; # Example: '(F ぎょっ)' +F = 0 +; # Example: '(L (F ン))', '比べ(F えー)る' +F^ = 0 +; # 言い直し、いいよどみなどによる語断片 +; # 0 to remain, 1 to delete +; # Example: '(D だ)(D だいが) 大学の学部の会議' +D = 0 +; # Example: '(L (D ドゥ)+(D ヒ))' +D^ = 0 +; # 助詞、助動詞、接辞の言い直し +; # 0 to remain, 1 to delete +; # Example: '西洋 (D2 的)(F えー)(D ふ) 風というか' +D2 = 0 +; # Example: '(X (D2 ノ))' +D2^ = 0 +; # 聞き取りや語彙の判断に自信がない場合 +; # 0 to remain, 1 to delete +; # Example: (? 字数) の +; # If no option: empty string is returned regardless of output +; # Example: '(?) で' +? = 0 +; # Example: '(D (? すー))+そう+です+よ+ね' +?^ = 0 +; # タグ?で、値は複数の候補が想定される場合 +; # 0 for main guess with matching morph info, 1 for second guess +; # Example: '(? 次数, 実数)', '(? これ,ここで)+(? 説明+し+た+方+が+いい+か+な)' +?, = 0 +; # Example: '(W (? テユクー);(? ケッキョク,テユウコトデ))', '(W マシ;(? マシ+タ,マス))' +?,^ = 0 +; # 音や言葉に関するメタ的な引用 +; # 0 to remain, 1 to delete +; # Example: '助詞の (M は) は (M は) と書くが発音は (M わ)' +M = 0 +; # Example: '(L (M ヒ)+(M ヒ))', '(L (M (? ヒ+ヒ)))' +M^ = 0 +; # 外国語や古語、方言など +; # 0 to remain, 1 to delete +; # Example: '(O ザッツファイン)' +O = 0 +; # Example: '(笑 (O エクスキューズ+ミー))', '(笑 メダッ+テ+(O ナンボ))' +O^ = 0 +; # 講演者の名前、差別語、誹謗中傷など +; # 0 to remain, 1 to delete +; # Example: '国語研の (R ××) です' +R = 0 +R^ = 0 +; # 非朗読対象発話(朗読における言い間違い等) +; # 0 to remain, 1 to delete +; # Example: '(X 実際は) 実際には' +X = 0 +; # Example: '(L (X (D2 ニ)))' +X^ = 0 +; # アルファベットや算用数字、記号の表記 +; # 0 to use Japanese form, 1 to use alphabet form +; # Example: '(A シーディーアール;CD-R)' +A = 1 +; # Example: 'スモール(A エヌ;N)', 'ラージ(A キュー;Q)', '(A ティーエフ;TF)+(A アイディーエフ;IDF)' (Strung together by pron: '(W (? ティーワイド);ティーエフ+アイディーエフ)') +A^ = 1 +; # タグAで、単語は算用数字の場合 +; # 0 to use Japanese form, 1 to use Arabic numerals +; # Example: (A 二千;2000) +A_num = eval:self.notag +A_num^ = eval:self.notag +; # 何らかの原因で漢字表記できなくなった場合 +; # 0 to use broken form, 1 to use orthodox form +; # Example: '(K たち (F えー) ばな;橘)' +K = 1 +; # Example: '合(K か(?)く;格)', '宮(K ま(?)え;前)' +K^ = 1 +; # 転訛、発音の怠けなど、一時的な発音エラー +; # 0 to use wrong form, 1 to use orthodox form +; # Example: '(W ギーツ;ギジュツ)' +W = 1 +; # Example: '(F (W エド;エト))', 'イベント(W リレーティッド;リレーテッド)' +W^ = 1 +; # 語の読みに関する知識レベルのいい間違い +; # 0 to use wrong form, 1 to use orthodox form +; # Example: '(B シブタイ;ジュータイ)' +B = 0 +; # Example: 'データー(B カズ;スー)' +B^ = 0 +; # 笑いながら発話 +; # 0 to remain, 1 to delete +; # Example: '(笑 ナニガ)', '(笑 (F エー)+ソー+イッ+タ+ヨー+ナ)' +笑 = 0 +; # Example: 'コク(笑 サイ+(D オン))', +笑^ = 0 +; # 泣きながら発話 +; # 0 to remain, 1 to delete +; # Example: '(泣 ドンナニ)' +泣 = 0 +泣^ = 0 +; # 咳をしながら発話 +; # 0 to remain, 1 to delete +; # Example: 'シャ(咳 リン) ノ' +咳 = 0 +; # Example: 'イッ(咳 パン)', 'ワズ(咳 カ)' +咳^ = 0 +; # ささやき声や独り言などの小さな声 +; # 0 to remain, 1 to delete +; # Example: '(L アレコレナンダッケ)', '(L (W コデ;(? コレ,ココデ))+(? セツメー+シ+タ+ホー+ガ+イー+カ+ナ))' +L = 0 +; # Example: 'デ(L ス)', 'ッ(L テ+コ)ト' +L^ = 0 + +[REPLACEMENTS] +; # ボーカルフライなどで母音が同定できない場合 + = +; # 「うん/うーん/ふーん」の音の特定が困難な場合 + = +; # 非語彙的な母音の引き延ばし + = +; # 非語彙的な子音の引き延ばし + = +; # 言語音と独立に講演者の笑いが生じている場合 +<笑> = +; # 言語音と独立に講演者の咳が生じている場合 +<咳> = +; # 言語音と独立に講演者の息が生じている場合 +<息> = +; # 講演者の泣き声 +<泣> = +; # 聴衆(司会者なども含む)の発話 +<フロア発話> = +; # 聴衆の笑い +<フロア笑> = +; # 聴衆の拍手 +<拍手> = +; # 講演者が発表中に用いたデモンストレーションの音声 +<デモ> = +; # 学会講演に発表時間を知らせるためにならすベルの音 +<ベル> = +; # 転記単位全体が再度読み直された場合 +<朗読間違い> = +; # 上記以外の音で特に目立った音 +<雑音> = +; # 0.2秒以上のポーズ +

= +; # Redacted information, for R +; # It is \x00D7 multiplication sign, not your normal 'x' +× = × + +[FIELDS] +; # Time information for segment +time = 3 +; # Word surface +surface = 5 +; # Word surface root form without CSJ tags +notag = 9 +; # Part Of Speech +pos1 = 11 +; # Conjugated Form +cForm = 12 +; # Conjugation Type +cType1 = 13 +; # Subcategory of POS +pos2 = 14 +; # Euphonic Change / Subcategory of Conjugation Type +cType2 = 15 +; # Other information +other = 16 +; # Pronunciation for lexicon +pron = 10 +; # Speaker ID +spk_id = 2 + +[KATAKANA2ROMAJI] +ア = 'a +イ = 'i +ウ = 'u +エ = 'e +オ = 'o +カ = ka +キ = ki +ク = ku +ケ = ke +コ = ko +ガ = ga +ギ = gi +グ = gu +ゲ = ge +ゴ = go +サ = sa +シ = si +ス = su +セ = se +ソ = so +ザ = za +ジ = zi +ズ = zu +ゼ = ze +ゾ = zo +タ = ta +チ = ti +ツ = tu +テ = te +ト = to +ダ = da +ヂ = di +ヅ = du +デ = de +ド = do +ナ = na +ニ = ni +ヌ = nu +ネ = ne +ノ = no +ハ = ha +ヒ = hi +フ = hu +ヘ = he +ホ = ho +バ = ba +ビ = bi +ブ = bu +ベ = be +ボ = bo +パ = pa +ピ = pi +プ = pu +ペ = pe +ポ = po +マ = ma +ミ = mi +ム = mu +メ = me +モ = mo +ヤ = ya +ユ = yu +ヨ = yo +ラ = ra +リ = ri +ル = ru +レ = re +ロ = ro +ワ = wa +ヰ = we +ヱ = wi +ヲ = wo +ン = ŋ +ッ = q +ー = - +キャ = kǐa +キュ = kǐu +キョ = kǐo +ギャ = gǐa +ギュ = gǐu +ギョ = gǐo +シャ = sǐa +シュ = sǐu +ショ = sǐo +ジャ = zǐa +ジュ = zǐu +ジョ = zǐo +チャ = tǐa +チュ = tǐu +チョ = tǐo +ヂャ = dǐa +ヂュ = dǐu +ヂョ = dǐo +ニャ = nǐa +ニュ = nǐu +ニョ = nǐo +ヒャ = hǐa +ヒュ = hǐu +ヒョ = hǐo +ビャ = bǐa +ビュ = bǐu +ビョ = bǐo +ピャ = pǐa +ピュ = pǐu +ピョ = pǐo +ミャ = mǐa +ミュ = mǐu +ミョ = mǐo +リャ = rǐa +リュ = rǐu +リョ = rǐo +ァ = a +ィ = i +ゥ = u +ェ = e +ォ = o +ヮ = ʍ +ヴ = vu +ャ = ǐa +ュ = ǐu +ョ = ǐo + diff --git a/egs/csj/ASR/local/conf/fluent.ini b/egs/csj/ASR/local/conf/fluent.ini new file mode 100644 index 0000000000..5d22f9eb8a --- /dev/null +++ b/egs/csj/ASR/local/conf/fluent.ini @@ -0,0 +1,321 @@ +; # This section is ignored if this file is not supplied as the first config file to +; # lhotse prepare csj +[SEGMENTS] +; # Allowed period of nonverbal noise. If exceeded, a new segment is created. +gap = 0.5 +; # Maximum length of segment (s). +maxlen = 10 +; # Minimum length of segment (s). Segments shorter than `minlen` will be dropped silently. +minlen = 0.02 +; # Use this symbol to represent a period of allowed nonverbal noise, i.e. `gap`. +; # Pass an empty string to avoid adding any symbol. It was "" in kaldi. +; # If you intend to use a multicharacter string for gap_sym, remember to register the +; # multicharacter string as part of userdef-string in prepare_lang_char.py. +gap_sym = + +[CONSTANTS] +; # Name of this mode +MODE = fluent +; # Suffixes to use after the word surface (no longer used) +MORPH = pos1 cForm cType2 pos2 +; # Used to differentiate between A tag and A_num tag +JPN_NUM = ゼロ 0 零 一 二 三 四 五 六 七 八 九 十 百 千 . +; # Dummy character to delineate multiline words +PLUS = + + +[DECISIONS] +; # TAG+'^'とは、タグが一つの転記単位に独立していない場合 +; # The PLUS (fullwidth) sign '+' marks line boundaries for multiline entries + +; # フィラー、感情表出系感動詞 +; # 0 to remain, 1 to delete +; # Example: '(F ぎょっ)' +F = 1 +; # Example: '(L (F ン))', '比べ(F えー)る' +F^ = 1 +; # 言い直し、いいよどみなどによる語断片 +; # 0 to remain, 1 to delete +; # Example: '(D だ)(D だいが) 大学の学部の会議' +D = 1 +; # Example: '(L (D ドゥ)+(D ヒ))' +D^ = 1 +; # 助詞、助動詞、接辞の言い直し +; # 0 to remain, 1 to delete +; # Example: '西洋 (D2 的)(F えー)(D ふ) 風というか' +D2 = 1 +; # Example: '(X (D2 ノ))' +D2^ = 1 +; # 聞き取りや語彙の判断に自信がない場合 +; # 0 to remain, 1 to delete +; # Example: (? 字数) の +; # If no option: empty string is returned regardless of output +; # Example: '(?) で' +? = 0 +; # Example: '(D (? すー))+そう+です+よ+ね' +?^ = 0 +; # タグ?で、値は複数の候補が想定される場合 +; # 0 for main guess with matching morph info, 1 for second guess +; # Example: '(? 次数, 実数)', '(? これ,ここで)+(? 説明+し+た+方+が+いい+か+な)' +?, = 0 +; # Example: '(W (? テユクー);(? ケッキョク,テユウコトデ))', '(W マシ;(? マシ+タ,マス))' +?,^ = 0 +; # 音や言葉に関するメタ的な引用 +; # 0 to remain, 1 to delete +; # Example: '助詞の (M は) は (M は) と書くが発音は (M わ)' +M = 0 +; # Example: '(L (M ヒ)+(M ヒ))', '(L (M (? ヒ+ヒ)))' +M^ = 0 +; # 外国語や古語、方言など +; # 0 to remain, 1 to delete +; # Example: '(O ザッツファイン)' +O = 0 +; # Example: '(笑 (O エクスキューズ+ミー))', '(笑 メダッ+テ+(O ナンボ))' +O^ = 0 +; # 講演者の名前、差別語、誹謗中傷など +; # 0 to remain, 1 to delete +; # Example: '国語研の (R ××) です' +R = 0 +R^ = 0 +; # 非朗読対象発話(朗読における言い間違い等) +; # 0 to remain, 1 to delete +; # Example: '(X 実際は) 実際には' +X = 0 +; # Example: '(L (X (D2 ニ)))' +X^ = 0 +; # アルファベットや算用数字、記号の表記 +; # 0 to use Japanese form, 1 to use alphabet form +; # Example: '(A シーディーアール;CD-R)' +A = 1 +; # Example: 'スモール(A エヌ;N)', 'ラージ(A キュー;Q)', '(A ティーエフ;TF)+(A アイディーエフ;IDF)' (Strung together by pron: '(W (? ティーワイド);ティーエフ+アイディーエフ)') +A^ = 1 +; # タグAで、単語は算用数字の場合 +; # 0 to use Japanese form, 1 to use Arabic numerals +; # Example: (A 二千;2000) +A_num = eval:self.notag +A_num^ = eval:self.notag +; # 何らかの原因で漢字表記できなくなった場合 +; # 0 to use broken form, 1 to use orthodox form +; # Example: '(K たち (F えー) ばな;橘)' +K = 1 +; # Example: '合(K か(?)く;格)', '宮(K ま(?)え;前)' +K^ = 1 +; # 転訛、発音の怠けなど、一時的な発音エラー +; # 0 to use wrong form, 1 to use orthodox form +; # Example: '(W ギーツ;ギジュツ)' +W = 1 +; # Example: '(F (W エド;エト))', 'イベント(W リレーティッド;リレーテッド)' +W^ = 1 +; # 語の読みに関する知識レベルのいい間違い +; # 0 to use wrong form, 1 to use orthodox form +; # Example: '(B シブタイ;ジュータイ)' +B = 0 +; # Example: 'データー(B カズ;スー)' +B^ = 0 +; # 笑いながら発話 +; # 0 to remain, 1 to delete +; # Example: '(笑 ナニガ)', '(笑 (F エー)+ソー+イッ+タ+ヨー+ナ)' +笑 = 0 +; # Example: 'コク(笑 サイ+(D オン))', +笑^ = 0 +; # 泣きながら発話 +; # 0 to remain, 1 to delete +; # Example: '(泣 ドンナニ)' +泣 = 0 +泣^ = 0 +; # 咳をしながら発話 +; # 0 to remain, 1 to delete +; # Example: 'シャ(咳 リン) ノ' +咳 = 0 +; # Example: 'イッ(咳 パン)', 'ワズ(咳 カ)' +咳^ = 0 +; # ささやき声や独り言などの小さな声 +; # 0 to remain, 1 to delete +; # Example: '(L アレコレナンダッケ)', '(L (W コデ;(? コレ,ココデ))+(? セツメー+シ+タ+ホー+ガ+イー+カ+ナ))' +L = 0 +; # Example: 'デ(L ス)', 'ッ(L テ+コ)ト' +L^ = 0 + +[REPLACEMENTS] +; # ボーカルフライなどで母音が同定できない場合 + = +; # 「うん/うーん/ふーん」の音の特定が困難な場合 + = +; # 非語彙的な母音の引き延ばし + = +; # 非語彙的な子音の引き延ばし + = +; # 言語音と独立に講演者の笑いが生じている場合 +<笑> = +; # 言語音と独立に講演者の咳が生じている場合 +<咳> = +; # 言語音と独立に講演者の息が生じている場合 +<息> = +; # 講演者の泣き声 +<泣> = +; # 聴衆(司会者なども含む)の発話 +<フロア発話> = +; # 聴衆の笑い +<フロア笑> = +; # 聴衆の拍手 +<拍手> = +; # 講演者が発表中に用いたデモンストレーションの音声 +<デモ> = +; # 学会講演に発表時間を知らせるためにならすベルの音 +<ベル> = +; # 転記単位全体が再度読み直された場合 +<朗読間違い> = +; # 上記以外の音で特に目立った音 +<雑音> = +; # 0.2秒以上のポーズ +

= +; # Redacted information, for R +; # It is \x00D7 multiplication sign, not your normal 'x' +× = × + +[FIELDS] +; # Time information for segment +time = 3 +; # Word surface +surface = 5 +; # Word surface root form without CSJ tags +notag = 9 +; # Part Of Speech +pos1 = 11 +; # Conjugated Form +cForm = 12 +; # Conjugation Type +cType1 = 13 +; # Subcategory of POS +pos2 = 14 +; # Euphonic Change / Subcategory of Conjugation Type +cType2 = 15 +; # Other information +other = 16 +; # Pronunciation for lexicon +pron = 10 +; # Speaker ID +spk_id = 2 + +[KATAKANA2ROMAJI] +ア = 'a +イ = 'i +ウ = 'u +エ = 'e +オ = 'o +カ = ka +キ = ki +ク = ku +ケ = ke +コ = ko +ガ = ga +ギ = gi +グ = gu +ゲ = ge +ゴ = go +サ = sa +シ = si +ス = su +セ = se +ソ = so +ザ = za +ジ = zi +ズ = zu +ゼ = ze +ゾ = zo +タ = ta +チ = ti +ツ = tu +テ = te +ト = to +ダ = da +ヂ = di +ヅ = du +デ = de +ド = do +ナ = na +ニ = ni +ヌ = nu +ネ = ne +ノ = no +ハ = ha +ヒ = hi +フ = hu +ヘ = he +ホ = ho +バ = ba +ビ = bi +ブ = bu +ベ = be +ボ = bo +パ = pa +ピ = pi +プ = pu +ペ = pe +ポ = po +マ = ma +ミ = mi +ム = mu +メ = me +モ = mo +ヤ = ya +ユ = yu +ヨ = yo +ラ = ra +リ = ri +ル = ru +レ = re +ロ = ro +ワ = wa +ヰ = we +ヱ = wi +ヲ = wo +ン = ŋ +ッ = q +ー = - +キャ = kǐa +キュ = kǐu +キョ = kǐo +ギャ = gǐa +ギュ = gǐu +ギョ = gǐo +シャ = sǐa +シュ = sǐu +ショ = sǐo +ジャ = zǐa +ジュ = zǐu +ジョ = zǐo +チャ = tǐa +チュ = tǐu +チョ = tǐo +ヂャ = dǐa +ヂュ = dǐu +ヂョ = dǐo +ニャ = nǐa +ニュ = nǐu +ニョ = nǐo +ヒャ = hǐa +ヒュ = hǐu +ヒョ = hǐo +ビャ = bǐa +ビュ = bǐu +ビョ = bǐo +ピャ = pǐa +ピュ = pǐu +ピョ = pǐo +ミャ = mǐa +ミュ = mǐu +ミョ = mǐo +リャ = rǐa +リュ = rǐu +リョ = rǐo +ァ = a +ィ = i +ゥ = u +ェ = e +ォ = o +ヮ = ʍ +ヴ = vu +ャ = ǐa +ュ = ǐu +ョ = ǐo + diff --git a/egs/csj/ASR/local/conf/number.ini b/egs/csj/ASR/local/conf/number.ini new file mode 100644 index 0000000000..2613c34095 --- /dev/null +++ b/egs/csj/ASR/local/conf/number.ini @@ -0,0 +1,321 @@ +; # This section is ignored if this file is not supplied as the first config file to +; # lhotse prepare csj +[SEGMENTS] +; # Allowed period of nonverbal noise. If exceeded, a new segment is created. +gap = 0.5 +; # Maximum length of segment (s). +maxlen = 10 +; # Minimum length of segment (s). Segments shorter than `minlen` will be dropped silently. +minlen = 0.02 +; # Use this symbol to represent a period of allowed nonverbal noise, i.e. `gap`. +; # Pass an empty string to avoid adding any symbol. It was "" in kaldi. +; # If you intend to use a multicharacter string for gap_sym, remember to register the +; # multicharacter string as part of userdef-string in prepare_lang_char.py. +gap_sym = + +[CONSTANTS] +; # Name of this mode +MODE = number +; # Suffixes to use after the word surface (no longer used) +MORPH = pos1 cForm cType2 pos2 +; # Used to differentiate between A tag and A_num tag +JPN_NUM = ゼロ 0 零 一 二 三 四 五 六 七 八 九 十 百 千 . +; # Dummy character to delineate multiline words +PLUS = + + +[DECISIONS] +; # TAG+'^'とは、タグが一つの転記単位に独立していない場合 +; # The PLUS (fullwidth) sign '+' marks line boundaries for multiline entries + +; # フィラー、感情表出系感動詞 +; # 0 to remain, 1 to delete +; # Example: '(F ぎょっ)' +F = 1 +; # Example: '(L (F ン))', '比べ(F えー)る' +F^ = 1 +; # 言い直し、いいよどみなどによる語断片 +; # 0 to remain, 1 to delete +; # Example: '(D だ)(D だいが) 大学の学部の会議' +D = 1 +; # Example: '(L (D ドゥ)+(D ヒ))' +D^ = 1 +; # 助詞、助動詞、接辞の言い直し +; # 0 to remain, 1 to delete +; # Example: '西洋 (D2 的)(F えー)(D ふ) 風というか' +D2 = 1 +; # Example: '(X (D2 ノ))' +D2^ = 1 +; # 聞き取りや語彙の判断に自信がない場合 +; # 0 to remain, 1 to delete +; # Example: (? 字数) の +; # If no option: empty string is returned regardless of output +; # Example: '(?) で' +? = 0 +; # Example: '(D (? すー))+そう+です+よ+ね' +?^ = 0 +; # タグ?で、値は複数の候補が想定される場合 +; # 0 for main guess with matching morph info, 1 for second guess +; # Example: '(? 次数, 実数)', '(? これ,ここで)+(? 説明+し+た+方+が+いい+か+な)' +?, = 0 +; # Example: '(W (? テユクー);(? ケッキョク,テユウコトデ))', '(W マシ;(? マシ+タ,マス))' +?,^ = 0 +; # 音や言葉に関するメタ的な引用 +; # 0 to remain, 1 to delete +; # Example: '助詞の (M は) は (M は) と書くが発音は (M わ)' +M = 0 +; # Example: '(L (M ヒ)+(M ヒ))', '(L (M (? ヒ+ヒ)))' +M^ = 0 +; # 外国語や古語、方言など +; # 0 to remain, 1 to delete +; # Example: '(O ザッツファイン)' +O = 0 +; # Example: '(笑 (O エクスキューズ+ミー))', '(笑 メダッ+テ+(O ナンボ))' +O^ = 0 +; # 講演者の名前、差別語、誹謗中傷など +; # 0 to remain, 1 to delete +; # Example: '国語研の (R ××) です' +R = 0 +R^ = 0 +; # 非朗読対象発話(朗読における言い間違い等) +; # 0 to remain, 1 to delete +; # Example: '(X 実際は) 実際には' +X = 0 +; # Example: '(L (X (D2 ニ)))' +X^ = 0 +; # アルファベットや算用数字、記号の表記 +; # 0 to use Japanese form, 1 to use alphabet form +; # Example: '(A シーディーアール;CD-R)' +A = 1 +; # Example: 'スモール(A エヌ;N)', 'ラージ(A キュー;Q)', '(A ティーエフ;TF)+(A アイディーエフ;IDF)' (Strung together by pron: '(W (? ティーワイド);ティーエフ+アイディーエフ)') +A^ = 1 +; # タグAで、単語は算用数字の場合 +; # 0 to use Japanese form, 1 to use Arabic numerals +; # Example: (A 二千;2000) +A_num = 1 +A_num^ = 1 +; # 何らかの原因で漢字表記できなくなった場合 +; # 0 to use broken form, 1 to use orthodox form +; # Example: '(K たち (F えー) ばな;橘)' +K = 1 +; # Example: '合(K か(?)く;格)', '宮(K ま(?)え;前)' +K^ = 1 +; # 転訛、発音の怠けなど、一時的な発音エラー +; # 0 to use wrong form, 1 to use orthodox form +; # Example: '(W ギーツ;ギジュツ)' +W = 1 +; # Example: '(F (W エド;エト))', 'イベント(W リレーティッド;リレーテッド)' +W^ = 1 +; # 語の読みに関する知識レベルのいい間違い +; # 0 to use wrong form, 1 to use orthodox form +; # Example: '(B シブタイ;ジュータイ)' +B = 0 +; # Example: 'データー(B カズ;スー)' +B^ = 0 +; # 笑いながら発話 +; # 0 to remain, 1 to delete +; # Example: '(笑 ナニガ)', '(笑 (F エー)+ソー+イッ+タ+ヨー+ナ)' +笑 = 0 +; # Example: 'コク(笑 サイ+(D オン))', +笑^ = 0 +; # 泣きながら発話 +; # 0 to remain, 1 to delete +; # Example: '(泣 ドンナニ)' +泣 = 0 +泣^ = 0 +; # 咳をしながら発話 +; # 0 to remain, 1 to delete +; # Example: 'シャ(咳 リン) ノ' +咳 = 0 +; # Example: 'イッ(咳 パン)', 'ワズ(咳 カ)' +咳^ = 0 +; # ささやき声や独り言などの小さな声 +; # 0 to remain, 1 to delete +; # Example: '(L アレコレナンダッケ)', '(L (W コデ;(? コレ,ココデ))+(? セツメー+シ+タ+ホー+ガ+イー+カ+ナ))' +L = 0 +; # Example: 'デ(L ス)', 'ッ(L テ+コ)ト' +L^ = 0 + +[REPLACEMENTS] +; # ボーカルフライなどで母音が同定できない場合 + = +; # 「うん/うーん/ふーん」の音の特定が困難な場合 + = +; # 非語彙的な母音の引き延ばし + = +; # 非語彙的な子音の引き延ばし + = +; # 言語音と独立に講演者の笑いが生じている場合 +<笑> = +; # 言語音と独立に講演者の咳が生じている場合 +<咳> = +; # 言語音と独立に講演者の息が生じている場合 +<息> = +; # 講演者の泣き声 +<泣> = +; # 聴衆(司会者なども含む)の発話 +<フロア発話> = +; # 聴衆の笑い +<フロア笑> = +; # 聴衆の拍手 +<拍手> = +; # 講演者が発表中に用いたデモンストレーションの音声 +<デモ> = +; # 学会講演に発表時間を知らせるためにならすベルの音 +<ベル> = +; # 転記単位全体が再度読み直された場合 +<朗読間違い> = +; # 上記以外の音で特に目立った音 +<雑音> = +; # 0.2秒以上のポーズ +

= +; # Redacted information, for R +; # It is \x00D7 multiplication sign, not your normal 'x' +× = × + +[FIELDS] +; # Time information for segment +time = 3 +; # Word surface +surface = 5 +; # Word surface root form without CSJ tags +notag = 9 +; # Part Of Speech +pos1 = 11 +; # Conjugated Form +cForm = 12 +; # Conjugation Type +cType1 = 13 +; # Subcategory of POS +pos2 = 14 +; # Euphonic Change / Subcategory of Conjugation Type +cType2 = 15 +; # Other information +other = 16 +; # Pronunciation for lexicon +pron = 10 +; # Speaker ID +spk_id = 2 + +[KATAKANA2ROMAJI] +ア = 'a +イ = 'i +ウ = 'u +エ = 'e +オ = 'o +カ = ka +キ = ki +ク = ku +ケ = ke +コ = ko +ガ = ga +ギ = gi +グ = gu +ゲ = ge +ゴ = go +サ = sa +シ = si +ス = su +セ = se +ソ = so +ザ = za +ジ = zi +ズ = zu +ゼ = ze +ゾ = zo +タ = ta +チ = ti +ツ = tu +テ = te +ト = to +ダ = da +ヂ = di +ヅ = du +デ = de +ド = do +ナ = na +ニ = ni +ヌ = nu +ネ = ne +ノ = no +ハ = ha +ヒ = hi +フ = hu +ヘ = he +ホ = ho +バ = ba +ビ = bi +ブ = bu +ベ = be +ボ = bo +パ = pa +ピ = pi +プ = pu +ペ = pe +ポ = po +マ = ma +ミ = mi +ム = mu +メ = me +モ = mo +ヤ = ya +ユ = yu +ヨ = yo +ラ = ra +リ = ri +ル = ru +レ = re +ロ = ro +ワ = wa +ヰ = we +ヱ = wi +ヲ = wo +ン = ŋ +ッ = q +ー = - +キャ = kǐa +キュ = kǐu +キョ = kǐo +ギャ = gǐa +ギュ = gǐu +ギョ = gǐo +シャ = sǐa +シュ = sǐu +ショ = sǐo +ジャ = zǐa +ジュ = zǐu +ジョ = zǐo +チャ = tǐa +チュ = tǐu +チョ = tǐo +ヂャ = dǐa +ヂュ = dǐu +ヂョ = dǐo +ニャ = nǐa +ニュ = nǐu +ニョ = nǐo +ヒャ = hǐa +ヒュ = hǐu +ヒョ = hǐo +ビャ = bǐa +ビュ = bǐu +ビョ = bǐo +ピャ = pǐa +ピュ = pǐu +ピョ = pǐo +ミャ = mǐa +ミュ = mǐu +ミョ = mǐo +リャ = rǐa +リュ = rǐu +リョ = rǐo +ァ = a +ィ = i +ゥ = u +ェ = e +ォ = o +ヮ = ʍ +ヴ = vu +ャ = ǐa +ュ = ǐu +ョ = ǐo + diff --git a/egs/csj/ASR/local/conf/symbol.ini b/egs/csj/ASR/local/conf/symbol.ini new file mode 100644 index 0000000000..8ba451dd5e --- /dev/null +++ b/egs/csj/ASR/local/conf/symbol.ini @@ -0,0 +1,322 @@ +; # This section is ignored if this file is not supplied as the first config file to +; # lhotse prepare csj +[SEGMENTS] +; # Allowed period of nonverbal noise. If exceeded, a new segment is created. +gap = 0.5 +; # Maximum length of segment (s). +maxlen = 10 +; # Minimum length of segment (s). Segments shorter than `minlen` will be dropped silently. +minlen = 0.02 +; # Use this symbol to represent a period of allowed nonverbal noise, i.e. `gap`. +; # Pass an empty string to avoid adding any symbol. It was "" in kaldi. +; # If you intend to use a multicharacter string for gap_sym, remember to register the +; # multicharacter string as part of userdef-string in prepare_lang_char.py. +gap_sym = + +[CONSTANTS] +; # Name of this mode +; # See https://www.isca-speech.org/archive/pdfs/interspeech_2022/horii22_interspeech.pdf +MODE = symbol +; # Suffixes to use after the word surface (no longer used) +MORPH = pos1 cForm cType2 pos2 +; # Used to differentiate between A tag and A_num tag +JPN_NUM = ゼロ 0 零 一 二 三 四 五 六 七 八 九 十 百 千 . +; # Dummy character to delineate multiline words +PLUS = + + +[DECISIONS] +; # TAG+'^'とは、タグが一つの転記単位に独立していない場合 +; # The PLUS (fullwidth) sign '+' marks line boundaries for multiline entries + +; # フィラー、感情表出系感動詞 +; # 0 to remain, 1 to delete +; # Example: '(F ぎょっ)' +F = # +; # Example: '(L (F ン))', '比べ(F えー)る' +F^ = # +; # 言い直し、いいよどみなどによる語断片 +; # 0 to remain, 1 to delete +; # Example: '(D だ)(D だいが) 大学の学部の会議' +D = @ +; # Example: '(L (D ドゥ)+(D ヒ))' +D^ = @ +; # 助詞、助動詞、接辞の言い直し +; # 0 to remain, 1 to delete +; # Example: '西洋 (D2 的)(F えー)(D ふ) 風というか' +D2 = @ +; # Example: '(X (D2 ノ))' +D2^ = @ +; # 聞き取りや語彙の判断に自信がない場合 +; # 0 to remain, 1 to delete +; # Example: (? 字数) の +; # If no option: empty string is returned regardless of output +; # Example: '(?) で' +? = 0 +; # Example: '(D (? すー))+そう+です+よ+ね' +?^ = 0 +; # タグ?で、値は複数の候補が想定される場合 +; # 0 for main guess with matching morph info, 1 for second guess +; # Example: '(? 次数, 実数)', '(? これ,ここで)+(? 説明+し+た+方+が+いい+か+な)' +?, = 0 +; # Example: '(W (? テユクー);(? ケッキョク,テユウコトデ))', '(W マシ;(? マシ+タ,マス))' +?,^ = 0 +; # 音や言葉に関するメタ的な引用 +; # 0 to remain, 1 to delete +; # Example: '助詞の (M は) は (M は) と書くが発音は (M わ)' +M = 0 +; # Example: '(L (M ヒ)+(M ヒ))', '(L (M (? ヒ+ヒ)))' +M^ = 0 +; # 外国語や古語、方言など +; # 0 to remain, 1 to delete +; # Example: '(O ザッツファイン)' +O = 0 +; # Example: '(笑 (O エクスキューズ+ミー))', '(笑 メダッ+テ+(O ナンボ))' +O^ = 0 +; # 講演者の名前、差別語、誹謗中傷など +; # 0 to remain, 1 to delete +; # Example: '国語研の (R ××) です' +R = 0 +R^ = 0 +; # 非朗読対象発話(朗読における言い間違い等) +; # 0 to remain, 1 to delete +; # Example: '(X 実際は) 実際には' +X = 0 +; # Example: '(L (X (D2 ニ)))' +X^ = 0 +; # アルファベットや算用数字、記号の表記 +; # 0 to use Japanese form, 1 to use alphabet form +; # Example: '(A シーディーアール;CD-R)' +A = 1 +; # Example: 'スモール(A エヌ;N)', 'ラージ(A キュー;Q)', '(A ティーエフ;TF)+(A アイディーエフ;IDF)' (Strung together by pron: '(W (? ティーワイド);ティーエフ+アイディーエフ)') +A^ = 1 +; # タグAで、単語は算用数字の場合 +; # 0 to use Japanese form, 1 to use Arabic numerals +; # Example: (A 二千;2000) +A_num = eval:self.notag +A_num^ = eval:self.notag +; # 何らかの原因で漢字表記できなくなった場合 +; # 0 to use broken form, 1 to use orthodox form +; # Example: '(K たち (F えー) ばな;橘)' +K = 1 +; # Example: '合(K か(?)く;格)', '宮(K ま(?)え;前)' +K^ = 1 +; # 転訛、発音の怠けなど、一時的な発音エラー +; # 0 to use wrong form, 1 to use orthodox form +; # Example: '(W ギーツ;ギジュツ)' +W = 1 +; # Example: '(F (W エド;エト))', 'イベント(W リレーティッド;リレーテッド)' +W^ = 1 +; # 語の読みに関する知識レベルのいい間違い +; # 0 to use wrong form, 1 to use orthodox form +; # Example: '(B シブタイ;ジュータイ)' +B = 0 +; # Example: 'データー(B カズ;スー)' +B^ = 0 +; # 笑いながら発話 +; # 0 to remain, 1 to delete +; # Example: '(笑 ナニガ)', '(笑 (F エー)+ソー+イッ+タ+ヨー+ナ)' +笑 = 0 +; # Example: 'コク(笑 サイ+(D オン))', +笑^ = 0 +; # 泣きながら発話 +; # 0 to remain, 1 to delete +; # Example: '(泣 ドンナニ)' +泣 = 0 +泣^ = 0 +; # 咳をしながら発話 +; # 0 to remain, 1 to delete +; # Example: 'シャ(咳 リン) ノ' +咳 = 0 +; # Example: 'イッ(咳 パン)', 'ワズ(咳 カ)' +咳^ = 0 +; # ささやき声や独り言などの小さな声 +; # 0 to remain, 1 to delete +; # Example: '(L アレコレナンダッケ)', '(L (W コデ;(? コレ,ココデ))+(? セツメー+シ+タ+ホー+ガ+イー+カ+ナ))' +L = 0 +; # Example: 'デ(L ス)', 'ッ(L テ+コ)ト' +L^ = 0 + +[REPLACEMENTS] +; # ボーカルフライなどで母音が同定できない場合 + = +; # 「うん/うーん/ふーん」の音の特定が困難な場合 + = +; # 非語彙的な母音の引き延ばし + = +; # 非語彙的な子音の引き延ばし + = +; # 言語音と独立に講演者の笑いが生じている場合 +<笑> = +; # 言語音と独立に講演者の咳が生じている場合 +<咳> = +; # 言語音と独立に講演者の息が生じている場合 +<息> = +; # 講演者の泣き声 +<泣> = +; # 聴衆(司会者なども含む)の発話 +<フロア発話> = +; # 聴衆の笑い +<フロア笑> = +; # 聴衆の拍手 +<拍手> = +; # 講演者が発表中に用いたデモンストレーションの音声 +<デモ> = +; # 学会講演に発表時間を知らせるためにならすベルの音 +<ベル> = +; # 転記単位全体が再度読み直された場合 +<朗読間違い> = +; # 上記以外の音で特に目立った音 +<雑音> = +; # 0.2秒以上のポーズ +

= +; # Redacted information, for R +; # It is \x00D7 multiplication sign, not your normal 'x' +× = × + +[FIELDS] +; # Time information for segment +time = 3 +; # Word surface +surface = 5 +; # Word surface root form without CSJ tags +notag = 9 +; # Part Of Speech +pos1 = 11 +; # Conjugated Form +cForm = 12 +; # Conjugation Type +cType1 = 13 +; # Subcategory of POS +pos2 = 14 +; # Euphonic Change / Subcategory of Conjugation Type +cType2 = 15 +; # Other information +other = 16 +; # Pronunciation for lexicon +pron = 10 +; # Speaker ID +spk_id = 2 + +[KATAKANA2ROMAJI] +ア = 'a +イ = 'i +ウ = 'u +エ = 'e +オ = 'o +カ = ka +キ = ki +ク = ku +ケ = ke +コ = ko +ガ = ga +ギ = gi +グ = gu +ゲ = ge +ゴ = go +サ = sa +シ = si +ス = su +セ = se +ソ = so +ザ = za +ジ = zi +ズ = zu +ゼ = ze +ゾ = zo +タ = ta +チ = ti +ツ = tu +テ = te +ト = to +ダ = da +ヂ = di +ヅ = du +デ = de +ド = do +ナ = na +ニ = ni +ヌ = nu +ネ = ne +ノ = no +ハ = ha +ヒ = hi +フ = hu +ヘ = he +ホ = ho +バ = ba +ビ = bi +ブ = bu +ベ = be +ボ = bo +パ = pa +ピ = pi +プ = pu +ペ = pe +ポ = po +マ = ma +ミ = mi +ム = mu +メ = me +モ = mo +ヤ = ya +ユ = yu +ヨ = yo +ラ = ra +リ = ri +ル = ru +レ = re +ロ = ro +ワ = wa +ヰ = we +ヱ = wi +ヲ = wo +ン = ŋ +ッ = q +ー = - +キャ = kǐa +キュ = kǐu +キョ = kǐo +ギャ = gǐa +ギュ = gǐu +ギョ = gǐo +シャ = sǐa +シュ = sǐu +ショ = sǐo +ジャ = zǐa +ジュ = zǐu +ジョ = zǐo +チャ = tǐa +チュ = tǐu +チョ = tǐo +ヂャ = dǐa +ヂュ = dǐu +ヂョ = dǐo +ニャ = nǐa +ニュ = nǐu +ニョ = nǐo +ヒャ = hǐa +ヒュ = hǐu +ヒョ = hǐo +ビャ = bǐa +ビュ = bǐu +ビョ = bǐo +ピャ = pǐa +ピュ = pǐu +ピョ = pǐo +ミャ = mǐa +ミュ = mǐu +ミョ = mǐo +リャ = rǐa +リュ = rǐu +リョ = rǐo +ァ = a +ィ = i +ゥ = u +ェ = e +ォ = o +ヮ = ʍ +ヴ = vu +ャ = ǐa +ュ = ǐu +ョ = ǐo + diff --git a/egs/csj/ASR/local/display_manifest_statistics.py b/egs/csj/ASR/local/display_manifest_statistics.py new file mode 100644 index 0000000000..c9de21073b --- /dev/null +++ b/egs/csj/ASR/local/display_manifest_statistics.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python3 +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# 2022 The University of Electro-Communications (author: Teo Wen Shen) # noqa +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +from pathlib import Path + +from lhotse import CutSet, load_manifest + +ARGPARSE_DESCRIPTION = """ +This file displays duration statistics of utterances in a manifest. +You can use the displayed value to choose minimum/maximum duration +to remove short and long utterances during the training. + +See the function `remove_short_and_long_utt()` in +pruned_transducer_stateless5/train.py for usage. +""" + + +def get_parser(): + parser = argparse.ArgumentParser( + description=ARGPARSE_DESCRIPTION, + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.add_argument( + "--manifest-dir", type=Path, help="Path to cutset manifests" + ) + + return parser.parse_args() + + +def main(): + args = get_parser() + + for path in args.manifest_dir.glob("csj_cuts_*.jsonl.gz"): + + cuts: CutSet = load_manifest(path) + + print("\n---------------------------------\n") + print(path.name + ":") + cuts.describe() + + +if __name__ == "__main__": + main() + +""" +## eval1 +Cuts count: 1272 +Total duration (hh:mm:ss): 01:50:07 +Speech duration (hh:mm:ss): 01:50:07 (100.0%) +Duration statistics (seconds): +mean 5.2 +std 3.9 +min 0.2 +25% 1.9 +50% 4.0 +75% 8.1 +99% 14.3 +99.5% 14.7 +99.9% 16.0 +max 16.9 +Recordings available: 1272 +Features available: 1272 +Supervisions available: 1272 +SUPERVISION custom fields: +- fluent (in 1272 cuts) +- disfluent (in 1272 cuts) +- number (in 1272 cuts) +- symbol (in 1272 cuts) + +## eval2 +Cuts count: 1292 +Total duration (hh:mm:ss): 01:56:50 +Speech duration (hh:mm:ss): 01:56:50 (100.0%) +Duration statistics (seconds): +mean 5.4 +std 3.9 +min 0.1 +25% 2.1 +50% 4.6 +75% 8.6 +99% 14.1 +99.5% 15.2 +99.9% 16.1 +max 16.9 +Recordings available: 1292 +Features available: 1292 +Supervisions available: 1292 +SUPERVISION custom fields: +- fluent (in 1292 cuts) +- number (in 1292 cuts) +- symbol (in 1292 cuts) +- disfluent (in 1292 cuts) + +## eval3 +Cuts count: 1385 +Total duration (hh:mm:ss): 01:19:21 +Speech duration (hh:mm:ss): 01:19:21 (100.0%) +Duration statistics (seconds): +mean 3.4 +std 3.0 +min 0.2 +25% 1.2 +50% 2.5 +75% 4.6 +99% 12.7 +99.5% 13.7 +99.9% 15.0 +max 15.9 +Recordings available: 1385 +Features available: 1385 +Supervisions available: 1385 +SUPERVISION custom fields: +- number (in 1385 cuts) +- symbol (in 1385 cuts) +- fluent (in 1385 cuts) +- disfluent (in 1385 cuts) + +## valid +Cuts count: 4000 +Total duration (hh:mm:ss): 05:08:09 +Speech duration (hh:mm:ss): 05:08:09 (100.0%) +Duration statistics (seconds): +mean 4.6 +std 3.8 +min 0.1 +25% 1.5 +50% 3.4 +75% 7.0 +99% 13.8 +99.5% 14.8 +99.9% 16.0 +max 17.3 +Recordings available: 4000 +Features available: 4000 +Supervisions available: 4000 +SUPERVISION custom fields: +- fluent (in 4000 cuts) +- symbol (in 4000 cuts) +- disfluent (in 4000 cuts) +- number (in 4000 cuts) + +## train +Cuts count: 1291134 +Total duration (hh:mm:ss): 1596:37:27 +Speech duration (hh:mm:ss): 1596:37:27 (100.0%) +Duration statistics (seconds): +mean 4.5 +std 3.6 +min 0.0 +25% 1.6 +50% 3.3 +75% 6.4 +99% 14.0 +99.5% 14.8 +99.9% 16.6 +max 27.8 +Recordings available: 1291134 +Features available: 1291134 +Supervisions available: 1291134 +SUPERVISION custom fields: +- disfluent (in 1291134 cuts) +- fluent (in 1291134 cuts) +- symbol (in 1291134 cuts) +- number (in 1291134 cuts) +""" diff --git a/egs/csj/ASR/local/prepare_lang_char.py b/egs/csj/ASR/local/prepare_lang_char.py new file mode 100644 index 0000000000..e4d9968711 --- /dev/null +++ b/egs/csj/ASR/local/prepare_lang_char.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +# Copyright 2022 The University of Electro-Communications (Author: Teo Wen Shen) # noqa +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import argparse +import logging +from pathlib import Path + +from lhotse import CutSet + +ARGPARSE_DESCRIPTION = """ +This script gathers all training transcripts of the specified {trans_mode} type +and produces a token_list that would be output set of the ASR system. + +It splits transcripts by whitespace into lists, then, for each word in the +list, if the word does not appear in the list of user-defined multicharacter +strings, it further splits that word into individual characters to be counted +into the output token set. + +It outputs 4 files into the lang directory: +- trans_mode: the name of transcript mode. If trans_mode was not specified, + this will be an empty file. +- userdef_string: a list of user defined strings that should not be split + further into individual characters. By default, it contains "", "", + "" +- words_len: the total number of tokens in the output set. +- words.txt: a list of tokens in the output set. The length matches words_len. + +""" + + +def get_args(): + parser = argparse.ArgumentParser( + description=ARGPARSE_DESCRIPTION, + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.add_argument( + "--train-cut", type=Path, required=True, help="Path to the train cut" + ) + + parser.add_argument( + "--trans-mode", + type=str, + default=None, + help=( + "Name of the transcript mode to use. " + "If lang-dir is not set, this will also name the lang-dir" + ), + ) + + parser.add_argument( + "--lang-dir", + type=Path, + default=None, + help=( + "Name of lang dir. " + "If not set, this will default to lang_char_{trans-mode}" + ), + ) + + parser.add_argument( + "--userdef-string", + type=Path, + default=None, + help="Multicharacter strings that do not need to be split", + ) + + return parser.parse_args() + + +def main(): + args = get_args() + + logging.basicConfig( + format=( + "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] " "%(message)s" + ), + level=logging.INFO, + ) + + if not args.lang_dir: + p = "lang_char" + if args.trans_mode: + p += f"_{args.trans_mode}" + args.lang_dir = Path(p) + + if args.userdef_string: + args.userdef_string = set(args.userdef_string.read_text().split()) + else: + args.userdef_string = set() + + sysdef_string = ["", "", ""] + args.userdef_string.update(sysdef_string) + + train_set: CutSet = CutSet.from_file(args.train_cut) + + words = set() + logging.info( + f"Creating vocabulary from {args.train_cut.name}" + f" at {args.trans_mode} mode." + ) + for cut in train_set: + try: + text: str = ( + cut.supervisions[0].custom[args.trans_mode] + if args.trans_mode + else cut.supervisions[0].text + ) + except KeyError: + raise KeyError( + f"Could not find {args.trans_mode} in " + f"{cut.supervisions[0].custom}" + ) + for t in text.split(): + if t in args.userdef_string: + words.add(t) + else: + words.update(c for c in list(t)) + + words -= set(sysdef_string) + words = sorted(words) + words = [""] + words + ["", ""] + + args.lang_dir.mkdir(parents=True, exist_ok=True) + (args.lang_dir / "words.txt").write_text( + "\n".join(f"{word}\t{i}" for i, word in enumerate(words)) + ) + + (args.lang_dir / "words_len").write_text(f"{len(words)}") + + (args.lang_dir / "userdef_string").write_text( + "\n".join(args.userdef_string) + ) + + (args.lang_dir / "trans_mode").write_text(args.trans_mode) + logging.info("Done.") + + +if __name__ == "__main__": + main() diff --git a/egs/csj/ASR/local/validate_manifest.py b/egs/csj/ASR/local/validate_manifest.py new file mode 100644 index 0000000000..0c4c6c1ea3 --- /dev/null +++ b/egs/csj/ASR/local/validate_manifest.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This script checks the following assumptions of the generated manifest: + +- Single supervision per cut +- Supervision time bounds are within cut time bounds + +We will add more checks later if needed. + +Usage example: + + python3 ./local/validate_manifest.py \ + ./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz + +""" + +import argparse +import logging +from pathlib import Path + +from lhotse import CutSet, load_manifest +from lhotse.cut import Cut + + +def get_args(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--manifest", + type=Path, + help="Path to the manifest file", + ) + + return parser.parse_args() + + +def validate_one_supervision_per_cut(c: Cut): + if len(c.supervisions) != 1: + raise ValueError(f"{c.id} has {len(c.supervisions)} supervisions") + + +def validate_supervision_and_cut_time_bounds(c: Cut): + s = c.supervisions[0] + + # Removed because when the cuts were trimmed from supervisions, + # the start time of the supervision can be lesser than cut start time. + # https://github.com/lhotse-speech/lhotse/issues/813 + # if s.start < c.start: + # raise ValueError( + # f"{c.id}: Supervision start time {s.start} is less " + # f"than cut start time {c.start}" + # ) + + if s.end > c.end: + raise ValueError( + f"{c.id}: Supervision end time {s.end} is larger " + f"than cut end time {c.end}" + ) + + +def main(): + args = get_args() + + manifest = Path(args.manifest) + logging.info(f"Validating {manifest}") + + assert manifest.is_file(), f"{manifest} does not exist" + cut_set = load_manifest(manifest) + assert isinstance(cut_set, CutSet) + + for c in cut_set: + validate_one_supervision_per_cut(c) + validate_supervision_and_cut_time_bounds(c) + + +if __name__ == "__main__": + formatter = ( + "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + ) + + logging.basicConfig(format=formatter, level=logging.INFO) + + main() diff --git a/egs/csj/ASR/prepare.sh b/egs/csj/ASR/prepare.sh new file mode 100755 index 0000000000..269c1ec9aa --- /dev/null +++ b/egs/csj/ASR/prepare.sh @@ -0,0 +1,130 @@ +#!/usr/bin/env bash +# We assume the following directories are downloaded. +# +# - $csj_dir +# CSJ is assumed to be the USB-type directory, which should contain the following subdirectories:- +# - DATA (not used in this script) +# - DOC (not used in this script) +# - MODEL (not used in this script) +# - MORPH +# - LDB (not used in this script) +# - SUWDIC (not used in this script) +# - SDB +# - core +# - ... +# - noncore +# - ... +# - PLABEL (not used in this script) +# - SUMMARY (not used in this script) +# - TOOL (not used in this script) +# - WAV +# - core +# - ... +# - noncore +# - ... +# - XML (not used in this script) +# +# - $musan_dir +# This directory contains the following directories downloaded from +# http://www.openslr.org/17/ +# - music +# - noise +# - speech +# +# By default, this script produces the original transcript like kaldi and espnet. Optionally, you +# can generate other transcript formats by supplying your own config files. A few examples of these +# config files can be found in local/conf. + +set -eou pipefail + +nj=8 +stage=-1 +stop_stage=100 + +csj_dir=/mnt/minami_data_server/t2131178/corpus/CSJ +musan_dir=/mnt/minami_data_server/t2131178/corpus/musan/musan +trans_dir=$csj_dir/retranscript +csj_fbank_dir=/mnt/host/csj_data/fbank +musan_fbank_dir=$musan_dir/fbank +csj_manifest_dir=data/manifests +musan_manifest_dir=$musan_dir/manifests + +. shared/parse_options.sh || exit 1 + +mkdir -p data + +log() { + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then + log "Stage 1: Prepare CSJ manifest" + # If you want to generate more transcript modes, append the path to those config files at c. + # Example: lhotse prepare csj $csj_dir $trans_dir $csj_manifest_dir -c local/conf/disfluent.ini + # NOTE: In case multiple config files are supplied, the second config file and onwards will inherit + # the segment boundaries of the first config file. + if [ ! -e $csj_manifest_dir/.librispeech.done ]; then + lhotse prepare csj $csj_dir $trans_dir $csj_manifest_dir -j 4 + touch $csj_manifest_dir/.librispeech.done + fi +fi + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then + log "Stage 2: Prepare musan manifest" + mkdir -p $musan_manifest_dir + if [ ! -e $musan_manifest_dir/.musan.done ]; then + lhotse prepare musan $musan_dir $musan_manifest_dir + touch $musan_manifest_dir/.musan.done + fi +fi + +if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then + log "Stage 3: Compute CSJ fbank" + if [ ! -e $csj_fbank_dir/.csj-validated.done ]; then + python local/compute_fbank_csj.py --manifest-dir $csj_manifest_dir \ + --fbank-dir $csj_fbank_dir + parts=( + train + valid + eval1 + eval2 + eval3 + ) + for part in ${parts[@]}; do + python local/validate_manifest.py --manifest $csj_manifest_dir/csj_cuts_$part.jsonl.gz + done + touch $csj_fbank_dir/.csj-validated.done + fi +fi + +if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then + log "Stage 4: Prepare CSJ lang" + modes=disfluent + + # If you want prepare the lang directory for other transcript modes, just append + # the names of those modes behind. An example is shown as below:- + # modes="$modes fluent symbol number" + + for mode in ${modes[@]}; do + python local/prepare_lang_char.py --trans-mode $mode \ + --train-cut $csj_manifest_dir/csj_cuts_train.jsonl.gz \ + --lang-dir lang_char_$mode + done +fi + +if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then + log "Stage 5: Compute fbank for musan" + mkdir -p $musan_fbank_dir + + if [ ! -e $musan_fbank_dir/.musan.done ]; then + python local/compute_fbank_musan.py --manifest-dir $musan_manifest_dir --fbank-dir $musan_fbank_dir + touch $musan_fbank_dir/.musan.done + fi +fi + +if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then + log "Stage 6: Show manifest statistics" + python local/display_manifest_statistics.py --manifest-dir $csj_manifest_dir > $csj_manifest_dir/manifest_statistics.txt + cat $csj_manifest_dir/manifest_statistics.txt +fi \ No newline at end of file diff --git a/egs/csj/ASR/shared b/egs/csj/ASR/shared new file mode 120000 index 0000000000..4c5e91438c --- /dev/null +++ b/egs/csj/ASR/shared @@ -0,0 +1 @@ +../../../icefall/shared/ \ No newline at end of file