Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add centralized data preparation for OWSM #5478

Merged
merged 31 commits into from
Dec 5, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
3be1f40
add whisper data.sh for v1 and v2
jctian98 Oct 17, 2023
37ab173
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 17, 2023
92bf631
add OWSM v3 data recipe
jctian98 Oct 17, 2023
ac8e423
Merge commit 'FETCH_HEAD' into owsm_data
jctian98 Oct 17, 2023
a3c24bd
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 17, 2023
063dc3f
fix ci issues
jctian98 Oct 18, 2023
5e14a62
update with ci issues
jctian98 Oct 18, 2023
7b707cd
change egs name from mixed_v* to owsm_v*
jctian98 Oct 23, 2023
14204e2
v3 shuold be ready except wsj
jctian98 Oct 30, 2023
ae05a6c
add wsj
jctian98 Oct 30, 2023
c515f76
update db.sh
jctian98 Oct 30, 2023
b53ce47
Merge branch 'master' into owsm_data
jctian98 Oct 30, 2023
ec109e2
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 30, 2023
31ad173
almost finish all scripts
jctian98 Nov 10, 2023
8a09625
fix small problems
jctian98 Nov 10, 2023
952acf6
Merge commit 'FETCH_HEAD' into owsm_data
jctian98 Nov 10, 2023
2fd2668
merge master
jctian98 Nov 10, 2023
c53afd0
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 10, 2023
bdaf344
update the langauge mapping
jctian98 Nov 11, 2023
d379fd0
Merge commit 'FETCH_HEAD' into owsm_data
jctian98 Nov 11, 2023
b2cb427
Merge branch 'master' into owsm_data
jctian98 Nov 11, 2023
f5e5414
Merge commit 'FETCH_HEAD' into owsm_data
jctian98 Nov 11, 2023
51e3691
fix CI issue
jctian98 Nov 11, 2023
7f75d15
Merge commit 'FETCH_HEAD' into owsm_data
jctian98 Nov 11, 2023
66176bc
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 11, 2023
3d89d78
update wsj and commonvoice
jctian98 Nov 26, 2023
8f1e0fa
Merge commit 'FETCH_HEAD' into owsm_data
jctian98 Nov 26, 2023
77fe14b
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 26, 2023
c391765
update wsj text norm script
jctian98 Nov 26, 2023
642fd22
update wsj text norm 2
jctian98 Nov 26, 2023
ee00c6c
revise voxpopuli
jctian98 Nov 29, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix ci issues
  • Loading branch information
jctian98 committed Oct 18, 2023
commit 063dc3f4d5bc6351576eea71f32738126d58adf4
1 change: 1 addition & 0 deletions egs2/mixed_v3/s2t1/local/prepare_commonvoice.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ def parse_args():

if __name__ == "__main__":
args = parse_args()
logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.DEBUG)

# map CommonVoice language-id to ISO-693-3 standard code.
language_map = open("local/cv-iso-693-3.txt").readlines()
Expand Down
7 changes: 3 additions & 4 deletions egs2/mixed_v3/s2t1/local/prepare_fleurs.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,6 @@
except Exception:
traceback.print_exc()
logging.warning("Error importing datasets library")
logging.warning(
"datasets can be installed via espnet/tools/installers/install_datasets"
)
exit()


Expand Down Expand Up @@ -107,6 +104,7 @@ def main():
)

args = parser.parse_args()
logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.DEBUG)

fleurs_asr = load_dataset(
"google/xtreme_s", f"fleurs.all", cache_dir=args.cache, num_proc=16
Expand All @@ -131,7 +129,8 @@ def main():
lang_id_iso, lang_name = lang.part3, lang.name
lang_id_dict[lang_name] = lang_id_iso
logging.info(
f"Process FLEURS subset {lang_name} ({lang_id}) with ISO-693-3 id: {lang_id_iso}"
f"Process FLEURS subset {lang_name} ({lang_id})"
f"with ISO-693-3 id: {lang_id_iso}"
)

# Add missing terms manually.
Expand Down
190 changes: 0 additions & 190 deletions egs2/mixed_v3/s2t1/local/prepare_voxforge.py

This file was deleted.

8 changes: 6 additions & 2 deletions egs2/mixed_v3/s2t1/local/prepare_voxpopuli.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,11 +126,12 @@ def collect_data_asr(
continue

iso_src = toiso(src)
path_template = "ffmpeg -i {} -ac 1 -ar 16000 -f wav - |"
talks[event_id].append(
Utterance(
utt_id=f"{prefix}_asr_{event_id}_{r['id_']}",
wav_id=f"{prefix}_asr_{event_id}",
wav_path=f"ffmpeg -i {str(audio_path.resolve())} -ac 1 -ar 16000 -f wav - |",
wav_path=path_template.format(str(audio_path.resolve())),
start_time=start_time,
end_time=end_time,
lang=f"<{iso_src}>",
Expand Down Expand Up @@ -232,11 +233,12 @@ def collect_data_st(
# Note(jinchuan): Not sure if "event_id" would overlap across
# languages. So add src2tgt tag to wav_id to exclude this risk
iso_src, iso_tgt = toiso(src), toiso(tgt)
path_template = "ffmpeg -i {} -ac 1 -ar 16000 -f wav - |"
talks[event_id].append(
Utterance(
utt_id=f"{prefix}_st_{iso_src}2{iso_tgt}_{event_id}_{utt_id}",
wav_id=f"{prefix}_st_{iso_src}2{iso_tgt}_{event_id}",
wav_path=f"ffmpeg -i {str(audio_path.resolve())} -ac 1 -ar 16000 -f wav - |",
wav_path=path_template.format(str(audio_path.resolve())),
start_time=float(r["start_time"]),
end_time=float(r["end_time"]),
lang=f"<{iso_src}>",
Expand Down Expand Up @@ -275,6 +277,8 @@ def parse_args():
if __name__ == "__main__":
args = parse_args()

logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.DEBUG)

args.output_dir.mkdir(parents=True, exist_ok=True)

for split in args.splits:
Expand Down
30 changes: 19 additions & 11 deletions egs2/mixed_v3/s2t1/local/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,14 +189,20 @@ def merge_short_utterances(
end_time = utts[-1].end_time
lang = utts[0].lang
task = utts[0].task
utt_id = f"{wav_id}_{round(1000 * start_time):09d}_{round(1000 * end_time):09d}_{lang[1:-1]}_{task[1:-1]}"
utt_id = (
f"{wav_id}_{round(1000 * start_time):09d}_"
f"{round(1000 * end_time):09d}_{lang[1:-1]}_{task[1:-1]}"
)
text = " ".join([u.text for u in utts])
asr_text = " ".join([u.asr_text for u in utts])
prev_text = prev.text if prev is not None else SYMBOL_NA

text_with_time = ""
for u in utts:
text_with_time += f"{time2token(u.start_time - start_time)} {u.text.strip()}{time2token(u.end_time - start_time)}"
text_with_time += (
f"{time2token(u.start_time - start_time)} "
f"{u.text.strip()}{time2token(u.end_time - start_time)}"
)

return LongUtterance(
utt_id=utt_id,
Expand All @@ -221,17 +227,19 @@ def generate_long_utterances(
utts.sort(key=lambda x: x.start_time)

long_utts = [None]
l, r = 0, 0
while l < len(utts):
if r < len(utts) and utts[r].end_time - utts[l].start_time <= SPEECH_MAX_LEN:
r += 1
elif r > l:
long_utts.append(merge_short_utterances(utts[l:r], long_utts[-1]))
l = r
left, right = 0, 0
while left < len(utts):
if right < len(utts) and (
utts[right].end_time - utts[left].start_time <= SPEECH_MAX_LEN
):
right += 1
elif right > left:
long_utts.append(merge_short_utterances(utts[left:right], long_utts[-1]))
left = right
else: # skip the current utt if its length already exceeds the limit
long_utts.append(None)
l = r + 1
r = l
left = right + 1
right = left

long_utts = [u for u in long_utts if u is not None]
return long_utts