Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EDACC dataset automatic speech recognition #5996

Merged
merged 34 commits into from
Jan 2, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
64f9775
data prep stage for edacc
uwanny Nov 30, 2024
2488ddc
split too large audio file limited memory on PSC, and verified implem…
uwanny Dec 12, 2024
70d2c9c
Merge remote-tracking branch 'origin/master' into EdAcc-dataset
uwanny Dec 12, 2024
17f3ad6
split and truncate too long test set
uwanny Dec 25, 2024
fb887af
update the training and decode config for wavLM, update run.sh
uwanny Dec 25, 2024
647e666
Merge branch 'master' into EdAcc-dataset
uwanny Dec 25, 2024
15f8a91
Merge branch 'espnet:master' into EdAcc-dataset
uwanny Dec 25, 2024
6d2848b
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 25, 2024
6a6df59
fix the too long line issue, make test set split optional
uwanny Dec 25, 2024
6930b93
Merge branch 'EdAcc-dataset' of https://github.com/uwanny/espnet into…
uwanny Dec 25, 2024
8abea69
delete useless file
uwanny Dec 25, 2024
5c4e73d
solve line too long issue
uwanny Dec 25, 2024
db2a309
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 25, 2024
bee1b67
fix line too long
uwanny Dec 26, 2024
98623d5
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 26, 2024
f8d73bb
add README
uwanny Dec 26, 2024
279a697
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 26, 2024
6135bab
update README, add missing file
uwanny Dec 26, 2024
475d159
remove duplicated file
uwanny Dec 26, 2024
362cb21
test line too long error
uwanny Dec 27, 2024
9033350
fix line too long, move to README
uwanny Dec 27, 2024
fbc1ec8
make data prep to multiple stages
uwanny Dec 27, 2024
0b40d51
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 27, 2024
6949d77
Update README.md in egs2
uwanny Dec 27, 2024
998c33c
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 27, 2024
e6a6f11
Merge branch 'master' into EdAcc-dataset
uwanny Dec 29, 2024
95cf86f
Update README
uwanny Dec 30, 2024
a783392
update config, update run.sh
uwanny Dec 30, 2024
b157f5a
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 30, 2024
3901058
update README
uwanny Dec 30, 2024
8912268
Merge branch 'EdAcc-dataset' of https://github.com/uwanny/espnet into…
uwanny Dec 30, 2024
bd05c27
trigger CI check
uwanny Dec 30, 2024
13d58fc
update README
uwanny Dec 31, 2024
2fe91b4
Merge branch 'master' into EdAcc-dataset
uwanny Dec 31, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix line too long
  • Loading branch information
uwanny committed Dec 26, 2024
commit bee1b67beae69a56a44403307d921024bd3e7883
43 changes: 33 additions & 10 deletions egs2/edacc/asr1/local/data_prep.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@ def generate_data_files(
segments = os.path.join(edacc_root, x, "segments")
utt2spk = os.path.join(edacc_root, x, "utt2spk")

os.makedirs(os.path.join(target_root, x), exist_ok=True)
os.makedirs(
os.path.join(target_root, x),
exist_ok=True,
)

# process utt2spk
utter_spk_map = {}
Expand All @@ -41,7 +44,9 @@ def generate_data_files(
elif "C30" in utter and int(utter[-9:]) < 387:
utter = utter.replace("C30", "C30_P1")
utter_spk_map[utter] = spk
utt2spk_out.write(f"{utter_spk_map[utter]}-{utter} {spk}\n")
utt2spk_out.write(
f"{utter_spk_map[utter]}-{utter} {spk}\n"
)
utt_list.append(utter)

if x == "dev":
Expand All @@ -66,22 +71,32 @@ def generate_data_files(
if os.path.exists(text):
with open(text, "r") as text:
for line in text:
utter, txt = line.strip().split(maxsplit=1)
(
utter,
txt,
) = line.strip().split(maxsplit=1)
# process utter for C30
if "C30" in utter and int(utter[-9:]) >= 387:
new_number = int(utter[-9:]) - 387
utter = "EDACC-C30_P2-" + f"{new_number:09d}"
elif "C30" in utter and int(utter[-9:]) < 387:
utter = utter.replace("C30", "C30_P1")
text_out.write(f"{utter_spk_map[utter]}-{utter} {txt}\n")
text_out.write(
f"{utter_spk_map[utter]}-{utter} {txt}\n"
)

# process segments and wav.scp
wavID_set = set()
with open(seg_out, "w") as seg_out, open(scp_out, "w") as scp_out:
if os.path.exists(segments):
with open(segments, "r") as segments:
for line in segments:
utter, wavID, start, end = line.strip().split()
(
utter,
wavID,
start,
end,
) = line.strip().split()
# process utter for C30
if "C30" in utter and int(utter[-9:]) >= 387:
new_number = int(utter[-9:]) - 387
Expand All @@ -90,22 +105,29 @@ def generate_data_files(
start = f"{float(start) - segment_length:.2f}"
end = f"{float(end) - segment_length:.2f}"
audio_path = os.path.join(
segmented_audio_path, f"{wavID}.wav"
segmented_audio_path,
f"{wavID}.wav",
)
elif "C30" in utter and int(utter[-9:]) < 387:
utter = utter.replace("C30", "C30_P1")
wavID = wavID.replace("C30", "C30_P1")
audio_path = os.path.join(
segmented_audio_path, f"{wavID}.wav"
segmented_audio_path,
f"{wavID}.wav",
)
else:
audio_path = os.path.join(
edacc_root, "data", f"{wavID}.wav"
edacc_root,
"data",
f"{wavID}.wav",
)
seg_out.write(
f"{utter_spk_map[utter]}-{utter} {wavID} {start} {end}\n"
)
if os.path.exists(audio_path) and wavID not in wavID_set:
if (
os.path.exists(audio_path)
and wavID not in wavID_set
):
scp_out.write(f"{wavID} {audio_path}\n")
wavID_set.add(wavID)

Expand All @@ -114,7 +136,8 @@ def generate_data_files(

if len(sys.argv) != 4:
print(
"Usage: python data_prep.py [edacc download directory] [target directory] [large audio path]"
"Usage: python data_prep.py [edacc download directory]"
"[target directory] [large audio path]"
)
sys.exit(1)

Expand Down
33 changes: 26 additions & 7 deletions egs2/edacc/asr1/local/truncate_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@

def truncate_test_set(test_dir, utterance_splits):
"""
Truncate specified utterances in a Kaldi test set directory and reorganize the folder.
Truncate specified utterances in a Kaldi test set directory.

Args:
test_dir (str): Path to the Kaldi test set directory.
utterance_splits (dict): A dictionary where keys are original utterance IDs, and values are lists of tuples,
each containing (new_utterance_id, start_time, end_time).
utterance_splits (dict):
A dictionary where keys are original utterance IDs,
and values are lists of tuples, each containing
(new_utterance_id, start_time, end_time).

Returns:
None
Expand Down Expand Up @@ -41,7 +43,12 @@ def truncate_test_set(test_dir, utterance_splits):
for line in text_file_input:
utter, _ = line.strip().split(maxsplit=1)
if utter in utterance_splits:
for new_utter, _, _, new_txt in utterance_splits[utter]:
for (
new_utter,
_,
_,
new_txt,
) in utterance_splits[utter]:
new_text.append(f"{new_utter} {new_txt}\n")
else:
new_text.append(line)
Expand All @@ -50,7 +57,12 @@ def truncate_test_set(test_dir, utterance_splits):
for line in utt2spk_file_input:
utter, spk = line.strip().split()
if utter in utterance_splits:
for new_utter, _, _, _ in utterance_splits[utter]:
for (
new_utter,
_,
_,
_,
) in utterance_splits[utter]:
new_utt2spk.append(f"{new_utter} {spk}\n")
else:
new_utt2spk.append(line)
Expand All @@ -59,8 +71,15 @@ def truncate_test_set(test_dir, utterance_splits):
for line in segment_file_input:
utter, id, start, end = line.strip().split()
if utter in utterance_splits:
for new_utter, new_start, new_end, _ in utterance_splits[utter]:
new_segment.append(f"{new_utter} {id} {new_start} {new_end}\n")
for (
new_utter,
new_start,
new_end,
_,
) in utterance_splits[utter]:
new_segment.append(
f"{new_utter} {id} {new_start} {new_end}\n"
)
else:
new_segment.append(line)

Expand Down
Loading