Skip to content

Commit

Permalink
Merge branch 'master' into pr_voxtlm
Browse files Browse the repository at this point in the history
  • Loading branch information
ftshijt authored Jan 30, 2024
2 parents f55e691 + 27f292d commit 8a48c23
Show file tree
Hide file tree
Showing 130 changed files with 1,055 additions and 683 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ repos:
exclude: ^(egs2/TEMPLATE/asr1/utils|egs2/TEMPLATE/asr1/steps|egs2/TEMPLATE/tts1/sid|tools/installers/patch_mwerSegmenter)

- repo: https://github.com/psf/black
rev: 23.12.1
rev: 24.1.1
hooks:
- id: black
exclude: ^(egs2/TEMPLATE/asr1/utils|egs2/TEMPLATE/asr1/steps|egs2/TEMPLATE/tts1/sid|doc)
Expand Down
3 changes: 1 addition & 2 deletions egs/must_c/st1/local/download_and_untar.sh
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,10 @@ if [ ! -f ${tar_path} ]; then
echo ${instructions}
fi

if ! tar -zxvf ${tar_path} -d -C ${data}; then
if ! tar -zxvf ${tar_path} -C ${data}; then
echo "$0: error un-tarring archive ${tar_path}"
exit 1;
fi

touch ${data}/.complete_en_${lang}
echo "$0: Successfully downloaded and un-tarred ${tar_path}"

Expand Down
8 changes: 5 additions & 3 deletions egs2/TEMPLATE/asr1/pyscripts/utils/print_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,11 @@ def get_commandline_args(no_executable=True):

# Escape the extra characters for shell
argv = [
arg.replace("'", "'\\''")
if all(char not in arg for char in extra_chars)
else "'" + arg.replace("'", "'\\''") + "'"
(
arg.replace("'", "'\\''")
if all(char not in arg for char in extra_chars)
else "'" + arg.replace("'", "'\\''") + "'"
)
for arg in sys.argv
]

Expand Down
115 changes: 115 additions & 0 deletions egs2/TEMPLATE/asr1/utils/data/internal/modify_speaker_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#!/usr/bin/env python

from __future__ import print_function
import argparse, sys,os
from collections import defaultdict
parser = argparse.ArgumentParser(description="""
Combine consecutive utterances into fake speaker ids for a kind of
poor man's segmentation. Reads old utt2spk from standard input,
outputs new utt2spk to standard output.""")
parser.add_argument("--utts-per-spk-max", type = int, required = True,
help="Maximum number of utterances allowed per speaker")
parser.add_argument("--seconds-per-spk-max", type = float, required = True,
help="""Maximum duration in seconds allowed per speaker.
If this option is >0, --utt2dur option must be provided.""")
parser.add_argument("--utt2dur", type = str,
help="""Filename of input 'utt2dur' file (needed only if
--seconds-per-spk-max is provided)""")
parser.add_argument("--respect-speaker-info", type = str, default = 'true',
choices = ['true', 'false'],
help="""If true, the output speakers will be split from "
"existing speakers.""")

args = parser.parse_args()

utt2spk = dict()
# an undefined spk2utt entry will default to an empty list.
spk2utt = defaultdict(lambda: [])

while True:
line = sys.stdin.readline()
if line == '':
break;
a = line.split()
if len(a) != 2:
sys.exit("modify_speaker_info.py: bad utt2spk line from standard input (expected two fields): " +
line)
[ utt, spk ] = a
utt2spk[utt] = spk
spk2utt[spk].append(utt)

if args.seconds_per_spk_max > 0:
utt2dur = dict()
try:
f = open(args.utt2dur)
while True:
line = f.readline()
if line == '':
break
a = line.split()
if len(a) != 2:
sys.exit("modify_speaker_info.py: bad utt2dur line from standard input (expected two fields): " +
line)
[ utt, dur ] = a
utt2dur[utt] = float(dur)
for utt in utt2spk:
if not utt in utt2dur:
sys.exit("modify_speaker_info.py: utterance {0} not in utt2dur file {1}".format(
utt, args.utt2dur))
except Exception as e:
sys.exit("modify_speaker_info.py: problem reading utt2dur info: " + str(e))

# splits a list of utts into a list of lists, based on constraints from the
# command line args. Note: the last list will tend to be shorter than the others,
# we make no attempt to fix this.
def SplitIntoGroups(uttlist):
ans = [] # list of lists.
cur_uttlist = []
cur_dur = 0.0
for utt in uttlist:
if ((args.utts_per_spk_max > 0 and len(cur_uttlist) == args.utts_per_spk_max) or
(args.seconds_per_spk_max > 0 and len(cur_uttlist) > 0 and
cur_dur + utt2dur[utt] > args.seconds_per_spk_max)):
ans.append(cur_uttlist)
cur_uttlist = []
cur_dur = 0.0
cur_uttlist.append(utt)
if args.seconds_per_spk_max > 0:
cur_dur += utt2dur[utt]
if len(cur_uttlist) > 0:
ans.append(cur_uttlist)
return ans


# This function will return '%01d' if d < 10, '%02d' if d < 100, and so on.
# It's for printf printing of numbers in such a way that sorted order will be
# correct.
def GetFormatString(d):
ans = 1
while (d >= 10):
d //= 10 # integer division
ans += 1
# e.g. we might return the string '%01d' or '%02d'
return '%0{0}d'.format(ans)


if args.respect_speaker_info == 'true':
for spk in sorted(spk2utt.keys()):
uttlists = SplitIntoGroups(spk2utt[spk])
format_string = '%s-' + GetFormatString(len(uttlists))
for i in range(len(uttlists)):
# the following might look like: '%s-%02d'.format('john_smith' 9 + 1),
# giving 'john_smith-10'.
this_spk = format_string % (spk, i + 1)
for utt in uttlists[i]:
print(utt, this_spk)
else:
uttlists = SplitIntoGroups(sorted(utt2spk.keys()))
format_string = 'speaker-' + GetFormatString(len(uttlists))
for i in range(len(uttlists)):
# the following might look like: 'speaker-%04d'.format(105 + 1),
# giving 'speaker-0106'.
this_spk = format_string % (i + 1)
for utt in uttlists[i]:
print(utt, this_spk)

125 changes: 125 additions & 0 deletions egs2/TEMPLATE/asr1/utils/data/modify_speaker_info.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#!/usr/bin/env bash

# Copyright 2013-2016 Johns Hopkins University (author: Daniel Povey)
# Apache 2.0

# This script copies a data directory (like utils/copy_data.sh) while
# modifying (splitting or merging) the speaker information in that data directory.
#
# This is done without looking at the data at all; we use only duration
# constraints and maximum-num-utts-per-speaker to assign contiguous
# sets of utterances to speakers.
#
# This has two general uses:
# (1) when dumping iVectors for training purposes, it's helpful to have
# a good variety of iVectors, and this can be accomplished by splitting
# speakers up into multiple copies of those speakers. We typically
# use the --utts-per-spk-max 2 option for this.
# (2) when dealing with data that is not diarized, and given that we
# haven't checked any diarization scripts into Kaldi yet, this
# script can do a "dumb" diarization that just groups consecutive
# utterances into groups based on length constraints.
# There are two cases here:

# a) With --respect-speaker-info true (the default),
# it only splits within existing speakers.
# This is suitable when you have existing speaker
# info that's meaningful in some way, e.g. represents
# individual recordings.
# b) With --respect-speaker-info false,
# it completely ignores the existing speaker information
# and constructs new speaker identities based on
# utterance names. This is suitable in scenarios when
# you have a one-to-one map between speakers and
# utterances.

# begin configuration section
utts_per_spk_max=-1
seconds_per_spk_max=-1
respect_speaker_info=true
# end configuration section

. utils/parse_options.sh

if [ $# != 2 ]; then
echo "Usage: "
echo " $0 [options] <srcdir> <destdir>"
echo "e.g.:"
echo " $0 --utts-per-spk-max 2 data/train data/train-max2"
echo "Options"
echo " --utts-per-spk-max <n> # number of utterances per speaker maximum,"
echo " # default -1 (meaning no maximum). E.g. 2."
echo " --seconds-per-spk-max <n> # number of seconds per speaker maximum,"
echo " # default -1 (meaning no maximum). E.g. 60."
echo " --respect-speaker-info <true|false> # If true, respect the"
echo " # existing speaker map (i.e. do not"
echo " # assign utterances from different"
echo " # speakers to the same generated speaker)."
echo " # Default: true."
echo "Note: one or both of the --utts-per-spk-max or --seconds-per-spk-max"
echo "options is required."
exit 1;
fi

export LC_ALL=C

srcdir=$1
destdir=$2

if [ "$destdir" == "$srcdir" ]; then
echo "$0: <srcdir> must be different from <destdir>."
exit 1
fi

if [ "$seconds_per_spk_max" == "-1" ] && ! [ "$utts_per_spk_max" -gt 0 ]; then
echo "$0: one or both of the --utts-per-spk-max or --seconds-per-spk-max options must be provided."
fi

if [ ! -f $srcdir/utt2spk ]; then
echo "$0: no such file $srcdir/utt2spk"
exit 1;
fi

set -e;
set -o pipefail

mkdir -p $destdir

if [ "$seconds_per_spk_max" != -1 ]; then
# we need the utt2dur file.
utils/data/get_utt2dur.sh $srcdir
utt2dur_opt="--utt2dur=$srcdir/utt2dur"
else
utt2dur_opt=
fi

utils/data/internal/modify_speaker_info.py \
$utt2dur_opt --respect-speaker-info=$respect_speaker_info \
--utts-per-spk-max=$utts_per_spk_max --seconds-per-spk-max=$seconds_per_spk_max \
<$srcdir/utt2spk >$destdir/utt2spk

utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt

# This script won't create the new cmvn.scp, it should be recomputed.
if [ -f $destdir/cmvn.scp ]; then
mkdir -p $destdir/.backup
mv $destdir/cmvn.scp $destdir/.backup
echo "$0: moving $destdir/cmvn.scp to $destdir/.backup/cmvn.scp"
fi

# these things won't be affected by the change of speaker mapping.
for f in feats.scp segments wav.scp reco2file_and_channel text stm glm ctm; do
[ -f $srcdir/$f ] && cp $srcdir/$f $destdir/
done


orig_num_spk=$(wc -l <$srcdir/spk2utt)
new_num_spk=$(wc -l <$destdir/spk2utt)

echo "$0: copied data from $srcdir to $destdir, number of speakers changed from $orig_num_spk to $new_num_spk"
opts=
[ ! -f $srcdir/feats.scp ] && opts="--no-feats"
[ ! -f $srcdir/text ] && opts="$opts --no-text"
[ ! -f $srcdir/wav.scp ] && opts="$opts --no-wav"

utils/validate_data_dir.sh $opts $destdir
6 changes: 5 additions & 1 deletion egs2/ameboshi/svs1/local/prep_segments_from_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,11 @@ def make_segment(file_id, tempo, notes, threshold, sil=["P", "B"]):
and note.lyric == "か"
and notes[i + 2].lyric == "か"
)
or ("aria" in file_id and note.lyric == "でぃ" and notes[i - 1].lyric == "い")
or (
"aria" in file_id
and note.lyric == "でぃ"
and notes[i - 1].lyric == "い"
)
or ("aria" in file_id and note.lyric == "ぷ" and notes[i - 1].lyric == "じ")
or (
"antagata_dokosa" in file_id
Expand Down
1 change: 1 addition & 0 deletions egs2/aphasiabank/asr1/local/clean_hyp_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Remove [APH] and [NONAPH] tags from the hypothesis file.
Works for both character- and word-level tokenization.
"""

from argparse import ArgumentParser


Expand Down
1 change: 1 addition & 0 deletions egs2/aphasiabank/asr1/local/clean_score_dir.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Filter out *.trn files in score_cer and score_wer based on language and aph types
"""

import argparse
import os
from typing import Iterable
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Filter out *.trn files in score_cer and score_wer based on language and
aph types
"""

import argparse
import os
from typing import Iterable
Expand Down
1 change: 1 addition & 0 deletions egs2/aphasiabank/asr1/local/extract_sentence_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
Based on https://github.com/monirome/AphasiaBank/blob/main/clean_transcriptions.ipynb
"""

import os
import re
from argparse import ArgumentParser
Expand Down
1 change: 1 addition & 0 deletions egs2/aphasiabank/asr1/local/split_train_test_val.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Split AphasiaBank into train, test and val sets, according to config.py
"""

import os
from argparse import ArgumentParser

Expand Down
13 changes: 7 additions & 6 deletions egs2/chime7_task1/asr1/local/get_lhotse_manifests.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Slighly modified versions of lhotse recipes scripts in
https://github.com/lhotse-speech/lhotse/blob/master/lhotse/recipes/
"""

import argparse
import glob
import json
Expand Down Expand Up @@ -172,9 +173,9 @@ def _get_channel(session, dset_part):
supervisions.append(
SupervisionSegment(
id=ex_id,
recording_id=session
if mic == "mdm"
else session + f"_{spk_id}",
recording_id=(
session if mic == "mdm" else session + f"_{spk_id}"
),
start=start,
duration=add_durations(end, -start, sampling_rate=16000),
channel=channel,
Expand Down Expand Up @@ -321,9 +322,9 @@ def prepare_dipco(
supervisions.append(
SupervisionSegment(
id=ex_id,
recording_id=session
if mic == "mdm"
else session + "_{}".format(spk_id),
recording_id=(
session if mic == "mdm" else session + "_{}".format(spk_id)
),
start=start,
duration=add_durations(end, -start, sampling_rate=16000),
channel=channel,
Expand Down
6 changes: 3 additions & 3 deletions egs2/chime7_task1/diar_asr1/local/pyannote_diarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,9 @@ def diarize_session(
segmentation.sum((0, 1))
) # not the best selection criteria
# however this keeps it simple and fast.
selected_audio[
:, math.floor(seg_b.start * fs) : math.floor(seg_b.end * fs)
] = c_seg[selection]
selected_audio[:, math.floor(seg_b.start * fs) : math.floor(seg_b.end * fs)] = (
c_seg[selection]
)
selected_seg.append(segmentation[..., selection])
# stack em
selected_seg = SlidingWindowFeature(
Expand Down
1 change: 1 addition & 0 deletions egs2/iam/ocr1/local/data_prep.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
Required packages:
Pillow
"""

import argparse
import os

Expand Down
Loading

0 comments on commit 8a48c23

Please sign in to comment.