forked from espnet/espnet
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' into pr_voxtlm
- Loading branch information
Showing
130 changed files
with
1,055 additions
and
683 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
115 changes: 115 additions & 0 deletions
115
egs2/TEMPLATE/asr1/utils/data/internal/modify_speaker_info.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
#!/usr/bin/env python | ||
|
||
from __future__ import print_function | ||
import argparse, sys,os | ||
from collections import defaultdict | ||
parser = argparse.ArgumentParser(description=""" | ||
Combine consecutive utterances into fake speaker ids for a kind of | ||
poor man's segmentation. Reads old utt2spk from standard input, | ||
outputs new utt2spk to standard output.""") | ||
parser.add_argument("--utts-per-spk-max", type = int, required = True, | ||
help="Maximum number of utterances allowed per speaker") | ||
parser.add_argument("--seconds-per-spk-max", type = float, required = True, | ||
help="""Maximum duration in seconds allowed per speaker. | ||
If this option is >0, --utt2dur option must be provided.""") | ||
parser.add_argument("--utt2dur", type = str, | ||
help="""Filename of input 'utt2dur' file (needed only if | ||
--seconds-per-spk-max is provided)""") | ||
parser.add_argument("--respect-speaker-info", type = str, default = 'true', | ||
choices = ['true', 'false'], | ||
help="""If true, the output speakers will be split from " | ||
"existing speakers.""") | ||
|
||
args = parser.parse_args() | ||
|
||
utt2spk = dict() | ||
# an undefined spk2utt entry will default to an empty list. | ||
spk2utt = defaultdict(lambda: []) | ||
|
||
while True: | ||
line = sys.stdin.readline() | ||
if line == '': | ||
break; | ||
a = line.split() | ||
if len(a) != 2: | ||
sys.exit("modify_speaker_info.py: bad utt2spk line from standard input (expected two fields): " + | ||
line) | ||
[ utt, spk ] = a | ||
utt2spk[utt] = spk | ||
spk2utt[spk].append(utt) | ||
|
||
if args.seconds_per_spk_max > 0: | ||
utt2dur = dict() | ||
try: | ||
f = open(args.utt2dur) | ||
while True: | ||
line = f.readline() | ||
if line == '': | ||
break | ||
a = line.split() | ||
if len(a) != 2: | ||
sys.exit("modify_speaker_info.py: bad utt2dur line from standard input (expected two fields): " + | ||
line) | ||
[ utt, dur ] = a | ||
utt2dur[utt] = float(dur) | ||
for utt in utt2spk: | ||
if not utt in utt2dur: | ||
sys.exit("modify_speaker_info.py: utterance {0} not in utt2dur file {1}".format( | ||
utt, args.utt2dur)) | ||
except Exception as e: | ||
sys.exit("modify_speaker_info.py: problem reading utt2dur info: " + str(e)) | ||
|
||
# splits a list of utts into a list of lists, based on constraints from the | ||
# command line args. Note: the last list will tend to be shorter than the others, | ||
# we make no attempt to fix this. | ||
def SplitIntoGroups(uttlist): | ||
ans = [] # list of lists. | ||
cur_uttlist = [] | ||
cur_dur = 0.0 | ||
for utt in uttlist: | ||
if ((args.utts_per_spk_max > 0 and len(cur_uttlist) == args.utts_per_spk_max) or | ||
(args.seconds_per_spk_max > 0 and len(cur_uttlist) > 0 and | ||
cur_dur + utt2dur[utt] > args.seconds_per_spk_max)): | ||
ans.append(cur_uttlist) | ||
cur_uttlist = [] | ||
cur_dur = 0.0 | ||
cur_uttlist.append(utt) | ||
if args.seconds_per_spk_max > 0: | ||
cur_dur += utt2dur[utt] | ||
if len(cur_uttlist) > 0: | ||
ans.append(cur_uttlist) | ||
return ans | ||
|
||
|
||
# This function will return '%01d' if d < 10, '%02d' if d < 100, and so on. | ||
# It's for printf printing of numbers in such a way that sorted order will be | ||
# correct. | ||
def GetFormatString(d): | ||
ans = 1 | ||
while (d >= 10): | ||
d //= 10 # integer division | ||
ans += 1 | ||
# e.g. we might return the string '%01d' or '%02d' | ||
return '%0{0}d'.format(ans) | ||
|
||
|
||
if args.respect_speaker_info == 'true': | ||
for spk in sorted(spk2utt.keys()): | ||
uttlists = SplitIntoGroups(spk2utt[spk]) | ||
format_string = '%s-' + GetFormatString(len(uttlists)) | ||
for i in range(len(uttlists)): | ||
# the following might look like: '%s-%02d'.format('john_smith' 9 + 1), | ||
# giving 'john_smith-10'. | ||
this_spk = format_string % (spk, i + 1) | ||
for utt in uttlists[i]: | ||
print(utt, this_spk) | ||
else: | ||
uttlists = SplitIntoGroups(sorted(utt2spk.keys())) | ||
format_string = 'speaker-' + GetFormatString(len(uttlists)) | ||
for i in range(len(uttlists)): | ||
# the following might look like: 'speaker-%04d'.format(105 + 1), | ||
# giving 'speaker-0106'. | ||
this_spk = format_string % (i + 1) | ||
for utt in uttlists[i]: | ||
print(utt, this_spk) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
#!/usr/bin/env bash | ||
|
||
# Copyright 2013-2016 Johns Hopkins University (author: Daniel Povey) | ||
# Apache 2.0 | ||
|
||
# This script copies a data directory (like utils/copy_data.sh) while | ||
# modifying (splitting or merging) the speaker information in that data directory. | ||
# | ||
# This is done without looking at the data at all; we use only duration | ||
# constraints and maximum-num-utts-per-speaker to assign contiguous | ||
# sets of utterances to speakers. | ||
# | ||
# This has two general uses: | ||
# (1) when dumping iVectors for training purposes, it's helpful to have | ||
# a good variety of iVectors, and this can be accomplished by splitting | ||
# speakers up into multiple copies of those speakers. We typically | ||
# use the --utts-per-spk-max 2 option for this. | ||
# (2) when dealing with data that is not diarized, and given that we | ||
# haven't checked any diarization scripts into Kaldi yet, this | ||
# script can do a "dumb" diarization that just groups consecutive | ||
# utterances into groups based on length constraints. | ||
# There are two cases here: | ||
|
||
# a) With --respect-speaker-info true (the default), | ||
# it only splits within existing speakers. | ||
# This is suitable when you have existing speaker | ||
# info that's meaningful in some way, e.g. represents | ||
# individual recordings. | ||
# b) With --respect-speaker-info false, | ||
# it completely ignores the existing speaker information | ||
# and constructs new speaker identities based on | ||
# utterance names. This is suitable in scenarios when | ||
# you have a one-to-one map between speakers and | ||
# utterances. | ||
|
||
# begin configuration section | ||
utts_per_spk_max=-1 | ||
seconds_per_spk_max=-1 | ||
respect_speaker_info=true | ||
# end configuration section | ||
|
||
. utils/parse_options.sh | ||
|
||
if [ $# != 2 ]; then | ||
echo "Usage: " | ||
echo " $0 [options] <srcdir> <destdir>" | ||
echo "e.g.:" | ||
echo " $0 --utts-per-spk-max 2 data/train data/train-max2" | ||
echo "Options" | ||
echo " --utts-per-spk-max <n> # number of utterances per speaker maximum," | ||
echo " # default -1 (meaning no maximum). E.g. 2." | ||
echo " --seconds-per-spk-max <n> # number of seconds per speaker maximum," | ||
echo " # default -1 (meaning no maximum). E.g. 60." | ||
echo " --respect-speaker-info <true|false> # If true, respect the" | ||
echo " # existing speaker map (i.e. do not" | ||
echo " # assign utterances from different" | ||
echo " # speakers to the same generated speaker)." | ||
echo " # Default: true." | ||
echo "Note: one or both of the --utts-per-spk-max or --seconds-per-spk-max" | ||
echo "options is required." | ||
exit 1; | ||
fi | ||
|
||
export LC_ALL=C | ||
|
||
srcdir=$1 | ||
destdir=$2 | ||
|
||
if [ "$destdir" == "$srcdir" ]; then | ||
echo "$0: <srcdir> must be different from <destdir>." | ||
exit 1 | ||
fi | ||
|
||
if [ "$seconds_per_spk_max" == "-1" ] && ! [ "$utts_per_spk_max" -gt 0 ]; then | ||
echo "$0: one or both of the --utts-per-spk-max or --seconds-per-spk-max options must be provided." | ||
fi | ||
|
||
if [ ! -f $srcdir/utt2spk ]; then | ||
echo "$0: no such file $srcdir/utt2spk" | ||
exit 1; | ||
fi | ||
|
||
set -e; | ||
set -o pipefail | ||
|
||
mkdir -p $destdir | ||
|
||
if [ "$seconds_per_spk_max" != -1 ]; then | ||
# we need the utt2dur file. | ||
utils/data/get_utt2dur.sh $srcdir | ||
utt2dur_opt="--utt2dur=$srcdir/utt2dur" | ||
else | ||
utt2dur_opt= | ||
fi | ||
|
||
utils/data/internal/modify_speaker_info.py \ | ||
$utt2dur_opt --respect-speaker-info=$respect_speaker_info \ | ||
--utts-per-spk-max=$utts_per_spk_max --seconds-per-spk-max=$seconds_per_spk_max \ | ||
<$srcdir/utt2spk >$destdir/utt2spk | ||
|
||
utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt | ||
|
||
# This script won't create the new cmvn.scp, it should be recomputed. | ||
if [ -f $destdir/cmvn.scp ]; then | ||
mkdir -p $destdir/.backup | ||
mv $destdir/cmvn.scp $destdir/.backup | ||
echo "$0: moving $destdir/cmvn.scp to $destdir/.backup/cmvn.scp" | ||
fi | ||
|
||
# these things won't be affected by the change of speaker mapping. | ||
for f in feats.scp segments wav.scp reco2file_and_channel text stm glm ctm; do | ||
[ -f $srcdir/$f ] && cp $srcdir/$f $destdir/ | ||
done | ||
|
||
|
||
orig_num_spk=$(wc -l <$srcdir/spk2utt) | ||
new_num_spk=$(wc -l <$destdir/spk2utt) | ||
|
||
echo "$0: copied data from $srcdir to $destdir, number of speakers changed from $orig_num_spk to $new_num_spk" | ||
opts= | ||
[ ! -f $srcdir/feats.scp ] && opts="--no-feats" | ||
[ ! -f $srcdir/text ] && opts="$opts --no-text" | ||
[ ! -f $srcdir/wav.scp ] && opts="$opts --no-wav" | ||
|
||
utils/validate_data_dir.sh $opts $destdir |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,7 @@ | |
Required packages: | ||
Pillow | ||
""" | ||
|
||
import argparse | ||
import os | ||
|
||
|
Oops, something went wrong.