Merge branch 'master' into pr_voxtlm

iamanigeeit · Jan 30, 2024 · 8a48c23 · 8a48c23
2 parents f55e691 + 27f292d
commit 8a48c23
Show file tree

Hide file tree

Showing 130 changed files with 1,055 additions and 683 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -14,7 +14,7 @@ repos:
         exclude: ^(egs2/TEMPLATE/asr1/utils|egs2/TEMPLATE/asr1/steps|egs2/TEMPLATE/tts1/sid|tools/installers/patch_mwerSegmenter)
 
 -   repo: https://github.com/psf/black
-    rev: 23.12.1
+    rev: 24.1.1
     hooks:
     -   id: black
         exclude: ^(egs2/TEMPLATE/asr1/utils|egs2/TEMPLATE/asr1/steps|egs2/TEMPLATE/tts1/sid|doc)

diff --git a/egs/must_c/st1/local/download_and_untar.sh b/egs/must_c/st1/local/download_and_untar.sh
@@ -62,11 +62,10 @@ if [ ! -f ${tar_path} ]; then
     echo ${instructions}
 fi
 
-if ! tar -zxvf ${tar_path} -d -C ${data}; then
+if ! tar -zxvf ${tar_path} -C ${data}; then
     echo "$0: error un-tarring archive ${tar_path}"
     exit 1;
 fi
-
 touch ${data}/.complete_en_${lang}
 echo "$0: Successfully downloaded and un-tarred ${tar_path}"
 

diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/print_args.py b/egs2/TEMPLATE/asr1/pyscripts/utils/print_args.py
@@ -25,9 +25,11 @@ def get_commandline_args(no_executable=True):
 
     # Escape the extra characters for shell
     argv = [
-        arg.replace("'", "'\\''")
-        if all(char not in arg for char in extra_chars)
-        else "'" + arg.replace("'", "'\\''") + "'"
+        (
+            arg.replace("'", "'\\''")
+            if all(char not in arg for char in extra_chars)
+            else "'" + arg.replace("'", "'\\''") + "'"
+        )
         for arg in sys.argv
     ]
 

diff --git a/egs2/TEMPLATE/asr1/utils/data/internal/modify_speaker_info.py b/egs2/TEMPLATE/asr1/utils/data/internal/modify_speaker_info.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+import argparse, sys,os
+from collections import defaultdict
+parser = argparse.ArgumentParser(description="""
+Combine consecutive utterances into fake speaker ids for a kind of
+poor man's segmentation.  Reads old utt2spk from standard input,
+outputs new utt2spk to standard output.""")
+parser.add_argument("--utts-per-spk-max", type = int, required = True,
+                    help="Maximum number of utterances allowed per speaker")
+parser.add_argument("--seconds-per-spk-max", type = float, required = True,
+                    help="""Maximum duration in seconds allowed per speaker.
+                         If this option is >0, --utt2dur option must be provided.""")
+parser.add_argument("--utt2dur", type = str,
+                    help="""Filename of input 'utt2dur' file (needed only if
+                    --seconds-per-spk-max is provided)""")
+parser.add_argument("--respect-speaker-info", type = str, default = 'true',
+                    choices = ['true', 'false'],
+                    help="""If true, the output speakers will be split from "
+                    "existing speakers.""")
+
+args = parser.parse_args()
+
+utt2spk = dict()
+# an undefined spk2utt entry will default to an empty list.
+spk2utt = defaultdict(lambda: [])
+
+while True:
+    line = sys.stdin.readline()
+    if line == '':
+        break;
+    a = line.split()
+    if len(a) != 2:
+        sys.exit("modify_speaker_info.py: bad utt2spk line from standard input (expected two fields): " +
+                 line)
+    [ utt, spk ] = a
+    utt2spk[utt] = spk
+    spk2utt[spk].append(utt)
+
+if args.seconds_per_spk_max > 0:
+    utt2dur = dict()
+    try:
+        f = open(args.utt2dur)
+        while True:
+            line = f.readline()
+            if line == '':
+                break
+            a = line.split()
+            if len(a) != 2:
+                sys.exit("modify_speaker_info.py: bad utt2dur line from standard input (expected two fields): " +
+                         line)
+            [ utt, dur ] = a
+            utt2dur[utt] = float(dur)
+        for utt in utt2spk:
+            if not utt in utt2dur:
+                sys.exit("modify_speaker_info.py: utterance {0} not in utt2dur file {1}".format(
+                        utt, args.utt2dur))
+    except Exception as e:
+        sys.exit("modify_speaker_info.py: problem reading utt2dur info: " + str(e))
+
+# splits a list of utts into a list of lists, based on constraints from the
+# command line args.  Note: the last list will tend to be shorter than the others,
+# we make no attempt to fix this.
+def SplitIntoGroups(uttlist):
+    ans = [] # list of lists.
+    cur_uttlist = []
+    cur_dur = 0.0
+    for utt in uttlist:
+        if ((args.utts_per_spk_max > 0 and len(cur_uttlist) == args.utts_per_spk_max) or
+            (args.seconds_per_spk_max > 0 and len(cur_uttlist) > 0 and
+             cur_dur + utt2dur[utt] > args.seconds_per_spk_max)):
+            ans.append(cur_uttlist)
+            cur_uttlist = []
+            cur_dur = 0.0
+        cur_uttlist.append(utt)
+        if args.seconds_per_spk_max > 0:
+            cur_dur += utt2dur[utt]
+    if len(cur_uttlist) > 0:
+        ans.append(cur_uttlist)
+    return ans
+
+
+# This function will return '%01d' if d < 10, '%02d' if d < 100, and so on.
+# It's for printf printing of numbers in such a way that sorted order will be
+# correct.
+def GetFormatString(d):
+    ans = 1
+    while (d >= 10):
+        d //= 10  # integer division
+        ans += 1
+    # e.g. we might return the string '%01d' or '%02d'
+    return '%0{0}d'.format(ans)
+
+
+if args.respect_speaker_info == 'true':
+    for spk in sorted(spk2utt.keys()):
+        uttlists = SplitIntoGroups(spk2utt[spk])
+        format_string = '%s-' + GetFormatString(len(uttlists))
+        for i in range(len(uttlists)):
+            # the following might look like: '%s-%02d'.format('john_smith' 9 + 1),
+            # giving 'john_smith-10'.
+            this_spk = format_string % (spk, i + 1)
+            for utt in uttlists[i]:
+                print(utt, this_spk)
+else:
+    uttlists = SplitIntoGroups(sorted(utt2spk.keys()))
+    format_string = 'speaker-' + GetFormatString(len(uttlists))
+    for i in range(len(uttlists)):
+        # the following might look like: 'speaker-%04d'.format(105 + 1),
+        # giving 'speaker-0106'.
+        this_spk = format_string % (i + 1)
+        for utt in uttlists[i]:
+            print(utt, this_spk)
+
diff --git a/egs2/TEMPLATE/asr1/utils/data/modify_speaker_info.sh b/egs2/TEMPLATE/asr1/utils/data/modify_speaker_info.sh
@@ -0,0 +1,125 @@
+#!/usr/bin/env bash
+
+# Copyright 2013-2016  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+# This script copies a data directory (like utils/copy_data.sh) while
+# modifying (splitting or merging) the speaker information in that data directory.
+#
+# This is done without looking at the data at all; we use only duration
+# constraints and maximum-num-utts-per-speaker to assign contiguous
+# sets of utterances to speakers.
+#
+# This has two general uses:
+# (1) when dumping iVectors for training purposes, it's helpful to have
+#   a good variety of iVectors, and this can be accomplished by splitting
+#   speakers up into multiple copies of those speakers.  We typically
+#   use the --utts-per-spk-max 2 option for this.
+# (2) when dealing with data that is not diarized, and given that we
+#   haven't checked any diarization scripts into Kaldi yet, this
+#   script can do a "dumb" diarization that just groups consecutive
+#   utterances into groups based on length constraints.
+#   There are two cases here:
+
+#       a) With --respect-speaker-info true (the default),
+#         it only splits within existing speakers.
+#         This is suitable when you have existing speaker
+#         info that's meaningful in some way, e.g. represents
+#         individual recordings.
+#      b) With --respect-speaker-info false,
+#        it completely ignores the existing speaker information
+#        and constructs new speaker identities based on
+#        utterance names.  This is suitable in scenarios when
+#        you have a one-to-one map between speakers and
+#        utterances.
+
+# begin configuration section
+utts_per_spk_max=-1
+seconds_per_spk_max=-1
+respect_speaker_info=true
+# end configuration section
+
+. utils/parse_options.sh
+
+if [ $# != 2 ]; then
+  echo "Usage: "
+  echo "  $0 [options] <srcdir> <destdir>"
+  echo "e.g.:"
+  echo " $0 --utts-per-spk-max 2 data/train data/train-max2"
+  echo "Options"
+  echo "   --utts-per-spk-max <n>  # number of utterances per speaker maximum,"
+  echo "                           # default -1 (meaning no maximum).  E.g. 2."
+  echo "   --seconds-per-spk-max <n> # number of seconds per speaker maximum,"
+  echo "                             # default -1 (meaning no maximum).  E.g. 60."
+  echo "   --respect-speaker-info <true|false>  # If true, respect the"
+  echo "                                        # existing speaker map (i.e. do not"
+  echo "                                        # assign utterances from different"
+  echo "                                        # speakers to the same generated speaker)."
+  echo "                                        # Default: true."
+  echo "Note: one or both of the --utts-per-spk-max or --seconds-per-spk-max"
+  echo "options is required."
+  exit 1;
+fi
+
+export LC_ALL=C
+
+srcdir=$1
+destdir=$2
+
+if [ "$destdir"  == "$srcdir" ]; then
+  echo "$0: <srcdir> must be different from <destdir>."
+  exit 1
+fi
+
+if [ "$seconds_per_spk_max" == "-1" ] && ! [ "$utts_per_spk_max" -gt 0 ]; then
+  echo "$0: one or both of the --utts-per-spk-max or --seconds-per-spk-max options must be provided."
+fi
+
+if [ ! -f $srcdir/utt2spk ]; then
+  echo "$0: no such file $srcdir/utt2spk"
+  exit 1;
+fi
+
+set -e;
+set -o pipefail
+
+mkdir -p $destdir
+
+if [ "$seconds_per_spk_max" != -1 ]; then
+  # we need the utt2dur file.
+  utils/data/get_utt2dur.sh $srcdir
+  utt2dur_opt="--utt2dur=$srcdir/utt2dur"
+else
+  utt2dur_opt=
+fi
+
+utils/data/internal/modify_speaker_info.py \
+   $utt2dur_opt --respect-speaker-info=$respect_speaker_info \
+  --utts-per-spk-max=$utts_per_spk_max --seconds-per-spk-max=$seconds_per_spk_max \
+  <$srcdir/utt2spk >$destdir/utt2spk
+
+utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt
+
+# This script won't create the new cmvn.scp, it should be recomputed.
+if [ -f $destdir/cmvn.scp ]; then
+  mkdir -p $destdir/.backup
+  mv $destdir/cmvn.scp $destdir/.backup
+  echo "$0: moving $destdir/cmvn.scp to $destdir/.backup/cmvn.scp"
+fi
+
+# these things won't be affected by the change of speaker mapping.
+for f in feats.scp segments wav.scp reco2file_and_channel text stm glm ctm; do
+  [ -f $srcdir/$f ] && cp $srcdir/$f $destdir/
+done
+
+
+orig_num_spk=$(wc -l <$srcdir/spk2utt)
+new_num_spk=$(wc -l <$destdir/spk2utt)
+
+echo "$0: copied data from $srcdir to $destdir, number of speakers changed from $orig_num_spk to $new_num_spk"
+opts=
+[ ! -f $srcdir/feats.scp ] && opts="--no-feats"
+[ ! -f $srcdir/text ] && opts="$opts --no-text"
+[ ! -f $srcdir/wav.scp ] && opts="$opts --no-wav"
+
+utils/validate_data_dir.sh $opts $destdir
diff --git a/egs2/ameboshi/svs1/local/prep_segments_from_xml.py b/egs2/ameboshi/svs1/local/prep_segments_from_xml.py
@@ -128,7 +128,11 @@ def make_segment(file_id, tempo, notes, threshold, sil=["P", "B"]):
                 and note.lyric == "か"
                 and notes[i + 2].lyric == "か"
             )
-            or ("aria" in file_id and note.lyric == "でぃ" and notes[i - 1].lyric == "い")
+            or (
+                "aria" in file_id
+                and note.lyric == "でぃ"
+                and notes[i - 1].lyric == "い"
+            )
             or ("aria" in file_id and note.lyric == "ぷ" and notes[i - 1].lyric == "じ")
             or (
                 "antagata_dokosa" in file_id

diff --git a/egs2/aphasiabank/asr1/local/clean_hyp_annotations.py b/egs2/aphasiabank/asr1/local/clean_hyp_annotations.py
@@ -2,6 +2,7 @@
 Remove [APH] and [NONAPH] tags from the hypothesis file.
 Works for both character- and word-level tokenization.
 """
+
 from argparse import ArgumentParser
 
 

diff --git a/egs2/aphasiabank/asr1/local/clean_score_dir.py b/egs2/aphasiabank/asr1/local/clean_score_dir.py
@@ -1,6 +1,7 @@
 """
 Filter out *.trn files in score_cer and score_wer based on language and aph types
 """
+
 import argparse
 import os
 from typing import Iterable

diff --git a/egs2/aphasiabank/asr1/local/clean_score_dir_per_severity.py b/egs2/aphasiabank/asr1/local/clean_score_dir_per_severity.py
@@ -2,6 +2,7 @@
 Filter out *.trn files in score_cer and score_wer based on language and
 aph types
 """
+
 import argparse
 import os
 from typing import Iterable

diff --git a/egs2/aphasiabank/asr1/local/extract_sentence_info.py b/egs2/aphasiabank/asr1/local/extract_sentence_info.py
@@ -3,6 +3,7 @@
 
 Based on https://github.com/monirome/AphasiaBank/blob/main/clean_transcriptions.ipynb
 """
+
 import os
 import re
 from argparse import ArgumentParser

diff --git a/egs2/aphasiabank/asr1/local/split_train_test_val.py b/egs2/aphasiabank/asr1/local/split_train_test_val.py
@@ -1,6 +1,7 @@
 """
 Split AphasiaBank into train, test and val sets, according to config.py
 """
+
 import os
 from argparse import ArgumentParser
 

diff --git a/egs2/chime7_task1/asr1/local/get_lhotse_manifests.py b/egs2/chime7_task1/asr1/local/get_lhotse_manifests.py
@@ -2,6 +2,7 @@
 Slighly modified versions of lhotse recipes scripts in
 https://github.com/lhotse-speech/lhotse/blob/master/lhotse/recipes/
 """
+
 import argparse
 import glob
 import json
@@ -172,9 +173,9 @@ def _get_channel(session, dset_part):
                 supervisions.append(
                     SupervisionSegment(
                         id=ex_id,
-                        recording_id=session
-                        if mic == "mdm"
-                        else session + f"_{spk_id}",
+                        recording_id=(
+                            session if mic == "mdm" else session + f"_{spk_id}"
+                        ),
                         start=start,
                         duration=add_durations(end, -start, sampling_rate=16000),
                         channel=channel,
@@ -321,9 +322,9 @@ def prepare_dipco(
                 supervisions.append(
                     SupervisionSegment(
                         id=ex_id,
-                        recording_id=session
-                        if mic == "mdm"
-                        else session + "_{}".format(spk_id),
+                        recording_id=(
+                            session if mic == "mdm" else session + "_{}".format(spk_id)
+                        ),
                         start=start,
                         duration=add_durations(end, -start, sampling_rate=16000),
                         channel=channel,

diff --git a/egs2/chime7_task1/diar_asr1/local/pyannote_diarize.py b/egs2/chime7_task1/diar_asr1/local/pyannote_diarize.py
@@ -172,9 +172,9 @@ def diarize_session(
             segmentation.sum((0, 1))
         )  # not the best selection criteria
         # however this keeps it simple and fast.
-        selected_audio[
-            :, math.floor(seg_b.start * fs) : math.floor(seg_b.end * fs)
-        ] = c_seg[selection]
+        selected_audio[:, math.floor(seg_b.start * fs) : math.floor(seg_b.end * fs)] = (
+            c_seg[selection]
+        )
         selected_seg.append(segmentation[..., selection])
     # stack em
     selected_seg = SlidingWindowFeature(

diff --git a/egs2/iam/ocr1/local/data_prep.py b/egs2/iam/ocr1/local/data_prep.py
@@ -17,6 +17,7 @@
 Required packages:
     Pillow
 """
+
 import argparse
 import os
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,6 +2,7 @@ @@
     Remove [APH] and [NONAPH] tags from the hypothesis file.
     Works for both character- and word-level tokenization.
     """
     from argparse import ArgumentParser
@@ Expand Down @@