fix ci issues

espnet · sw005320 · Dec 5, 2023 · Oct 17, 2023 · Oct 17, 2023 · Oct 17, 2023
commit 063dc3f4d5bc6351576eea71f32738126d58adf4
diff --git a/egs2/mixed_v3/s2t1/local/prepare_commonvoice.py b/egs2/mixed_v3/s2t1/local/prepare_commonvoice.py
@@ -131,6 +131,7 @@ def parse_args():
 
 if __name__ == "__main__":
     args = parse_args()
+    logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.DEBUG)
 
     # map CommonVoice language-id to ISO-693-3 standard code.
     language_map = open("local/cv-iso-693-3.txt").readlines()

diff --git a/egs2/mixed_v3/s2t1/local/prepare_fleurs.py b/egs2/mixed_v3/s2t1/local/prepare_fleurs.py
@@ -20,9 +20,6 @@
 except Exception:
     traceback.print_exc()
     logging.warning("Error importing datasets library")
-    logging.warning(
-        "datasets can be installed via espnet/tools/installers/install_datasets"
-    )
     exit()
 
 
@@ -107,6 +104,7 @@ def main():
     )
 
     args = parser.parse_args()
+    logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.DEBUG)
 
     fleurs_asr = load_dataset(
         "google/xtreme_s", f"fleurs.all", cache_dir=args.cache, num_proc=16
@@ -131,7 +129,8 @@ def main():
         lang_id_iso, lang_name = lang.part3, lang.name
         lang_id_dict[lang_name] = lang_id_iso
         logging.info(
-            f"Process FLEURS subset {lang_name} ({lang_id}) with ISO-693-3 id: {lang_id_iso}"
+            f"Process FLEURS subset {lang_name} ({lang_id})"
+            f"with ISO-693-3 id: {lang_id_iso}"
         )
 
     # Add missing terms manually.

diff --git a/egs2/mixed_v3/s2t1/local/prepare_voxforge.py b/egs2/mixed_v3/s2t1/local/prepare_voxforge.py
diff --git a/egs2/mixed_v3/s2t1/local/prepare_voxpopuli.py b/egs2/mixed_v3/s2t1/local/prepare_voxpopuli.py
@@ -126,11 +126,12 @@ def collect_data_asr(
             continue
 
         iso_src = toiso(src)
+        path_template = "ffmpeg -i {} -ac 1 -ar 16000 -f wav - |"
         talks[event_id].append(
             Utterance(
                 utt_id=f"{prefix}_asr_{event_id}_{r['id_']}",
                 wav_id=f"{prefix}_asr_{event_id}",
-                wav_path=f"ffmpeg -i {str(audio_path.resolve())} -ac 1 -ar 16000 -f wav - |",
+                wav_path=path_template.format(str(audio_path.resolve())),
                 start_time=start_time,
                 end_time=end_time,
                 lang=f"<{iso_src}>",
@@ -232,11 +233,12 @@ def collect_data_st(
         # Note(jinchuan): Not sure if "event_id" would overlap across
         # languages. So add src2tgt tag to wav_id to exclude this risk
         iso_src, iso_tgt = toiso(src), toiso(tgt)
+        path_template = "ffmpeg -i {} -ac 1 -ar 16000 -f wav - |"
         talks[event_id].append(
             Utterance(
                 utt_id=f"{prefix}_st_{iso_src}2{iso_tgt}_{event_id}_{utt_id}",
                 wav_id=f"{prefix}_st_{iso_src}2{iso_tgt}_{event_id}",
-                wav_path=f"ffmpeg -i {str(audio_path.resolve())} -ac 1 -ar 16000 -f wav - |",
+                wav_path=path_template.format(str(audio_path.resolve())),
                 start_time=float(r["start_time"]),
                 end_time=float(r["end_time"]),
                 lang=f"<{iso_src}>",
@@ -275,6 +277,8 @@ def parse_args():
 if __name__ == "__main__":
     args = parse_args()
 
+    logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.DEBUG)
+
     args.output_dir.mkdir(parents=True, exist_ok=True)
 
     for split in args.splits:

diff --git a/egs2/mixed_v3/s2t1/local/utils.py b/egs2/mixed_v3/s2t1/local/utils.py
@@ -189,14 +189,20 @@ def merge_short_utterances(
     end_time = utts[-1].end_time
     lang = utts[0].lang
     task = utts[0].task
-    utt_id = f"{wav_id}_{round(1000 * start_time):09d}_{round(1000 * end_time):09d}_{lang[1:-1]}_{task[1:-1]}"
+    utt_id = (
+        f"{wav_id}_{round(1000 * start_time):09d}_"
+        f"{round(1000 * end_time):09d}_{lang[1:-1]}_{task[1:-1]}"
+    )
     text = " ".join([u.text for u in utts])
     asr_text = " ".join([u.asr_text for u in utts])
     prev_text = prev.text if prev is not None else SYMBOL_NA
 
     text_with_time = ""
     for u in utts:
-        text_with_time += f"{time2token(u.start_time - start_time)} {u.text.strip()}{time2token(u.end_time - start_time)}"
+        text_with_time += (
+            f"{time2token(u.start_time - start_time)} "
+            f"{u.text.strip()}{time2token(u.end_time - start_time)}"
+        )
 
     return LongUtterance(
         utt_id=utt_id,
@@ -221,17 +227,19 @@ def generate_long_utterances(
     utts.sort(key=lambda x: x.start_time)
 
     long_utts = [None]
-    l, r = 0, 0
-    while l < len(utts):
-        if r < len(utts) and utts[r].end_time - utts[l].start_time <= SPEECH_MAX_LEN:
-            r += 1
-        elif r > l:
-            long_utts.append(merge_short_utterances(utts[l:r], long_utts[-1]))
-            l = r
+    left, right = 0, 0
+    while left < len(utts):
+        if right < len(utts) and (
+            utts[right].end_time - utts[left].start_time <= SPEECH_MAX_LEN
+        ):
+            right += 1
+        elif right > left:
+            long_utts.append(merge_short_utterances(utts[left:right], long_utts[-1]))
+            left = right
         else:  # skip the current utt if its length already exceeds the limit
             long_utts.append(None)
-            l = r + 1
-            r = l
+            left = right + 1
+            right = left
 
     long_utts = [u for u in long_utts if u is not None]
     return long_utts