From 37aa8550d532e9a28fb7630f5bf07d01a6b0313a Mon Sep 17 00:00:00 2001
From: Vineel Pratap <vineelkpratap@fb.com>
Date: Tue, 30 Mar 2021 20:41:56 -0700
Subject: [PATCH] Generic script to convert kaldi datasets to list file format

Summary: As per title

Reviewed By: avidov

Differential Revision: D27444664

fbshipit-source-id: ff964962cec567fa26cf072175b5734a1adc7be4
---
 data/utils/kaldi_to_listfile.py | 139 ++++++++++++++++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100644 data/utils/kaldi_to_listfile.py

diff --git a/data/utils/kaldi_to_listfile.py b/data/utils/kaldi_to_listfile.py
new file mode 100644
index 00000000..99f4cb50
--- /dev/null
+++ b/data/utils/kaldi_to_listfile.py
@@ -0,0 +1,139 @@
+"""
+Copyright (c) Facebook, Inc. and its affiliates.
+All rights reserved.
+This source code is licensed under the BSD-style license found in the
+LICENSE file in the root directory of this source tree.
+----------
+Script to package kaldi data directory into a form readable in
+wav2letter++ pipelines
+
+Command : python3 prepare.py --src [...] --dst [...]
+Replace [...] with appropriate path
+
+`src` directory is the path to kaldi data directory typically
+prepared with `prepare_data.sh` script.
+
+`dst` directory is the path to store (segmented) audio files and the
+list file that is used by wav2letter++ pipelines to load data.
+
+"""
+
+from __future__ import (
+    absolute_import,
+    division,
+    print_function,
+    unicode_literals,
+)
+
+import argparse
+import os
+import re
+from multiprocessing import Pool
+
+import sox
+from tqdm import tqdm
+
+
+def run_segment(item):
+    uid, val = item
+    infile, start_sec, end_sec, outfile = val
+    sox_tfm = sox.Transformer()
+    sox_tfm.set_output_format(
+        file_type="flac", encoding="signed-integer", bits=16
+    )
+    sox_tfm.trim(start_sec, end_sec)
+    sox_tfm.build(infile, outfile)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Script to generate list file from Kaldi data dir"
+    )
+    parser.add_argument(
+        "--src",
+        help="input kaldi data directory. Must contain "
+        "'text', 'segments' and 'wav.scp' files",
+    )
+    parser.add_argument(
+        "--dst", help="destination directory where to store data",
+    )
+    parser.add_argument(
+        "--name", help="name of the output list file", default="data.lst"
+    )
+    parser.add_argument(
+        "-p",
+        "--process",
+        help="number of process for multiprocessing",
+        default=8,
+        type=int,
+    )
+
+    args = parser.parse_args()
+
+    wav_files = {}
+    cache = {}
+    cmds = []
+    with open(f"{args.src}/wav.scp") as f:
+        for line in f:
+            # handles two possible cases
+            # Case 1: ID followed by wav file
+            # Ex: S03_U01.CH1 /path/S03_U01.CH1.wav
+            # Case 2: ID followed by sox script
+            # Ex: P09_S03.L sox /path/S03_P09.wav -t wav - remix 1 |
+            wid, wav_handle = line.strip().split(" ", 1)
+            if wav_handle in cache:
+                wav_file = cache[wav_handle]
+            elif wav_handle.startswith("sox"):
+                hsh = re.sub("[^0-9a-zA-Z]+", "", wav_handle)
+                wav_file = "/tmp/{}.wav".format(hsh)
+                cmds.append(
+                    wav_handle.replace(" - ", " " + wav_file + " ").replace(
+                        "|", ""
+                    )
+                )
+            else:
+                wav_file = wav_handle
+            wav_files[wid] = wav_file
+    print("Found {} wav files".format(len(wav_files)))
+
+    print("Running {} wav commands ...".format(len(cmds)))
+
+    def run_command(cmd):
+        os.system(cmd)
+
+    p = Pool(args.process)
+    list(tqdm(p.imap(run_command, cmds), total=len(cmds),))
+
+    transcripts = {}
+    with open(f"{args.src}/text") as f:
+        for line in f:
+            line_split = line.strip().split()
+            transcripts[line_split[0]] = " ".join(line_split[1:])
+    print("Found {} transcripts".format(len(transcripts)))
+
+    segments = {}
+    with open(f"{args.src}/segments") as f:
+        for line in f:
+            uid, wid, start_sec, end_sec = line.strip().split(" ", 3)
+            start_sec = float(start_sec)
+            end_sec = float(end_sec)
+            outfile = f"{args.dst}/audio/{uid}.flac"
+            segments[uid] = (wav_files[wid], start_sec, end_sec, outfile)
+    print("Found {} segments".format(len(segments)))
+
+    os.makedirs(f"{args.dst}", exist_ok=True)
+    os.makedirs(f"{args.dst}/audio", exist_ok=True)
+
+    print("Creating segmented audio files ...")
+    list(tqdm(p.imap(run_segment, segments.items()), total=len(segments),))
+
+    print("Writing to list file ...")
+    with open(f"{args.dst}/{args.name}", "w") as fo:
+        for uid, val in segments.items():
+            _, start_sec, end_sec, outfile = val
+            duration = "{:.2f}".format((end_sec - start_sec) * 1000)
+            fo.write(
+                "\t".join([uid, outfile, duration, transcripts[uid]]) + "\n"
+            )
+
+    print("Done!")