Use jsonl for CutSet in the LibriSpeech recipe. (k2-fsa#397)

* Use jsonl for cutsets in the librispeech recipe. * Use lazy cutset for all recipes. * More fixes to use lazy CutSet. * Remove force=True from logging to support Python < 3.8 * Minor fixes. * Fix style issues.
tramphero · Jun 6, 2022 · f1abce7 · f1abce7
1 parent e5884f8
commit f1abce7
Show file tree

Hide file tree

Showing 68 changed files with 701 additions and 1,097 deletions.
diff --git a/.github/workflows/run-gigaspeech-2022-05-13.yml b/.github/workflows/run-gigaspeech-2022-05-13.yml
@@ -59,6 +59,8 @@ jobs:
       - name: Install Python dependencies
         run: |
           grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf
 
       - name: Cache kaldifeat
         id: my-cache

diff --git a/.github/workflows/run-librispeech-2022-03-12.yml b/.github/workflows/run-librispeech-2022-03-12.yml
@@ -59,6 +59,8 @@ jobs:
       - name: Install Python dependencies
         run: |
           grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf
 
       - name: Cache kaldifeat
         id: my-cache
@@ -99,7 +101,7 @@ jobs:
         with:
           path: |
             ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
 
       - name: Compute fbank for LibriSpeech test-clean and test-other
         if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

diff --git a/.github/workflows/run-librispeech-2022-04-29.yml b/.github/workflows/run-librispeech-2022-04-29.yml
@@ -59,6 +59,8 @@ jobs:
       - name: Install Python dependencies
         run: |
           grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf
 
       - name: Cache kaldifeat
         id: my-cache
@@ -99,7 +101,7 @@ jobs:
         with:
           path: |
             ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
 
       - name: Compute fbank for LibriSpeech test-clean and test-other
         if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

diff --git a/.github/workflows/run-librispeech-2022-05-13.yml b/.github/workflows/run-librispeech-2022-05-13.yml
@@ -59,6 +59,8 @@ jobs:
       - name: Install Python dependencies
         run: |
           grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf
 
       - name: Cache kaldifeat
         id: my-cache
@@ -99,7 +101,7 @@ jobs:
         with:
           path: |
             ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
 
       - name: Compute fbank for LibriSpeech test-clean and test-other
         if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

diff --git a/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml b/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
@@ -59,6 +59,8 @@ jobs:
       - name: Install Python dependencies
         run: |
           grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf
 
       - name: Cache kaldifeat
         id: my-cache
@@ -99,7 +101,7 @@ jobs:
         with:
           path: |
             ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
 
       - name: Compute fbank for LibriSpeech test-clean and test-other
         if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

diff --git a/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml b/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
@@ -59,6 +59,8 @@ jobs:
       - name: Install Python dependencies
         run: |
           grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf
 
       - name: Cache kaldifeat
         id: my-cache
@@ -99,7 +101,7 @@ jobs:
         with:
           path: |
             ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
 
       - name: Compute fbank for LibriSpeech test-clean and test-other
         if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

diff --git a/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml b/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
@@ -58,6 +58,8 @@ jobs:
       - name: Install Python dependencies
         run: |
           grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf
 
       - name: Cache kaldifeat
         id: my-cache
@@ -98,7 +100,7 @@ jobs:
         with:
           path: |
             ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
 
       - name: Compute fbank for LibriSpeech test-clean and test-other
         if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

diff --git a/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
@@ -58,6 +58,8 @@ jobs:
       - name: Install Python dependencies
         run: |
           grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf
 
       - name: Cache kaldifeat
         id: my-cache
@@ -98,7 +100,7 @@ jobs:
         with:
           path: |
             ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
 
       - name: Compute fbank for LibriSpeech test-clean and test-other
         if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

diff --git a/.github/workflows/run-pretrained-transducer-stateless.yml b/.github/workflows/run-pretrained-transducer-stateless.yml
@@ -58,6 +58,8 @@ jobs:
       - name: Install Python dependencies
         run: |
           grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf
 
       - name: Cache kaldifeat
         id: my-cache
@@ -98,7 +100,7 @@ jobs:
         with:
           path: |
             ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
 
       - name: Compute fbank for LibriSpeech test-clean and test-other
         if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

diff --git a/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py b/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py
@@ -43,7 +43,7 @@
 
 
 def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
-    src_dir = Path("data/manifests/aidatatang_200zh")
+    src_dir = Path("data/manifests")
     output_dir = Path("data/fbank")
     num_jobs = min(15, os.cpu_count())
 
@@ -52,22 +52,28 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
         "dev",
         "test",
     )
+    prefix = "aidatatang"
+    suffix = "jsonl.gz"
     manifests = read_manifests_if_cached(
-        prefix="aidatatang",
-        suffix="jsonl.gz",
         dataset_parts=dataset_parts,
         output_dir=src_dir,
+        prefix=prefix,
+        suffix=suffix,
     )
     assert manifests is not None
 
     extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
 
     with get_executor() as ex:  # Initialize the executor only once.
         for partition, m in manifests.items():
-            if (output_dir / f"cuts_{partition}.json.gz").is_file():
+            if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
                 logging.info(f"{partition} already exists - skipping.")
                 continue
             logging.info(f"Processing {partition}")
+
+            for sup in m["supervisions"]:
+                sup.custom = {"origin": "aidatatang_200zh"}
+
             cut_set = CutSet.from_manifests(
                 recordings=m["recordings"],
                 supervisions=m["supervisions"],
@@ -80,13 +86,14 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
                 )
             cut_set = cut_set.compute_and_store_features(
                 extractor=extractor,
-                storage_path=f"{output_dir}/feats_{partition}",
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                 # when an executor is specified, make more partitions
                 num_jobs=num_jobs if ex is None else 80,
                 executor=ex,
                 storage_type=ChunkedLilcomHdf5Writer,
             )
-            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+
+            cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
 
 
 def get_args():

diff --git a/egs/aidatatang_200zh/ASR/local/display_manifest_statistics.py b/egs/aidatatang_200zh/ASR/local/display_manifest_statistics.py
@@ -25,27 +25,27 @@
 """
 
 
-from lhotse import load_manifest
+from lhotse import load_manifest_lazy
 
 
 def main():
     paths = [
-        "./data/fbank/cuts_train.json.gz",
-        "./data/fbank/cuts_dev.json.gz",
-        "./data/fbank/cuts_test.json.gz",
+        "./data/fbank/aidatatang_cuts_train.jsonl.gz",
+        "./data/fbank/aidatatang_cuts_dev.jsonl.gz",
+        "./data/fbank/aidatatang_cuts_test.jsonl.gz",
     ]
 
     for path in paths:
         print(f"Starting display the statistics for {path}")
-        cuts = load_manifest(path)
+        cuts = load_manifest_lazy(path)
         cuts.describe()
 
 
 if __name__ == "__main__":
     main()
 
 """
-Starting display the statistics for ./data/fbank/cuts_train.json.gz
+Starting display the statistics for ./data/fbank/aidatatang_cuts_train.jsonl.gz
 Cuts count: 494715
 Total duration (hours): 422.6
 Speech duration (hours): 422.6 (100.0%)
@@ -61,7 +61,7 @@ def main():
 99.5%   8.0
 99.9%   9.5
 max     18.1
-Starting display the statistics for ./data/fbank/cuts_dev.json.gz
+Starting display the statistics for ./data/fbank/aidatatang_cuts_dev.jsonl.gz
 Cuts count: 24216
 Total duration (hours): 20.2
 Speech duration (hours): 20.2 (100.0%)
@@ -77,7 +77,7 @@ def main():
 99.5%   7.3
 99.9%   8.8
 max     11.3
-Starting display the statistics for ./data/fbank/cuts_test.json.gz
+Starting display the statistics for ./data/fbank/aidatatang_cuts_test.jsonl.gz
 Cuts count: 48144
 Total duration (hours): 40.2
 Speech duration (hours): 40.2 (100.0%)

diff --git a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/asr_datamodule.py b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/asr_datamodule.py
@@ -27,11 +27,10 @@
     CutSet,
     Fbank,
     FbankConfig,
-    load_manifest,
+    load_manifest_lazy,
     set_caching_enabled,
 )
 from lhotse.dataset import (
-    BucketingSampler,
     CutConcatenate,
     CutMix,
     DynamicBucketingSampler,
@@ -205,8 +204,8 @@ def train_dataloaders(
             The state dict for the training sampler.
         """
         logging.info("About to get Musan cuts")
-        cuts_musan = load_manifest(
-            self.args.manifest_dir / "cuts_musan.json.gz"
+        cuts_musan = load_manifest_lazy(
+            self.args.manifest_dir / "musan_cuts.jsonl.gz"
         )
 
         transforms = []
@@ -290,13 +289,12 @@ def train_dataloaders(
             )
 
         if self.args.bucketing_sampler:
-            logging.info("Using BucketingSampler.")
-            train_sampler = BucketingSampler(
+            logging.info("Using DynamicBucketingSampler.")
+            train_sampler = DynamicBucketingSampler(
                 cuts_train,
                 max_duration=self.args.max_duration,
                 shuffle=self.args.shuffle,
                 num_buckets=self.args.num_buckets,
-                bucket_method="equal_duration",
                 drop_last=True,
             )
         else:
@@ -402,14 +400,20 @@ def test_dataloaders(self, cuts: CutSet) -> DataLoader:
     @lru_cache()
     def train_cuts(self) -> CutSet:
         logging.info("About to get train cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_train.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "aidatatang_cuts_train.jsonl.gz"
+        )
 
     @lru_cache()
     def valid_cuts(self) -> CutSet:
         logging.info("About to get dev cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "aidatatang_cuts_dev.jsonl.gz"
+        )
 
     @lru_cache()
     def test_cuts(self) -> List[CutSet]:
         logging.info("About to get test cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "aidatatang_cuts_test.jsonl.gz"
+        )
diff --git a/egs/aishell/ASR/conformer_ctc/train.py b/egs/aishell/ASR/conformer_ctc/train.py
@@ -195,9 +195,9 @@ def get_params() -> AttributeDict:
             "best_train_epoch": -1,
             "best_valid_epoch": -1,
             "batch_idx_train": 0,
-            "log_interval": 10,
+            "log_interval": 50,
             "reset_interval": 200,
-            "valid_interval": 3000,
+            "valid_interval": 2000,
             # parameters for k2.ctc_loss
             "beam_size": 10,
             "reduction": "sum",