Skip to content

Commit

Permalink
Use jsonl for CutSet in the LibriSpeech recipe. (k2-fsa#397)
Browse files Browse the repository at this point in the history
* Use jsonl for cutsets in the librispeech recipe.

* Use lazy cutset for all recipes.

* More fixes to use lazy CutSet.

* Remove force=True from logging to support Python < 3.8

* Minor fixes.

* Fix style issues.
  • Loading branch information
csukuangfj authored Jun 6, 2022
1 parent e5884f8 commit f1abce7
Show file tree
Hide file tree
Showing 68 changed files with 701 additions and 1,097 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/run-gigaspeech-2022-05-13.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ jobs:
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat
id: my-cache
Expand Down
4 changes: 3 additions & 1 deletion .github/workflows/run-librispeech-2022-03-12.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ jobs:
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat
id: my-cache
Expand Down Expand Up @@ -99,7 +101,7 @@ jobs:
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other
key: cache-libri-fbank-test-clean-and-test-other-v2

- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
Expand Down
4 changes: 3 additions & 1 deletion .github/workflows/run-librispeech-2022-04-29.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ jobs:
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat
id: my-cache
Expand Down Expand Up @@ -99,7 +101,7 @@ jobs:
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other
key: cache-libri-fbank-test-clean-and-test-other-v2

- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
Expand Down
4 changes: 3 additions & 1 deletion .github/workflows/run-librispeech-2022-05-13.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ jobs:
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat
id: my-cache
Expand Down Expand Up @@ -99,7 +101,7 @@ jobs:
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other
key: cache-libri-fbank-test-clean-and-test-other-v2

- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ jobs:
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat
id: my-cache
Expand Down Expand Up @@ -99,7 +101,7 @@ jobs:
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other
key: cache-libri-fbank-test-clean-and-test-other-v2

- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ jobs:
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat
id: my-cache
Expand Down Expand Up @@ -99,7 +101,7 @@ jobs:
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other
key: cache-libri-fbank-test-clean-and-test-other-v2

- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ jobs:
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat
id: my-cache
Expand Down Expand Up @@ -98,7 +100,7 @@ jobs:
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other
key: cache-libri-fbank-test-clean-and-test-other-v2

- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ jobs:
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat
id: my-cache
Expand Down Expand Up @@ -98,7 +100,7 @@ jobs:
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other
key: cache-libri-fbank-test-clean-and-test-other-v2

- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
Expand Down
4 changes: 3 additions & 1 deletion .github/workflows/run-pretrained-transducer-stateless.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ jobs:
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat
id: my-cache
Expand Down Expand Up @@ -98,7 +100,7 @@ jobs:
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other
key: cache-libri-fbank-test-clean-and-test-other-v2

- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
Expand Down
19 changes: 13 additions & 6 deletions egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@


def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
src_dir = Path("data/manifests/aidatatang_200zh")
src_dir = Path("data/manifests")
output_dir = Path("data/fbank")
num_jobs = min(15, os.cpu_count())

Expand All @@ -52,22 +52,28 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
"dev",
"test",
)
prefix = "aidatatang"
suffix = "jsonl.gz"
manifests = read_manifests_if_cached(
prefix="aidatatang",
suffix="jsonl.gz",
dataset_parts=dataset_parts,
output_dir=src_dir,
prefix=prefix,
suffix=suffix,
)
assert manifests is not None

extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))

with get_executor() as ex: # Initialize the executor only once.
for partition, m in manifests.items():
if (output_dir / f"cuts_{partition}.json.gz").is_file():
if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
logging.info(f"{partition} already exists - skipping.")
continue
logging.info(f"Processing {partition}")

for sup in m["supervisions"]:
sup.custom = {"origin": "aidatatang_200zh"}

cut_set = CutSet.from_manifests(
recordings=m["recordings"],
supervisions=m["supervisions"],
Expand All @@ -80,13 +86,14 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
)
cut_set = cut_set.compute_and_store_features(
extractor=extractor,
storage_path=f"{output_dir}/feats_{partition}",
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
# when an executor is specified, make more partitions
num_jobs=num_jobs if ex is None else 80,
executor=ex,
storage_type=ChunkedLilcomHdf5Writer,
)
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")

cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")


def get_args():
Expand Down
16 changes: 8 additions & 8 deletions egs/aidatatang_200zh/ASR/local/display_manifest_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,27 +25,27 @@
"""


from lhotse import load_manifest
from lhotse import load_manifest_lazy


def main():
paths = [
"./data/fbank/cuts_train.json.gz",
"./data/fbank/cuts_dev.json.gz",
"./data/fbank/cuts_test.json.gz",
"./data/fbank/aidatatang_cuts_train.jsonl.gz",
"./data/fbank/aidatatang_cuts_dev.jsonl.gz",
"./data/fbank/aidatatang_cuts_test.jsonl.gz",
]

for path in paths:
print(f"Starting display the statistics for {path}")
cuts = load_manifest(path)
cuts = load_manifest_lazy(path)
cuts.describe()


if __name__ == "__main__":
main()

"""
Starting display the statistics for ./data/fbank/cuts_train.json.gz
Starting display the statistics for ./data/fbank/aidatatang_cuts_train.jsonl.gz
Cuts count: 494715
Total duration (hours): 422.6
Speech duration (hours): 422.6 (100.0%)
Expand All @@ -61,7 +61,7 @@ def main():
99.5% 8.0
99.9% 9.5
max 18.1
Starting display the statistics for ./data/fbank/cuts_dev.json.gz
Starting display the statistics for ./data/fbank/aidatatang_cuts_dev.jsonl.gz
Cuts count: 24216
Total duration (hours): 20.2
Speech duration (hours): 20.2 (100.0%)
Expand All @@ -77,7 +77,7 @@ def main():
99.5% 7.3
99.9% 8.8
max 11.3
Starting display the statistics for ./data/fbank/cuts_test.json.gz
Starting display the statistics for ./data/fbank/aidatatang_cuts_test.jsonl.gz
Cuts count: 48144
Total duration (hours): 40.2
Speech duration (hours): 40.2 (100.0%)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,10 @@
CutSet,
Fbank,
FbankConfig,
load_manifest,
load_manifest_lazy,
set_caching_enabled,
)
from lhotse.dataset import (
BucketingSampler,
CutConcatenate,
CutMix,
DynamicBucketingSampler,
Expand Down Expand Up @@ -205,8 +204,8 @@ def train_dataloaders(
The state dict for the training sampler.
"""
logging.info("About to get Musan cuts")
cuts_musan = load_manifest(
self.args.manifest_dir / "cuts_musan.json.gz"
cuts_musan = load_manifest_lazy(
self.args.manifest_dir / "musan_cuts.jsonl.gz"
)

transforms = []
Expand Down Expand Up @@ -290,13 +289,12 @@ def train_dataloaders(
)

if self.args.bucketing_sampler:
logging.info("Using BucketingSampler.")
train_sampler = BucketingSampler(
logging.info("Using DynamicBucketingSampler.")
train_sampler = DynamicBucketingSampler(
cuts_train,
max_duration=self.args.max_duration,
shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets,
bucket_method="equal_duration",
drop_last=True,
)
else:
Expand Down Expand Up @@ -402,14 +400,20 @@ def test_dataloaders(self, cuts: CutSet) -> DataLoader:
@lru_cache()
def train_cuts(self) -> CutSet:
logging.info("About to get train cuts")
return load_manifest(self.args.manifest_dir / "cuts_train.json.gz")
return load_manifest_lazy(
self.args.manifest_dir / "aidatatang_cuts_train.jsonl.gz"
)

@lru_cache()
def valid_cuts(self) -> CutSet:
logging.info("About to get dev cuts")
return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz")
return load_manifest_lazy(
self.args.manifest_dir / "aidatatang_cuts_dev.jsonl.gz"
)

@lru_cache()
def test_cuts(self) -> List[CutSet]:
logging.info("About to get test cuts")
return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
return load_manifest_lazy(
self.args.manifest_dir / "aidatatang_cuts_test.jsonl.gz"
)
4 changes: 2 additions & 2 deletions egs/aishell/ASR/conformer_ctc/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,9 +195,9 @@ def get_params() -> AttributeDict:
"best_train_epoch": -1,
"best_valid_epoch": -1,
"batch_idx_train": 0,
"log_interval": 10,
"log_interval": 50,
"reset_interval": 200,
"valid_interval": 3000,
"valid_interval": 2000,
# parameters for k2.ctc_loss
"beam_size": 10,
"reduction": "sum",
Expand Down
Loading

0 comments on commit f1abce7

Please sign in to comment.