Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upload extracted codebook indexes #429

Merged
merged 17 commits into from
Jun 21, 2022
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
support to join manifest files
  • Loading branch information
yaozengwei committed Jun 15, 2022
commit 1ed96824a0393c644f63eb5a752c227c93050fb1
35 changes: 29 additions & 6 deletions egs/librispeech/ASR/pruned_transducer_stateless6/vq_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def __init__(self, params: AttributeDict):
setup_logger(f"{self.vq_dir}/log-vq_extraction")

def init_dirs(self):
# TODO:
# vq_dir is the root dir for quantizer:
# training data/ quantizer / extracted codebook indexes
self.vq_dir = (
Expand Down Expand Up @@ -229,15 +230,40 @@ def split_ori_manifests(self):
split_cmd = f"lhotse split {self.params.world_size} {ori_manifest} {self.manifest_dir}"
os.system(f"{split_cmd}")

def join_manifests(self):
"""TODO:"""

for subset in self.params.subsets:
vq_manifest_path = (
self.dst_manifest_dir
/ f"librispeech_cuts_train-{subset}-vq.jsonl.gz"
)
ori_manifest_path = (
self.ori_manifest_dir
/ f"librispeech_cuts_train-{subset}.jsonl.gz"
)
dst_vq_manifest_path = (
self.dst_manifest_dir
/ f"librispeech_cuts_train-{subset}.jsonl.gz"
)
cuts_vq = load_manifest(vq_manifest_path)
cuts_ori = load_manifest(ori_manifest_path)
cuts_vq = cuts_vq.sort_like(cuts_ori)
for cut_idx, (cut_vq, cut_ori) in enumerate(zip(cuts_vq, cuts_ori)):
assert cut_vq.id == cut_ori.id
cut_ori.codebook_indexes = cut_vq.codebook_indexes

CutSet.from_cuts(cuts_ori).to_jsonl(dst_vq_manifest_path)

def merge_vq_manifests(self):
"""
Merge generated vq included manfiests and storage to self.dst_manifest_dir.
"""
for subset in self.params.subsets:
vq_manifests = f"{self.manifest_dir}/with_codebook_indexes-cuts_train-{subset}*.jsonl.gz"
vq_manifests = f"{self.manifest_dir}/with_codebook_indexes-librispeech-cuts_train-{subset}*.jsonl.gz"
dst_vq_manifest = (
self.dst_manifest_dir
/ f"librispeech_cuts_train-{subset}.jsonl.gz"
/ f"librispeech_cuts_train-{subset}-vq.jsonl.gz"
)
if 1 == self.params.world_size:
merge_cmd = f"cp {vq_manifests} {dst_vq_manifest}"
Expand Down Expand Up @@ -385,12 +411,9 @@ def extract_codebook_indexes_imp(self):
message += f" by job {self.params.manifest_index}"
logging.info(f"{message}.")

if batch_idx >= 1:
break

json_file_path = (
self.manifest_dir
/ f"with_codebook_indexes-cuts_train-{manifest_file_id}.jsonl.gz" # noqa
/ f"with_codebook_indexes-librispeech-cuts_train-{manifest_file_id}.jsonl.gz" # noqa
)
CutSet.from_cuts(new_cuts).to_jsonl(json_file_path)

Expand Down