Skip to content

Commit

Permalink
Merge branch 'lhotse-speech:master' into csj
Browse files Browse the repository at this point in the history
  • Loading branch information
teowenshen authored Oct 12, 2022
2 parents ca2430b + a5cf356 commit 6e8b659
Show file tree
Hide file tree
Showing 8 changed files with 45 additions and 20 deletions.
8 changes: 5 additions & 3 deletions lhotse/qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from lhotse.cut import Cut, CutSet, MixedCut, MonoCut, PaddingCut
from lhotse.features import Features, FeatureSet
from lhotse.supervision import SupervisionSegment, SupervisionSet
from lhotse.utils import compute_num_frames, overlaps
from lhotse.utils import compute_num_frames, is_equal_or_contains, overlaps

_VALIDATORS: Dict[str, Callable] = {}

Expand Down Expand Up @@ -110,7 +110,7 @@ def validate_recordings_and_supervisions(
f"Supervision {s.id}: exceeded the bounds of its corresponding recording "
f"(supervision spans [{s.start}, {s.end}]; recording spans [0, {r.duration}])"
)
assert s.channel in r.channel_ids, (
assert is_equal_or_contains(r.channel_ids, s.channel), (
f"Supervision {s.id}: channel {s.channel} does not exist in its corresponding Recording "
f"(recording channels: {r.channel_ids})"
)
Expand Down Expand Up @@ -457,7 +457,9 @@ def validate_supervision_set(supervisions: SupervisionSet, **kwargs) -> None:
for rid, sups in supervisions._segments_by_recording_id.items():
cntr_per_channel = defaultdict(int)
for s in sups:
cntr_per_channel[s.channel] += int(s.start == 0)
# channel can be an int or a list (in which case we convert it to a tuple)
c = s.channel if isinstance(s.channel, int) else tuple(s.channel)
cntr_per_channel[c] += int(s.start == 0)
for channel, count in cntr_per_channel.items():
if count > 1:
logging.warning(
Expand Down
2 changes: 1 addition & 1 deletion lhotse/recipes/aishell4.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def prepare_aishell4(
recording_id=idx,
start=start,
duration=round(end - start, 4),
channel=0,
channel=recording.channel_ids,
language="Chinese",
speaker=spk_id,
text=text.strip(),
Expand Down
2 changes: 1 addition & 1 deletion lhotse/recipes/ali_meeting.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def prepare_ali_meeting(
recording_id=recording.id,
start=start,
duration=round(end - start, 4),
channel=0,
channel=0 if mic == "near" else list(range(8)),
language="Chinese",
speaker=spk_id,
gender=gender,
Expand Down
2 changes: 1 addition & 1 deletion lhotse/recipes/ami.py
Original file line number Diff line number Diff line change
Expand Up @@ -550,7 +550,7 @@ def prepare_supervision_other(
recording_id=recording.id,
start=seg_info.start_time,
duration=duration,
channel=0,
channel=recording.channel_ids,
language="English",
speaker=seg_info.speaker,
gender=seg_info.gender,
Expand Down
3 changes: 3 additions & 0 deletions lhotse/recipes/aspire.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,9 @@ def prepare_aspire(
speaker=speaker,
text=seg.text,
language="English",
channel=0
if mic == "single"
else recording_set[session].channel_ids,
)
for i, seg in enumerate(segs)
]
Expand Down
2 changes: 1 addition & 1 deletion lhotse/recipes/icsi.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,7 +469,7 @@ def prepare_supervision_other(
recording_id=recording.id,
start=seg_info.start_time,
duration=duration,
channel=source.channels[0],
channel=recording.channel_ids,
language="English",
speaker=seg_info.speaker,
gender=seg_info.gender,
Expand Down
26 changes: 16 additions & 10 deletions lhotse/recipes/libricss.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
validate_recordings_and_supervisions,
)
from lhotse.audio import Recording
from lhotse.utils import Pathlike
from lhotse.utils import Pathlike, fastcopy

# fmt: off
# The following mapping is courtesy Zhuo Chen (Microsoft). It is not available in the original
Expand Down Expand Up @@ -141,9 +141,7 @@ def prepare_libricss(
:return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
"""
assert type in ["mdm", "ihm-mix", "ihm"]

manifests = {}
assert type in ["mdm", "sdm", "ihm-mix", "ihm"]

corpus_dir = Path(corpus_dir)
corpus_dir = (
Expand All @@ -165,12 +163,22 @@ def prepare_libricss(
if type == "ihm"
else session / "record" / "raw_recording.wav"
)
recordings.append(
Recording.from_file(audio_path, recording_id=recording_id)
)
recording = Recording.from_file(audio_path, recording_id=recording_id)

if type == "sdm":
recordings.append(fastcopy(recording, channel_ids=[0]))
else:
recordings.append(recording)

for idx, seg in enumerate(
parse_transcript(session / "transcription" / "meeting_info.txt")
):
if type == "ihm-mix" or type == "sdm":
channel = 0
elif type == "ihm":
channel = SPK_TO_CHANNEL_MAP[session.name][seg[2]]
else:
channel = list(range(7))
segments.append(
SupervisionSegment(
id=f"{recording_id}-{idx}",
Expand All @@ -180,9 +188,7 @@ def prepare_libricss(
text=seg[4],
language="English",
speaker=seg[2],
channel=SPK_TO_CHANNEL_MAP[session.name][seg[2]]
if type == "ihm"
else 0,
channel=channel,
)
)

Expand Down
20 changes: 17 additions & 3 deletions lhotse/workflows/whisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,14 @@

import torch

from lhotse import CutSet, MonoCut, Recording, RecordingSet, SupervisionSegment
from lhotse import (
CutSet,
MonoCut,
Recording,
RecordingSet,
SupervisionSegment,
add_durations,
)
from lhotse.qa import trim_supervisions_to_recordings
from lhotse.utils import fastcopy, is_module_available

Expand Down Expand Up @@ -68,7 +75,9 @@ def _annotate_recordings(
id=f"{recording.id}-{segment['id']:06d}",
recording_id=recording.id,
start=round(segment["start"], ndigits=8),
duration=round(segment["end"], ndigits=8),
duration=add_durations(
segment["end"], -segment["start"], sampling_rate=16000
),
text=segment["text"].strip(),
language=result["language"],
)
Expand Down Expand Up @@ -107,7 +116,12 @@ def _annotate_cuts(cuts: CutSet, language: str, model_name: str, device: str):
id=f"{cut.id}-{segment['id']:06d}",
recording_id=cut.recording_id,
start=round(segment["start"], ndigits=8),
duration=max(cut.duration, round(segment["end"], ndigits=8)),
duration=max(
cut.duration,
add_durations(
segment["end"], -segment["start"], sampling_rate=16000
),
),
text=segment["text"].strip(),
language=result["language"],
)
Expand Down

0 comments on commit 6e8b659

Please sign in to comment.