Merge branch 'lhotse-speech:master' into csj

lhotse-speech · Oct 12, 2022 · 6e8b659 · 6e8b659
2 parents ca2430b + a5cf356
commit 6e8b659
Show file tree

Hide file tree

Showing 8 changed files with 45 additions and 20 deletions.
diff --git a/lhotse/qa.py b/lhotse/qa.py
@@ -14,7 +14,7 @@
 from lhotse.cut import Cut, CutSet, MixedCut, MonoCut, PaddingCut
 from lhotse.features import Features, FeatureSet
 from lhotse.supervision import SupervisionSegment, SupervisionSet
-from lhotse.utils import compute_num_frames, overlaps
+from lhotse.utils import compute_num_frames, is_equal_or_contains, overlaps
 
 _VALIDATORS: Dict[str, Callable] = {}
 
@@ -110,7 +110,7 @@ def validate_recordings_and_supervisions(
             f"Supervision {s.id}: exceeded the bounds of its corresponding recording "
             f"(supervision spans [{s.start}, {s.end}]; recording spans [0, {r.duration}])"
         )
-        assert s.channel in r.channel_ids, (
+        assert is_equal_or_contains(r.channel_ids, s.channel), (
             f"Supervision {s.id}: channel {s.channel} does not exist in its corresponding Recording "
             f"(recording channels: {r.channel_ids})"
         )
@@ -457,7 +457,9 @@ def validate_supervision_set(supervisions: SupervisionSet, **kwargs) -> None:
     for rid, sups in supervisions._segments_by_recording_id.items():
         cntr_per_channel = defaultdict(int)
         for s in sups:
-            cntr_per_channel[s.channel] += int(s.start == 0)
+            # channel can be an int or a list (in which case we convert it to a tuple)
+            c = s.channel if isinstance(s.channel, int) else tuple(s.channel)
+            cntr_per_channel[c] += int(s.start == 0)
         for channel, count in cntr_per_channel.items():
             if count > 1:
                 logging.warning(

diff --git a/lhotse/recipes/aishell4.py b/lhotse/recipes/aishell4.py
@@ -132,7 +132,7 @@ def prepare_aishell4(
                             recording_id=idx,
                             start=start,
                             duration=round(end - start, 4),
-                            channel=0,
+                            channel=recording.channel_ids,
                             language="Chinese",
                             speaker=spk_id,
                             text=text.strip(),

diff --git a/lhotse/recipes/ali_meeting.py b/lhotse/recipes/ali_meeting.py
@@ -156,7 +156,7 @@ def prepare_ali_meeting(
                             recording_id=recording.id,
                             start=start,
                             duration=round(end - start, 4),
-                            channel=0,
+                            channel=0 if mic == "near" else list(range(8)),
                             language="Chinese",
                             speaker=spk_id,
                             gender=gender,

diff --git a/lhotse/recipes/ami.py b/lhotse/recipes/ami.py
@@ -550,7 +550,7 @@ def prepare_supervision_other(
                         recording_id=recording.id,
                         start=seg_info.start_time,
                         duration=duration,
-                        channel=0,
+                        channel=recording.channel_ids,
                         language="English",
                         speaker=seg_info.speaker,
                         gender=seg_info.gender,

diff --git a/lhotse/recipes/aspire.py b/lhotse/recipes/aspire.py
@@ -158,6 +158,9 @@ def prepare_aspire(
                     speaker=speaker,
                     text=seg.text,
                     language="English",
+                    channel=0
+                    if mic == "single"
+                    else recording_set[session].channel_ids,
                 )
                 for i, seg in enumerate(segs)
             ]

diff --git a/lhotse/recipes/icsi.py b/lhotse/recipes/icsi.py
@@ -469,7 +469,7 @@ def prepare_supervision_other(
                         recording_id=recording.id,
                         start=seg_info.start_time,
                         duration=duration,
-                        channel=source.channels[0],
+                        channel=recording.channel_ids,
                         language="English",
                         speaker=seg_info.speaker,
                         gender=seg_info.gender,

diff --git a/lhotse/recipes/libricss.py b/lhotse/recipes/libricss.py
@@ -19,7 +19,7 @@
     validate_recordings_and_supervisions,
 )
 from lhotse.audio import Recording
-from lhotse.utils import Pathlike
+from lhotse.utils import Pathlike, fastcopy
 
 # fmt: off
 # The following mapping is courtesy Zhuo Chen (Microsoft). It is not available in the original
@@ -141,9 +141,7 @@ def prepare_libricss(
     :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
 
     """
-    assert type in ["mdm", "ihm-mix", "ihm"]
-
-    manifests = {}
+    assert type in ["mdm", "sdm", "ihm-mix", "ihm"]
 
     corpus_dir = Path(corpus_dir)
     corpus_dir = (
@@ -165,12 +163,22 @@ def prepare_libricss(
                 if type == "ihm"
                 else session / "record" / "raw_recording.wav"
             )
-            recordings.append(
-                Recording.from_file(audio_path, recording_id=recording_id)
-            )
+            recording = Recording.from_file(audio_path, recording_id=recording_id)
+
+            if type == "sdm":
+                recordings.append(fastcopy(recording, channel_ids=[0]))
+            else:
+                recordings.append(recording)
+
             for idx, seg in enumerate(
                 parse_transcript(session / "transcription" / "meeting_info.txt")
             ):
+                if type == "ihm-mix" or type == "sdm":
+                    channel = 0
+                elif type == "ihm":
+                    channel = SPK_TO_CHANNEL_MAP[session.name][seg[2]]
+                else:
+                    channel = list(range(7))
                 segments.append(
                     SupervisionSegment(
                         id=f"{recording_id}-{idx}",
@@ -180,9 +188,7 @@ def prepare_libricss(
                         text=seg[4],
                         language="English",
                         speaker=seg[2],
-                        channel=SPK_TO_CHANNEL_MAP[session.name][seg[2]]
-                        if type == "ihm"
-                        else 0,
+                        channel=channel,
                     )
                 )
 

diff --git a/lhotse/workflows/whisper.py b/lhotse/workflows/whisper.py
@@ -3,7 +3,14 @@
 
 import torch
 
-from lhotse import CutSet, MonoCut, Recording, RecordingSet, SupervisionSegment
+from lhotse import (
+    CutSet,
+    MonoCut,
+    Recording,
+    RecordingSet,
+    SupervisionSegment,
+    add_durations,
+)
 from lhotse.qa import trim_supervisions_to_recordings
 from lhotse.utils import fastcopy, is_module_available
 
@@ -68,7 +75,9 @@ def _annotate_recordings(
                 id=f"{recording.id}-{segment['id']:06d}",
                 recording_id=recording.id,
                 start=round(segment["start"], ndigits=8),
-                duration=round(segment["end"], ndigits=8),
+                duration=add_durations(
+                    segment["end"], -segment["start"], sampling_rate=16000
+                ),
                 text=segment["text"].strip(),
                 language=result["language"],
             )
@@ -107,7 +116,12 @@ def _annotate_cuts(cuts: CutSet, language: str, model_name: str, device: str):
                 id=f"{cut.id}-{segment['id']:06d}",
                 recording_id=cut.recording_id,
                 start=round(segment["start"], ndigits=8),
-                duration=max(cut.duration, round(segment["end"], ndigits=8)),
+                duration=max(
+                    cut.duration,
+                    add_durations(
+                        segment["end"], -segment["start"], sampling_rate=16000
+                    ),
+                ),
                 text=segment["text"].strip(),
                 language=result["language"],
             )