add linear decoder for classification tasks

espnet · Shikhar-S · Dec 10, 2024 · Dec 10, 2024 · Dec 10, 2024 · Dec 10, 2024
commit 23914db1bf1b6098f79af615b2aaa7cd80fc7eb2
diff --git a/espnet2/asr/decoder/linear_decoder.py b/espnet2/asr/decoder/linear_decoder.py
@@ -0,0 +1,83 @@
+"""A simple linear layer decoder.
+
+This can be used for classification tasks from sequence input.
+"""
+
+from typing import Tuple
+
+import torch
+from espnet2.asr.decoder.abs_decoder import AbsDecoder
+from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
+from typeguard import typechecked
+
+
+class LinearDecoder(AbsDecoder):
+
+    @typechecked
+    def __init__(
+        self,
+        vocab_size: int,
+        encoder_output_size: int,
+        pooling: str = "CLS",
+    ):
+        """Initialize the module."""
+        super().__init__()
+
+        self.input_dim = encoder_output_size
+        self.output_dim = vocab_size
+        self.linear_out = torch.nn.Linear(self.input_dim, self.output_dim)
+        assert pooling in [
+            "mean",
+            "max",
+            "CLS",
+        ], f"Invalid pooling: {pooling}. Should be 'mean', 'max' or 'CLS'."
+        self.pooling = pooling
+
+    def forward(
+        self,
+        hs_pad: torch.Tensor,
+        hlens: torch.Tensor,
+        ys_in_pad: torch.Tensor = None,
+        ys_in_lens: torch.Tensor = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            hs_pad: (B, Tmax, D)
+            hlens: (B,)
+        Returns:
+            output: (B, n_classes)
+        """
+
+        mask = make_pad_mask(lengths=hlens, xs=hs_pad, length_dim=1).to(hs_pad.device)
+        if self.pooling == "mean":
+            unmasked_entries = (~mask).to(dtype=hs_pad.dtype)
+            input_feature = (hs_pad * unmasked_entries).sum(dim=1)
+            input_feature = input_feature / unmasked_entries.sum(dim=1)
+        elif self.pooling == "max":
+            input_feature = hs_pad.masked_fill(mask, float("-inf"))
+            input_feature, _ = torch.max(input_feature, dim=1)
+        elif self.pooling == "CLS":
+            input_feature = hs_pad[:, 0, :]
+
+        output = self.linear_out(input_feature)  # Get logits
+
+        # Fix blank, unk and sos/eos to -inf
+        # This ensure that they are never selected at inference.
+        output[:, 0] = float("-inf")
+        output[:, 1] = float("-inf")
+        output[:, -1] = float("-inf")
+        return output
+
+    def score(self, ys, state, x):
+        """Classify x."""
+        hs_len = torch.tensor([x.shape[0]], dtype=torch.long).to(x.device)
+        logits = self.forward(
+            x.unsqueeze(0),
+            hs_len,
+        )
+        logp = torch.nn.functional.log_softmax(logits, dim=-1)
+        return logp.squeeze(0), None
+
+    def output_size(self) -> int:
+        """Get the output size."""
+        return self.output_dim
diff --git a/espnet2/asr/espnet_model.py b/espnet2/asr/espnet_model.py
@@ -8,6 +8,7 @@
 
 from espnet2.asr.ctc import CTC
 from espnet2.asr.decoder.abs_decoder import AbsDecoder
+from espnet2.asr.decoder.linear_decoder import LinearDecoder
 from espnet2.asr.encoder.abs_encoder import AbsEncoder
 from espnet2.asr.frontend.abs_frontend import AbsFrontend
 from espnet2.asr.postencoder.abs_postencoder import AbsPostEncoder
@@ -110,6 +111,7 @@ def __init__(
             )
 
         self.use_transducer_decoder = joint_network is not None
+        self.use_linear_decoder = isinstance(decoder, LinearDecoder)
 
         self.error_calculator = None
 
@@ -155,6 +157,12 @@ def __init__(
                     self.error_calculator = ErrorCalculator(
                         token_list, sym_space, sym_blank, report_cer, report_wer
                     )
+        elif self.use_linear_decoder:
+            assert ctc_weight == 0.0, "CTC is not supported with LinearDecoder."
+            self.decoder = decoder
+            self.criterion_classif = torch.nn.CrossEntropyLoss(
+                ignore_index=ignore_id, label_smoothing=lsm_weight
+            )
         else:
             # we set self.decoder = None in the CTC mode since
             # self.decoder parameters were never used and PyTorch complained
@@ -243,6 +251,7 @@ def forward(
         loss_att, acc_att, cer_att, wer_att = None, None, None, None
         loss_ctc, cer_ctc = None, None
         loss_transducer, cer_transducer, wer_transducer = None, None, None
+        loss_classif, acc_classif = None, None
         stats = dict()
 
         # 1. CTC branch
@@ -325,8 +334,13 @@ def forward(
             stats["cer_transducer"] = cer_transducer
             stats["wer_transducer"] = wer_transducer
 
+        elif self.use_linear_decoder:
+            # 2b. Linear decoder branch for classification tasks
+            loss, acc = self._calc_classif_loss(encoder_out, encoder_out_lens, text)
+            stats["loss"] = loss
+            stats["acc"] = acc
         else:
-            # 2b. Attention decoder branch
+            # 2c. Attention decoder branch
             if self.ctc_weight != 1.0:
                 loss_att, acc_att, cer_att, wer_att = self._calc_att_loss(
                     encoder_out, encoder_out_lens, text, text_lengths
@@ -672,3 +686,31 @@ def _calc_batch_ctc_loss(
         loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, text_lengths)
         self.ctc.reduce = do_reduce
         return loss_ctc
+
+    def _calc_classif_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        labels: torch.Tensor,
+    ):
+        """Compute classification loss.
+        Args:
+            encoder_out: Encoder output sequences. (B, T, D_enc)
+            encoder_out_lens: Encoder output sequences lengths. (B,)
+            labels: Label ID sequences. (B, 1)
+        Return:
+            loss_classif: Classification loss value.
+            acc_classif: Classification accuracy.
+        """
+        # Calc classification loss
+        assert labels.dim() == 2, labels.shape
+        assert labels.shape[1] == 1, labels.shape
+        logits = self.decoder(encoder_out, encoder_out_lens)  # (B, n_class + 3)
+        logits = logits[:, 2:-1]  # remove blank, unk and sos/eos # (B, n_class)
+        # We do not want unk/seos/blank, just class
+        assert logits.shape[1] == self.vocab_size - 3, logits.shape
+        # Shift up labels to remove blank and unk.
+        labels = labels - 2
+        loss_classif = self.criterion_classif(logits, labels.squeeze(-1))
+        acc_classif = th_accuracy(logits, labels, ignore_label=self.ignore_id)
+        return loss_classif, acc_classif
diff --git a/espnet2/bin/asr_inference.py b/espnet2/bin/asr_inference.py
@@ -65,6 +65,10 @@
     ]
 ]
 
+logger = logging.getLogger(__name__)
+# NOTE(shikhar): We use contextual logging here because
+# RTF calculation looks for "INFO: " as a prefix in the logs.
+
 
 class Speech2Text:
     """Speech2Text class
@@ -157,7 +161,7 @@ def __init__(
         asr_model.to(dtype=getattr(torch, dtype)).eval()
 
         if quantize_asr_model:
-            logging.info("Use quantized asr model for decoding.")
+            logger.info("Use quantized asr model for decoding.")
 
             asr_model = torch.quantization.quantize_dynamic(
                 asr_model, qconfig_spec=qconfig_spec, dtype=quantize_dtype
@@ -180,7 +184,7 @@ def __init__(
             )
 
             if quantize_lm:
-                logging.info("Use quantized lm for decoding.")
+                logger.info("Use quantized lm for decoding.")
 
                 lm = torch.quantization.quantize_dynamic(
                     lm, qconfig_spec=qconfig_spec, dtype=quantize_dtype
@@ -337,7 +341,7 @@ def __init__(
                     raise NotImplementedError(
                         "BeamSearchTimeSync with batching is not yet supported."
                     )
-                logging.info("BeamSearchTimeSync implementation is selected.")
+                logger.info("BeamSearchTimeSync implementation is selected.")
 
                 scorers["ctc"] = asr_model.ctc
                 beam_search = BeamSearchTimeSync(
@@ -371,14 +375,14 @@ def __init__(
                         if streaming:
                             beam_search.__class__ = BatchBeamSearchOnlineSim
                             beam_search.set_streaming_config(asr_train_config)
-                            logging.info(
+                            logger.info(
                                 "BatchBeamSearchOnlineSim implementation is selected."
                             )
                         else:
                             beam_search.__class__ = BatchBeamSearch
-                            logging.info("BatchBeamSearch implementation is selected.")
+                            logger.info("BatchBeamSearch implementation is selected.")
                     else:
-                        logging.warning(
+                        logger.warning(
                             f"As non-batch scorers {non_batch} are found, "
                             f"fall back to non-batch implementation."
                         )
@@ -387,8 +391,8 @@ def __init__(
             for scorer in scorers.values():
                 if isinstance(scorer, torch.nn.Module):
                     scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
-            logging.info(f"Beam_search: {beam_search}")
-            logging.info(f"Decoding device={device}, dtype={dtype}")
+            logger.info(f"Beam_search: {beam_search}")
+            logger.info(f"Decoding device={device}, dtype={dtype}")
 
         # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
         if token_type is None:
@@ -466,7 +470,7 @@ def __init__(
                 beam_search.set_hyp_primer(
                     list(converter.tokenizer.tokenizer.convert_tokens_to_ids(a1))
                 )
-        logging.info(f"Text tokenizer: {tokenizer}")
+        logger.info(f"Text tokenizer: {tokenizer}")
 
         self.asr_model = asr_model
         self.asr_train_args = asr_train_args
@@ -513,7 +517,7 @@ def __call__(self, speech: Union[torch.Tensor, np.ndarray]) -> Union[
         # lengths: (1,)
         lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
         batch = {"speech": speech, "speech_lengths": lengths}
-        logging.info("speech length: " + str(speech.size(1)))
+        logger.info("speech length: " + str(speech.size(1)))
 
         # a. To device
         batch = to_device(batch, device=self.device)

diff --git a/espnet2/tasks/asr.py b/espnet2/tasks/asr.py
@@ -11,6 +11,7 @@
 from espnet2.asr.decoder.hugging_face_transformers_decoder import (  # noqa: H301
     HuggingFaceTransformersDecoder,
 )
+from espnet2.asr.decoder.linear_decoder import LinearDecoder
 from espnet2.asr.decoder.mlm_decoder import MLMDecoder
 from espnet2.asr.decoder.rnn_decoder import RNNDecoder
 from espnet2.asr.decoder.s4_decoder import S4Decoder
@@ -189,6 +190,7 @@
         whisper=OpenAIWhisperDecoder,
         hugging_face_transformers=HuggingFaceTransformersDecoder,
         s4=S4Decoder,
+        linear_decoder=LinearDecoder,
     ),
     type_check=AbsDecoder,
     default=None,

diff --git a/test/espnet2/asr/decoder/test_linear_decoder.py b/test/espnet2/asr/decoder/test_linear_decoder.py
@@ -0,0 +1,16 @@
+import pytest
+import torch
+from espnet2.asr.decoder.linear_decoder import LinearDecoder
+
+
+@pytest.mark.execution_timeout(30)
+@pytest.mark.parametrize("vocab_size", [10, 5])
+@pytest.mark.parametrize("encoder_output_size", [4, 21])
+@pytest.mark.parametrize("pooling", ["mean", "max", "CLS"])
+def test_LinearDecoder_forward_backward(vocab_size, encoder_output_size, pooling):
+    decoder = LinearDecoder(vocab_size, encoder_output_size, pooling)
+    x = torch.randn(2, 10, encoder_output_size, requires_grad=True)
+    x_len = torch.randint(1, 10, [2], dtype=torch.long)
+    logits = decoder(x, x_len)
+    assert logits.shape == (2, vocab_size), logits.shape
+    logits.sum().backward()