run server

sooftware · Apr 28, 2020 · d019d07 · d019d07
1 parent 9d7a516
commit d019d07
Show file tree

Hide file tree

Showing 20 changed files with 630 additions and 96 deletions.
diff --git a/README.md b/README.md
@@ -1,8 +1,8 @@
-# **End-to-end Speech Recognition**  
+# **End-to-End Speech Recognition**  
 
 ### Character-unit based End-to-End Korean Speech Recognition  
 
-[<img src="https://github.com/gentaiscool/end2end-asr-pytorch/raw/master/img/pytorch-logo-dark.png" height=18>](https://pytorch.org/) <img src="https://img.shields.io/badge/License-Apache--2.0-yellow" height=20> [<img src="https://img.shields.io/badge/chat-on%20gitter-4fb99a" height=20>](https://gitter.im/Korean-Speech-Recognition/community) <img src="https://img.shields.io/badge/Fixing-Beam%20Search-red" height=20>  
+[<img src="https://github.com/gentaiscool/end2end-asr-pytorch/raw/master/img/pytorch-logo-dark.png" height=18>](https://pytorch.org/) <img src="https://img.shields.io/badge/License-Apache--2.0-yellow" height=20> [<img src="https://img.shields.io/badge/chat-on%20gitter-4fb99a" height=20>](https://gitter.im/Korean-Speech-Recognition/community)
 
 ### [**Documentation**](https://sooftware.github.io/End-to-end-Speech-Recognition/)   
 
@@ -72,7 +72,7 @@ ListenAttendSpell(
 ```
 
 We use [AI Hub 1000h](http://www.aihub.or.kr/aidata/105) dataset which contains 1,000 hours korean voice data. and, our project is currently in progress.   
-At present our top model has recorded an **82.3% CRR**, and we are working for a higher recognition rate.  
+At present our top model has recorded an **80% CRR**, and we are working for a higher recognition rate.  
 
 Also our model has recorded **91% CRR** in [Kadi-zeroth dataset](https://github.com/goodatlas/zeroth).  
 

diff --git a/data/data_list/debug_list.csv b/data/data_list/debug_list.csv
@@ -1,9 +1,2 @@
-audio,label
+audio,label
 KaiSpeech_000001.pcm,KaiSpeech_label_000001.txt
-KaiSpeech_000002.pcm,KaiSpeech_label_000002.txt
-KaiSpeech_000003.pcm,KaiSpeech_label_000003.txt
-KaiSpeech_000004.pcm,KaiSpeech_label_000004.txt
-KaiSpeech_000005.pcm,KaiSpeech_label_000005.txt
-KaiSpeech_000006.pcm,KaiSpeech_label_000006.txt
-KaiSpeech_000007.pcm,KaiSpeech_label_000007.txt
-KaiSpeech_000008.pcm,KaiSpeech_label_000008.txt
diff --git a/data/pickle/target_dict.bin.bin b/data/pickle/target_dict.bin.bin
diff --git a/model/attention.py b/model/attention.py
@@ -51,9 +51,9 @@ def forward(self, queries, values):
         values = values.permute(2, 0, 1, 3).contiguous().view(-1, value_length, self.dim)
 
         attn_score = torch.bmm(queries, values.transpose(1, 2))
-        alignment = F.softmax(attn_score, dim=2)
+        align = F.softmax(attn_score, dim=2)
 
-        attn_val = torch.bmm(alignment, values).view(self.n_head, batch_size, query_length, self.dim)
+        attn_val = torch.bmm(align, values).view(self.n_head, batch_size, query_length, self.dim)
         attn_val = attn_val.permute(1, 2, 0, 3).contiguous().view(batch_size, query_length, -1)
 
         combined = torch.cat([attn_val, preserved], dim=2)

diff --git a/model/beam.py b/model/beam.py
@@ -1,41 +1,39 @@
 import torch
 import torch.nn.functional as F
-from package.definition import char2id, PAD_token
+import copy
+from package.definition import char2id, id2char
+from package.utils import label_to_string
 
 
 class Beam:
     r"""
     Applying Beam-Search during decoding process.
-
     Args:
         k (int) : size of beam
         decoder (torch.nn.Module) : get pointer of decoder object to get multiple parameters at once
         batch_size (int) : mini-batch size during infer
         max_length (int) :  a maximum allowed length for the sequence to be processed
-
     Inputs: decoder_input, encoder_outputs
         - **decoder_input** (torch.Tensor): initial input of decoder - <sos>
         - **encoder_outputs** (torch.Tensor): tensor with containing the outputs of the encoder.
-
     Returns: y_hats
         - **y_hats** (batch, seq_len): predicted y values (y_hat) by the model
-
     Examples::
-
         >>> beam = Beam(k, decoder, batch_size, max_length, F.log_softmax)
         >>> y_hats = beam.search(inputs, encoder_outputs)
     """
 
     def __init__(self, k, decoder, batch_size, max_length, device):
 
-        assert k > 1, "beam size (k) should be bigger than 1"
+        # assert k > 1, "beam size (k) should be bigger than 1"
 
         self.max_length = max_length
         self.n_layers = decoder.n_layers
         self.rnn = decoder.rnn
         self.embedding = decoder.embedding
         self.attention = decoder.attention
         self.hidden_dim = decoder.hidden_dim
+        self.decoder = decoder
         self.fc = decoder.fc
         self.eos_id = decoder.eos_id
         self.beams = None
@@ -47,37 +45,45 @@ def __init__(self, k, decoder, batch_size, max_length, device):
 
     def search(self, input_, encoder_outputs):
         batch_size = encoder_outputs.size(0)
-        h_state = None
+        h_state = self.decoder.init_state(batch_size)
 
+        # input_ : Bx1 (sos_id)   encoder_outputs : BxSxH
         step_outputs, h_state = self.forward_step(input_, h_state, encoder_outputs)
+        # step_outputs : BxC   h_state : KxBxH
         self.cumulative_probs, self.beams = step_outputs.topk(self.k)  # BxK
+        # self.cumulative_probs : BxK  확률   self.beam : BxK  인덱스
+
+        input_ = copy.deepcopy(self.beams)
+        self.beams = self.beams.unsqueeze(2)  # BxK => BxKx1
 
-        input_ = self.beams
-        self.beams = self.beams.unsqueeze(2)
+        repeated_h_state = h_state.repeat(1, 1, self.k)
+        repeated_encoder_outputs = encoder_outputs.repeat(1, 1, self.k)
+        # input_ : BxK   self.beams : BxKx1
 
         for di in range(self.max_length - 1):
             if self._is_done():
                 break
 
-            step_outputs, h_state = self.forward_step(input_, h_state, encoder_outputs)
-            probs, values = step_outputs.topk(self.k)
+            step_outputs, repeated_h_state = self.forward_beam_step(input_, repeated_h_state, repeated_encoder_outputs)
 
-            self.cumulative_probs /= self.get_length_penalty(length=di + 1, alpha=1.2, min_length=5)
-            probs = self.cumulative_probs.unsqueeze(1) + probs
+            # step_outputs: BxKxC     probs: BxKxK     values: BxKxK
+            probs, values = step_outputs.topk(self.k)
+            # probs = probs.unsqueeze(1)  # k = 1
+            probs = (probs.permute(0, 2, 1) + self.cumulative_probs.unsqueeze(1)).permute(0, 2, 1)
 
             probs = probs.view(batch_size, self.k * self.k)
             values = values.view(batch_size, self.k * self.k)
 
-            topk_probs, topk_status_ids = probs.topk(self.k)
-            topk_values = torch.LongTensor(batch_size, self.k)
+            topk_probs, topk_status_ids = probs.topk(self.k)  # BxK^2 = > BxK
+            prev_beams_ids = (topk_status_ids // self.k)
 
-            prev_beams = torch.LongTensor(self.beams.size())
-            prev_beams_ids = (topk_status_ids // self.k).view(batch_size, self.k)
+            topk_values = torch.zeros((batch_size, self.k), dtype=torch.long)
+            prev_beams = torch.zeros(self.beams.size(), dtype=torch.long)
 
             for batch_num, batch in enumerate(topk_status_ids):
                 for beam_idx, topk_status_idx in enumerate(batch):
                     topk_values[batch_num, beam_idx] = values[batch_num, topk_status_idx]
-                    prev_beams[batch_num, beam_idx] = self.beams[batch_num, prev_beams_ids[batch_num, beam_idx]]
+                    prev_beams[batch_num, beam_idx] = copy.deepcopy(self.beams[batch_num, prev_beams_ids[batch_num, beam_idx]])
 
             self.beams = torch.cat([prev_beams, topk_values.unsqueeze(2)], dim=2).to(self.device)
             self.cumulative_probs = topk_probs.to(self.device)
@@ -88,67 +94,95 @@ def search(self, input_, encoder_outputs):
                 next_ = [1] * batch_size
 
                 for (batch_num, beam_idx) in zip(*done_ids):
-                    self.sentences[batch_num].append(self.beams[batch_num, beam_idx])
-                    self.sentence_probs[batch_num].append(self.cumulative_probs[batch_num, beam_idx])
-                    self._replace_beam(
+                    self.sentences[batch_num].append(copy.deepcopy(self.beams[batch_num, beam_idx]))
+                    self.sentence_probs[batch_num].append(copy.deepcopy(self.cumulative_probs[batch_num, beam_idx]))
+                    eos_count = self._replace_beam(
                         probs=probs,
                         values=values,
                         done_ids=(batch_num, beam_idx),
-                        next_=next_[batch_num]
+                        next_=next_[batch_num],
+                        eos_count=1
                     )
-                    next_[batch_num] += 1
 
-            input_ = topk_values
+                    next_[batch_num] += eos_count
 
-        return self._get_best()
+            input_ = copy.deepcopy(self.beams[:, :, -1])
+
+        return self.get_best()
 
     def forward_step(self, input_, h_state, encoder_outputs):
+        """
+        :param input_: (batch_size, beam_size)
+        :param h_state: (beam_size, batch_size, hidden_dim)
+        :param encoder_outputs: (batch_size, seq_len, hidden_dim)
+        :return: step_outputs: (batch_size, beam_size, class_num)
+        """
+
+        batch_size = encoder_outputs.size(0)
+        seq_length = input_.size(1)
+
+        embedded = self.embedding(input_).to(self.device)
+
+        output, h_state = self.rnn(embedded, h_state)
+        context = self.attention(output, encoder_outputs)
+
+        predicted_softmax = F.log_softmax(self.fc(context.contiguous().view(-1, self.hidden_dim)), dim=1)
+        predicted_softmax = predicted_softmax.view(batch_size, seq_length, -1)
+        step_outputs = predicted_softmax.squeeze(1)
+
+        return step_outputs, h_state
+
+    def forward_beam_step(self, input_, h_state, encoder_outputs):
         batch_size = encoder_outputs.size(0)
         seq_length = input_.size(1)
 
         embedded = self.embedding(input_).to(self.device)
 
         output, h_state = self.rnn(embedded, h_state)
-        output = self.attention(output, encoder_outputs)
+        context = self.attention(output, encoder_outputs)
 
-        predicted_softmax = F.log_softmax(self.fc(output.contiguous().view(-1, self.hidden_dim)), dim=1)
+        predicted_softmax = F.log_softmax(self.fc(context.contiguous().view(-1, self.hidden_dim)), dim=1)
         predicted_softmax = predicted_softmax.view(batch_size, seq_length, -1)
         step_outputs = predicted_softmax.squeeze(1)
 
         return step_outputs, h_state
 
-    def _get_best(self):
+    def get_best(self):
         y_hats = list()
 
+        for batch_num, batch in enumerate(self.sentences):
+            for beam_idx, beam in enumerate(batch):
+                self.sentence_probs[batch_num][beam_idx] /= self.get_length_penalty(len(beam))
+
         for batch_num, batch in enumerate(self.sentences):
             # if there is no terminated sentences, bring ongoing sentence which has the highest probability instead
             if len(batch) == 0:
                 prob_batch = self.cumulative_probs[batch_num].to(self.device)
                 top_beam_idx = int(prob_batch.topk(1)[1])
-                y_hats.append(self.beams[batch_num, top_beam_idx])
+                y_hats.append(copy.deepcopy(self.beams[batch_num, top_beam_idx]))
 
             # bring highest probability sentence
             else:
                 top_beam_idx = int(torch.FloatTensor(self.sentence_probs[batch_num]).topk(1)[1])
-                y_hats.append(self.sentences[batch_num][top_beam_idx])
+                y_hats.append(copy.deepcopy(self.sentences[batch_num][top_beam_idx]))
 
         y_hats = self._match_len(y_hats).to(self.device)
 
         return y_hats
 
     def _match_len(self, y_hats):
-        batch_size = y_hats.size(0)
+        batch_size = len(y_hats)
         max_length = -1
 
         for y_hat in y_hats:
             if len(y_hat) > max_length:
                 max_length = len(y_hat)
 
-        matched = torch.LongTensor(batch_size, max_length).to(self.device)
+        matched = torch.zeros((batch_size, max_length), dtype=torch.long).to(self.device)
 
         for batch_num, y_hat in enumerate(y_hats):
             matched[batch_num, :len(y_hat)] = y_hat
-            matched[batch_num, len(y_hat):] = int(char2id[PAD_token])
+            matched[batch_num, len(y_hat):] = int(char2id[' '])
 
         return matched
 
@@ -168,8 +202,11 @@ def get_length_penalty(self, length, alpha=1.2, min_length=5):
         """
         return ((min_length + length) / (min_length + 1)) ** alpha
 
-    def _replace_beam(self, probs, values, done_ids, next_):
-        """ Replaces a beam that ends with <eos> with a beam with the next higher probability. """
+    def _replace_beam(self, probs, values, done_ids, next_, eos_count):
+        """ Replaces a beam that ends with <eos> with a beam with the next higher probability.
+
+        probs BxK^2
+        """
         done_batch_num, done_beam_idx = done_ids
 
         replace_ids = probs.topk(self.k + next_)[1]
@@ -179,10 +216,18 @@ def _replace_beam(self, probs, values, done_ids, next_):
         new_value = values[done_batch_num, replace_idx].to(self.device)
 
         prev_beam_idx = (replace_idx // self.k)
-        prev_beam = self.beams[done_batch_num, prev_beam_idx]
+        prev_beam = copy.deepcopy(self.beams[done_batch_num, prev_beam_idx])
         prev_beam = prev_beam[:-1].to(self.device)
 
         new_beam = torch.cat([prev_beam, new_value.view(1)])
 
-        self.beams[done_batch_num, done_beam_idx] = new_beam
-        self.cumulative_probs[done_batch_num, done_beam_idx] = new_prob
+        if int(new_value) == self.eos_id:
+            self.sentences[done_batch_num].append(copy.deepcopy(new_beam))
+            self.sentence_probs[done_batch_num].append(copy.deepcopy(new_prob))
+            eos_count = self._replace_beam(probs, values, done_ids, next_ + eos_count, eos_count + 1)
+
+        else:
+            self.beams[done_batch_num, done_beam_idx] = copy.deepcopy(new_beam)
+            self.cumulative_probs[done_batch_num, done_beam_idx] = copy.deepcopy(new_prob)
+
+        return eos_count
diff --git a/model/speller.py b/model/speller.py
@@ -92,7 +92,7 @@ def forward(self, inputs, listener_outputs, teacher_forcing_ratio=0.90, use_beam
         decode_outputs = list()
         use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
 
-        h_state = self._init_state(batch_size)
+        h_state = self.init_state(batch_size)
 
         if use_beam_search:  # TopK Decoding
             input_ = inputs[:, 0].unsqueeze(1)
@@ -137,7 +137,7 @@ def forward(self, inputs, listener_outputs, teacher_forcing_ratio=0.90, use_beam
 
         return y_hats, logits
 
-    def _init_state(self, batch_size):
+    def init_state(self, batch_size):
         if isinstance(self.rnn, nn.LSTM):
             h_0 = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(self.device)
             c_0 = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(self.device)