Skip to content

Commit

Permalink
run server
Browse files Browse the repository at this point in the history
  • Loading branch information
sooftware committed Apr 28, 2020
1 parent 9d7a516 commit d019d07
Show file tree
Hide file tree
Showing 20 changed files with 630 additions and 96 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# **End-to-end Speech Recognition**
# **End-to-End Speech Recognition**

### Character-unit based End-to-End Korean Speech Recognition

[<img src="https://github.com/gentaiscool/end2end-asr-pytorch/raw/master/img/pytorch-logo-dark.png" height=18>](https://pytorch.org/) <img src="https://img.shields.io/badge/License-Apache--2.0-yellow" height=20> [<img src="https://img.shields.io/badge/chat-on%20gitter-4fb99a" height=20>](https://gitter.im/Korean-Speech-Recognition/community) <img src="https://img.shields.io/badge/Fixing-Beam%20Search-red" height=20>
[<img src="https://github.com/gentaiscool/end2end-asr-pytorch/raw/master/img/pytorch-logo-dark.png" height=18>](https://pytorch.org/) <img src="https://img.shields.io/badge/License-Apache--2.0-yellow" height=20> [<img src="https://img.shields.io/badge/chat-on%20gitter-4fb99a" height=20>](https://gitter.im/Korean-Speech-Recognition/community)

### [**Documentation**](https://sooftware.github.io/End-to-end-Speech-Recognition/)

Expand Down Expand Up @@ -72,7 +72,7 @@ ListenAttendSpell(
```

We use [AI Hub 1000h](http://www.aihub.or.kr/aidata/105) dataset which contains 1,000 hours korean voice data. and, our project is currently in progress.
At present our top model has recorded an **82.3% CRR**, and we are working for a higher recognition rate.
At present our top model has recorded an **80% CRR**, and we are working for a higher recognition rate.

Also our model has recorded **91% CRR** in [Kadi-zeroth dataset](https://github.com/goodatlas/zeroth).

Expand Down
9 changes: 1 addition & 8 deletions data/data_list/debug_list.csv
Original file line number Diff line number Diff line change
@@ -1,9 +1,2 @@
audio,label
audio,label
KaiSpeech_000001.pcm,KaiSpeech_label_000001.txt
KaiSpeech_000002.pcm,KaiSpeech_label_000002.txt
KaiSpeech_000003.pcm,KaiSpeech_label_000003.txt
KaiSpeech_000004.pcm,KaiSpeech_label_000004.txt
KaiSpeech_000005.pcm,KaiSpeech_label_000005.txt
KaiSpeech_000006.pcm,KaiSpeech_label_000006.txt
KaiSpeech_000007.pcm,KaiSpeech_label_000007.txt
KaiSpeech_000008.pcm,KaiSpeech_label_000008.txt
Binary file modified data/pickle/target_dict.bin.bin
Binary file not shown.
4 changes: 2 additions & 2 deletions model/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ def forward(self, queries, values):
values = values.permute(2, 0, 1, 3).contiguous().view(-1, value_length, self.dim)

attn_score = torch.bmm(queries, values.transpose(1, 2))
alignment = F.softmax(attn_score, dim=2)
align = F.softmax(attn_score, dim=2)

attn_val = torch.bmm(alignment, values).view(self.n_head, batch_size, query_length, self.dim)
attn_val = torch.bmm(align, values).view(self.n_head, batch_size, query_length, self.dim)
attn_val = attn_val.permute(1, 2, 0, 3).contiguous().view(batch_size, query_length, -1)

combined = torch.cat([attn_val, preserved], dim=2)
Expand Down
123 changes: 84 additions & 39 deletions model/beam.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,39 @@
import torch
import torch.nn.functional as F
from package.definition import char2id, PAD_token
import copy
from package.definition import char2id, id2char
from package.utils import label_to_string


class Beam:
r"""
Applying Beam-Search during decoding process.
Args:
k (int) : size of beam
decoder (torch.nn.Module) : get pointer of decoder object to get multiple parameters at once
batch_size (int) : mini-batch size during infer
max_length (int) : a maximum allowed length for the sequence to be processed
Inputs: decoder_input, encoder_outputs
- **decoder_input** (torch.Tensor): initial input of decoder - <sos>
- **encoder_outputs** (torch.Tensor): tensor with containing the outputs of the encoder.
Returns: y_hats
- **y_hats** (batch, seq_len): predicted y values (y_hat) by the model
Examples::
>>> beam = Beam(k, decoder, batch_size, max_length, F.log_softmax)
>>> y_hats = beam.search(inputs, encoder_outputs)
"""

def __init__(self, k, decoder, batch_size, max_length, device):

assert k > 1, "beam size (k) should be bigger than 1"
# assert k > 1, "beam size (k) should be bigger than 1"

self.max_length = max_length
self.n_layers = decoder.n_layers
self.rnn = decoder.rnn
self.embedding = decoder.embedding
self.attention = decoder.attention
self.hidden_dim = decoder.hidden_dim
self.decoder = decoder
self.fc = decoder.fc
self.eos_id = decoder.eos_id
self.beams = None
Expand All @@ -47,37 +45,45 @@ def __init__(self, k, decoder, batch_size, max_length, device):

def search(self, input_, encoder_outputs):
batch_size = encoder_outputs.size(0)
h_state = None
h_state = self.decoder.init_state(batch_size)

# input_ : Bx1 (sos_id) encoder_outputs : BxSxH
step_outputs, h_state = self.forward_step(input_, h_state, encoder_outputs)
# step_outputs : BxC h_state : KxBxH
self.cumulative_probs, self.beams = step_outputs.topk(self.k) # BxK
# self.cumulative_probs : BxK 확률 self.beam : BxK 인덱스

input_ = copy.deepcopy(self.beams)
self.beams = self.beams.unsqueeze(2) # BxK => BxKx1

input_ = self.beams
self.beams = self.beams.unsqueeze(2)
repeated_h_state = h_state.repeat(1, 1, self.k)
repeated_encoder_outputs = encoder_outputs.repeat(1, 1, self.k)
# input_ : BxK self.beams : BxKx1

for di in range(self.max_length - 1):
if self._is_done():
break

step_outputs, h_state = self.forward_step(input_, h_state, encoder_outputs)
probs, values = step_outputs.topk(self.k)
step_outputs, repeated_h_state = self.forward_beam_step(input_, repeated_h_state, repeated_encoder_outputs)

self.cumulative_probs /= self.get_length_penalty(length=di + 1, alpha=1.2, min_length=5)
probs = self.cumulative_probs.unsqueeze(1) + probs
# step_outputs: BxKxC probs: BxKxK values: BxKxK
probs, values = step_outputs.topk(self.k)
# probs = probs.unsqueeze(1) # k = 1
probs = (probs.permute(0, 2, 1) + self.cumulative_probs.unsqueeze(1)).permute(0, 2, 1)

probs = probs.view(batch_size, self.k * self.k)
values = values.view(batch_size, self.k * self.k)

topk_probs, topk_status_ids = probs.topk(self.k)
topk_values = torch.LongTensor(batch_size, self.k)
topk_probs, topk_status_ids = probs.topk(self.k) # BxK^2 = > BxK
prev_beams_ids = (topk_status_ids // self.k)

prev_beams = torch.LongTensor(self.beams.size())
prev_beams_ids = (topk_status_ids // self.k).view(batch_size, self.k)
topk_values = torch.zeros((batch_size, self.k), dtype=torch.long)
prev_beams = torch.zeros(self.beams.size(), dtype=torch.long)

for batch_num, batch in enumerate(topk_status_ids):
for beam_idx, topk_status_idx in enumerate(batch):
topk_values[batch_num, beam_idx] = values[batch_num, topk_status_idx]
prev_beams[batch_num, beam_idx] = self.beams[batch_num, prev_beams_ids[batch_num, beam_idx]]
prev_beams[batch_num, beam_idx] = copy.deepcopy(self.beams[batch_num, prev_beams_ids[batch_num, beam_idx]])

self.beams = torch.cat([prev_beams, topk_values.unsqueeze(2)], dim=2).to(self.device)
self.cumulative_probs = topk_probs.to(self.device)
Expand All @@ -88,67 +94,95 @@ def search(self, input_, encoder_outputs):
next_ = [1] * batch_size

for (batch_num, beam_idx) in zip(*done_ids):
self.sentences[batch_num].append(self.beams[batch_num, beam_idx])
self.sentence_probs[batch_num].append(self.cumulative_probs[batch_num, beam_idx])
self._replace_beam(
self.sentences[batch_num].append(copy.deepcopy(self.beams[batch_num, beam_idx]))
self.sentence_probs[batch_num].append(copy.deepcopy(self.cumulative_probs[batch_num, beam_idx]))
eos_count = self._replace_beam(
probs=probs,
values=values,
done_ids=(batch_num, beam_idx),
next_=next_[batch_num]
next_=next_[batch_num],
eos_count=1
)
next_[batch_num] += 1

input_ = topk_values
next_[batch_num] += eos_count

return self._get_best()
input_ = copy.deepcopy(self.beams[:, :, -1])

return self.get_best()

def forward_step(self, input_, h_state, encoder_outputs):
"""
:param input_: (batch_size, beam_size)
:param h_state: (beam_size, batch_size, hidden_dim)
:param encoder_outputs: (batch_size, seq_len, hidden_dim)
:return: step_outputs: (batch_size, beam_size, class_num)
"""

batch_size = encoder_outputs.size(0)
seq_length = input_.size(1)

embedded = self.embedding(input_).to(self.device)

output, h_state = self.rnn(embedded, h_state)
context = self.attention(output, encoder_outputs)

predicted_softmax = F.log_softmax(self.fc(context.contiguous().view(-1, self.hidden_dim)), dim=1)
predicted_softmax = predicted_softmax.view(batch_size, seq_length, -1)
step_outputs = predicted_softmax.squeeze(1)

return step_outputs, h_state

def forward_beam_step(self, input_, h_state, encoder_outputs):
batch_size = encoder_outputs.size(0)
seq_length = input_.size(1)

embedded = self.embedding(input_).to(self.device)

output, h_state = self.rnn(embedded, h_state)
output = self.attention(output, encoder_outputs)
context = self.attention(output, encoder_outputs)

predicted_softmax = F.log_softmax(self.fc(output.contiguous().view(-1, self.hidden_dim)), dim=1)
predicted_softmax = F.log_softmax(self.fc(context.contiguous().view(-1, self.hidden_dim)), dim=1)
predicted_softmax = predicted_softmax.view(batch_size, seq_length, -1)
step_outputs = predicted_softmax.squeeze(1)

return step_outputs, h_state

def _get_best(self):
def get_best(self):
y_hats = list()

for batch_num, batch in enumerate(self.sentences):
for beam_idx, beam in enumerate(batch):
self.sentence_probs[batch_num][beam_idx] /= self.get_length_penalty(len(beam))

for batch_num, batch in enumerate(self.sentences):
# if there is no terminated sentences, bring ongoing sentence which has the highest probability instead
if len(batch) == 0:
prob_batch = self.cumulative_probs[batch_num].to(self.device)
top_beam_idx = int(prob_batch.topk(1)[1])
y_hats.append(self.beams[batch_num, top_beam_idx])
y_hats.append(copy.deepcopy(self.beams[batch_num, top_beam_idx]))

# bring highest probability sentence
else:
top_beam_idx = int(torch.FloatTensor(self.sentence_probs[batch_num]).topk(1)[1])
y_hats.append(self.sentences[batch_num][top_beam_idx])
y_hats.append(copy.deepcopy(self.sentences[batch_num][top_beam_idx]))

y_hats = self._match_len(y_hats).to(self.device)

return y_hats

def _match_len(self, y_hats):
batch_size = y_hats.size(0)
batch_size = len(y_hats)
max_length = -1

for y_hat in y_hats:
if len(y_hat) > max_length:
max_length = len(y_hat)

matched = torch.LongTensor(batch_size, max_length).to(self.device)
matched = torch.zeros((batch_size, max_length), dtype=torch.long).to(self.device)

for batch_num, y_hat in enumerate(y_hats):
matched[batch_num, :len(y_hat)] = y_hat
matched[batch_num, len(y_hat):] = int(char2id[PAD_token])
matched[batch_num, len(y_hat):] = int(char2id[' '])

return matched

Expand All @@ -168,8 +202,11 @@ def get_length_penalty(self, length, alpha=1.2, min_length=5):
"""
return ((min_length + length) / (min_length + 1)) ** alpha

def _replace_beam(self, probs, values, done_ids, next_):
""" Replaces a beam that ends with <eos> with a beam with the next higher probability. """
def _replace_beam(self, probs, values, done_ids, next_, eos_count):
""" Replaces a beam that ends with <eos> with a beam with the next higher probability.
probs BxK^2
"""
done_batch_num, done_beam_idx = done_ids

replace_ids = probs.topk(self.k + next_)[1]
Expand All @@ -179,10 +216,18 @@ def _replace_beam(self, probs, values, done_ids, next_):
new_value = values[done_batch_num, replace_idx].to(self.device)

prev_beam_idx = (replace_idx // self.k)
prev_beam = self.beams[done_batch_num, prev_beam_idx]
prev_beam = copy.deepcopy(self.beams[done_batch_num, prev_beam_idx])
prev_beam = prev_beam[:-1].to(self.device)

new_beam = torch.cat([prev_beam, new_value.view(1)])

self.beams[done_batch_num, done_beam_idx] = new_beam
self.cumulative_probs[done_batch_num, done_beam_idx] = new_prob
if int(new_value) == self.eos_id:
self.sentences[done_batch_num].append(copy.deepcopy(new_beam))
self.sentence_probs[done_batch_num].append(copy.deepcopy(new_prob))
eos_count = self._replace_beam(probs, values, done_ids, next_ + eos_count, eos_count + 1)

else:
self.beams[done_batch_num, done_beam_idx] = copy.deepcopy(new_beam)
self.cumulative_probs[done_batch_num, done_beam_idx] = copy.deepcopy(new_prob)

return eos_count
4 changes: 2 additions & 2 deletions model/speller.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def forward(self, inputs, listener_outputs, teacher_forcing_ratio=0.90, use_beam
decode_outputs = list()
use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

h_state = self._init_state(batch_size)
h_state = self.init_state(batch_size)

if use_beam_search: # TopK Decoding
input_ = inputs[:, 0].unsqueeze(1)
Expand Down Expand Up @@ -137,7 +137,7 @@ def forward(self, inputs, listener_outputs, teacher_forcing_ratio=0.90, use_beam

return y_hats, logits

def _init_state(self, batch_size):
def init_state(self, batch_size):
if isinstance(self.rnn, nn.LSTM):
h_0 = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(self.device)
c_0 = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(self.device)
Expand Down
Loading

0 comments on commit d019d07

Please sign in to comment.