Skip to content

Commit

Permalink
[update] Change the behavior of WordAlign class
Browse files Browse the repository at this point in the history
- Separate get_similarity_matrix
- An instance of WordAlign has some parameters as variables
- Use __call__ method
  • Loading branch information
m-yoshinaka committed Oct 5, 2020
1 parent 9914525 commit c79fb96
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 49 deletions.
23 changes: 11 additions & 12 deletions sapphire/sapphire.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,33 @@
from .word_alignment import FastTextVectorize, WordAlign
from .phrase_alignment import PhraseExtract, PhraseAlign
from sapphire.word_alignment import (
FastTextVectorize, WordAlign, get_similarity_matrix
)
from sapphire.phrase_alignment import PhraseExtract, PhraseAlign


class Sapphire(object):

def __init__(self, model):
self.vectorizer = FastTextVectorize(model)
self.word_aligner = WordAlign()
self.set_params()
self.word_aligner = WordAlign(self.lambda_, self.use_hungarian)
self.extractor = PhraseExtract()
self.phrase_aligner = PhraseAlign()
self.set_params()

def set_params(self, lambda_=0.6, delta=0.6, alpha=0.01, hungarian=False):
self.lambda_ = lambda_
self.delta = delta
self.alpha = alpha
self.hungarian = hungarian
self.use_hungarian = hungarian

def align(self, tokens_src: list, tokens_trg: list):
len_src = len(tokens_src)
len_trg = len(tokens_trg)

vectors_src = self.vectorizer.vectorize(tokens_src)
vectors_trg = self.vectorizer.vectorize(tokens_trg)
vectors_src = self.vectorizer(tokens_src)
vectors_trg = self.vectorizer(tokens_trg)

sim_matrix = self.word_aligner.similarity_matrix(vectors_src,
vectors_trg)
word_alignment = self.word_aligner.align(sim_matrix,
self.lambda_,
self.hungarian)
sim_matrix = get_similarity_matrix(vectors_src, vectors_trg)
word_alignment = self.word_aligner(sim_matrix)

phrase_pairs = self.extractor.extract(word_alignment,
vectors_src,
Expand Down
73 changes: 36 additions & 37 deletions sapphire/word_alignment.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import itertools
import numpy as np
from itertools import product
from scipy.spatial.distance import cosine
from scipy.optimize import linear_sum_assignment
from scipy.spatial import distance


class WordEmbedding(object):

def __init__(self):
pass

def vectorize(self, word: list) -> np.array:
def __call__(self, words):
pass


Expand All @@ -20,7 +20,7 @@ def __init__(self, model):
self.model = model
self.dim = model.get_dimension()

def vectorize(self, words: list) -> np.array:
def __call__(self, words):
vector = []

if words:
Expand All @@ -32,32 +32,37 @@ def vectorize(self, words: list) -> np.array:
return np.array(vector)


class WordAlign(object):
def get_similarity_matrix(src_vectors, trg_vectors):
len_src = len(src_vectors)
len_trg = len(trg_vectors)
sim_matrix = np.zeros((len_src, len_trg))

def __init__(self):
self.name = ''
for (src_idx, src_vec), (trg_idx, trg_vec) in product(
enumerate(src_vectors), enumerate(trg_vectors)
):
sim_matrix[src_idx][trg_idx] = 1 - cosine(src_vec, trg_vec)

@staticmethod
def similarity_matrix(vectors_src: np.array, vectors_trg: np.array) -> np.ndarray:
len_src = len(vectors_src)
len_trg = len(vectors_trg)
return sim_matrix

sim_matrix = np.zeros((len_src, len_trg))

for (id_src, vec_src), (id_trg, vec_trg) in itertools.product(enumerate(vectors_src), enumerate(vectors_trg)):
sim_matrix[id_src][id_trg] = 1 - distance.cosine(vec_src, vec_trg)
class WordAlign(object):

return sim_matrix
def __init__(self, lambda_, use_hungarian):
self.lambda_ = lambda_
self.use_hungarian = use_hungarian

@staticmethod
def _hungarian_assign(sim_matrix):

len_src, len_trg = sim_matrix.shape
cost_matrix = np.ones((len_src, len_trg))
def __call__(self, sim_matrix):
if self.use_hungarian:
alignments = self._hungarian_assign(sim_matrix)
else:
alignments = self._grow_diag_final(sim_matrix)

for id_src, id_trg in itertools.product(range(len_src), range(len_trg)):
cost_matrix[id_src][id_trg] -= sim_matrix[id_src][id_trg]
return [(s + 1, t + 1) for s, t in alignments
if sim_matrix[s][t] >= self.lambda_]

@staticmethod
def _hungarian_assign(sim_matrix):
cost_matrix = - sim_matrix
aligned_src, aligned_trg = linear_sum_assignment(cost_matrix)
alignments = [(s, t) for s, t in zip(aligned_src, aligned_trg)]

Expand All @@ -68,22 +73,25 @@ def _grow_diag_final(sim_matrix):

def _grow_diag():
point_added = False
for src, trg in itertools.product(range(len_src), range(len_trg)):
for src, trg in product(range(len_src), range(len_trg)):
if not align_matrix[src][trg]:
continue
for ns, nt in neighbors:
if src + ns < 0 or src + ns >= len_src or trg + nt < 0 or trg + nt >= len_trg:
if src + ns < 0 or src + ns >= len_src \
or trg + nt < 0 or trg + nt >= len_trg:
continue
if (not np.any(align_matrix[src + ns, :]) or not np.any(align_matrix[:, trg + nt])) \
if (not np.any(align_matrix[src + ns, :])
or not np.any(align_matrix[:, trg + nt])) \
and union_matrix[src + ns][trg + nt]:
align_matrix[src + ns][trg + nt] = 1
point_added = True
if point_added:
_grow_diag()

def _final(matrix):
for src, trg in itertools.product(range(len_src), range(len_trg)):
if (not np.any(align_matrix[src, :]) or not np.any(align_matrix[:, trg])) \
for src, trg in product(range(len_src), range(len_trg)):
if (not np.any(align_matrix[src, :])
or not np.any(align_matrix[:, trg])) \
and matrix[src][trg]:
align_matrix[src][trg] = 1

Expand All @@ -107,17 +115,8 @@ def _final(matrix):
_final(trg2src)

alignments = []
for s, t in itertools.product(range(len_src), range(len_trg)):
for s, t in product(range(len_src), range(len_trg)):
if align_matrix[s][t]:
alignments.append((s, t))

return alignments

def align(self, sim_matrix: np.ndarray, gamma, hungarian) -> list:
### 1-index alignment ###
if hungarian:
alignments = self._hungarian_assign(sim_matrix)
else:
alignments = self._grow_diag_final(sim_matrix)

return [(s + 1, t + 1) for s, t in alignments if sim_matrix[s][t] >= gamma]

0 comments on commit c79fb96

Please sign in to comment.