Skip to content

Commit

Permalink
Move text normalization outside chunking method
Browse files Browse the repository at this point in the history
  • Loading branch information
osma committed Feb 4, 2019
1 parent 3e46e99 commit 794bfb5
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 6 deletions.
11 changes: 10 additions & 1 deletion annif/backend/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,18 @@ def train(self, corpus, project):
self._create_train_file(corpus, project)
self._create_model()

def _predict_chunks(self, chunktexts, project, limit):
normalized_chunks = []
for chunktext in chunktexts:
normalized = self._normalize_text(project, chunktext)
if normalized != '':
normalized_chunks.append(normalized)
return self._model.predict(normalized_chunks, limit)

def _analyze_chunks(self, chunktexts, project):
limit = int(self.params['limit'])
chunklabels, chunkscores = self._model.predict(chunktexts, limit)
chunklabels, chunkscores = self._predict_chunks(
chunktexts, project, limit)
label_scores = collections.defaultdict(float)
for labels, scores in zip(chunklabels, chunkscores):
for label, score in zip(labels, scores):
Expand Down
5 changes: 1 addition & 4 deletions annif/backend/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,7 @@ def _analyze(self, text, project, params):
chunksize = int(params['chunksize'])
chunktexts = []
for i in range(0, len(sentences), chunksize):
chunktext = ' '.join(sentences[i:i + chunksize])
normalized = self._normalize_text(project, chunktext)
if normalized != '':
chunktexts.append(normalized)
chunktexts.append(' '.join(sentences[i:i + chunksize]))
self.debug('Split sentences into {} chunks'.format(len(chunktexts)))
if len(chunktexts) == 0: # nothing to analyze, empty result
return ListAnalysisResult(hits=[], subject_index=project.subjects)
Expand Down
5 changes: 4 additions & 1 deletion annif/backend/vw_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,10 @@ def train(self, corpus, project):
def _analyze_chunks(self, chunktexts, project):
results = []
for chunktext in chunktexts:
example = ' | {}'.format(chunktext)
normalized = self._normalize_text(project, chunktext)
if normalized == '':
continue
example = ' | {}'.format(normalized)
result = self._model.predict(example)
if self.algorithm == 'multilabel_oaa':
# result is a list of subject IDs - need to vectorize
Expand Down

0 comments on commit 794bfb5

Please sign in to comment.