Skip to content

Commit

Permalink
first implementation of EstNLTK analyzer support
Browse files Browse the repository at this point in the history
  • Loading branch information
osma committed Nov 22, 2024
1 parent d907024 commit 94d29db
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 1 deletion.
3 changes: 2 additions & 1 deletion annif/analyzer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import annif
from annif.util import parse_args

from . import simple, simplemma, snowball, spacy, voikko
from . import estnltk, simple, simplemma, snowball, spacy, voikko

if TYPE_CHECKING:
from annif.analyzer.analyzer import Analyzer
Expand Down Expand Up @@ -42,3 +42,4 @@ def get_analyzer(analyzerspec: str) -> Analyzer:
register_analyzer(simplemma.SimplemmaAnalyzer)
register_analyzer(voikko.VoikkoAnalyzer)
register_analyzer(spacy.SpacyAnalyzer)
register_analyzer(estnltk.EstNLTKAnalyzer)
28 changes: 28 additions & 0 deletions annif/analyzer/estnltk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""EstNLTK analyzer for Annif which uses EstNLTK for lemmatization"""

from __future__ import annotations

import annif.util
from annif.exception import OperationFailedException

from . import analyzer


class EstNLTKAnalyzer(analyzer.Analyzer):
name = "estnltk"

def __init__(self, param: str, **kwargs) -> None:
self.param = param
super().__init__(**kwargs)

def tokenize_words(self, text: str, filter: bool = True) -> list[str]:
import estnltk

txt = estnltk.Text(text.strip())
txt.tag_layer()
lemmas = [
lemma
for lemma in [l[0] for l in txt.lemma]
if (not filter or self.is_valid_token(lemma))
]
return lemmas
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ huggingface-hub = "~0.25.1"

fasttext-wheel = { version = "0.9.2", optional = true }
voikko = { version = "0.5.*", optional = true }
estnltk = { version = "1.7.3", optional = true }
tensorflow-cpu = { version = "~2.17.0", optional = true }
lmdb = { version = "~1.5.1", optional = true }
omikuji = { version = "0.5.*", optional = true }
Expand All @@ -73,6 +74,7 @@ schemathesis = "3.*.*"
[tool.poetry.extras]
fasttext = ["fasttext-wheel"]
voikko = ["voikko"]
estnltk = ["estnltk"]
nn = ["tensorflow-cpu", "lmdb"]
omikuji = ["omikuji"]
yake = ["yake"]
Expand Down
49 changes: 49 additions & 0 deletions tests/test_analyzer_estnltk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""Unit tests for EstNLTK analyzer in Annif"""

import pytest

import annif.analyzer

estnltk = pytest.importorskip("annif.analyzer.estnltk")


def test_estnltk_tokenize_words():
analyzer = annif.analyzer.get_analyzer("estnltk")
words = analyzer.tokenize_words(
"""
Aga kõik juhtus iseenesest. Ka köögis oli kõik endine.
"""
)
assert words == [
"aga",
"kõik",
"juhtuma",
"iseenesest",
"köök",
"olema",
"kõik",
"endine",
]


def test_estnltk_tokenize_words_no_filter():
analyzer = annif.analyzer.get_analyzer("estnltk")
words = analyzer.tokenize_words(
"""
Aga kõik juhtus iseenesest. Ka köögis oli kõik endine.
""",
filter=False,
)
assert words == [
"aga",
"kõik",
"juhtuma",
"iseenesest",
".",
"ka",
"köök",
"olema",
"kõik",
"endine",
".",
]

0 comments on commit 94d29db

Please sign in to comment.