first implementation of EstNLTK analyzer support

NatLibFi · Nov 22, 2024 · 94d29db · 94d29db
1 parent d907024
commit 94d29db
Show file tree

Hide file tree

Showing 4 changed files with 81 additions and 1 deletion.
diff --git a/annif/analyzer/__init__.py b/annif/analyzer/__init__.py
@@ -8,7 +8,7 @@
 import annif
 from annif.util import parse_args
 
-from . import simple, simplemma, snowball, spacy, voikko
+from . import estnltk, simple, simplemma, snowball, spacy, voikko
 
 if TYPE_CHECKING:
     from annif.analyzer.analyzer import Analyzer
@@ -42,3 +42,4 @@ def get_analyzer(analyzerspec: str) -> Analyzer:
 register_analyzer(simplemma.SimplemmaAnalyzer)
 register_analyzer(voikko.VoikkoAnalyzer)
 register_analyzer(spacy.SpacyAnalyzer)
+register_analyzer(estnltk.EstNLTKAnalyzer)
diff --git a/annif/analyzer/estnltk.py b/annif/analyzer/estnltk.py
@@ -0,0 +1,28 @@
+"""EstNLTK analyzer for Annif which uses EstNLTK for lemmatization"""
+
+from __future__ import annotations
+
+import annif.util
+from annif.exception import OperationFailedException
+
+from . import analyzer
+
+
+class EstNLTKAnalyzer(analyzer.Analyzer):
+    name = "estnltk"
+
+    def __init__(self, param: str, **kwargs) -> None:
+        self.param = param
+        super().__init__(**kwargs)
+
+    def tokenize_words(self, text: str, filter: bool = True) -> list[str]:
+        import estnltk
+
+        txt = estnltk.Text(text.strip())
+        txt.tag_layer()
+        lemmas = [
+            lemma
+            for lemma in [l[0] for l in txt.lemma]
+            if (not filter or self.is_valid_token(lemma))
+        ]
+        return lemmas
diff --git a/pyproject.toml b/pyproject.toml
@@ -51,6 +51,7 @@ huggingface-hub = "~0.25.1"
 
 fasttext-wheel = { version = "0.9.2", optional = true }
 voikko = { version = "0.5.*", optional = true }
+estnltk = { version = "1.7.3", optional = true }
 tensorflow-cpu = { version = "~2.17.0", optional = true }
 lmdb = { version = "~1.5.1", optional = true }
 omikuji = { version = "0.5.*", optional = true }
@@ -73,6 +74,7 @@ schemathesis = "3.*.*"
 [tool.poetry.extras]
 fasttext = ["fasttext-wheel"]
 voikko = ["voikko"]
+estnltk = ["estnltk"]
 nn = ["tensorflow-cpu", "lmdb"]
 omikuji = ["omikuji"]
 yake = ["yake"]

diff --git a/tests/test_analyzer_estnltk.py b/tests/test_analyzer_estnltk.py
@@ -0,0 +1,49 @@
+"""Unit tests for EstNLTK analyzer in Annif"""
+
+import pytest
+
+import annif.analyzer
+
+estnltk = pytest.importorskip("annif.analyzer.estnltk")
+
+
+def test_estnltk_tokenize_words():
+    analyzer = annif.analyzer.get_analyzer("estnltk")
+    words = analyzer.tokenize_words(
+        """
+        Aga kõik juhtus iseenesest. Ka köögis oli kõik endine.
+        """
+    )
+    assert words == [
+        "aga",
+        "kõik",
+        "juhtuma",
+        "iseenesest",
+        "köök",
+        "olema",
+        "kõik",
+        "endine",
+    ]
+
+
+def test_estnltk_tokenize_words_no_filter():
+    analyzer = annif.analyzer.get_analyzer("estnltk")
+    words = analyzer.tokenize_words(
+        """
+        Aga kõik juhtus iseenesest. Ka köögis oli kõik endine.
+        """,
+        filter=False,
+    )
+    assert words == [
+        "aga",
+        "kõik",
+        "juhtuma",
+        "iseenesest",
+        ".",
+        "ka",
+        "köök",
+        "olema",
+        "kõik",
+        "endine",
+        ".",
+    ]