diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index e4fbe17e..544446b3 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -85,7 +85,7 @@ jobs: fi # For Python 3.10: if [[ ${{ matrix.python-version }} == '3.10' ]]; then - poetry install -E "fasttext spacy"; + poetry install -E "fasttext spacy estnltk"; # download the small English pretrained spaCy model needed by spacy analyzer poetry run python -m spacy download en_core_web_sm --upgrade-strategy only-if-needed fi diff --git a/README.md b/README.md index 88d47309..c92ec653 100644 --- a/README.md +++ b/README.md @@ -223,13 +223,24 @@ https://doi.org/10.18352/lq.10285 # License -The code in this repository is licensed under Apache License 2.0, except for the -dependencies included under `annif/static/css` and `annif/static/js`, -which have their own licenses, see the file headers for details. -Please note that the [YAKE](https://github.com/LIAAD/yake) library is licended -under [GPLv3](https://www.gnu.org/licenses/gpl-3.0.txt), while Annif is -licensed under the Apache License 2.0. The licenses are compatible, but -depending on legal interpretation, the terms of the GPLv3 (for example the -requirement to publish corresponding source code when publishing an executable -application) may be considered to apply to the whole of Annif+Yake if you -decide to install the optional Yake dependency. +The code in this repository is licensed under Apache License 2.0, except for +the dependencies included under `annif/static/css` and `annif/static/js`, +which have their own licenses; see the file headers for details. + +Please note that the [YAKE](https://github.com/LIAAD/yake) library is +licensed under [GPLv3](https://www.gnu.org/licenses/gpl-3.0.txt) and the +[EstNLTK-core](https://github.com/estnltk/estnltk/tree/main/estnltk_core) +library is licensed under +[GPLv2](https://www.gnu.org/licenses/old-licenses/gpl-2.0.html), while Annif +itself is licensed under the Apache License 2.0. It is commonly accepted +that the GPLv3 and Apache 2.0 licenses are compatible at least in one +direction (GPLv3 is more restrictive than the Apache License), while the +compatibility between GPLv2 and Apache 2.0 licenses is a more difficult +question with arguments made both for and against license compatibility; +obviously it also depends on the legal environment. The Annif developers +make no legal claims; we simply provide the software and allow the user to +install these optional extensions if they consider it appropriate. Depending +on legal interpretation, the terms of the GPL (for example the requirement +to publish corresponding source code when publishing an executable +application) may be considered to apply to the whole of Annif+extensions if +you decide to install the optional Yake and/or EstNLTK dependencies. diff --git a/annif/analyzer/__init__.py b/annif/analyzer/__init__.py index 81f52511..fcd57baf 100644 --- a/annif/analyzer/__init__.py +++ b/annif/analyzer/__init__.py @@ -8,7 +8,7 @@ import annif from annif.util import parse_args -from . import simple, simplemma, snowball, spacy, voikko +from . import estnltk, simple, simplemma, snowball, spacy, voikko if TYPE_CHECKING: from annif.analyzer.analyzer import Analyzer @@ -42,3 +42,4 @@ def get_analyzer(analyzerspec: str) -> Analyzer: register_analyzer(simplemma.SimplemmaAnalyzer) register_analyzer(voikko.VoikkoAnalyzer) register_analyzer(spacy.SpacyAnalyzer) +register_analyzer(estnltk.EstNLTKAnalyzer) diff --git a/annif/analyzer/estnltk.py b/annif/analyzer/estnltk.py new file mode 100644 index 00000000..9c2f38be --- /dev/null +++ b/annif/analyzer/estnltk.py @@ -0,0 +1,31 @@ +"""EstNLTK analyzer for Annif which uses EstNLTK for lemmatization""" + +from __future__ import annotations + +import importlib + +from . import analyzer + + +class EstNLTKAnalyzer(analyzer.Analyzer): + name = "estnltk" + + @staticmethod + def is_available() -> bool: + # return True iff EstNLTK is installed + return importlib.util.find_spec("estnltk") is not None + + def __init__(self, param: str, **kwargs) -> None: + self.param = param + super().__init__(**kwargs) + + def tokenize_words(self, text: str, filter: bool = True) -> list[str]: + import estnltk + + txt = estnltk.Text(text.strip()) + txt.tag_layer() + return [ + lemma + for lemma in [lemmas[0] for lemmas in txt.lemma] + if (not filter or self.is_valid_token(lemma)) + ] diff --git a/pyproject.toml b/pyproject.toml index de8410e6..924ae9ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ huggingface-hub = "~0.25.1" fasttext-wheel = { version = "0.9.2", optional = true } voikko = { version = "0.5.*", optional = true } +estnltk = { version = "1.7.3", optional = true } tensorflow-cpu = { version = "~2.17.0", optional = true } lmdb = { version = "~1.5.1", optional = true } omikuji = { version = "0.5.*", optional = true } @@ -73,6 +74,7 @@ schemathesis = "3.*.*" [tool.poetry.extras] fasttext = ["fasttext-wheel"] voikko = ["voikko"] +estnltk = ["estnltk"] nn = ["tensorflow-cpu", "lmdb"] omikuji = ["omikuji"] yake = ["yake"] diff --git a/tests/test_analyzer_estnltk.py b/tests/test_analyzer_estnltk.py new file mode 100644 index 00000000..3892b422 --- /dev/null +++ b/tests/test_analyzer_estnltk.py @@ -0,0 +1,53 @@ +"""Unit tests for EstNLTK analyzer in Annif""" + +import pytest + +import annif.analyzer +import annif.analyzer.estnltk + +pytestmark = pytest.mark.skipif( + not annif.analyzer.estnltk.EstNLTKAnalyzer.is_available(), + reason="EstNLTK is required", +) + + +def test_estnltk_tokenize_words(): + analyzer = annif.analyzer.get_analyzer("estnltk") + words = analyzer.tokenize_words( + """ + Aga kõik juhtus iseenesest. Ka köögis oli kõik endine. + """ + ) + assert words == [ + "aga", + "kõik", + "juhtuma", + "iseenesest", + "köök", + "olema", + "kõik", + "endine", + ] + + +def test_estnltk_tokenize_words_no_filter(): + analyzer = annif.analyzer.get_analyzer("estnltk") + words = analyzer.tokenize_words( + """ + Aga kõik juhtus iseenesest. Ka köögis oli kõik endine. + """, + filter=False, + ) + assert words == [ + "aga", + "kõik", + "juhtuma", + "iseenesest", + ".", + "ka", + "köök", + "olema", + "kõik", + "endine", + ".", + ]