From 642281e54535f7aca53187c5eb76846499a26b6e Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Thu, 19 Sep 2024 15:16:28 +0300 Subject: [PATCH 1/4] Automate NLTK datapackage punkt_tab download for Analyzers --- annif/analyzer/analyzer.py | 16 ++++++++++++++++ tests/test_analyzer.py | 11 +++++++++++ 2 files changed, 27 insertions(+) diff --git a/annif/analyzer/analyzer.py b/annif/analyzer/analyzer.py index 25bdb6b57..d08de6480 100644 --- a/annif/analyzer/analyzer.py +++ b/annif/analyzer/analyzer.py @@ -6,6 +6,10 @@ import functools import unicodedata +import annif + +logger = annif.logger + _KEY_TOKEN_MIN_LENGTH = "token_min_length" @@ -21,6 +25,18 @@ def __init__(self, **kwargs) -> None: if _KEY_TOKEN_MIN_LENGTH in kwargs: self.token_min_length = int(kwargs[_KEY_TOKEN_MIN_LENGTH]) + import nltk.data + + try: + nltk.data.find("tokenizers/punkt_tab") + except LookupError as err: + logger.debug(str(err)) + if "punkt_tab" in str(err): # "punkt_tab" is surrounded by color code tags + logger.warning( + 'NLTK datapackage "punkt_tab" not found, downloading it now.' + ) + nltk.download("punkt_tab") + def tokenize_sentences(self, text: str) -> list[str]: """Tokenize a piece of text (e.g. a document) into sentences.""" import nltk.tokenize diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py index ecddfbb37..ff4b70793 100644 --- a/tests/test_analyzer.py +++ b/tests/test_analyzer.py @@ -1,5 +1,7 @@ """Unit tests for analyzers in Annif""" +from unittest import mock + import pytest import annif.analyzer @@ -15,6 +17,15 @@ def test_get_analyzer_badspec(): annif.analyzer.get_analyzer("()") +@mock.patch("nltk.data.find", side_effect=LookupError("Resource punkt_tab not found")) +@mock.patch("nltk.download") +def test_nltk_data_missing(download, find): + annif.analyzer.get_analyzer("snowball(english)") + assert find.called + assert download.called + assert download.call_args == mock.call("punkt_tab") + + def test_english_analyzer_normalize_word(): analyzer = annif.analyzer.get_analyzer("snowball(english)") assert analyzer._normalize_word("running") == "run" From c047aeaa2b6656a5038eb35f83429bd3c6c3f532 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Thu, 19 Sep 2024 16:30:05 +0300 Subject: [PATCH 2/4] Use variable for NLTK tokenizer datapackage name (punkt_tab) --- annif/analyzer/analyzer.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/annif/analyzer/analyzer.py b/annif/analyzer/analyzer.py index d08de6480..4b647c18b 100644 --- a/annif/analyzer/analyzer.py +++ b/annif/analyzer/analyzer.py @@ -11,6 +11,7 @@ logger = annif.logger _KEY_TOKEN_MIN_LENGTH = "token_min_length" +_NLTK_TOKENIZER_DATA = "punkt_tab" class Analyzer(metaclass=abc.ABCMeta): @@ -28,14 +29,15 @@ def __init__(self, **kwargs) -> None: import nltk.data try: - nltk.data.find("tokenizers/punkt_tab") + nltk.data.find("tokenizers/" + _NLTK_TOKENIZER_DATA) except LookupError as err: logger.debug(str(err)) - if "punkt_tab" in str(err): # "punkt_tab" is surrounded by color code tags + if _NLTK_TOKENIZER_DATA in str(err): logger.warning( - 'NLTK datapackage "punkt_tab" not found, downloading it now.' + f'NLTK datapackage "{_NLTK_TOKENIZER_DATA}" not found, ' + "downloading it now." ) - nltk.download("punkt_tab") + nltk.download(_NLTK_TOKENIZER_DATA) def tokenize_sentences(self, text: str) -> list[str]: """Tokenize a piece of text (e.g. a document) into sentences.""" From f01d351079f0d1879ea3a7fc9b47f21fa997f562 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Thu, 19 Sep 2024 16:33:44 +0300 Subject: [PATCH 3/4] Remove instructions to download NLTK data --- README.md | 8 -------- 1 file changed, 8 deletions(-) diff --git a/README.md b/README.md index 41e3a9f7f..707fe201a 100644 --- a/README.md +++ b/README.md @@ -38,10 +38,6 @@ The recommended way is to install Annif from source annif-venv/bin/activate pip install annif -You will also need NLTK data files: - - python -m nltk.downloader punkt_tab - Start up the application: annif @@ -113,10 +109,6 @@ Enter the virtual environment: poetry shell -You will also need NLTK data files: - - python -m nltk.downloader punkt_tab - Start up the application: annif From 45be777cfd12609150973b95d496cf232862526f Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Tue, 24 Sep 2024 08:21:31 +0300 Subject: [PATCH 4/4] Reraise LookupErrors that do not mention punkt_tab --- annif/analyzer/analyzer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/annif/analyzer/analyzer.py b/annif/analyzer/analyzer.py index 4b647c18b..129b882ab 100644 --- a/annif/analyzer/analyzer.py +++ b/annif/analyzer/analyzer.py @@ -38,6 +38,8 @@ def __init__(self, **kwargs) -> None: "downloading it now." ) nltk.download(_NLTK_TOKENIZER_DATA) + else: + raise def tokenize_sentences(self, text: str) -> list[str]: """Tokenize a piece of text (e.g. a document) into sentences."""