NatLibFi · juhoinkinen · Nov 7, 2022 · Nov 3, 2022 · Nov 4, 2022 · Nov 4, 2022
diff --git a/.codeclimate.yml b/.codeclimate.yml
@@ -1,6 +1,11 @@
 engines:
   pep8:
     enabled: true
+    checks:
+      E203: # Check whitespace before ':'
+        enabled: false
+      E501: # Line length checks
+        enabled: false
   radon:
     enabled: true
   duplication:

diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
@@ -0,0 +1,2 @@
+# Migrate code style to Black
+3bc18907354a40f1d89dca1833a2719ba7fb0933
diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml
@@ -12,6 +12,14 @@ env:
   PIPX_BIN_DIR: "/home/runner/.local/bin"
   POETRY_VERSION: "1.2.0"
 jobs:
+
+  lint:
+    runs-on: ubuntu-latest
+    name: lint with Black
+    steps:
+      - uses: actions/checkout@v3
+      - uses: psf/black@6b42c2b8c9f9bd666120a2c19b8da509fe477f27
+
   test:
     runs-on: ubuntu-latest
     timeout-minutes: 15
@@ -23,6 +31,7 @@ jobs:
     steps:
     - uses: actions/checkout@v3
     - name: Install system packages
+
       run: |
         sudo apt-get install \
           libvoikko1 \
@@ -63,12 +72,6 @@ jobs:
         fi
         poetry run python -m nltk.downloader punkt
 
-    - name: Lint with flake8
-      run: |
-        # stop the build if there are Python syntax errors or undefined names
-        poetry run flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        poetry run flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
     - name: Test with pytest
       run: |
         poetry run pytest --cov=./ --cov-report xml
@@ -77,7 +80,7 @@ jobs:
 
   publish-docker-latest:
     name: publish latest Docker image
-    needs: test
+    needs: [lint, test]
     runs-on: ubuntu-20.04
     timeout-minutes: 15
     if: github.event_name == 'push' && github.ref == 'refs/heads/master'
@@ -104,7 +107,7 @@ jobs:
 
   publish-release:
     name: publish release
-    needs: test
+    needs: [lint, test]
     runs-on: ubuntu-20.04
     if: github.event_name == 'push' && contains(github.ref, 'refs/tags/')
     steps:

diff --git a/README.md b/README.md
@@ -11,6 +11,7 @@
 [![LGTM: Python](https://img.shields.io/lgtm/grade/python/g/NatLibFi/Annif.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/NatLibFi/Annif/context:python)
 [![Quality Gate Status](https://sonarcloud.io/api/project_badges/measure?project=NatLibFi_Annif&metric=alert_status)](https://sonarcloud.io/dashboard?id=NatLibFi_Annif)
 [![docs](https://readthedocs.org/projects/annif/badge/?version=latest)](https://annif.readthedocs.io/en/latest/index.html)
+[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 
 Annif is an automated subject indexing toolkit. It was originally created as
 a statistical automated indexing tool that used metadata from the
@@ -99,6 +100,18 @@ Run `poetry shell` to enter the virtual environment and then run `pytest`.
 To have the test suite watch for changes in code and run automatically, use
 pytest-watch by running `ptw`.
 
+## Code style
+
+Annif code should follow the [Black style](https://black.readthedocs.io/en/stable/the_black_code_style/current_style.html).
+The Black tool is included as a developoment dependency; you can run `black .` in the project root to autoformat code. 
+You can set up a [pre-commit hook](https://git-scm.com/book/en/v2/Customizing-Git-Git-Hooks) to automate formatting and flake8 linting to be run with every git commit by using the following in the file `.git/hooks/pre-commit`, which should have execute permission set:
+```bash
+#!/bin/sh
+
+black .
+flake8
+```
+
 # Getting help
 
 Many resources are available:

diff --git a/annif/__init__.py b/annif/__init__.py
@@ -6,7 +6,7 @@
 import connexion
 from flask_cors import CORS
 
-logger = logging.getLogger('annif')
+logger = logging.getLogger("annif")
 
 import annif.backend  # noqa
 
@@ -15,29 +15,30 @@ def create_app(config_name=None):
     # 'cxapp' here is the Connexion application that has a normal Flask app
     # as a property (cxapp.app)
 
-    specdir = os.path.join(os.path.dirname(__file__), 'swagger')
+    specdir = os.path.join(os.path.dirname(__file__), "swagger")
     cxapp = connexion.App(__name__, specification_dir=specdir)
     if config_name is None:
-        config_name = os.environ.get('ANNIF_CONFIG')
+        config_name = os.environ.get("ANNIF_CONFIG")
     if config_name is None:
-        if os.environ.get('FLASK_RUN_FROM_CLI') == 'true':
-            config_name = 'annif.default_config.Config'
+        if os.environ.get("FLASK_RUN_FROM_CLI") == "true":
+            config_name = "annif.default_config.Config"
         else:
-            config_name = 'annif.default_config.ProductionConfig'
-    logger.debug('creating app with configuration %s', config_name)
+            config_name = "annif.default_config.ProductionConfig"
+    logger.debug("creating app with configuration %s", config_name)
     cxapp.app.config.from_object(config_name)
-    cxapp.app.config.from_envvar('ANNIF_SETTINGS', silent=True)
+    cxapp.app.config.from_envvar("ANNIF_SETTINGS", silent=True)
 
-    cxapp.add_api('annif.yaml')
+    cxapp.add_api("annif.yaml")
 
     # add CORS support
     CORS(cxapp.app)
 
-    if cxapp.app.config['INITIALIZE_PROJECTS']:
+    if cxapp.app.config["INITIALIZE_PROJECTS"]:
         annif.registry.initialize_projects(cxapp.app)
 
     # register the views via blueprints
     from annif.views import bp
+
     cxapp.app.register_blueprint(bp)
 
     # return the Flask app

diff --git a/annif/analyzer/__init__.py b/annif/analyzer/__init__.py
@@ -15,10 +15,9 @@ def register_analyzer(analyzer):
 
 
 def get_analyzer(analyzerspec):
-    match = re.match(r'(\w+)(\((.*)\))?', analyzerspec)
+    match = re.match(r"(\w+)(\((.*)\))?", analyzerspec)
     if match is None:
-        raise ValueError(
-            "Invalid analyzer specification {}".format(analyzerspec))
+        raise ValueError("Invalid analyzer specification {}".format(analyzerspec))
 
     analyzer = match.group(1)
     posargs, kwargs = parse_args(match.group(3))
@@ -36,12 +35,14 @@ def get_analyzer(analyzerspec):
 # Optional analyzers
 try:
     from . import voikko
+
     register_analyzer(voikko.VoikkoAnalyzer)
 except ImportError:
     annif.logger.debug("voikko not available, not enabling voikko analyzer")
 
 try:
     from . import spacy
+
     register_analyzer(spacy.SpacyAnalyzer)
 except ImportError:
     annif.logger.debug("spaCy not available, not enabling spacy analyzer")
diff --git a/annif/analyzer/analyzer.py b/annif/analyzer/analyzer.py
@@ -4,7 +4,7 @@
 import functools
 import unicodedata
 
-_KEY_TOKEN_MIN_LENGTH = 'token_min_length'
+_KEY_TOKEN_MIN_LENGTH = "token_min_length"
 
 
 class Analyzer(metaclass=abc.ABCMeta):
@@ -22,6 +22,7 @@ def __init__(self, **kwargs):
     def tokenize_sentences(self, text):
         """Tokenize a piece of text (e.g. a document) into sentences."""
         import nltk.tokenize
+
         return nltk.tokenize.sent_tokenize(text)
 
     @functools.lru_cache(maxsize=50000)
@@ -31,7 +32,7 @@ def is_valid_token(self, word):
             return False
         for char in word:
             category = unicodedata.category(char)
-            if category[0] == 'L':  # letter
+            if category[0] == "L":  # letter
                 return True
         return False
 
@@ -41,9 +42,12 @@ def tokenize_words(self, text, filter=True):
         punctuation, numbers or very short words)"""
 
         import nltk.tokenize
-        return [self._normalize_word(word)
-                for word in nltk.tokenize.word_tokenize(text)
-                if (not filter or self.is_valid_token(word))]
+
+        return [
+            self._normalize_word(word)
+            for word in nltk.tokenize.word_tokenize(text)
+            if (not filter or self.is_valid_token(word))
+        ]
 
     def _normalize_word(self, word):
         """Normalize (stem or lemmatize) a word form into a normal form."""

diff --git a/annif/analyzer/snowball.py b/annif/analyzer/snowball.py
@@ -10,6 +10,7 @@ class SnowballAnalyzer(analyzer.Analyzer):
     def __init__(self, param, **kwargs):
         self.param = param
         import nltk.stem.snowball
+
         self.stemmer = nltk.stem.snowball.SnowballStemmer(param)
         super().__init__(**kwargs)
 

diff --git a/annif/analyzer/spacy.py b/annif/analyzer/spacy.py
@@ -4,32 +4,35 @@
 from annif.exception import OperationFailedException
 import annif.util
 
-_KEY_LOWERCASE = 'lowercase'
+_KEY_LOWERCASE = "lowercase"
 
 
 class SpacyAnalyzer(analyzer.Analyzer):
     name = "spacy"
 
     def __init__(self, param, **kwargs):
         import spacy
+
         self.param = param
         try:
-            self.nlp = spacy.load(param, exclude=['ner', 'parser'])
+            self.nlp = spacy.load(param, exclude=["ner", "parser"])
         except IOError as err:
             raise OperationFailedException(
-                f"Loading spaCy model '{param}' failed - " +
-                f"please download the model.\n{err}")
+                f"Loading spaCy model '{param}' failed - "
+                + f"please download the model.\n{err}"
+            )
         if _KEY_LOWERCASE in kwargs:
             self.lowercase = annif.util.boolean(kwargs[_KEY_LOWERCASE])
         else:
             self.lowercase = False
         super().__init__(**kwargs)
 
     def tokenize_words(self, text, filter=True):
-        lemmas = [lemma
-                  for lemma in (token.lemma_
-                                for token in self.nlp(text.strip()))
-                  if (not filter or self.is_valid_token(lemma))]
+        lemmas = [
+            lemma
+            for lemma in (token.lemma_ for token in self.nlp(text.strip()))
+            if (not filter or self.is_valid_token(lemma))
+        ]
         if self.lowercase:
             return [lemma.lower() for lemma in lemmas]
         else:

diff --git a/annif/analyzer/voikko.py b/annif/analyzer/voikko.py
@@ -18,13 +18,13 @@ def __getstate__(self):
         instance is set to None because as a ctypes object it cannot be
         pickled."""
 
-        return {'param': self.param, 'voikko': None}
+        return {"param": self.param, "voikko": None}
 
     @functools.lru_cache(maxsize=500000)
     def _normalize_word(self, word):
         if self.voikko is None:
             self.voikko = voikko.libvoikko.Voikko(self.param)
         result = self.voikko.analyze(word)
-        if len(result) > 0 and 'BASEFORM' in result[0]:
-            return result[0]['BASEFORM']
+        if len(result) > 0 and "BASEFORM" in result[0]:
+            return result[0]["BASEFORM"]
         return word
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Migrate code style to Black
		3bc18907354a40f1d89dca1833a2719ba7fb0933