chore: Move from lintian to a sphinx spelling plugin (canonical#3639)

georgehyde-dot · Dec 8, 2023 · 305ec6b · 305ec6b
1 parent 3e7caf3
commit 305ec6b
Show file tree

Hide file tree

Showing 8 changed files with 409 additions and 43 deletions.
diff --git a/.github/workflows/check_format.yml b/.github/workflows/check_format.yml
@@ -87,10 +87,12 @@ jobs:
       - name: "Install dependencies"
         run: |
           sudo DEBIAN_FRONTEND=noninteractive apt-get -qy update
-          sudo DEBIAN_FRONTEND=noninteractive apt-get -qy install tox lintian
+          sudo DEBIAN_FRONTEND=noninteractive apt-get -qy install tox
       - name: "Spellcheck"
         run: |
-          make check_spelling
+          tox
+        env:
+          TOXENV: doc-spelling
       - name: "Build docs"
         env:
           TOXENV: doc

diff --git a/Makefile b/Makefile
@@ -124,47 +124,8 @@ fmt:
 fmt-tip:
 	tox -e do_format_tip && tox -e check_format_tip
 
-# Spell check && filter false positives
-_CHECK_SPELLING := find doc -type f -exec spellintian {} + | \
-       grep -v -e 'doc/rtd/topics/cli.rst: modules modules' \
-               -e 'doc/examples/cloud-config-mcollective.txt: WARNING WARNING' \
-               -e 'doc/examples/cloud-config-power-state.txt: Bye Bye' \
-               -e 'doc/examples/cloud-config.txt: Bye Bye' \
-               -e 'doc/rtd/topics/cli.rst: DOCS DOCS' \
-               -e 'doc/summit/2023_summit_shared_notes.md: Moustafa Moustafa' \
-               -e 'dependant'
-
-
-# For CI we require a failing return code when spellintian finds spelling errors
-check_spelling:
-	@! $(_CHECK_SPELLING)
-
-# Manipulate the output of spellintian into a valid "sed" command which is run
-# to fix the error
-#
-# Example spellintian output:
-#
-# doc/examples/kernel-cmdline.txt: everthing -> everything
-#
-# The "fix_spelling" target manipulates the above output into the following command
-# and runs that command.
-#
-# sed -i "s/everthing/everything/g" doc/examples/kernel-cmdline.txt
-#
-# awk notes:
-#
-# -F ': | -> ' means use the strings ": " or " -> " as field delimeters
-# \046 is octal for double quote
-# $$2 will contain the second field, ($ must be escaped because this is in a Makefile)
-#
-# Limitation: duplicate words with newline between them are not automatically fixed
-fix_spelling:
-	@$(_CHECK_SPELLING) | \
-		sed 's/ (duplicate word)//g' | \
-		awk -F ': | -> ' '{printf "sed -i \047s/%s/%s/g\047 %s\n", $$2, $$3, $$1}' | \
-		sh
 
 .PHONY: all check test lint clean rpm srpm deb deb-src yaml
 .PHONY: check_version clean_pyc
-.PHONY: unittest style-check fix_spelling render-template benchmark-generator
-.PHONY: clean_pytest clean_packaging check_spelling clean_release doc
+.PHONY: unittest style-check render-template benchmark-generator
+.PHONY: clean_pytest clean_packaging clean_release doc
diff --git a/doc-requirements.txt b/doc-requirements.txt
@@ -6,3 +6,4 @@ sphinx==7.1.2
 sphinx-design
 sphinx-copybutton
 sphinx-notfound-page
+sphinxcontrib-spelling
diff --git a/doc/rtd/conf.py b/doc/rtd/conf.py
@@ -36,8 +36,17 @@
     "sphinx.ext.autodoc",
     "sphinx.ext.autosectionlabel",
     "sphinx.ext.viewcode",
+    "sphinxcontrib.spelling",
 ]
 
+
+# Spelling settings for sphinxcontrib.spelling
+# https://docs.ubuntu.com/styleguide/en/
+spelling_warning = True
+
+# Uses case-independent spelling matches from doc/rtd/spelling_word_list.txt
+spelling_filters = ["spelling.WordListFilter"]
+
 # The suffix of source filenames.
 source_suffix = ".rst"
 

diff --git a/doc/rtd/spelling.py b/doc/rtd/spelling.py
@@ -0,0 +1,79 @@
+import pathlib
+import re
+
+import enchant
+
+
+class WordListFilter(enchant.tokenize.Filter):
+    word_list = "spelling_word_list.txt"
+    regex_list = "spelling_regex_list.txt"
+
+    def __init__(self, *args, **kwargs):
+        """Use two files for ignoring correctly spelled words
+
+        - spelling_word_list.txt: a list of exact matches to ignore
+        - spelling_regex_list.txt: a list of regular expressions to ignore
+
+        Splits tokens on "/" and "-".
+        """
+        super().__init__(*args, *kwargs)
+        directory = pathlib.Path(__file__).parent
+        with open(directory.joinpath(self.word_list)) as f:
+            lines = f.read().splitlines()
+            self._validate_lines(lines)
+            self.word_set = set(lines)
+            print(f"Loaded {self.word_list}: {lines})")
+        with open(directory.joinpath(self.regex_list)) as f:
+            regex_lines = f.read().splitlines()
+            self.regex_set = set(regex_lines)
+            print(f"Loaded {self.regex_list}: {regex_lines}")
+
+    def _validate_lines(self, lines):
+        """Assert that the word_list file is legible and orderly"""
+        for line in lines:
+            if line != line.lower():
+                raise Exception(
+                    f"Uppercase characters in {self.word_list} detected. "
+                    "Please use lowercase characters for legibility."
+                )
+        if lines != sorted(lines):
+            first_missordered = next_item = previous_item = None
+            for item_a, item_b in zip(lines, sorted(lines)):
+                if first_missordered:
+                    next_item = item_a
+                    break
+                elif item_a != item_b:
+                    first_missordered = item_a
+                else:
+                    previous_item = item_a
+            unordered = (
+                f"[..., {previous_item}, {first_missordered}, "
+                f"{next_item}, ...]"
+            )
+            raise Exception(
+                f"Unsorted {self.word_list} detected. "
+                f"Please sort for legibility. Unordered list: {unordered}"
+            )
+
+    def _in_word_list(self, word):
+        """Lowercase match the set of words in spelling_word_list.txt"""
+        return word.lower() in self.word_set
+
+    def _in_word_regex(self, word):
+        """Regex match the expressions in spelling_regex_list.txt"""
+        for regex in self.regex_set:
+            out = re.search(regex, word)
+            if out:
+                return True
+
+    def _skip(self, word):
+        """Skip words and regex expressions in the allowlist files"""
+        return self._in_word_list(word) or self._in_word_regex(word)
+
+    def _split(self, word):
+        """split words into sub-tokens on - and /"""
+        if "-" in word or "/" in word:
+            for i, token in enumerate(re.split("-|/", word)):
+                if self._skip(token):
+                    continue
+                yield token, i
diff --git a/doc/rtd/spelling_regex_list.txt b/doc/rtd/spelling_regex_list.txt
@@ -0,0 +1,14 @@
+\.py$
+\.sources$
+\.list$
+\.yml$
+\.yaml$
+cloud-init
+ami\-
+ubuntu\:
+IPv[46]
+^/
+$/
+ecdsa-sha2-nistp
+ed25519
+1.0/config/user