tools/check.py

#! /usr/bin/env python

"""
Validate Software Carpentry lessons
according to the Markdown template specification described here:
http://software-carpentry.org/blog/2014/10/new-lesson-template-v2.html

Validates the presence of headings, as well as specific sub-nodes.
Contains validators for several kinds of template.

Call at command line with flag -h to see options and usage instructions.
"""

import argparse
import collections
import functools
import glob
import hashlib
import logging
import os
import re
import sys

import CommonMark
import yaml

import validation_helpers as vh

NUMBER_OF_ERRORS = 0

def incr_error(func):
    """Wrapper to count the number of errors"""
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        global NUMBER_OF_ERRORS
        NUMBER_OF_ERRORS += 1
        return func(*args, **kwargs)
    return wrapper

logging.error = incr_error(logging.error)


class MarkdownValidator(object):
    """Base class for Markdown validation

    Contains basic validation skeleton to be extended for specific page types
    """
    HEADINGS = []  # List of strings containing expected heading text

    # Callout boxes (blockquote items) have special rules.
    # Dict of tuples for each callout type: {style: (title, min, max)}
    CALLOUTS = {}

    WARN_ON_EXTRA_HEADINGS = False  # Warn when other headings are present?

    # Validate YAML doc headers: dict of {header text: validation_func}
    DOC_HEADERS = {}

    def __init__(self, filename=None, markdown=None):
        """Perform validation on a Markdown document.

        Validator accepts either the path to a file containing Markdown,
        OR a valid Markdown string. The latter is useful for unit testing."""
        self.filename = filename

        if filename:
            # Expect Markdown files to be in same directory as the input file
            self.markdown_dir = os.path.dirname(filename)
            self.lesson_dir = self.markdown_dir
            with open(filename, 'rU') as f:
                self.markdown = f.read()
        else:
            # Look for linked content in ../pages (relative to this file)
            self.lesson_dir = os.path.abspath(
                os.path.join(os.path.dirname(__file__), os.pardir))

            self.markdown_dir = self.lesson_dir
            self.markdown = markdown

        ast = self._parse_markdown(self.markdown)
        self.ast = vh.CommonMarkHelper(ast)

        # Keep track of how many times callout box styles are used
        self._callout_counts = collections.Counter()

    def _parse_markdown(self, markdown):
        parser = CommonMark.DocParser()
        ast = parser.parse(markdown)
        return ast

    def _validate_no_fixme(self):
        """Validate that the file contains no lines marked 'FIXME'
        This will be based on the raw markdown, not the ast"""
        valid = True
        for i, line in enumerate(self.markdown.splitlines()):
            if re.search("FIXME", line, re.IGNORECASE):
                logging.error(
                    "In {0}: "
                    "Line {1} contains FIXME, indicating "
                    "work in progress".format(self.filename, i+1))
                valid = False
        return valid

    def _validate_hrs(self):
        """Validate header

        Verify that the header section at top of document
        is bracketed by two horizontal rules"""
        valid = True
        try:
            hr_nodes = [self.ast.children[0], self.ast.children[2]]
        except IndexError:
            logging.error(
                "In {0}: "
                "Document must include header sections".format(self.filename))
            return False

        for hr in hr_nodes:
            if not self.ast.is_hr(hr):
                logging.error(
                    "In {0}: "
                    "Expected --- at line: {1}".format(
                        self.filename, hr.start_line))
                valid = False
        return valid

    def _validate_one_doc_header_row(self, label, content):
        """Validate a single row of the document header section"""
        if label not in self.DOC_HEADERS:
            logging.warning(
                "In {0}: "
                "Unrecognized label in header section: {1}".format(
                    self.filename, label))
            return False

        validation_function = self.DOC_HEADERS[label]
        validate_header = validation_function(content)
        if not validate_header:
            logging.error(
                "In {0}: "
                "Contents of document header field for label {1} "
                "do not follow expected format".format(self.filename, label))
        return validate_header

    # Methods related to specific validation. Can override specific tests.
    def _validate_doc_headers(self):
        """Validate the document header section.

        Pass only if the header of the document contains the specified
            sections with the expected contents"""

        # Test: Header section should be wrapped in hrs
        has_hrs = self._validate_hrs()

        header_node = self.ast.children[1]
        header_text = '\n'.join(header_node.strings)

        # Parse headers as YAML. Don't check if parser returns None or str.
        header_yaml = yaml.load(header_text)
        if not isinstance(header_yaml, dict):
            logging.error("In {0}: "
                          "Expected YAML markup with labels "
                          "{1}".format(self.filename, self.DOC_HEADERS.keys()))
            return False

        # Test: Labeled YAML should match expected format
        test_headers = [self._validate_one_doc_header_row(k, v)
                        for k, v in header_yaml.items()]

        # Test: Must have all expected header lines, and no others.
        only_headers = (len(header_yaml) == len(self.DOC_HEADERS))

        # If expected headings are missing, print an informative message
        missing_headings = [h for h in self.DOC_HEADERS
                            if h not in header_yaml]

        for h in missing_headings:
            logging.error("In {0}: "
                          "Header section is missing expected "
                          "row '{1}'".format(self.filename, h))

        return has_hrs and all(test_headers) and only_headers

    def _validate_section_heading_order(self, ast_node=None, headings=None):
        """Verify that section headings appear, and in the order expected"""
        if ast_node is None:
            ast_node = self.ast.data
            headings = self.HEADINGS

        heading_nodes = self.ast.get_section_headings(ast_node)
        # All headings should be exactly level 2
        correct_level = True
        for n in heading_nodes:
            if n.level != 2:
                logging.error(
                    "In {0}: "
                    "Heading at line {1} should be level 2".format(
                        self.filename, n.start_line))
                correct_level = False

        heading_labels = [vh.strip_attrs(n.strings[0]) for n in heading_nodes]

        # Check for missing and extra headings
        missing_headings = [expected_heading for expected_heading in headings
                            if expected_heading not in heading_labels]

        extra_headings = [found_heading for found_heading in heading_labels
                          if found_heading not in headings]

        for h in missing_headings:
            logging.error(
                "In {0}: "
                "Document is missing expected heading: {1}".format(
                    self.filename, h))

        if self.WARN_ON_EXTRA_HEADINGS is True:
            for h in extra_headings:
                logging.error(
                    "In {0}: "
                    "Document contains heading "
                    "not specified in the template: {1}".format(
                        self.filename, h))
            no_extra = (len(extra_headings) == 0)
        else:
            no_extra = True

        # Check that the subset of headings
        # in the template spec matches order in the document
        valid_order = True
        headings_overlap = [h for h in heading_labels if h in headings]
        if len(missing_headings) == 0 and headings_overlap != headings:
            valid_order = False
            logging.error(
                "In {0}: "
                "Document headings do not match "
                "the order specified by the template".format(self.filename))

        return (len(missing_headings) == 0) and \
            valid_order and no_extra and correct_level

    def _validate_one_callout(self, callout_node):
        """
        Logic to validate a single callout box (defined as a blockquoted
        section that starts with a heading). Checks that:

        * First child of callout box should be a lvl 2 header with
          known title & CSS style
        * Callout box must have at least one child after the heading

        An additional test is done in another function:
        * Checks # times callout style appears in document, minc <= n <= maxc
        """
        heading_node = callout_node.children[0]
        valid_head_lvl = self.ast.is_heading(heading_node, heading_level=2)
        title, styles = self.ast.get_heading_info(heading_node)

        if not valid_head_lvl:
            logging.error("In {0}: "
                          "Callout box titled '{1}' must start with a "
                          "level 2 heading".format(self.filename, title))

        try:
            style = styles[0]
        except IndexError:
            logging.error(
                "In {0}: "
                "Callout section titled '{1}' must specify "
                "a CSS style".format(self.filename, title))
            return False

        # Track # times this style is used in any callout
        self._callout_counts[style] += 1

        # Verify style actually in callout spec
        if style not in self.CALLOUTS:
            spec_title = None
            valid_style = False
        else:
            spec_title, _, _ = self.CALLOUTS[style]
            valid_style = True

        has_children = self.ast.has_number_children(callout_node, minc=2)
        if spec_title is not None and title != spec_title:
            # Callout box must have specified heading title
            logging.error(
                "In {0}: "
                "Callout section with style '{1}' should have "
                "title '{2}'".format(self.filename, style, spec_title))
            valid_title = False
        else:
            valid_title = True

        res = (valid_style and valid_title and has_children and valid_head_lvl)
        return res

    def _validate_callouts(self):
        """
        Validate all sections that appear as callouts

        The style is a better determinant of callout than the title
        """
        callout_nodes = self.ast.get_callouts()
        callouts_valid = True

        # Validate all the callout nodes present
        for n in callout_nodes:
            res = self._validate_one_callout(n)
            callouts_valid = callouts_valid and res

        found_styles = self._callout_counts

        # Issue error if style is not present correct # times
        missing_styles = [style
                          for style, (title, minc, maxc) in self.CALLOUTS.items()
                          if not ((minc or 0) <= found_styles[style]
                                  <= (maxc or sys.maxsize))]
        unknown_styles = [k for k in found_styles if k not in self.CALLOUTS]

        for style in unknown_styles:
            logging.error("In {0}: "
                          "Found callout box with unrecognized "
                          "style '{1}'".format(self.filename, style))

        for style in missing_styles:
            minc = self.CALLOUTS[style][1]
            maxc = self.CALLOUTS[style][2]
            logging.error("In {0}: "
                          "Expected between min {1} and max {2} callout boxes "
                          "with style '{3}'".format(
                self.filename, minc, maxc, style))

        return (callouts_valid and
                len(missing_styles) == 0 and len(unknown_styles) == 0)

    # Link validation methods
    def _validate_one_html_link(self, link_node, check_text=False):
        """
        Any local html file being linked was generated as part of the lesson.
        Therefore, file links (.html) must have a Markdown file
            in the expected folder.

        The title of the linked Markdown document should match the link text.
        """
        dest, link_text = self.ast.get_link_info(link_node)

        # HTML files in same folder are made from Markdown; special tests
        fn = dest.split("#")[0]  # Split anchor name from filename
        expected_md_fn = os.path.splitext(fn)[0] + os.extsep + "md"
        expected_md_path = os.path.join(self.markdown_dir,
                                        expected_md_fn)
        if not os.path.isfile(expected_md_path):
            logging.error(
                "In {0}: "
                "The document links to {1}, but could not find "
                "the expected markdown file {2}".format(
                    self.filename, fn, expected_md_path))
            return False

        if check_text is True:
            # If file exists, parse and validate link text = node title
            with open(expected_md_path, 'rU') as link_dest_file:
                dest_contents = link_dest_file.read()

            dest_ast = self._parse_markdown(dest_contents)
            dest_ast = vh.CommonMarkHelper(dest_ast)
            dest_page_title = dest_ast.get_doc_header_subtitle()

            if dest_page_title != link_text:
                logging.error(
                    "In {0}: "
                    "The linked page {1} exists, but "
                    "the link text '{2}' does not match the "
                    "(sub)title of that page, '{3}'.".format(
                        self.filename, dest,
                        link_text, dest_page_title))
                return False
        return True

    def _validate_one_link(self, link_node, check_text=False):
        """Logic to validate a single link to a file asset

        Performs special checks for links to a local markdown file.

        For links or images, just verify that a file exists.
        """
        dest, link_text = self.ast.get_link_info(link_node)

        if re.match(r"^[\w,\s-]+\.(html?)", dest, re.IGNORECASE):
            # Validate local html links have matching md file
            return self._validate_one_html_link(link_node,
                                                check_text=check_text)
        elif not re.match(r"^((https?|ftp)://.+)|mailto:",
                          dest, re.IGNORECASE)\
                and not re.match(r"^#.*", dest):
            # If not web or email URL, and not anchor on same page, then
            #  verify that local file exists
            dest_path = os.path.join(self.lesson_dir, dest)
            dest_path = dest_path.split("#")[0]  # Split anchor from filename
            if not os.path.isfile(dest_path):
                fn = dest.split("#")[0]  # Split anchor name from filename
                logging.error(
                    "In {0}: "
                    "Could not find the linked asset file "
                    "{1} in {2}. If this is a URL, it must be "
                    "prefixed with http(s):// or ftp://.".format(
                        self.filename, fn, dest_path))
                return False
        else:
            logging.debug(
                "In {0}: "
                "Skipped validation of link {1}".format(self.filename, dest))
        return True

    def _partition_links(self):
        """Fetch links in document. If this template has special requirements
        for link text (eg only some links' text should match dest page title),
        filter the list accordingly.

        Default behavior: don't check the text of any links"""
        check_text = []
        no_check_text = self.ast.find_external_links()

        return check_text, no_check_text

    def _validate_links(self):
        """Validate all references to external content

        This includes links AND images: these are the two types of node that
        CommonMark assigns a .destination property"""
        check_text, no_check_text = self._partition_links()

        valid = True
        for link_node in check_text:
            res = self._validate_one_link(link_node, check_text=True)
            valid = valid and res

        for link_node in no_check_text:
            res = self._validate_one_link(link_node, check_text=False)
            valid = valid and res
        return valid

    def _run_tests(self):
        """
        Let user override the list of tests to be performed.

        Error trapping is handled by the validate() wrapper method.
        """
        tests = [self._validate_no_fixme(),
                 self._validate_doc_headers(),
                 self._validate_section_heading_order(),
                 self._validate_callouts(),
                 self._validate_links()]

        return all(tests)

    def validate(self):
        """Perform all required validations. Wrap in exception handler"""
        try:
            return self._run_tests()
        except IndexError:
            logging.error("Document is missing critical sections")
            return False


class IndexPageValidator(MarkdownValidator):
    """Validate the contents of the homepage (index.md)"""
    HEADINGS = ['Topics',
                'Other Resources']

    DOC_HEADERS = {'layout': vh.is_str,
                   'title': vh.is_str}

    CALLOUTS = {'prereq': ("Prerequisites", 1, 1),
                'getready': ("Getting ready", 1, 1)}

    def _partition_links(self):
        """Check the text of every link in index.md"""
        external_links = self.ast.find_external_links()

        check_text = []
        no_check_text = []

        for link in external_links:
            if '#' in link.destination:
                no_check_text.append(link)
            else:
                check_text.append(link)

        return check_text, no_check_text

    def _validate_intro_section(self):
        """Validate the intro section

        It must be a paragraph, followed by blockquoted list of prereqs"""
        intro_block = self.ast.children[3]
        intro_section = self.ast.is_paragraph(intro_block)
        if not intro_section:
            logging.error(
                "In {0}: "
                "Expected paragraph of introductory text at {1}".format(
                    self.filename, intro_block.start_line))

        return intro_section

    def _run_tests(self):
        parent_tests = super(IndexPageValidator, self)._run_tests()
        tests = [self._validate_intro_section()]
        return all(tests) and parent_tests


class TopicPageValidator(MarkdownValidator):
    """Validate the Markdown contents of a topic page, eg 01-topicname.md"""
    DOC_HEADERS = {"layout": vh.is_str,
                   "title": vh.is_str,
                   "subtitle": vh.is_str,
                   "minutes": vh.is_numeric}

    CALLOUTS = {"objectives": ("Learning Objectives", 1, 1),
                "callout": (None, 0, None),
                "challenge": (None, 0, None)}

    def _validate_has_no_headings(self):
        """Check headings

        The top-level document has no headings indicating subtopics.
        The only valid subheadings are nested in blockquote elements"""
        heading_nodes = self.ast.get_section_headings()
        if len(heading_nodes) != 0:
            # Individual heading msgs are logged by validate_section_heading_order
            logging.warning(
                "In {0}: "
                "Sub-headings are often a sign "
                "a lesson needs to be split into multiple topics. "
                "Please make sure this subsection doesn't belong "
                "in a separate lesson.".format(self.filename))

        return True

    def _run_tests(self):
        parent_tests = super(TopicPageValidator, self)._run_tests()
        tests = [self._validate_has_no_headings()]
        return all(tests) and parent_tests


class ReferencePageValidator(MarkdownValidator):
    """Validate reference.md"""
    HEADINGS = ["Glossary"]
    WARN_ON_EXTRA_HEADINGS = False

    DOC_HEADERS = {"layout": vh.is_str,
                   "title": vh.is_str,
                   "subtitle": vh.is_str}

    def _partition_links(self):
        """For reference.md, only check that text of link matches
        dest page subtitle if the link is in a heading"""
        all_links = self.ast.find_external_links()
        check_text = self.ast.find_external_links(
            parent_crit=self.ast.is_heading)
        dont_check_text = [n for n in all_links if n not in check_text]
        return check_text, dont_check_text

    def _validate_glossary_entry(self, glossary_entry):
        """Validate glossary entry

        Glossary entry must be formatted in conformance with Pandoc's
        ```definition_lists``` extension.

        That syntax isn't supported by the CommonMark parser, so we identify
        terms manually."""
        glossary_keyword = glossary_entry[0]
        if len(glossary_entry) < 2:
            logging.error(
                "In {0}: "
                "Glossary entry '{1}' must have at least two lines- "
                "a term and a definition.".format(
                    self.filename, glossary_keyword))
            return False

        entry_is_valid = True
        for line_index, line in enumerate(glossary_entry):
            if line_index == 1 and not re.match("^:   ", line):
                logging.error(
                    "In {0}: "
                    "At glossary entry '{1}' "
                    "First line of definition must "
                    "start with ':    '.".format(
                        self.filename, glossary_keyword))
                entry_is_valid = False
        return entry_is_valid

    def _validate_glossary(self):
        """Validate the glossary section.

        Assumes that the glossary is at the end of the file:
            everything after the header. (and there must be a glossary section)

        Verifies that the only things in the glossary are definition items.
        """
        is_glossary_valid = True
        in_glossary = False
        for node in self.ast.children:
            if in_glossary:
                is_glossary_valid = is_glossary_valid and \
                    self._validate_glossary_entry(node.strings)
            elif self.ast.is_heading(node) and "Glossary" in node.strings:
                in_glossary = True

        return is_glossary_valid

    def _run_tests(self):
        tests = [self._validate_glossary()]
        parent_tests = super(ReferencePageValidator, self)._run_tests()
        return all(tests) and parent_tests


class InstructorPageValidator(MarkdownValidator):
    """Simple validator for Instructor's Guide- instructors.md"""
    HEADINGS = ["Overall"]
    WARN_ON_EXTRA_HEADINGS = False

    DOC_HEADERS = {"layout": vh.is_str,
                   "title": vh.is_str,
                   "subtitle": vh.is_str}

    def _partition_links(self):
        """For instructors.md, only check that text of link matches
        dest page subtitle if the link is in a heading"""
        all_links = self.ast.find_external_links()
        check_text = self.ast.find_external_links(
            parent_crit=self.ast.is_heading)
        dont_check_text = [n for n in all_links if n not in check_text]
        return check_text, dont_check_text


class LicensePageValidator(MarkdownValidator):
    """Validate LICENSE.md: user should not edit this file"""
    def _run_tests(self):
        """Skip the base tests; just check md5 hash"""
        # TODO: This hash is specific to the license for english-language repo
        expected_hash = '051a04b8ffe580ba6b7018fb4fd72a50'
        m = hashlib.md5()
        try:
            m.update(self.markdown)
        except TypeError:
            # Workaround for hashing in python3
            m.update(self.markdown.encode('utf-8'))

        if m.hexdigest() == expected_hash:
            return True
        else:
            logging.error("The provided license file should not be modified.")
            return False


class DiscussionPageValidator(MarkdownValidator):
    """
    Validate the discussion page (discussion.md).
    Most of the content is free-form.
    """
    WARN_ON_EXTRA_HEADINGS = False
    DOC_HEADERS = {"layout": vh.is_str,
                   "title": vh.is_str,
                   "subtitle": vh.is_str}


# Associate lesson template names with validators. This list used by CLI.
#   Dict of {name: (Validator, filename_pattern)}
LESSON_TEMPLATES = {"index": (IndexPageValidator, "^index"),
                    "topic": (TopicPageValidator, "^[0-9]{2}-.*"),
                    "reference": (ReferencePageValidator, "^reference"),
                    "instructor": (InstructorPageValidator, "^instructors"),
                    "license": (LicensePageValidator, "^LICENSE"),
                    "discussion": (DiscussionPageValidator, "^discussion")}

# List of files in the lesson directory that should not be validated at all
SKIP_FILES = ("CONDUCT.md", "CONTRIBUTING.md",
              "DESIGN.md", "FAQ.md", "LAYOUT.md", "README.md")


def identify_template(filepath):
    """Identify template

    Given the path to a single file,
    identify the appropriate template to use"""
    for template_name, (validator, pattern) in LESSON_TEMPLATES.items():
        if re.search(pattern, os.path.basename(filepath)):
            return template_name

    return None


def validate_single(filepath, template=None):
    """Validate a single Markdown file based on a specified template"""
    if os.path.basename(filepath) in SKIP_FILES:
        # Silently pass certain non-lesson files without validating them
        return True

    template = template or identify_template(filepath)
    if template is None:
        logging.error(
            "Validation failed for {0}: "
            "Could not automatically identify correct template.".format(
                filepath))
        return False

    logging.debug(
        "Beginning validation of {0} using template {1}".format(
            filepath, template))
    validator = LESSON_TEMPLATES[template][0]
    validate_file = validator(filepath)

    res = validate_file.validate()
    if res is True:
        logging.debug("File {0} successfully passed validation".format(
            filepath))
    else:
        logging.debug("File {0} failed validation: "
                      "see error log for details".format(filepath))

    return res


def validate_folder(path, template=None):
    """Validate an entire folder of files"""
    search_str = os.path.join(path, "*.md")  # Find files based on extension
    filename_list = glob.glob(search_str)

    if not filename_list:
        logging.error(
            "No Markdown files were found "
            "in specified directory {0}".format(path))
        return False

    all_valid = True
    for fn in filename_list:
        res = validate_single(fn, template=template)
        all_valid = all_valid and res
    return all_valid


def start_logging(level=logging.INFO):
    """Initialize logging and print messages to console."""
    logging.basicConfig(stream=sys.stdout,
                        level=level,
                        format="%(levelname)s: %(message)s")


def command_line():
    """Handle arguments passed in via the command line"""
    parser = argparse.ArgumentParser()
    parser.add_argument("file_or_path",
                        nargs="*",
                        default=[os.getcwd()],
                        help="The individual pathname")

    parser.add_argument('-t', '--template',
                        choices=LESSON_TEMPLATES.keys(),
                        help="The type of template to apply to all file(s). "
                             "If not specified, will auto-identify template.")

    parser.add_argument('-d', '--debug',
                        action='store_true',
                        help="Enable debug information.")

    return parser.parse_args()


def check_required_files(dir_to_validate):
    """Check if required files exists."""
    REQUIRED_FILES = ["01-*.md",
                      "CONDUCT.md",
                      "CONTRIBUTING.md",
                      "discussion.md",
                      "index.md",
                      "instructors.md",
                      "LICENSE.md",
                      "README.md",
                      "reference.md"]
    valid = True

    for required in REQUIRED_FILES:
        req_fn = os.path.join(dir_to_validate, required)
        if not glob.glob(req_fn):
            logging.error(
                "Missing file {0}.".format(required))
            valid = False

    return valid


def get_files_to_validate(file_or_path):
    """Generate list of files to validate."""
    files_to_validate = []
    dirs_to_validate = []

    for fn in file_or_path:
        if os.path.isdir(fn):
            search_str = os.path.join(fn, "*.md")
            files_to_validate.extend(glob.glob(search_str))
            dirs_to_validate.append(fn)
        elif os.path.isfile(fn):
            files_to_validate.append(fn)
        else:
            logging.error(
                "The specified file or folder {0} does not exist; "
                "could not perform validation".format(fn))

    return files_to_validate, dirs_to_validate


def main(parsed_args_obj):
    if parsed_args_obj.debug:
        log_level = "DEBUG"
    else:
        log_level = "WARNING"
    start_logging(log_level)

    template = parsed_args_obj.template

    all_valid = True

    files_to_validate, dirs_to_validate = get_files_to_validate(
        parsed_args_obj.file_or_path)

    # If user ask to validate only one file don't check for required files.
    for d in dirs_to_validate:
        all_valid = all_valid and check_required_files(d)

    for fn in files_to_validate:
        res = validate_single(fn, template=template)

        all_valid = all_valid and res

    if all_valid is True:
        logging.debug("All Markdown files successfully passed validation.")
    else:
        logging.warning(
            "{0} errors were encountered during validation. "
            "See log for details.".format(NUMBER_OF_ERRORS))
    sys.exit(NUMBER_OF_ERRORS)


if __name__ == "__main__":
    parsed_args = command_line()
    main(parsed_args)