MM-11451 Added autolinking to Markdown parser (mattermost#9151)

* MM-11451 Added autolinking to Markdown parser * Added missing headers * Added mailto and tel links
nicospacez · Jul 24, 2018 · c8d3e42 · c8d3e42
1 parent bfb2640
commit c8d3e42
Show file tree

Hide file tree

Showing 6 changed files with 1,002 additions and 1 deletion.
diff --git a/utils/markdown/autolink.go b/utils/markdown/autolink.go
@@ -0,0 +1,253 @@
+// Copyright (c) 2017-present Mattermost, Inc. All Rights Reserved.
+// See License.txt for license information.
+
+package markdown
+
+import (
+	"regexp"
+	"strings"
+	"unicode"
+	"unicode/utf8"
+)
+
+// Based off of extensions/autolink.c from https://github.com/github/cmark
+
+var (
+	DefaultUrlSchemes = []string{"http", "https", "ftp", "mailto", "tel"}
+)
+
+// Given a string with a w at the given position, tries to parse and return a link starting with "www."
+// if one exists. If the text at the given position isn't a link, returns an empty string. Equivalent to
+// www_match from the reference code.
+func parseWWWAutolink(data string, position int) string {
+	// Check that this isn't part of another word
+	if position > 1 {
+		prevChar := data[position-1]
+
+		if !isWhitespaceByte(prevChar) && !isAllowedBeforeWWWLink(prevChar) {
+			return ""
+		}
+	}
+
+	// Check that this starts with www
+	if len(data)-position < 4 || !regexp.MustCompile(`^www\d{0,3}\.`).MatchString(data[position:]) {
+		return ""
+	}
+
+	end := checkDomain(data[position:], false)
+	if end == 0 {
+		return ""
+	}
+
+	end += position
+
+	// Grab all text until the end of the string or the next whitespace character
+	for end < len(data) && !isWhitespaceByte(data[end]) {
+		end += 1
+	}
+
+	// Trim trailing punctuation
+	link := trimTrailingCharactersFromLink(data[position:end])
+	if link == "" {
+		return ""
+	}
+
+	return link
+}
+
+func isAllowedBeforeWWWLink(c byte) bool {
+	switch c {
+	case '*', '_', '~', ')':
+		return true
+	default:
+		return false
+	}
+}
+
+// Given a string with a : at the given position, tried to parse and return a link starting with a URL scheme
+// if one exists. If the text around the given position isn't a link, returns an empty string. Equivalent to
+// url_match from the reference code.
+func parseURLAutolink(data string, position int) string {
+	// Check that a :// exists. This doesn't match the clients that treat the slashes as optional.
+	if len(data)-position < 4 || data[position+1] != '/' || data[position+2] != '/' {
+		return ""
+	}
+
+	start := position - 1
+	for start > 0 && isAlphanumericByte(data[start-1]) {
+		start -= 1
+	}
+
+	// Ensure that the URL scheme is allowed and that at least one character after the scheme is valid.
+	scheme := data[start:position]
+	if !isSchemeAllowed(scheme) || !isValidHostCharacter(data[position+3:]) {
+		return ""
+	}
+
+	end := checkDomain(data[position+3:], true)
+	if end == 0 {
+		return ""
+	}
+
+	end += position
+
+	// Grab all text until the end of the string or the next whitespace character
+	for end < len(data) && !isWhitespaceByte(data[end]) {
+		end += 1
+	}
+
+	// Trim trailing punctuation
+	link := trimTrailingCharactersFromLink(data[start:end])
+	if link == "" {
+		return ""
+	}
+
+	return link
+}
+
+func isSchemeAllowed(scheme string) bool {
+	// Note that this doesn't support the custom URL schemes implemented by the client
+	for _, allowed := range DefaultUrlSchemes {
+		if strings.EqualFold(allowed, scheme) {
+			return true
+		}
+	}
+
+	return false
+}
+
+// Given a string starting with a URL, returns the number of valid characters that make up the URL's domain.
+// Returns 0 if the string doesn't start with a domain name. allowShort determines whether or not the domain
+// needs to contain a period to be considered valid. Equivalent to check_domain from the reference code.
+func checkDomain(data string, allowShort bool) int {
+	foundUnderscore := false
+	foundPeriod := false
+
+	i := 1
+	for ; i < len(data)-1; i++ {
+		if data[i] == '_' {
+			foundUnderscore = true
+			break
+		} else if data[i] == '.' {
+			foundPeriod = true
+		} else if !isValidHostCharacter(data[i:]) && data[i] != '-' {
+			break
+		}
+	}
+
+	if foundUnderscore {
+		return 0
+	}
+
+	if allowShort {
+		// If allowShort is set, accept any string of valid domain characters
+		return i
+	}
+
+	// If allowShort isn't set, a valid domain just requires at least a single period. Note that this
+	// logic isn't entirely necessary because we already know the string starts with "www." when
+	// this is called from parseWWWAutolink
+	if foundPeriod {
+		return i
+	} else {
+		return 0
+	}
+}
+
+// Returns true if the provided link starts with a valid character for a domain name. Equivalent to
+// is_valid_hostchar from the reference code.
+func isValidHostCharacter(link string) bool {
+	c, _ := utf8.DecodeRuneInString(link)
+	if c == utf8.RuneError {
+		return false
+	}
+
+	return !unicode.IsSpace(c) && !unicode.IsPunct(c)
+}
+
+// Removes any trailing characters such as punctuation or stray brackets that shouldn't be part of the link.
+// Equivalent to autolink_delim from the reference code.
+func trimTrailingCharactersFromLink(link string) string {
+	runes := []rune(link)
+	linkEnd := len(runes)
+
+	// Cut off the link before an open angle bracket if it contains one
+	for i, c := range runes {
+		if c == '<' {
+			linkEnd = i
+			break
+		}
+	}
+
+	for linkEnd > 0 {
+		c := runes[linkEnd-1]
+
+		if !canEndAutolink(c) {
+			// Trim trailing quotes, periods, etc
+			linkEnd = linkEnd - 1
+		} else if c == ';' {
+			// Trim a trailing HTML entity
+			newEnd := linkEnd - 2
+
+			for newEnd > 0 && ((runes[newEnd] >= 'a' && runes[newEnd] <= 'z') || (runes[newEnd] >= 'A' && runes[newEnd] <= 'Z')) {
+				newEnd -= 1
+			}
+
+			if newEnd < linkEnd-2 && runes[newEnd] == '&' {
+				linkEnd = newEnd
+			} else {
+				// This isn't actually an HTML entity, so just trim the semicolon
+				linkEnd = linkEnd - 1
+			}
+		} else if c == ')' {
+			// Only allow an autolink ending with a bracket if that bracket is part of a matching pair of brackets.
+			// If there are more closing brackets than opening ones, remove the extra bracket
+
+			numClosing := 0
+			numOpening := 0
+
+			// Examples (input text => output linked portion):
+			//
+			//  http://www.pokemon.com/Pikachu_(Electric)
+			//    => http://www.pokemon.com/Pikachu_(Electric)
+			//
+			//  http://www.pokemon.com/Pikachu_((Electric)
+			//    => http://www.pokemon.com/Pikachu_((Electric)
+			//
+			//  http://www.pokemon.com/Pikachu_(Electric))
+			//    => http://www.pokemon.com/Pikachu_(Electric)
+			//
+			//  http://www.pokemon.com/Pikachu_((Electric))
+			//    => http://www.pokemon.com/Pikachu_((Electric))
+
+			for i := 0; i < linkEnd; i++ {
+				if runes[i] == '(' {
+					numOpening += 1
+				} else if runes[i] == ')' {
+					numClosing += 1
+				}
+			}
+
+			if numClosing <= numOpening {
+				// There's fewer or equal closing brackets, so we've found the end of the link
+				break
+			}
+
+			linkEnd -= 1
+		} else {
+			// There's no special characters at the end of the link, so we're at the end
+			break
+		}
+	}
+
+	return string(runes[:linkEnd])
+}
+
+func canEndAutolink(c rune) bool {
+	switch c {
+	case '?', '!', '.', ',', ':', '*', '_', '~', '\'', '"':
+		return false
+	default:
+		return true
+	}
+}