Skip to content

Commit

Permalink
MM-11451 Added autolinking to Markdown parser (mattermost#9151)
Browse files Browse the repository at this point in the history
* MM-11451 Added autolinking to Markdown parser

* Added missing headers

* Added mailto and tel links
  • Loading branch information
hmhealey authored and crspeller committed Jul 24, 2018
1 parent bfb2640 commit c8d3e42
Show file tree
Hide file tree
Showing 6 changed files with 1,002 additions and 1 deletion.
253 changes: 253 additions & 0 deletions utils/markdown/autolink.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
// Copyright (c) 2017-present Mattermost, Inc. All Rights Reserved.
// See License.txt for license information.

package markdown

import (
"regexp"
"strings"
"unicode"
"unicode/utf8"
)

// Based off of extensions/autolink.c from https://github.com/github/cmark

var (
DefaultUrlSchemes = []string{"http", "https", "ftp", "mailto", "tel"}
)

// Given a string with a w at the given position, tries to parse and return a link starting with "www."
// if one exists. If the text at the given position isn't a link, returns an empty string. Equivalent to
// www_match from the reference code.
func parseWWWAutolink(data string, position int) string {
// Check that this isn't part of another word
if position > 1 {
prevChar := data[position-1]

if !isWhitespaceByte(prevChar) && !isAllowedBeforeWWWLink(prevChar) {
return ""
}
}

// Check that this starts with www
if len(data)-position < 4 || !regexp.MustCompile(`^www\d{0,3}\.`).MatchString(data[position:]) {
return ""
}

end := checkDomain(data[position:], false)
if end == 0 {
return ""
}

end += position

// Grab all text until the end of the string or the next whitespace character
for end < len(data) && !isWhitespaceByte(data[end]) {
end += 1
}

// Trim trailing punctuation
link := trimTrailingCharactersFromLink(data[position:end])
if link == "" {
return ""
}

return link
}

func isAllowedBeforeWWWLink(c byte) bool {
switch c {
case '*', '_', '~', ')':
return true
default:
return false
}
}

// Given a string with a : at the given position, tried to parse and return a link starting with a URL scheme
// if one exists. If the text around the given position isn't a link, returns an empty string. Equivalent to
// url_match from the reference code.
func parseURLAutolink(data string, position int) string {
// Check that a :// exists. This doesn't match the clients that treat the slashes as optional.
if len(data)-position < 4 || data[position+1] != '/' || data[position+2] != '/' {
return ""
}

start := position - 1
for start > 0 && isAlphanumericByte(data[start-1]) {
start -= 1
}

// Ensure that the URL scheme is allowed and that at least one character after the scheme is valid.
scheme := data[start:position]
if !isSchemeAllowed(scheme) || !isValidHostCharacter(data[position+3:]) {
return ""
}

end := checkDomain(data[position+3:], true)
if end == 0 {
return ""
}

end += position

// Grab all text until the end of the string or the next whitespace character
for end < len(data) && !isWhitespaceByte(data[end]) {
end += 1
}

// Trim trailing punctuation
link := trimTrailingCharactersFromLink(data[start:end])
if link == "" {
return ""
}

return link
}

func isSchemeAllowed(scheme string) bool {
// Note that this doesn't support the custom URL schemes implemented by the client
for _, allowed := range DefaultUrlSchemes {
if strings.EqualFold(allowed, scheme) {
return true
}
}

return false
}

// Given a string starting with a URL, returns the number of valid characters that make up the URL's domain.
// Returns 0 if the string doesn't start with a domain name. allowShort determines whether or not the domain
// needs to contain a period to be considered valid. Equivalent to check_domain from the reference code.
func checkDomain(data string, allowShort bool) int {
foundUnderscore := false
foundPeriod := false

i := 1
for ; i < len(data)-1; i++ {
if data[i] == '_' {
foundUnderscore = true
break
} else if data[i] == '.' {
foundPeriod = true
} else if !isValidHostCharacter(data[i:]) && data[i] != '-' {
break
}
}

if foundUnderscore {
return 0
}

if allowShort {
// If allowShort is set, accept any string of valid domain characters
return i
}

// If allowShort isn't set, a valid domain just requires at least a single period. Note that this
// logic isn't entirely necessary because we already know the string starts with "www." when
// this is called from parseWWWAutolink
if foundPeriod {
return i
} else {
return 0
}
}

// Returns true if the provided link starts with a valid character for a domain name. Equivalent to
// is_valid_hostchar from the reference code.
func isValidHostCharacter(link string) bool {
c, _ := utf8.DecodeRuneInString(link)
if c == utf8.RuneError {
return false
}

return !unicode.IsSpace(c) && !unicode.IsPunct(c)
}

// Removes any trailing characters such as punctuation or stray brackets that shouldn't be part of the link.
// Equivalent to autolink_delim from the reference code.
func trimTrailingCharactersFromLink(link string) string {
runes := []rune(link)
linkEnd := len(runes)

// Cut off the link before an open angle bracket if it contains one
for i, c := range runes {
if c == '<' {
linkEnd = i
break
}
}

for linkEnd > 0 {
c := runes[linkEnd-1]

if !canEndAutolink(c) {
// Trim trailing quotes, periods, etc
linkEnd = linkEnd - 1
} else if c == ';' {
// Trim a trailing HTML entity
newEnd := linkEnd - 2

for newEnd > 0 && ((runes[newEnd] >= 'a' && runes[newEnd] <= 'z') || (runes[newEnd] >= 'A' && runes[newEnd] <= 'Z')) {
newEnd -= 1
}

if newEnd < linkEnd-2 && runes[newEnd] == '&' {
linkEnd = newEnd
} else {
// This isn't actually an HTML entity, so just trim the semicolon
linkEnd = linkEnd - 1
}
} else if c == ')' {
// Only allow an autolink ending with a bracket if that bracket is part of a matching pair of brackets.
// If there are more closing brackets than opening ones, remove the extra bracket

numClosing := 0
numOpening := 0

// Examples (input text => output linked portion):
//
// http://www.pokemon.com/Pikachu_(Electric)
// => http://www.pokemon.com/Pikachu_(Electric)
//
// http://www.pokemon.com/Pikachu_((Electric)
// => http://www.pokemon.com/Pikachu_((Electric)
//
// http://www.pokemon.com/Pikachu_(Electric))
// => http://www.pokemon.com/Pikachu_(Electric)
//
// http://www.pokemon.com/Pikachu_((Electric))
// => http://www.pokemon.com/Pikachu_((Electric))

for i := 0; i < linkEnd; i++ {
if runes[i] == '(' {
numOpening += 1
} else if runes[i] == ')' {
numClosing += 1
}
}

if numClosing <= numOpening {
// There's fewer or equal closing brackets, so we've found the end of the link
break
}

linkEnd -= 1
} else {
// There's no special characters at the end of the link, so we're at the end
break
}
}

return string(runes[:linkEnd])
}

func canEndAutolink(c rune) bool {
switch c {
case '?', '!', '.', ',', ':', '*', '_', '~', '\'', '"':
return false
default:
return true
}
}
Loading

0 comments on commit c8d3e42

Please sign in to comment.