forked from mattermost/mattermost
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
MM-11451 Added autolinking to Markdown parser (mattermost#9151)
* MM-11451 Added autolinking to Markdown parser * Added missing headers * Added mailto and tel links
- Loading branch information
Showing
6 changed files
with
1,002 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,253 @@ | ||
// Copyright (c) 2017-present Mattermost, Inc. All Rights Reserved. | ||
// See License.txt for license information. | ||
|
||
package markdown | ||
|
||
import ( | ||
"regexp" | ||
"strings" | ||
"unicode" | ||
"unicode/utf8" | ||
) | ||
|
||
// Based off of extensions/autolink.c from https://github.com/github/cmark | ||
|
||
var ( | ||
DefaultUrlSchemes = []string{"http", "https", "ftp", "mailto", "tel"} | ||
) | ||
|
||
// Given a string with a w at the given position, tries to parse and return a link starting with "www." | ||
// if one exists. If the text at the given position isn't a link, returns an empty string. Equivalent to | ||
// www_match from the reference code. | ||
func parseWWWAutolink(data string, position int) string { | ||
// Check that this isn't part of another word | ||
if position > 1 { | ||
prevChar := data[position-1] | ||
|
||
if !isWhitespaceByte(prevChar) && !isAllowedBeforeWWWLink(prevChar) { | ||
return "" | ||
} | ||
} | ||
|
||
// Check that this starts with www | ||
if len(data)-position < 4 || !regexp.MustCompile(`^www\d{0,3}\.`).MatchString(data[position:]) { | ||
return "" | ||
} | ||
|
||
end := checkDomain(data[position:], false) | ||
if end == 0 { | ||
return "" | ||
} | ||
|
||
end += position | ||
|
||
// Grab all text until the end of the string or the next whitespace character | ||
for end < len(data) && !isWhitespaceByte(data[end]) { | ||
end += 1 | ||
} | ||
|
||
// Trim trailing punctuation | ||
link := trimTrailingCharactersFromLink(data[position:end]) | ||
if link == "" { | ||
return "" | ||
} | ||
|
||
return link | ||
} | ||
|
||
func isAllowedBeforeWWWLink(c byte) bool { | ||
switch c { | ||
case '*', '_', '~', ')': | ||
return true | ||
default: | ||
return false | ||
} | ||
} | ||
|
||
// Given a string with a : at the given position, tried to parse and return a link starting with a URL scheme | ||
// if one exists. If the text around the given position isn't a link, returns an empty string. Equivalent to | ||
// url_match from the reference code. | ||
func parseURLAutolink(data string, position int) string { | ||
// Check that a :// exists. This doesn't match the clients that treat the slashes as optional. | ||
if len(data)-position < 4 || data[position+1] != '/' || data[position+2] != '/' { | ||
return "" | ||
} | ||
|
||
start := position - 1 | ||
for start > 0 && isAlphanumericByte(data[start-1]) { | ||
start -= 1 | ||
} | ||
|
||
// Ensure that the URL scheme is allowed and that at least one character after the scheme is valid. | ||
scheme := data[start:position] | ||
if !isSchemeAllowed(scheme) || !isValidHostCharacter(data[position+3:]) { | ||
return "" | ||
} | ||
|
||
end := checkDomain(data[position+3:], true) | ||
if end == 0 { | ||
return "" | ||
} | ||
|
||
end += position | ||
|
||
// Grab all text until the end of the string or the next whitespace character | ||
for end < len(data) && !isWhitespaceByte(data[end]) { | ||
end += 1 | ||
} | ||
|
||
// Trim trailing punctuation | ||
link := trimTrailingCharactersFromLink(data[start:end]) | ||
if link == "" { | ||
return "" | ||
} | ||
|
||
return link | ||
} | ||
|
||
func isSchemeAllowed(scheme string) bool { | ||
// Note that this doesn't support the custom URL schemes implemented by the client | ||
for _, allowed := range DefaultUrlSchemes { | ||
if strings.EqualFold(allowed, scheme) { | ||
return true | ||
} | ||
} | ||
|
||
return false | ||
} | ||
|
||
// Given a string starting with a URL, returns the number of valid characters that make up the URL's domain. | ||
// Returns 0 if the string doesn't start with a domain name. allowShort determines whether or not the domain | ||
// needs to contain a period to be considered valid. Equivalent to check_domain from the reference code. | ||
func checkDomain(data string, allowShort bool) int { | ||
foundUnderscore := false | ||
foundPeriod := false | ||
|
||
i := 1 | ||
for ; i < len(data)-1; i++ { | ||
if data[i] == '_' { | ||
foundUnderscore = true | ||
break | ||
} else if data[i] == '.' { | ||
foundPeriod = true | ||
} else if !isValidHostCharacter(data[i:]) && data[i] != '-' { | ||
break | ||
} | ||
} | ||
|
||
if foundUnderscore { | ||
return 0 | ||
} | ||
|
||
if allowShort { | ||
// If allowShort is set, accept any string of valid domain characters | ||
return i | ||
} | ||
|
||
// If allowShort isn't set, a valid domain just requires at least a single period. Note that this | ||
// logic isn't entirely necessary because we already know the string starts with "www." when | ||
// this is called from parseWWWAutolink | ||
if foundPeriod { | ||
return i | ||
} else { | ||
return 0 | ||
} | ||
} | ||
|
||
// Returns true if the provided link starts with a valid character for a domain name. Equivalent to | ||
// is_valid_hostchar from the reference code. | ||
func isValidHostCharacter(link string) bool { | ||
c, _ := utf8.DecodeRuneInString(link) | ||
if c == utf8.RuneError { | ||
return false | ||
} | ||
|
||
return !unicode.IsSpace(c) && !unicode.IsPunct(c) | ||
} | ||
|
||
// Removes any trailing characters such as punctuation or stray brackets that shouldn't be part of the link. | ||
// Equivalent to autolink_delim from the reference code. | ||
func trimTrailingCharactersFromLink(link string) string { | ||
runes := []rune(link) | ||
linkEnd := len(runes) | ||
|
||
// Cut off the link before an open angle bracket if it contains one | ||
for i, c := range runes { | ||
if c == '<' { | ||
linkEnd = i | ||
break | ||
} | ||
} | ||
|
||
for linkEnd > 0 { | ||
c := runes[linkEnd-1] | ||
|
||
if !canEndAutolink(c) { | ||
// Trim trailing quotes, periods, etc | ||
linkEnd = linkEnd - 1 | ||
} else if c == ';' { | ||
// Trim a trailing HTML entity | ||
newEnd := linkEnd - 2 | ||
|
||
for newEnd > 0 && ((runes[newEnd] >= 'a' && runes[newEnd] <= 'z') || (runes[newEnd] >= 'A' && runes[newEnd] <= 'Z')) { | ||
newEnd -= 1 | ||
} | ||
|
||
if newEnd < linkEnd-2 && runes[newEnd] == '&' { | ||
linkEnd = newEnd | ||
} else { | ||
// This isn't actually an HTML entity, so just trim the semicolon | ||
linkEnd = linkEnd - 1 | ||
} | ||
} else if c == ')' { | ||
// Only allow an autolink ending with a bracket if that bracket is part of a matching pair of brackets. | ||
// If there are more closing brackets than opening ones, remove the extra bracket | ||
|
||
numClosing := 0 | ||
numOpening := 0 | ||
|
||
// Examples (input text => output linked portion): | ||
// | ||
// http://www.pokemon.com/Pikachu_(Electric) | ||
// => http://www.pokemon.com/Pikachu_(Electric) | ||
// | ||
// http://www.pokemon.com/Pikachu_((Electric) | ||
// => http://www.pokemon.com/Pikachu_((Electric) | ||
// | ||
// http://www.pokemon.com/Pikachu_(Electric)) | ||
// => http://www.pokemon.com/Pikachu_(Electric) | ||
// | ||
// http://www.pokemon.com/Pikachu_((Electric)) | ||
// => http://www.pokemon.com/Pikachu_((Electric)) | ||
|
||
for i := 0; i < linkEnd; i++ { | ||
if runes[i] == '(' { | ||
numOpening += 1 | ||
} else if runes[i] == ')' { | ||
numClosing += 1 | ||
} | ||
} | ||
|
||
if numClosing <= numOpening { | ||
// There's fewer or equal closing brackets, so we've found the end of the link | ||
break | ||
} | ||
|
||
linkEnd -= 1 | ||
} else { | ||
// There's no special characters at the end of the link, so we're at the end | ||
break | ||
} | ||
} | ||
|
||
return string(runes[:linkEnd]) | ||
} | ||
|
||
func canEndAutolink(c rune) bool { | ||
switch c { | ||
case '?', '!', '.', ',', ':', '*', '_', '~', '\'', '"': | ||
return false | ||
default: | ||
return true | ||
} | ||
} |
Oops, something went wrong.