2021-08-12 20:03:24 +01:00
|
|
|
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
|
|
|
|
// See LICENSE for licensing information
|
|
|
|
|
|
|
|
// Package xurls extracts urls from plain text using regular expressions.
|
|
|
|
package xurls
|
|
|
|
|
|
|
|
import (
|
|
|
|
"regexp"
|
|
|
|
"strings"
|
2022-09-28 18:30:40 +01:00
|
|
|
"sync"
|
2021-08-12 20:03:24 +01:00
|
|
|
"unicode/utf8"
|
|
|
|
)
|
|
|
|
|
|
|
|
//go:generate go run ./generate/tldsgen
|
|
|
|
//go:generate go run ./generate/schemesgen
|
|
|
|
//go:generate go run ./generate/unicodegen
|
|
|
|
|
|
|
|
const (
|
2022-09-28 18:30:40 +01:00
|
|
|
// pathCont is based on https://www.rfc-editor.org/rfc/rfc3987#section-2.2
|
|
|
|
// but does not match separators anywhere or most puncutation in final position,
|
|
|
|
// to avoid creating asymmetries like
|
|
|
|
// `Did you know that **<a href="...">https://example.com/**</a> is reserved for documentation?`
|
|
|
|
// from `Did you know that **https://example.com/** is reserved for documentation?`.
|
|
|
|
unreservedChar = `a-zA-Z0-9\-._~`
|
|
|
|
endUnreservedChar = `a-zA-Z0-9\-_~`
|
|
|
|
midSubDelimChar = `!$&'*+,;=`
|
|
|
|
endSubDelimChar = `$&+=`
|
|
|
|
midIPathSegmentChar = unreservedChar + `%` + midSubDelimChar + `:@` + allowedUcsChar
|
|
|
|
endIPathSegmentChar = endUnreservedChar + `%` + endSubDelimChar + allowedUcsCharMinusPunc
|
|
|
|
iPrivateChar = `\x{E000}-\x{F8FF}\x{F0000}-\x{FFFFD}\x{100000}-\x{10FFFD}`
|
|
|
|
midIChar = `/?#\\` + midIPathSegmentChar + iPrivateChar
|
|
|
|
endIChar = `/#` + endIPathSegmentChar + iPrivateChar
|
|
|
|
wellParen = `\((?:[` + midIChar + `]|\([` + midIChar + `]*\))*\)`
|
|
|
|
wellBrack = `\[(?:[` + midIChar + `]|\[[` + midIChar + `]*\])*\]`
|
|
|
|
wellBrace = `\{(?:[` + midIChar + `]|\{[` + midIChar + `]*\})*\}`
|
|
|
|
wellAll = wellParen + `|` + wellBrack + `|` + wellBrace
|
|
|
|
pathCont = `(?:[` + midIChar + `]*(?:` + wellAll + `|[` + endIChar + `]))+`
|
|
|
|
|
2021-08-12 20:03:24 +01:00
|
|
|
letter = `\p{L}`
|
|
|
|
mark = `\p{M}`
|
|
|
|
number = `\p{N}`
|
|
|
|
iriChar = letter + mark + number
|
2022-09-28 18:30:40 +01:00
|
|
|
iri = `[` + iriChar + `](?:[` + iriChar + `\-]*[` + iriChar + `])?`
|
|
|
|
subdomain = `(?:` + iri + `\.)+`
|
|
|
|
octet = `(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])`
|
|
|
|
ipv4Addr = octet + `\.` + octet + `\.` + octet + `\.` + octet
|
|
|
|
|
|
|
|
// ipv6Addr is based on https://datatracker.ietf.org/doc/html/rfc4291#section-2.2
|
|
|
|
// with a specific alternative for each valid count of leading 16-bit hexadecimal "chomps"
|
|
|
|
// that have not been replaced with a `::` elision.
|
|
|
|
h4 = `[0-9a-fA-F]{1,4}`
|
|
|
|
ipv6AddrMinusEmpty = `(?:` +
|
|
|
|
// 7 colon-terminated chomps, followed by a final chomp or the rest of an elision.
|
|
|
|
`(?:` + h4 + `:){7}(?:` + h4 + `|:)|` +
|
|
|
|
// 6 chomps, followed by an IPv4 address or elision with final chomp or final elision.
|
|
|
|
`(?:` + h4 + `:){6}(?:` + ipv4Addr + `|:` + h4 + `|:)|` +
|
|
|
|
// 5 chomps, followed by an elision with optional IPv4 or up to 2 final chomps.
|
|
|
|
`(?:` + h4 + `:){5}(?::` + ipv4Addr + `|(?::` + h4 + `){1,2}|:)|` +
|
|
|
|
// 4 chomps, followed by an elision with optional IPv4 (optionally preceded by a chomp) or
|
|
|
|
// up to 3 final chomps.
|
|
|
|
`(?:` + h4 + `:){4}(?:(?::` + h4 + `){0,1}:` + ipv4Addr + `|(?::` + h4 + `){1,3}|:)|` +
|
|
|
|
// 3 chomps, followed by an elision with optional IPv4 (preceded by up to 2 chomps) or
|
|
|
|
// up to 4 final chomps.
|
|
|
|
`(?:` + h4 + `:){3}(?:(?::` + h4 + `){0,2}:` + ipv4Addr + `|(?::` + h4 + `){1,4}|:)|` +
|
|
|
|
// 2 chomps, followed by an elision with optional IPv4 (preceded by up to 3 chomps) or
|
|
|
|
// up to 5 final chomps.
|
|
|
|
`(?:` + h4 + `:){2}(?:(?::` + h4 + `){0,3}:` + ipv4Addr + `|(?::` + h4 + `){1,5}|:)|` +
|
|
|
|
// 1 chomp, followed by an elision with optional IPv4 (preceded by up to 4 chomps) or
|
|
|
|
// up to 6 final chomps.
|
|
|
|
`(?:` + h4 + `:){1}(?:(?::` + h4 + `){0,4}:` + ipv4Addr + `|(?::` + h4 + `){1,6}|:)|` +
|
|
|
|
// elision, followed by optional IPv4 (preceded by up to 5 chomps) or
|
|
|
|
// up to 7 final chomps.
|
|
|
|
// `:` is an intentionally omitted alternative, to avoid matching `::`.
|
|
|
|
`:(?:(?::` + h4 + `){0,5}:` + ipv4Addr + `|(?::` + h4 + `){1,7})` +
|
|
|
|
`)`
|
|
|
|
ipv6Addr = `(?:` + ipv6AddrMinusEmpty + `|::)`
|
|
|
|
ipAddrMinusEmpty = `(?:` + ipv6AddrMinusEmpty + `|\b` + ipv4Addr + `\b)`
|
|
|
|
port = `(?::[0-9]*)?`
|
2021-08-12 20:03:24 +01:00
|
|
|
)
|
|
|
|
|
|
|
|
// AnyScheme can be passed to StrictMatchingScheme to match any possibly valid
|
|
|
|
// scheme, and not just the known ones.
|
2022-09-28 18:30:40 +01:00
|
|
|
var AnyScheme = `(?:[a-zA-Z][a-zA-Z.\-+]*://|` + anyOf(SchemesNoAuthority...) + `:)`
|
2021-08-12 20:03:24 +01:00
|
|
|
|
|
|
|
// SchemesNoAuthority is a sorted list of some well-known url schemes that are
|
|
|
|
// followed by ":" instead of "://". The list includes both officially
|
|
|
|
// registered and unofficial schemes.
|
|
|
|
var SchemesNoAuthority = []string{
|
|
|
|
`bitcoin`, // Bitcoin
|
|
|
|
`cid`, // Content-ID
|
|
|
|
`file`, // Files
|
|
|
|
`magnet`, // Torrent magnets
|
|
|
|
`mailto`, // Mail
|
|
|
|
`mid`, // Message-ID
|
|
|
|
`sms`, // SMS
|
|
|
|
`tel`, // Telephone
|
|
|
|
`xmpp`, // XMPP
|
|
|
|
}
|
|
|
|
|
|
|
|
// SchemesUnofficial is a sorted list of some well-known url schemes which
|
|
|
|
// aren't officially registered just yet. They tend to correspond to software.
|
|
|
|
//
|
|
|
|
// Mostly collected from https://en.wikipedia.org/wiki/List_of_URI_schemes#Unofficial_but_common_URI_schemes.
|
|
|
|
var SchemesUnofficial = []string{
|
2022-09-28 18:30:40 +01:00
|
|
|
`gemini`, // gemini
|
|
|
|
`jdbc`, // Java database Connectivity
|
|
|
|
`moz-extension`, // Firefox extension
|
|
|
|
`postgres`, // PostgreSQL (short form)
|
|
|
|
`postgresql`, // PostgreSQL
|
|
|
|
`slack`, // Slack
|
|
|
|
`zoommtg`, // Zoom (desktop)
|
|
|
|
`zoomus`, // Zoom (mobile)
|
2021-08-12 20:03:24 +01:00
|
|
|
}
|
|
|
|
|
2022-09-28 18:30:40 +01:00
|
|
|
// The regular expressions are compiled when the API is first called.
|
|
|
|
// Any subsequent calls will use the same regular expression pointers.
|
|
|
|
//
|
|
|
|
// We do not need to make a copy of them for each API call,
|
|
|
|
// as Copy is now only useful if one copy calls Longest but not another,
|
|
|
|
// and we always call Longest after compiling the regular expression.
|
|
|
|
var (
|
|
|
|
strictRe *regexp.Regexp
|
|
|
|
strictInit sync.Once
|
|
|
|
|
|
|
|
relaxedRe *regexp.Regexp
|
|
|
|
relaxedInit sync.Once
|
|
|
|
)
|
|
|
|
|
2021-08-12 20:03:24 +01:00
|
|
|
func anyOf(strs ...string) string {
|
|
|
|
var b strings.Builder
|
2022-09-28 18:30:40 +01:00
|
|
|
b.WriteString("(?:")
|
2021-08-12 20:03:24 +01:00
|
|
|
for i, s := range strs {
|
|
|
|
if i != 0 {
|
|
|
|
b.WriteByte('|')
|
|
|
|
}
|
|
|
|
b.WriteString(regexp.QuoteMeta(s))
|
|
|
|
}
|
|
|
|
b.WriteByte(')')
|
|
|
|
return b.String()
|
|
|
|
}
|
|
|
|
|
|
|
|
func strictExp() string {
|
2022-09-28 18:30:40 +01:00
|
|
|
schemes := `(?:(?i)(?:` + anyOf(Schemes...) + `|` + anyOf(SchemesUnofficial...) + `)://|` + anyOf(SchemesNoAuthority...) + `:)`
|
|
|
|
return schemes + pathCont
|
2021-08-12 20:03:24 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
func relaxedExp() string {
|
|
|
|
var asciiTLDs, unicodeTLDs []string
|
|
|
|
for i, tld := range TLDs {
|
|
|
|
if tld[0] >= utf8.RuneSelf {
|
|
|
|
asciiTLDs = TLDs[:i:i]
|
|
|
|
unicodeTLDs = TLDs[i:]
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
punycode := `xn--[a-z0-9-]+`
|
|
|
|
|
|
|
|
// Use \b to make sure ASCII TLDs are immediately followed by a word break.
|
|
|
|
// We can't do that with unicode TLDs, as they don't see following
|
|
|
|
// whitespace as a word break.
|
2022-09-28 18:30:40 +01:00
|
|
|
tlds := `(?:(?i)` + punycode + `|` + anyOf(append(asciiTLDs, PseudoTLDs...)...) + `\b|` + anyOf(unicodeTLDs...) + `)`
|
|
|
|
domain := subdomain + tlds
|
2021-08-12 20:03:24 +01:00
|
|
|
|
2022-09-28 18:30:40 +01:00
|
|
|
hostName := `(?:` + domain + `|\[` + ipv6Addr + `\]|\b` + ipv4Addr + `\b)`
|
|
|
|
webURL := hostName + port + `(?:/` + pathCont + `|/)?`
|
|
|
|
email := `[a-zA-Z0-9._%\-+]+@` + domain
|
|
|
|
return strictExp() + `|` + webURL + `|` + email + `|` + ipv6AddrMinusEmpty
|
2021-08-12 20:03:24 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// Strict produces a regexp that matches any URL with a scheme in either the
|
|
|
|
// Schemes or SchemesNoAuthority lists.
|
|
|
|
func Strict() *regexp.Regexp {
|
2022-09-28 18:30:40 +01:00
|
|
|
strictInit.Do(func() {
|
|
|
|
strictRe = regexp.MustCompile(strictExp())
|
|
|
|
strictRe.Longest()
|
|
|
|
})
|
|
|
|
return strictRe
|
2021-08-12 20:03:24 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// Relaxed produces a regexp that matches any URL matched by Strict, plus any
|
|
|
|
// URL with no scheme or email address.
|
|
|
|
func Relaxed() *regexp.Regexp {
|
2022-09-28 18:30:40 +01:00
|
|
|
relaxedInit.Do(func() {
|
|
|
|
relaxedRe = regexp.MustCompile(relaxedExp())
|
|
|
|
relaxedRe.Longest()
|
|
|
|
})
|
|
|
|
return relaxedRe
|
2021-08-12 20:03:24 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// StrictMatchingScheme produces a regexp similar to Strict, but requiring that
|
|
|
|
// the scheme match the given regular expression. See AnyScheme too.
|
|
|
|
func StrictMatchingScheme(exp string) (*regexp.Regexp, error) {
|
2022-09-28 18:30:40 +01:00
|
|
|
strictMatching := `(?i)(?:` + exp + `)(?-i)` + pathCont
|
2021-08-12 20:03:24 +01:00
|
|
|
re, err := regexp.Compile(strictMatching)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
re.Longest()
|
|
|
|
return re, nil
|
|
|
|
}
|