gotosocial/internal/text/sanitize.go

// GoToSocial
// Copyright (C) GoToSocial Authors admin@gotosocial.org
// SPDX-License-Identifier: AGPL-3.0-or-later
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.

package text

import (
	"html"
	"regexp"
	"strings"

	"github.com/microcosm-cc/bluemonday"
)

// Regular HTML policy is an adapted version of the default
// bluemonday UGC policy, with some tweaks of our own.
// See: https://github.com/microcosm-cc/bluemonday#usage
var regular *bluemonday.Policy = func() *bluemonday.Policy {
	p := bluemonday.NewPolicy()

	// AllowStandardAttributes will enable "id", "title" and
	// the language specific attributes "dir" and "lang" on
	// all elements that are allowed
	p.AllowStandardAttributes()

	/*
		LAYOUT AND FORMATTING
	*/

	// "aside" is permitted and takes no attributes.
	// See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/aside
	p.AllowElements("article", "aside")

	// "details" is permitted, including the "open" attribute
	// which can either be blank or the value "open".
	// See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/details
	p.AllowAttrs("open").Matching(regexp.MustCompile(`(?i)^(|open)$`)).OnElements("details")

	// "section" is permitted and takes no attributes.
	// See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/section
	p.AllowElements("section")

	// "summary" is permitted and takes no attributes.
	// See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/summary
	p.AllowElements("summary")

	// "h1" through "h6" are permitted and take no attributes.
	p.AllowElements("h1", "h2", "h3", "h4", "h5", "h6")

	// "hgroup" is permitted and takes no attributes.
	// See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/hgroup
	p.AllowElements("hgroup")

	// "blockquote" is permitted, including the "cite"
	// attribute which must be a standard URL.
	p.AllowAttrs("cite").OnElements("blockquote")

	// "br" "div" "hr" "p" "span" "wbr" are permitted and take no attributes
	p.AllowElements("br", "div", "hr", "p", "span", "wbr")

	// The following are all inline phrasing elements:
	p.AllowElements("abbr", "acronym", "cite", "code", "dfn", "em",
		"figcaption", "mark", "s", "samp", "strong", "sub", "sup", "var")

	// "q" is permitted and "cite" is a URL and handled by URL policies
	p.AllowAttrs("cite").OnElements("q")

	// "time" is permitted
	p.AllowAttrs("datetime").Matching(bluemonday.ISO8601).OnElements("time")

	// Block and inline elements that impart no
	// semantic meaning but style the document.
	// Underlines, italics, bold, strikethrough etc.
	p.AllowElements("b", "i", "pre", "small", "strike", "tt", "u")

	// "del" "ins" are permitted
	p.AllowAttrs("cite").Matching(bluemonday.Paragraph).OnElements("del", "ins")
	p.AllowAttrs("datetime").Matching(bluemonday.ISO8601).OnElements("del", "ins")

	// Enable ordered, unordered, and definition lists.
	p.AllowLists()

	// Class needed on span for mentions, which look like this when assembled:
	// `<span class="h-card"><a href="https://example.org/users/targetAccount" class="u-url mention">@<span>someusername</span></a></span>`
	p.AllowAttrs("class").OnElements("span")

	/*
		LANGUAGE FORMATTING
	*/

	// "bdi" "bdo" are permitted on "dir".
	// See: https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes/dir
	p.AllowAttrs("dir").Matching(bluemonday.Direction).OnElements("bdi", "bdo")

	// "rp" "rt" "ruby" are permitted. See:
	// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/rp
	// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/rt
	// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/ruby
	p.AllowElements("rp", "rt", "ruby")

	/*
		CODE BLOCKS
	*/

	// Permit language tags for code elements.
	p.AllowAttrs("class").Matching(regexp.MustCompile("^language-[a-zA-Z0-9]+$")).OnElements("code")

	// Don't sanitize HTML inside code blocks.
	p.SkipElementsContent("code", "pre")

	/*
		LINKS AND LINK SAFETY.
	*/

	// Permit hyperlinks.
	p.AllowAttrs("class", "href", "rel").OnElements("a")

	// URLs must be parseable by net/url.Parse().
	p.RequireParseableURLs(true)

	// Most common URL schemes only.
	p.AllowURLSchemes("mailto", "http", "https")

	// Force rel="noreferrer".
	// See: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/rel/noreferrer
	p.RequireNoReferrerOnLinks(true)

	// Add rel="nofollow" on all fully qualified (not relative) links.
	// See: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/rel#nofollow
	p.RequireNoFollowOnFullyQualifiedLinks(true)

	// Force crossorigin="anonymous"
	// See: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/crossorigin#anonymous
	p.RequireCrossOriginAnonymous(true)

	// Force target="_blank".
	// See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/a#target
	p.AddTargetBlankToFullyQualifiedLinks(true)

	return p
}()

// '[C]an be thought of as equivalent to stripping all HTML
// elements and their attributes as it has nothing on its allowlist.
// An example usage scenario would be blog post titles where HTML
// tags are not expected at all and if they are then the elements
// and the content of the elements should be stripped. This is a
// very strict policy.'
//
// Source: https://github.com/microcosm-cc/bluemonday#usage
var strict *bluemonday.Policy = bluemonday.StrictPolicy()

// removeHTML strictly removes *all* recognized
// HTML elements from the given string.
func removeHTML(in string) string {
	return strict.Sanitize(in)
}

// SanitizeToHTML sanitizes only risky html elements
// from the given string, allowing safe ones through.
func SanitizeToHTML(in string) string {
	return regular.Sanitize(in)
}

// SanitizeToPlaintext runs text through basic sanitization.
// This removes any html elements that were in the string,
// and returns clean plaintext.
func SanitizeToPlaintext(in string) string {
	// Unescape first to catch any tricky critters.
	content := html.UnescapeString(in)

	// Remove all detected HTML.
	content = removeHTML(content)

	// Unescape again to return plaintext.
	content = html.UnescapeString(content)
	return strings.TrimSpace(content)
}
[chore] Improve copyright header handling (#1608) * [chore] Remove years from all license headers Years or year ranges aren't required in license headers. Many projects have removed them in recent years and it avoids a bit of yearly toil. In many cases our copyright claim was also a bit dodgy since we added the 2021-2023 header to files created after 2021 but you can't claim copyright into the past that way. * [chore] Add license header check This ensures a license header is always added to any new file. This avoids maintainers/reviewers needing to remember to check for and ask for it in case a contribution doesn't include it. * [chore] Add missing license headers * [chore] Further updates to license header * Use the more common // indentend comment format * Remove the hack we had for the linter now that we use the // format * Add SPDX license identifier 2023-03-12 16:00:57 +01:00			`// GoToSocial`
			`// Copyright (C) GoToSocial Authors admin@gotosocial.org`
			`// SPDX-License-Identifier: AGPL-3.0-or-later`
			`//`
			`// This program is free software: you can redistribute it and/or modify`
			`// it under the terms of the GNU Affero General Public License as published by`
			`// the Free Software Foundation, either version 3 of the License, or`
			`// (at your option) any later version.`
			`//`
			`// This program is distributed in the hope that it will be useful,`
			`// but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`// GNU Affero General Public License for more details.`
			`//`
			`// You should have received a copy of the GNU Affero General Public License`
			`// along with this program. If not, see <http://www.gnu.org/licenses/>.`
sanitize html for statuses + instance (#97) * sanitize html for statuses + instance * sanitization 2021-07-13 16:03:51 +02:00
Markdown Statuses (#116) * parse markdown statuses if desired * add some preliminary docs for writing posts 2021-07-26 20:25:54 +02:00			`package text`
sanitize html for statuses + instance (#97) * sanitize html for statuses + instance * sanitization 2021-07-13 16:03:51 +02:00
			`import (`
[bugfix] html escape special characters in text instead of totally removing them (#719) * remove minify dependency * tidy up some tests * remove pre + postformat funcs * rework sanitization + formatting * update tests * add some more markdown tests 2022-07-19 15:21:17 +02:00			`"html"`
Text/status parsing fixes (#141) * aaaaaa * vendor minify * update + test markdown parsing 2021-08-16 19:17:56 +02:00			`"regexp"`
[bugfix] html escape special characters in text instead of totally removing them (#719) * remove minify dependency * tidy up some tests * remove pre + postformat funcs * rework sanitization + formatting * update tests * add some more markdown tests 2022-07-19 15:21:17 +02:00			`"strings"`
Text/status parsing fixes (#141) * aaaaaa * vendor minify * update + test markdown parsing 2021-08-16 19:17:56 +02:00
sanitize html for statuses + instance (#97) * sanitize html for statuses + instance * sanitization 2021-07-13 16:03:51 +02:00			`"github.com/microcosm-cc/bluemonday"`
			`)`

[bugfix] Use custom bluemonday policy to disallow inline img tags (#2100) 2023-08-11 14:40:11 +02:00			`// Regular HTML policy is an adapted version of the default`
			`// bluemonday UGC policy, with some tweaks of our own.`
			`// See: https://github.com/microcosm-cc/bluemonday#usage`
			`var regular bluemonday.Policy = func() bluemonday.Policy {`
			`p := bluemonday.NewPolicy()`

			`// AllowStandardAttributes will enable "id", "title" and`
			`// the language specific attributes "dir" and "lang" on`
			`// all elements that are allowed`
			`p.AllowStandardAttributes()`

			`/*`
			`LAYOUT AND FORMATTING`
			`*/`

			`// "aside" is permitted and takes no attributes.`
			`// See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/aside`
			`p.AllowElements("article", "aside")`

			`// "details" is permitted, including the "open" attribute`
			`// which can either be blank or the value "open".`
			`// See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/details`
			p.AllowAttrs("open").Matching(regexp.MustCompile(`(?i)^(\|open)$`)).OnElements("details")

			`// "section" is permitted and takes no attributes.`
			`// See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/section`
			`p.AllowElements("section")`

			`// "summary" is permitted and takes no attributes.`
			`// See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/summary`
			`p.AllowElements("summary")`

			`// "h1" through "h6" are permitted and take no attributes.`
			`p.AllowElements("h1", "h2", "h3", "h4", "h5", "h6")`

			`// "hgroup" is permitted and takes no attributes.`
			`// See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/hgroup`
			`p.AllowElements("hgroup")`

			`// "blockquote" is permitted, including the "cite"`
			`// attribute which must be a standard URL.`
			`p.AllowAttrs("cite").OnElements("blockquote")`

			`// "br" "div" "hr" "p" "span" "wbr" are permitted and take no attributes`
			`p.AllowElements("br", "div", "hr", "p", "span", "wbr")`

			`// The following are all inline phrasing elements:`
			`p.AllowElements("abbr", "acronym", "cite", "code", "dfn", "em",`
			`"figcaption", "mark", "s", "samp", "strong", "sub", "sup", "var")`

			`// "q" is permitted and "cite" is a URL and handled by URL policies`
			`p.AllowAttrs("cite").OnElements("q")`

			`// "time" is permitted`
			`p.AllowAttrs("datetime").Matching(bluemonday.ISO8601).OnElements("time")`

			`// Block and inline elements that impart no`
			`// semantic meaning but style the document.`
			`// Underlines, italics, bold, strikethrough etc.`
			`p.AllowElements("b", "i", "pre", "small", "strike", "tt", "u")`

			`// "del" "ins" are permitted`
			`p.AllowAttrs("cite").Matching(bluemonday.Paragraph).OnElements("del", "ins")`
			`p.AllowAttrs("datetime").Matching(bluemonday.ISO8601).OnElements("del", "ins")`

			`// Enable ordered, unordered, and definition lists.`
			`p.AllowLists()`

			`// Class needed on span for mentions, which look like this when assembled:`
			// `<span class="h-card"><a href="https://example.org/users/targetAccount" class="u-url mention">@<span>someusername</span></a></span>`
			`p.AllowAttrs("class").OnElements("span")`

			`/*`
			`LANGUAGE FORMATTING`
			`*/`

			`// "bdi" "bdo" are permitted on "dir".`
			`// See: https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes/dir`
			`p.AllowAttrs("dir").Matching(bluemonday.Direction).OnElements("bdi", "bdo")`

			`// "rp" "rt" "ruby" are permitted. See:`
			`// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/rp`
			`// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/rt`
			`// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/ruby`
			`p.AllowElements("rp", "rt", "ruby")`

			`/*`
			`CODE BLOCKS`
			`*/`

			`// Permit language tags for code elements.`
			`p.AllowAttrs("class").Matching(regexp.MustCompile("^language-[a-zA-Z0-9]+$")).OnElements("code")`

			`// Don't sanitize HTML inside code blocks.`
			`p.SkipElementsContent("code", "pre")`

			`/*`
			`LINKS AND LINK SAFETY.`
			`*/`

			`// Permit hyperlinks.`
			`p.AllowAttrs("class", "href", "rel").OnElements("a")`

			`// URLs must be parseable by net/url.Parse().`
			`p.RequireParseableURLs(true)`

			`// Most common URL schemes only.`
			`p.AllowURLSchemes("mailto", "http", "https")`

			`// Force rel="noreferrer".`
			`// See: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/rel/noreferrer`
			`p.RequireNoReferrerOnLinks(true)`

			`// Add rel="nofollow" on all fully qualified (not relative) links.`
			`// See: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/rel#nofollow`
			`p.RequireNoFollowOnFullyQualifiedLinks(true)`

			`// Force crossorigin="anonymous"`
			`// See: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/crossorigin#anonymous`
			`p.RequireCrossOriginAnonymous(true)`

			`// Force target="_blank".`
			`// See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/a#target`
			`p.AddTargetBlankToFullyQualifiedLinks(true)`

			`return p`
			`}()`

			`// '[C]an be thought of as equivalent to stripping all HTML`
			`// elements and their attributes as it has nothing on its allowlist.`
			`// An example usage scenario would be blog post titles where HTML`
			`// tags are not expected at all and if they are then the elements`
			`// and the content of the elements should be stripped. This is a`
			`// very strict policy.'`
sanitize html for statuses + instance (#97) * sanitize html for statuses + instance * sanitization 2021-07-13 16:03:51 +02:00			`//`
			`// Source: https://github.com/microcosm-cc/bluemonday#usage`
			`var strict *bluemonday.Policy = bluemonday.StrictPolicy()`

[bugfix] Use custom bluemonday policy to disallow inline img tags (#2100) 2023-08-11 14:40:11 +02:00			`// removeHTML strictly removes all recognized`
			`// HTML elements from the given string.`
[bugfix] Fix HTML escaping in instance title (#607) * move caption sanitization -> sanitize.go * use sanitizeplaintext rather than removehtml * rename sanitizecaption to sanitizeplaintext * avoid removing html twice from statuses * unexport remoteHTML it's no longer used outside the text package so this makes it less confusing * test instance PATCH 2022-05-26 11:37:13 +02:00			`func removeHTML(in string) string {`
			`return strict.Sanitize(in)`
			`}`

[bugfix] Use custom bluemonday policy to disallow inline img tags (#2100) 2023-08-11 14:40:11 +02:00			`// SanitizeToHTML sanitizes only risky html elements`
			`// from the given string, allowing safe ones through.`
			`func SanitizeToHTML(in string) string {`
sanitize html for statuses + instance (#97) * sanitize html for statuses + instance * sanitization 2021-07-13 16:03:51 +02:00			`return regular.Sanitize(in)`
			`}`

[bugfix] Use custom bluemonday policy to disallow inline img tags (#2100) 2023-08-11 14:40:11 +02:00			`// SanitizeToPlaintext runs text through basic sanitization.`
			`// This removes any html elements that were in the string,`
			`// and returns clean plaintext.`
			`func SanitizeToPlaintext(in string) string {`
			`// Unescape first to catch any tricky critters.`
[bugfix] html escape special characters in text instead of totally removing them (#719) * remove minify dependency * tidy up some tests * remove pre + postformat funcs * rework sanitization + formatting * update tests * add some more markdown tests 2022-07-19 15:21:17 +02:00			`content := html.UnescapeString(in)`
[bugfix] Use custom bluemonday policy to disallow inline img tags (#2100) 2023-08-11 14:40:11 +02:00
			`// Remove all detected HTML.`
[bugfix] Fix HTML escaping in instance title (#607) * move caption sanitization -> sanitize.go * use sanitizeplaintext rather than removehtml * rename sanitizecaption to sanitizeplaintext * avoid removing html twice from statuses * unexport remoteHTML it's no longer used outside the text package so this makes it less confusing * test instance PATCH 2022-05-26 11:37:13 +02:00			`content = removeHTML(content)`
[bugfix] Use custom bluemonday policy to disallow inline img tags (#2100) 2023-08-11 14:40:11 +02:00
			`// Unescape again to return plaintext.`
[bugfix] html escape special characters in text instead of totally removing them (#719) * remove minify dependency * tidy up some tests * remove pre + postformat funcs * rework sanitization + formatting * update tests * add some more markdown tests 2022-07-19 15:21:17 +02:00			`content = html.UnescapeString(content)`
			`return strings.TrimSpace(content)`
sanitize html for statuses + instance (#97) * sanitize html for statuses + instance * sanitization 2021-07-13 16:03:51 +02:00			`}`