2023-03-12 16:00:57 +01:00
// GoToSocial
// Copyright (C) GoToSocial Authors admin@gotosocial.org
// SPDX-License-Identifier: AGPL-3.0-or-later
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
2021-09-01 18:29:25 +02:00
package regexes
import (
2022-05-07 17:55:27 +02:00
"bytes"
2021-09-01 18:29:25 +02:00
"regexp"
2022-05-07 17:55:27 +02:00
"sync"
"mvdan.cc/xurls/v2"
2021-09-01 18:29:25 +02:00
)
const (
2021-09-02 12:24:18 +02:00
users = "users"
actors = "actors"
statuses = "statuses"
inbox = "inbox"
outbox = "outbox"
followers = "followers"
following = "following"
liked = "liked"
publicKey = "main-key"
follow = "follow"
2023-01-10 15:19:05 +01:00
blocks = "blocks"
reports = "reports"
2021-09-01 18:29:25 +02:00
2023-05-07 19:53:21 +02:00
schemes = ` (http|https):// ` // Allowed URI protocols for parsing links in text.
alphaNumeric = ` \p { L}\p { M}*|\p { N} ` // A single number or script character in any language, including chars with accents.
usernameGrp = ` (?: ` + alphaNumeric + ` |\.|\-|\_) ` // Non-capturing group that matches against a single valid username character.
domainGrp = ` (?: ` + alphaNumeric + ` |\.|\-|\:) ` // Non-capturing group that matches against a single valid domain character.
mentionName = ` ^@( ` + usernameGrp + ` +)(?:@( ` + domainGrp + ` +))?$ ` // Extract parts of one mention, maybe including domain.
mentionFinder = ` (?:^|\s)(@ ` + usernameGrp + ` +(?:@ ` + domainGrp + ` +)?) ` // Extract all mentions from a text, each mention may include domain.
emojiShortcode = ` \w { 2,30} ` // Pattern for emoji shortcodes. maximumEmojiShortcodeLength = 30
emojiFinder = ` (?:\b)?:( ` + emojiShortcode + ` ):(?:\b)? ` // Extract all emoji shortcodes from a text.
2024-02-20 11:46:04 +01:00
emojiValidator = ` ^ ` + emojiShortcode + ` $ ` // Validate a single emoji shortcode.
2023-05-25 11:15:10 +02:00
usernameStrict = ` ^[a-z0-9_] { 1,64}$ ` // Pattern for usernames on THIS instance. maximumUsernameLength = 64
usernameRelaxed = ` [a-z0-9_\.] { 1,} ` // Relaxed version of username that can match instance accounts too.
2023-05-07 19:53:21 +02:00
misskeyReportNotesFinder = ` (?m)(?:^Note: ((?:http|https):\/\/.*)$) ` // Extract reported Note URIs from the text of a Misskey report/flag.
ulid = ` [0123456789ABCDEFGHJKMNPQRSTVWXYZ] { 26} ` // Pattern for ULID.
ulidValidate = ` ^ ` + ulid + ` $ ` // Validate one ULID.
/ *
Path parts / capture .
* /
2024-01-26 14:17:10 +01:00
userPathPrefix = ` ^/? ` + users + ` /( ` + usernameRelaxed + ` ) `
userPath = userPathPrefix + ` $ `
userWebPathPrefix = ` ^/? ` + ` @( ` + usernameRelaxed + ` ) `
userWebPath = userWebPathPrefix + ` $ `
publicKeyPath = userPathPrefix + ` / ` + publicKey + ` $ `
inboxPath = userPathPrefix + ` / ` + inbox + ` $ `
outboxPath = userPathPrefix + ` / ` + outbox + ` $ `
followersPath = userPathPrefix + ` / ` + followers + ` $ `
followingPath = userPathPrefix + ` / ` + following + ` $ `
likedPath = userPathPrefix + ` / ` + liked + ` $ `
followPath = userPathPrefix + ` / ` + follow + ` /( ` + ulid + ` )$ `
likePath = userPathPrefix + ` / ` + liked + ` /( ` + ulid + ` )$ `
statusesPath = userPathPrefix + ` / ` + statuses + ` /( ` + ulid + ` )$ `
blockPath = userPathPrefix + ` / ` + blocks + ` /( ` + ulid + ` )$ `
reportPath = ` ^/? ` + reports + ` /( ` + ulid + ` )$ `
filePath = ` ^/?( ` + ulid + ` )/([a-z]+)/([a-z]+)/( ` + ulid + ` )\.([a-z0-9]+)$ `
2021-09-01 18:29:25 +02:00
)
var (
2022-05-07 17:55:27 +02:00
// LinkScheme captures http/https schemes in URLs.
LinkScheme = func ( ) * regexp . Regexp {
rgx , err := xurls . StrictMatchingScheme ( schemes )
if err != nil {
panic ( err )
}
return rgx
} ( )
2023-05-07 19:53:21 +02:00
// MentionName captures the username and domain part from
// a mention string such as @whatever_user@example.org,
// returning whatever_user and example.org (without the @ symbols).
// Will also work for characters with umlauts and other accents.
// See: https://regex101.com/r/9tjNUy/1 for explanation and examples.
2021-09-01 18:29:25 +02:00
MentionName = regexp . MustCompile ( mentionName )
2023-05-07 19:53:21 +02:00
// MentionFinder extracts whole mentions from a piece of text.
2021-09-01 18:29:25 +02:00
MentionFinder = regexp . MustCompile ( mentionFinder )
2024-02-20 11:46:04 +01:00
// EmojiValidator validates an emoji shortcode.
EmojiValidator = regexp . MustCompile ( emojiValidator )
2021-09-01 18:29:25 +02:00
// EmojiFinder extracts emoji strings from a piece of text.
2023-05-07 19:53:21 +02:00
// See: https://regex101.com/r/478XGM/1
EmojiFinder = regexp . MustCompile ( emojiFinder )
2021-09-01 18:29:25 +02:00
2023-05-07 19:53:21 +02:00
// Username can be used to validate usernames of new signups on this instance.
Username = regexp . MustCompile ( usernameStrict )
2021-09-01 18:29:25 +02:00
2023-05-07 19:53:21 +02:00
// MisskeyReportNotes captures a list of Note URIs from report content created by Misskey.
// See: https://regex101.com/r/EnTOBV/1
MisskeyReportNotes = regexp . MustCompile ( misskeyReportNotesFinder )
2023-01-27 14:48:11 +01:00
2023-05-07 19:53:21 +02:00
// UserPath validates and captures the username part from eg /users/example_username.
UserPath = regexp . MustCompile ( userPath )
2021-09-01 18:29:25 +02:00
2024-01-26 14:17:10 +01:00
// UserWebPath validates and captures the username part from eg /@example_username.
UserWebPath = regexp . MustCompile ( userWebPath )
2021-09-01 18:29:25 +02:00
// PublicKeyPath parses a path that validates and captures the username part from eg /users/example_username/main-key
PublicKeyPath = regexp . MustCompile ( publicKeyPath )
// InboxPath parses a path that validates and captures the username part from eg /users/example_username/inbox
InboxPath = regexp . MustCompile ( inboxPath )
// OutboxPath parses a path that validates and captures the username part from eg /users/example_username/outbox
OutboxPath = regexp . MustCompile ( outboxPath )
// FollowersPath parses a path that validates and captures the username part from eg /users/example_username/followers
FollowersPath = regexp . MustCompile ( followersPath )
// FollowingPath parses a path that validates and captures the username part from eg /users/example_username/following
FollowingPath = regexp . MustCompile ( followingPath )
2023-05-07 19:53:21 +02:00
// LikedPath parses a path that validates and captures the username part from eg /users/example_username/liked
LikedPath = regexp . MustCompile ( likedPath )
2021-09-01 18:29:25 +02:00
// ULID parses and validate a ULID.
2023-05-07 19:53:21 +02:00
ULID = regexp . MustCompile ( ulidValidate )
2021-09-01 18:29:25 +02:00
2023-05-07 19:53:21 +02:00
// FollowPath parses a path that validates and captures the username part and the ulid part
// from eg /users/example_username/follow/01F7XT5JZW1WMVSW1KADS8PVDH
FollowPath = regexp . MustCompile ( followPath )
2021-09-01 18:29:25 +02:00
// LikePath parses a path that validates and captures the username part and the ulid part
2023-05-07 19:53:21 +02:00
// from eg /users/example_username/liked/01F7XT5JZW1WMVSW1KADS8PVDH
2021-09-01 18:29:25 +02:00
LikePath = regexp . MustCompile ( likePath )
// StatusesPath parses a path that validates and captures the username part and the ulid part
// from eg /users/example_username/statuses/01F7XT5JZW1WMVSW1KADS8PVDH
// The regex can be played with here: https://regex101.com/r/G9zuxQ/1
StatusesPath = regexp . MustCompile ( statusesPath )
// BlockPath parses a path that validates and captures the username part and the ulid part
// from eg /users/example_username/blocks/01F7XT5JZW1WMVSW1KADS8PVDH
BlockPath = regexp . MustCompile ( blockPath )
2022-11-25 18:23:42 +01:00
2023-01-10 15:19:05 +01:00
// ReportPath parses a path that validates and captures the ulid part
// from eg /reports/01GP3AWY4CRDVRNZKW0TEAMB5R
ReportPath = regexp . MustCompile ( reportPath )
2022-11-25 18:23:42 +01:00
// FilePath parses a file storage path of the form [ACCOUNT_ID]/[MEDIA_TYPE]/[MEDIA_SIZE]/[FILE_NAME]
// eg 01F8MH1H7YV1Z7D2C8K2730QBF/attachment/small/01F8MH8RMYQ6MSNY3JM2XT1CQ5.jpeg
// It captures the account id, media type, media size, file name, and file extension, eg
// `01F8MH1H7YV1Z7D2C8K2730QBF`, `attachment`, `small`, `01F8MH8RMYQ6MSNY3JM2XT1CQ5`, `jpeg`.
FilePath = regexp . MustCompile ( filePath )
2021-09-01 18:29:25 +02:00
)
2022-05-07 17:55:27 +02:00
// bufpool is a memory pool of byte buffers for use in our regex utility functions.
var bufpool = sync . Pool {
New : func ( ) any {
buf := bytes . NewBuffer ( make ( [ ] byte , 0 , 512 ) )
return buf
} ,
}
// ReplaceAllStringFunc will call through to .ReplaceAllStringFunc in the provided regex, but provide you a clean byte buffer for optimized string writes.
func ReplaceAllStringFunc ( rgx * regexp . Regexp , src string , repl func ( match string , buf * bytes . Buffer ) string ) string {
buf := bufpool . Get ( ) . ( * bytes . Buffer ) //nolint
defer bufpool . Put ( buf )
return rgx . ReplaceAllStringFunc ( src , func ( match string ) string {
buf . Reset ( ) // reset use
return repl ( match , buf )
} )
}