From 52109776f63ac59b2fef5cd7417becd9f0007acb Mon Sep 17 00:00:00 2001 From: ugla Date: Tue, 15 Nov 2022 16:05:34 +0100 Subject: [PATCH] [bugfix] Fix unicode-unaware word boundary check in hashtags (#1049) * [bugfix] Fix unicode-unaware word boundary check in hashtag regex Go `\b` does not care for Unicode, and without lookahead, the workarounds got very ugly. So I replaced the regex with a parser. The parser runs in O(n) time and performance should not be affected. * [bugfix] Add back hashtag max length and add tests for it --- internal/regexes/regexes.go | 9 +--- internal/text/common.go | 48 ++++++++++------- internal/util/statustools.go | 90 ++++++++++++++++++++++++++++--- internal/util/statustools_test.go | 44 +++++++++++---- 4 files changed, 146 insertions(+), 45 deletions(-) diff --git a/internal/regexes/regexes.go b/internal/regexes/regexes.go index dd3d9ce40..c9286611e 100644 --- a/internal/regexes/regexes.go +++ b/internal/regexes/regexes.go @@ -47,7 +47,6 @@ const ( const ( maximumUsernameLength = 64 maximumEmojiShortcodeLength = 30 - maximumHashtagLength = 30 ) var ( @@ -66,17 +65,11 @@ var ( // such as @whatever_user@example.org, returning whatever_user and example.org (without the @ symbols) MentionName = regexp.MustCompile(mentionName) - // mention regex can be played around with here: https://regex101.com/r/G1oGR0/1 + // mention regex can be played around with here: https://regex101.com/r/P0vpYG/1 mentionFinder = `(?:^|\s)(@\w+(?:@[a-zA-Z0-9_\-\.]+)?)` // MentionFinder extracts mentions from a piece of text. MentionFinder = regexp.MustCompile(mentionFinder) - // hashtag regex can be played with here: https://regex101.com/r/bpyGlj/1 - hashtagFinder = fmt.Sprintf(`(?:^|\s)(?:#*)(#[\p{L}\p{N}]{1,%d})(?:#|\b)`, maximumHashtagLength) - // HashtagFinder finds possible hashtags in a string. - // It returns just the string part of the hashtag, not the # symbol. - HashtagFinder = regexp.MustCompile(hashtagFinder) - emojiShortcode = fmt.Sprintf(`\w{2,%d}`, maximumEmojiShortcodeLength) // EmojiShortcode validates an emoji name. EmojiShortcode = regexp.MustCompile(fmt.Sprintf("^%s$", emojiShortcode)) diff --git a/internal/text/common.go b/internal/text/common.go index 005f9dfe1..ca4b97465 100644 --- a/internal/text/common.go +++ b/internal/text/common.go @@ -27,36 +27,46 @@ import ( "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" "github.com/superseriousbusiness/gotosocial/internal/log" "github.com/superseriousbusiness/gotosocial/internal/regexes" + "github.com/superseriousbusiness/gotosocial/internal/util" ) func (f *formatter) ReplaceTags(ctx context.Context, in string, tags []*gtsmodel.Tag) string { - return regexes.ReplaceAllStringFunc(regexes.HashtagFinder, in, func(match string, buf *bytes.Buffer) string { - // we have a match - matchTrimmed := strings.TrimSpace(match) - tagAsEntered := matchTrimmed[1:] + spans := util.FindHashtagSpansInText(in) + + if len(spans) == 0 { + return in + } + + var b strings.Builder + i := 0 + +spans: + for _, t := range spans { + b.WriteString(in[i:t.First]) + i = t.Second + tagAsEntered := in[t.First+1 : t.Second] - // check through the tags to find what we're matching for _, tag := range tags { if strings.EqualFold(tagAsEntered, tag.Name) { - // Add any dropped space from match - if unicode.IsSpace(rune(match[0])) { - buf.WriteByte(match[0]) - } - // replace the #tag with the formatted tag content // ` - buf.WriteString(``) - return buf.String() + b.WriteString(``) + continue spans } } - // the match wasn't in the list of tags for whatever reason, so just return the match as we found it so nothing changes - return match - }) + b.WriteString(in[t.First:t.Second]) + } + + // Get the last bits. + i = spans[len(spans)-1].Second + b.WriteString(in[i:]) + + return b.String() } func (f *formatter) ReplaceMentions(ctx context.Context, in string, mentions []*gtsmodel.Mention) string { diff --git a/internal/util/statustools.go b/internal/util/statustools.go index b2b7fffa1..b1fd7968b 100644 --- a/internal/util/statustools.go +++ b/internal/util/statustools.go @@ -19,11 +19,16 @@ package util import ( - "strings" + "unicode" + "unicode/utf8" "github.com/superseriousbusiness/gotosocial/internal/regexes" ) +const ( + maximumHashtagLength = 30 +) + // DeriveMentionNamesFromText takes a plaintext (ie., not html-formatted) text, // and applies a regex to it to return a deduplicated list of account names // mentioned in that text, in the format "@user@example.org" or "@username" for @@ -36,16 +41,71 @@ func DeriveMentionNamesFromText(text string) []string { return UniqueStrings(mentionedAccounts) } -// DeriveHashtagsFromText takes a plaintext (ie., not html-formatted) text, -// and applies a regex to it to return a deduplicated list of hashtags -// used in that text, without the leading #. The case of the returned -// tags will be lowered, for consistency. +type Pair[A, B any] struct { + First A + Second B +} + +// Byte index in original string +// `First` includes `#`. +type Span = Pair[int, int] + +// Takes a plaintext (ie., not HTML-formatted) text, +// and returns a slice of unique hashtags. func DeriveHashtagsFromText(text string) []string { + tagsMap := make(map[string]bool) tags := []string{} - for _, m := range regexes.HashtagFinder.FindAllStringSubmatch(text, -1) { - tags = append(tags, strings.TrimPrefix(m[1], "#")) + + for _, v := range FindHashtagSpansInText(text) { + t := text[v.First+1 : v.Second] + if _, value := tagsMap[t]; !value { + tagsMap[t] = true + tags = append(tags, t) + } + } + + return tags +} + +// Takes a plaintext (ie., not HTML-formatted) text, +// and returns a list of pairs of indices into the original string, where +// hashtags are located. +func FindHashtagSpansInText(text string) []Span { + tags := []Span{} + start := 0 + // Keep one rune of lookbehind. + prev := ' ' + inTag := false + + for i, r := range text { + if r == '#' && isHashtagBoundary(prev) { + // Start of hashtag. + inTag = true + start = i + } else if inTag && !isPermittedInHashtag(r) && !isHashtagBoundary(r) { + // Inside the hashtag, but it was a phoney, gottem. + inTag = false + } else if inTag && isHashtagBoundary(r) { + // End of hashtag. + inTag = false + appendTag(&tags, text, start, i) + } else if irl := i + utf8.RuneLen(r); inTag && irl == len(text) { + // End of text. + appendTag(&tags, text, start, irl) + } + + prev = r + } + + return tags +} + +func appendTag(tags *[]Span, text string, start int, end int) { + l := end - start - 1 + // This check could be moved out into the parsing loop if necessary! + if 0 < l && l <= maximumHashtagLength { + *tags = append(*tags, Span{First: start, Second: end}) } - return UniqueStrings(tags) } // DeriveEmojisFromText takes a plaintext (ie., not html-formatted) text, @@ -58,3 +118,17 @@ func DeriveEmojisFromText(text string) []string { } return UniqueStrings(emojis) } + +func isPermittedInHashtag(r rune) bool { + return unicode.IsLetter(r) || unicode.IsNumber(r) +} + +// Decides where to break before or after a hashtag. +func isHashtagBoundary(r rune) bool { + return r == '#' || // `###lol` should work + unicode.IsSpace(r) || // All kinds of Unicode whitespace. + unicode.IsControl(r) || // All kinds of control characters, like tab. + // Most kinds of punctuation except "Pc" ("Punctuation, connecting", like `_`). + // But `someurl/#fragment` should not match, neither should HTML entities like `#`. + ('/' != r && '&' != r && !unicode.Is(unicode.Categories["Pc"], r) && unicode.IsPunct(r)) +} diff --git a/internal/util/statustools_test.go b/internal/util/statustools_test.go index d9f344e4b..214fab553 100644 --- a/internal/util/statustools_test.go +++ b/internal/util/statustools_test.go @@ -77,26 +77,50 @@ func (suite *StatusTestSuite) TestDeriveHashtagsOK() { # testing this one shouldn't work - #thisshouldwork + #thisshouldwork #dupe #dupe!! #dupe here's a link with a fragment: https://example.org/whatever#ahhh + here's another link with a fragment: https://example.org/whatever/#ahhh -#ThisShouldAlsoWork #not_this_though +(#ThisShouldAlsoWork) #not_this_though #111111 thisalsoshouldn'twork#### ## -#alimentación, #saúde +#alimentación, #saúde, #lävistää, #ö, #네 +#ThisOneIsThirtyOneCharactersLon... ...ng +#ThisOneIsThirteyCharactersLong ` tags := util.DeriveHashtagsFromText(statusText) - assert.Len(suite.T(), tags, 7) + assert.Len(suite.T(), tags, 12) assert.Equal(suite.T(), "testing123", tags[0]) assert.Equal(suite.T(), "also", tags[1]) assert.Equal(suite.T(), "thisshouldwork", tags[2]) - assert.Equal(suite.T(), "ThisShouldAlsoWork", tags[3]) - assert.Equal(suite.T(), "111111", tags[4]) - assert.Equal(suite.T(), "alimentación", tags[5]) - assert.Equal(suite.T(), "saúde", tags[6]) + assert.Equal(suite.T(), "dupe", tags[3]) + assert.Equal(suite.T(), "ThisShouldAlsoWork", tags[4]) + assert.Equal(suite.T(), "111111", tags[5]) + assert.Equal(suite.T(), "alimentación", tags[6]) + assert.Equal(suite.T(), "saúde", tags[7]) + assert.Equal(suite.T(), "lävistää", tags[8]) + assert.Equal(suite.T(), "ö", tags[9]) + assert.Equal(suite.T(), "네", tags[10]) + assert.Equal(suite.T(), "ThisOneIsThirteyCharactersLong", tags[11]) + + statusText = `#올빼미 hej` + tags = util.DeriveHashtagsFromText(statusText) + assert.Equal(suite.T(), "올빼미", tags[0]) +} + +func (suite *StatusTestSuite) TestHashtagSpansOK() { + statusText := `#0 #3 #8aa` + + spans := util.FindHashtagSpansInText(statusText) + assert.Equal(suite.T(), 0, spans[0].First) + assert.Equal(suite.T(), 2, spans[0].Second) + assert.Equal(suite.T(), 3, spans[1].First) + assert.Equal(suite.T(), 5, spans[1].Second) + assert.Equal(suite.T(), 8, spans[2].First) + assert.Equal(suite.T(), 12, spans[2].Second) } func (suite *StatusTestSuite) TestDeriveEmojiOK() { @@ -127,7 +151,7 @@ Here's some normal text with an :emoji: at the end func (suite *StatusTestSuite) TestDeriveMultiple() { statusText := `Another test @foss_satan@fossbros-anonymous.io - #Hashtag + #HashTag Text` @@ -139,7 +163,7 @@ func (suite *StatusTestSuite) TestDeriveMultiple() { assert.Equal(suite.T(), "@foss_satan@fossbros-anonymous.io", ms[0]) assert.Len(suite.T(), hs, 1) - assert.Equal(suite.T(), "Hashtag", hs[0]) + assert.Contains(suite.T(), hs, "HashTag") assert.Len(suite.T(), es, 0) }