feat(code search): replace fuzzy search with union search for indexer (#6947)
Fuzzy searching for code has been known to be problematic #5264 and in my personal opinion isn't very useful. Reviewed-on: https://codeberg.org/forgejo/forgejo/pulls/6947 Reviewed-by: Gusted <gusted@noreply.codeberg.org> Co-authored-by: Shiny Nematoda <snematoda.751k2@aleeas.com> Co-committed-by: Shiny Nematoda <snematoda.751k2@aleeas.com>
This commit is contained in:
parent
cb46a036aa
commit
3816db68aa
|
@ -28,10 +28,10 @@ type GrepResult struct {
|
|||
HighlightedRanges [][3]int
|
||||
}
|
||||
|
||||
type grepMode int
|
||||
type GrepMode int
|
||||
|
||||
const (
|
||||
FixedGrepMode grepMode = iota
|
||||
FixedGrepMode GrepMode = iota
|
||||
FixedAnyGrepMode
|
||||
RegExpGrepMode
|
||||
)
|
||||
|
@ -43,7 +43,7 @@ type GrepOptions struct {
|
|||
MaxResultLimit int
|
||||
MatchesPerFile int // >= git 2.38
|
||||
ContextLineNumber int
|
||||
Mode grepMode
|
||||
Mode GrepMode
|
||||
Filename string
|
||||
}
|
||||
|
||||
|
|
|
@ -40,10 +40,6 @@ import (
|
|||
const (
|
||||
unicodeNormalizeName = "unicodeNormalize"
|
||||
maxBatchSize = 16
|
||||
// fuzzyDenominator determines the levenshtein distance per each character of a keyword
|
||||
fuzzyDenominator = 4
|
||||
// see https://github.com/blevesearch/bleve/issues/1563#issuecomment-786822311
|
||||
maxFuzziness = 2
|
||||
)
|
||||
|
||||
func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
|
||||
|
@ -260,12 +256,14 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
|
|||
keywordQuery query.Query
|
||||
)
|
||||
|
||||
phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword)
|
||||
phraseQuery.FieldVal = "Content"
|
||||
phraseQuery.Analyzer = repoIndexerAnalyzer
|
||||
keywordQuery = phraseQuery
|
||||
if opts.IsKeywordFuzzy {
|
||||
phraseQuery.Fuzziness = min(maxFuzziness, len(opts.Keyword)/fuzzyDenominator)
|
||||
if opts.Mode == internal.CodeSearchModeUnion {
|
||||
query := bleve.NewDisjunctionQuery()
|
||||
for _, field := range strings.Fields(opts.Keyword) {
|
||||
query.AddQuery(inner_bleve.MatchPhraseQuery(field, "Content", repoIndexerAnalyzer, 0))
|
||||
}
|
||||
keywordQuery = query
|
||||
} else {
|
||||
keywordQuery = inner_bleve.MatchPhraseQuery(opts.Keyword, "Content", repoIndexerAnalyzer, 0)
|
||||
}
|
||||
|
||||
if len(opts.RepoIDs) > 0 {
|
||||
|
@ -325,13 +323,16 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
|
|||
for i, hit := range result.Hits {
|
||||
startIndex, endIndex := -1, -1
|
||||
for _, locations := range hit.Locations["Content"] {
|
||||
if startIndex != -1 && endIndex != -1 {
|
||||
break
|
||||
}
|
||||
location := locations[0]
|
||||
locationStart := int(location.Start)
|
||||
locationEnd := int(location.End)
|
||||
if startIndex < 0 || locationStart < startIndex {
|
||||
startIndex = locationStart
|
||||
}
|
||||
if endIndex < 0 || locationEnd > endIndex {
|
||||
if endIndex < 0 && locationEnd > endIndex {
|
||||
endIndex = locationEnd
|
||||
}
|
||||
}
|
||||
|
|
|
@ -33,8 +33,8 @@ const (
|
|||
esRepoIndexerLatestVersion = 2
|
||||
// multi-match-types, currently only 2 types are used
|
||||
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
|
||||
esMultiMatchTypeBestFields = "best_fields"
|
||||
esMultiMatchTypePhrasePrefix = "phrase_prefix"
|
||||
esMultiMatchTypeBestFields = "best_fields"
|
||||
esMultiMatchTypePhrase = "phrase"
|
||||
)
|
||||
|
||||
var _ internal.Indexer = &Indexer{}
|
||||
|
@ -334,8 +334,8 @@ func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLan
|
|||
|
||||
// Search searches for codes and language stats by given conditions.
|
||||
func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
|
||||
searchType := esMultiMatchTypePhrasePrefix
|
||||
if opts.IsKeywordFuzzy {
|
||||
searchType := esMultiMatchTypePhrase
|
||||
if opts.Mode == internal.CodeSearchModeUnion {
|
||||
searchType = esMultiMatchTypeBestFields
|
||||
}
|
||||
|
||||
|
|
|
@ -100,8 +100,8 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
|
|||
Page: 1,
|
||||
PageSize: 10,
|
||||
},
|
||||
Filename: kw.Filename,
|
||||
IsKeywordFuzzy: true,
|
||||
Filename: kw.Filename,
|
||||
Mode: SearchModeUnion,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
assert.Len(t, kw.IDs, int(total))
|
||||
|
|
|
@ -20,13 +20,27 @@ type Indexer interface {
|
|||
Search(ctx context.Context, opts *SearchOptions) (int64, []*SearchResult, []*SearchResultLanguages, error)
|
||||
}
|
||||
|
||||
type CodeSearchMode int
|
||||
|
||||
const (
|
||||
CodeSearchModeExact CodeSearchMode = iota
|
||||
CodeSearchModeUnion
|
||||
)
|
||||
|
||||
func (mode CodeSearchMode) String() string {
|
||||
if mode == CodeSearchModeUnion {
|
||||
return "union"
|
||||
}
|
||||
return "exact"
|
||||
}
|
||||
|
||||
type SearchOptions struct {
|
||||
RepoIDs []int64
|
||||
Keyword string
|
||||
Language string
|
||||
Filename string
|
||||
|
||||
IsKeywordFuzzy bool
|
||||
Mode CodeSearchMode
|
||||
|
||||
db.Paginator
|
||||
}
|
||||
|
|
|
@ -35,7 +35,14 @@ type SearchResultLanguages = internal.SearchResultLanguages
|
|||
|
||||
type SearchOptions = internal.SearchOptions
|
||||
|
||||
var CodeSearchOptions = [2]string{"exact", "fuzzy"}
|
||||
var CodeSearchOptions = [2]string{"exact", "union"}
|
||||
|
||||
type SearchMode = internal.CodeSearchMode
|
||||
|
||||
const (
|
||||
SearchModeExact = internal.CodeSearchModeExact
|
||||
SearchModeUnion = internal.CodeSearchModeUnion
|
||||
)
|
||||
|
||||
func indices(content string, selectionStartIndex, selectionEndIndex int) (int, int) {
|
||||
startIndex := selectionStartIndex
|
||||
|
@ -206,7 +213,6 @@ func searchResult(result *internal.SearchResult, startIndex, endIndex int) (*Res
|
|||
}
|
||||
|
||||
// PerformSearch perform a search on a repository
|
||||
// if isFuzzy is true set the Damerau-Levenshtein distance from 0 to 2
|
||||
func PerformSearch(ctx context.Context, opts *SearchOptions) (int, []*Result, []*SearchResultLanguages, error) {
|
||||
if opts == nil || len(opts.Keyword) == 0 {
|
||||
return 0, nil, nil, nil
|
||||
|
|
|
@ -37,19 +37,17 @@ func Code(ctx *context.Context) {
|
|||
keyword := ctx.FormTrim("q")
|
||||
path := ctx.FormTrim("path")
|
||||
|
||||
isFuzzy := ctx.FormOptionalBool("fuzzy").ValueOrDefault(true)
|
||||
if mode := ctx.FormTrim("mode"); len(mode) > 0 {
|
||||
isFuzzy = mode == "fuzzy"
|
||||
mode := code_indexer.SearchModeExact
|
||||
if m := ctx.FormTrim("mode"); m == "union" ||
|
||||
m == "fuzzy" ||
|
||||
ctx.FormBool("fuzzy") {
|
||||
mode = code_indexer.SearchModeUnion
|
||||
}
|
||||
|
||||
ctx.Data["Keyword"] = keyword
|
||||
ctx.Data["Language"] = language
|
||||
ctx.Data["CodeSearchOptions"] = []string{"exact", "fuzzy"}
|
||||
if isFuzzy {
|
||||
ctx.Data["CodeSearchMode"] = "fuzzy"
|
||||
} else {
|
||||
ctx.Data["CodeSearchMode"] = "exact"
|
||||
}
|
||||
ctx.Data["CodeSearchOptions"] = code_indexer.CodeSearchOptions
|
||||
ctx.Data["CodeSearchMode"] = mode.String()
|
||||
ctx.Data["PageIsViewCode"] = true
|
||||
|
||||
if keyword == "" {
|
||||
|
@ -88,11 +86,11 @@ func Code(ctx *context.Context) {
|
|||
|
||||
if (len(repoIDs) > 0) || isAdmin {
|
||||
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, &code_indexer.SearchOptions{
|
||||
RepoIDs: repoIDs,
|
||||
Keyword: keyword,
|
||||
IsKeywordFuzzy: isFuzzy,
|
||||
Language: language,
|
||||
Filename: path,
|
||||
RepoIDs: repoIDs,
|
||||
Keyword: keyword,
|
||||
Mode: mode,
|
||||
Language: language,
|
||||
Filename: path,
|
||||
Paginator: &db.ListOptions{
|
||||
Page: page,
|
||||
PageSize: setting.UI.RepoSearchPagingNum,
|
||||
|
|
|
@ -21,14 +21,14 @@ type searchMode int
|
|||
|
||||
const (
|
||||
ExactSearchMode searchMode = iota
|
||||
FuzzySearchMode
|
||||
UnionSearchMode
|
||||
RegExpSearchMode
|
||||
)
|
||||
|
||||
func searchModeFromString(s string) searchMode {
|
||||
switch s {
|
||||
case "fuzzy", "union":
|
||||
return FuzzySearchMode
|
||||
return UnionSearchMode
|
||||
case "regexp":
|
||||
return RegExpSearchMode
|
||||
default:
|
||||
|
@ -40,8 +40,8 @@ func (m searchMode) String() string {
|
|||
switch m {
|
||||
case ExactSearchMode:
|
||||
return "exact"
|
||||
case FuzzySearchMode:
|
||||
return "fuzzy"
|
||||
case UnionSearchMode:
|
||||
return "union"
|
||||
case RegExpSearchMode:
|
||||
return "regexp"
|
||||
default:
|
||||
|
@ -49,6 +49,24 @@ func (m searchMode) String() string {
|
|||
}
|
||||
}
|
||||
|
||||
func (m searchMode) ToIndexer() code_indexer.SearchMode {
|
||||
if m == ExactSearchMode {
|
||||
return code_indexer.SearchModeExact
|
||||
}
|
||||
return code_indexer.SearchModeUnion
|
||||
}
|
||||
|
||||
func (m searchMode) ToGitGrep() git.GrepMode {
|
||||
switch m {
|
||||
case RegExpSearchMode:
|
||||
return git.RegExpGrepMode
|
||||
case UnionSearchMode:
|
||||
return git.FixedAnyGrepMode
|
||||
default:
|
||||
return git.FixedGrepMode
|
||||
}
|
||||
}
|
||||
|
||||
// Search render repository search page
|
||||
func Search(ctx *context.Context) {
|
||||
language := ctx.FormTrim("l")
|
||||
|
@ -59,7 +77,7 @@ func Search(ctx *context.Context) {
|
|||
if modeStr := ctx.FormString("mode"); len(modeStr) > 0 {
|
||||
mode = searchModeFromString(modeStr)
|
||||
} else if ctx.FormOptionalBool("fuzzy").ValueOrDefault(true) { // for backward compatibility in links
|
||||
mode = FuzzySearchMode
|
||||
mode = UnionSearchMode
|
||||
}
|
||||
|
||||
ctx.Data["Keyword"] = keyword
|
||||
|
@ -90,11 +108,11 @@ func Search(ctx *context.Context) {
|
|||
if setting.Indexer.RepoIndexerEnabled {
|
||||
var err error
|
||||
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, &code_indexer.SearchOptions{
|
||||
RepoIDs: []int64{ctx.Repo.Repository.ID},
|
||||
Keyword: keyword,
|
||||
IsKeywordFuzzy: mode == FuzzySearchMode,
|
||||
Language: language,
|
||||
Filename: path,
|
||||
RepoIDs: []int64{ctx.Repo.Repository.ID},
|
||||
Keyword: keyword,
|
||||
Mode: mode.ToIndexer(),
|
||||
Language: language,
|
||||
Filename: path,
|
||||
Paginator: &db.ListOptions{
|
||||
Page: page,
|
||||
PageSize: setting.UI.RepoSearchPagingNum,
|
||||
|
@ -110,19 +128,12 @@ func Search(ctx *context.Context) {
|
|||
ctx.Data["CodeIndexerUnavailable"] = !code_indexer.IsAvailable(ctx)
|
||||
}
|
||||
} else {
|
||||
grepOpt := git.GrepOptions{
|
||||
res, err := git.GrepSearch(ctx, ctx.Repo.GitRepo, keyword, git.GrepOptions{
|
||||
ContextLineNumber: 1,
|
||||
RefName: ctx.Repo.RefName,
|
||||
Filename: path,
|
||||
}
|
||||
switch mode {
|
||||
case FuzzySearchMode:
|
||||
grepOpt.Mode = git.FixedAnyGrepMode
|
||||
ctx.Data["CodeSearchMode"] = "union"
|
||||
case RegExpSearchMode:
|
||||
grepOpt.Mode = git.RegExpGrepMode
|
||||
}
|
||||
res, err := git.GrepSearch(ctx, ctx.Repo.GitRepo, keyword, grepOpt)
|
||||
Mode: mode.ToGitGrep(),
|
||||
})
|
||||
if err != nil {
|
||||
ctx.ServerError("GrepSearch", err)
|
||||
return
|
||||
|
|
|
@ -41,19 +41,17 @@ func CodeSearch(ctx *context.Context) {
|
|||
keyword := ctx.FormTrim("q")
|
||||
path := ctx.FormTrim("path")
|
||||
|
||||
isFuzzy := ctx.FormOptionalBool("fuzzy").ValueOrDefault(true)
|
||||
if mode := ctx.FormTrim("mode"); len(mode) > 0 {
|
||||
isFuzzy = mode == "fuzzy"
|
||||
mode := code_indexer.SearchModeExact
|
||||
if m := ctx.FormTrim("mode"); m == "union" ||
|
||||
m == "fuzzy" ||
|
||||
ctx.FormBool("fuzzy") {
|
||||
mode = code_indexer.SearchModeUnion
|
||||
}
|
||||
|
||||
ctx.Data["Keyword"] = keyword
|
||||
ctx.Data["Language"] = language
|
||||
ctx.Data["CodeSearchOptions"] = []string{"exact", "fuzzy"}
|
||||
if isFuzzy {
|
||||
ctx.Data["CodeSearchMode"] = "fuzzy"
|
||||
} else {
|
||||
ctx.Data["CodeSearchMode"] = "exact"
|
||||
}
|
||||
ctx.Data["CodeSearchOptions"] = code_indexer.CodeSearchOptions
|
||||
ctx.Data["CodeSearchMode"] = mode.String()
|
||||
ctx.Data["IsCodePage"] = true
|
||||
|
||||
if keyword == "" {
|
||||
|
@ -85,11 +83,11 @@ func CodeSearch(ctx *context.Context) {
|
|||
|
||||
if len(repoIDs) > 0 {
|
||||
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, &code_indexer.SearchOptions{
|
||||
RepoIDs: repoIDs,
|
||||
Keyword: keyword,
|
||||
IsKeywordFuzzy: isFuzzy,
|
||||
Language: language,
|
||||
Filename: path,
|
||||
RepoIDs: repoIDs,
|
||||
Keyword: keyword,
|
||||
Mode: mode,
|
||||
Language: language,
|
||||
Filename: path,
|
||||
Paginator: &db.ListOptions{
|
||||
Page: page,
|
||||
PageSize: setting.UI.RepoSearchPagingNum,
|
||||
|
|
|
@ -82,19 +82,10 @@ func testSearchRepo(t *testing.T, indexer bool) {
|
|||
testSearch(t, "/user2/glob/search?q=loren&page=1", []string{"a.txt"}, indexer)
|
||||
testSearch(t, "/user2/glob/search?q=loren&page=1&mode=exact", []string{"a.txt"}, indexer)
|
||||
|
||||
if indexer {
|
||||
// fuzzy search: matches both file3 (x/b.txt) and file1 (a.txt)
|
||||
// when indexer is enabled
|
||||
testSearch(t, "/user2/glob/search?q=file3&mode=fuzzy&page=1", []string{"x/b.txt", "a.txt"}, indexer)
|
||||
testSearch(t, "/user2/glob/search?q=file4&mode=fuzzy&page=1", []string{"x/b.txt", "a.txt"}, indexer)
|
||||
testSearch(t, "/user2/glob/search?q=file5&mode=fuzzy&page=1", []string{"x/b.txt", "a.txt"}, indexer)
|
||||
} else {
|
||||
// fuzzy search: Union/OR of all the keywords
|
||||
// when indexer is disabled
|
||||
testSearch(t, "/user2/glob/search?q=file3+file1&mode=union&page=1", []string{"a.txt", "x/b.txt"}, indexer)
|
||||
testSearch(t, "/user2/glob/search?q=file4&mode=union&page=1", []string{}, indexer)
|
||||
testSearch(t, "/user2/glob/search?q=file5&mode=union&page=1", []string{}, indexer)
|
||||
}
|
||||
// union search: Union/OR of all the keywords
|
||||
testSearch(t, "/user2/glob/search?q=file3+file1&mode=union&page=1", []string{"a.txt", "x/b.txt"}, indexer)
|
||||
testSearch(t, "/user2/glob/search?q=file4&mode=union&page=1", []string{}, indexer)
|
||||
testSearch(t, "/user2/glob/search?q=file5&mode=union&page=1", []string{}, indexer)
|
||||
|
||||
testSearch(t, "/user2/glob/search?q=file3&page=1&mode=exact", []string{"x/b.txt"}, indexer)
|
||||
testSearch(t, "/user2/glob/search?q=file4&page=1&mode=exact", []string{}, indexer)
|
||||
|
@ -121,11 +112,11 @@ func testSearch(t *testing.T, url string, expected []string, indexer bool) {
|
|||
})
|
||||
|
||||
if indexer {
|
||||
assert.EqualValues(t, []string{"exact", "fuzzy"}, dropdownOptions)
|
||||
assert.EqualValues(t, []string{"exact", "union"}, dropdownOptions)
|
||||
} else {
|
||||
assert.EqualValues(t, []string{"exact", "union", "regexp"}, dropdownOptions)
|
||||
}
|
||||
|
||||
filenames := resultFilenames(t, doc)
|
||||
assert.EqualValues(t, expected, filenames)
|
||||
assert.ElementsMatch(t, expected, filenames)
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue