From 544cbc6f01bd93858a112ada32a41080b1f36f77 Mon Sep 17 00:00:00 2001 From: Chl Date: Fri, 28 Jun 2024 05:11:57 +0000 Subject: [PATCH] Optimization of labels handling in issue_search (#4228) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR optimizes the SQL query and de-duplicate the labels' ids when generating the query string, on the issue page.
### Background Some time ago, BingBot and some other crawlers have been putting my instance on its knees with requests containing a lot of label ids, like this one : ``` [07/Aug/2023:11:28:37 +0200] "GET /Dolibarr/sendrecurringinvoicebymail/issues?q=&type=all&sort=&state=closed&labels=1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c2%2c10%2c2%2c1%2c1%2c10%2c10%2c7%2c6%2c10%2c10%2c3%2c2%2c1%2c5%2c10%2c1%2c6%2c2%2c7%2c3%2c7%2c6%2c10%2c1%2c10%2c1%2c1%2c7%2c7%2c1%2c1%2c1%2c1%2c10%2c10%2c1%2c2%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c1%2c2%2c1%2c12%2c6%2c6%2c10&milestone=0&project=-1&poster=0 HTTP/1.1" 499 0 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) Chrome/103.0.5060.134 Safari/537.36" ``` Since each of the label ids implies a join, it grows exponentially expensive for the database engine (at least on PostgreSQL but SQLite suffers a little too). Thus, this PR proposes two enhancements: * rewrite the database query to use only one squashed condition, * deduplicate the label ids when generating the URL. ### Performance comparison Here are some timings on Postgresql-backed, Forgejo 7.0.4 instances : ```sh $ time curl -s -o /dev/null "http://localhost:3000/toto/tata/issues?q=&type=all&sort=&labels=19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25&state=open&milestone=0&project=0&assignee=0&poster=0" real 0m10,491s user 0m0,017s sys 0m0,008s ``` ...and with the patch: ```sh $ time curl -s -o /dev/null "http://localhost:3000/toto/tata/issues?q=&type=all&sort=&labels=19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25%2c19%2c25&state=open&milestone=0&project=0&assignee=0&poster=0" real 0m0,094s user 0m0,012s sys 0m0,013s ``` ### Annex This issue was originally proposed to [Gitea](https://github.com/go-gitea/gitea/pull/26460) but didn't get much attention, and I switched to Forgejo in the meantime :) Reviewed-on: https://codeberg.org/forgejo/forgejo/pulls/4228 Reviewed-by: Earl Warren Co-authored-by: Chl Co-committed-by: Chl --- models/issues/issue_search.go | 26 ++++++++++++++++++++++---- models/issues/issue_test.go | 13 +++++++++++++ models/issues/label.go | 35 +++++++++++++++++++++++------------ models/issues/label_test.go | 22 ++++++++++++++++++++++ 4 files changed, 80 insertions(+), 16 deletions(-) diff --git a/models/issues/issue_search.go b/models/issues/issue_search.go index c1d7d921a9..e9f116bfc6 100644 --- a/models/issues/issue_search.go +++ b/models/issues/issue_search.go @@ -6,6 +6,7 @@ package issues import ( "context" "fmt" + "strconv" "strings" "code.gitea.io/gitea/models/db" @@ -13,6 +14,7 @@ import ( repo_model "code.gitea.io/gitea/models/repo" "code.gitea.io/gitea/models/unit" user_model "code.gitea.io/gitea/models/user" + "code.gitea.io/gitea/modules/container" "code.gitea.io/gitea/modules/optional" "xorm.io/builder" @@ -116,14 +118,30 @@ func applyLabelsCondition(sess *xorm.Session, opts *IssuesOptions) { if opts.LabelIDs[0] == 0 { sess.Where("issue.id NOT IN (SELECT issue_id FROM issue_label)") } else { - for i, labelID := range opts.LabelIDs { + // deduplicate the label IDs for inclusion and exclusion + includedLabelIDs := make(container.Set[int64]) + excludedLabelIDs := make(container.Set[int64]) + for _, labelID := range opts.LabelIDs { if labelID > 0 { - sess.Join("INNER", fmt.Sprintf("issue_label il%d", i), - fmt.Sprintf("issue.id = il%[1]d.issue_id AND il%[1]d.label_id = %[2]d", i, labelID)) + includedLabelIDs.Add(labelID) } else if labelID < 0 { // 0 is not supported here, so just ignore it - sess.Where("issue.id not in (select issue_id from issue_label where label_id = ?)", -labelID) + excludedLabelIDs.Add(-labelID) } } + // ... and use them in a subquery of the form : + // where (select count(*) from issue_label where issue_id=issue.id and label_id in (2, 4, 6)) = 3 + // This equality is guaranteed thanks to unique index (issue_id,label_id) on table issue_label. + if len(includedLabelIDs) > 0 { + subQuery := builder.Select("count(*)").From("issue_label").Where(builder.Expr("issue_id = issue.id")). + And(builder.In("label_id", includedLabelIDs.Values())) + sess.Where(builder.Eq{strconv.Itoa(len(includedLabelIDs)): subQuery}) + } + // or (select count(*)...) = 0 for excluded labels + if len(excludedLabelIDs) > 0 { + subQuery := builder.Select("count(*)").From("issue_label").Where(builder.Expr("issue_id = issue.id")). + And(builder.In("label_id", excludedLabelIDs.Values())) + sess.Where(builder.Eq{"0": subQuery}) + } } } diff --git a/models/issues/issue_test.go b/models/issues/issue_test.go index 044666a3f0..bdab6bddc4 100644 --- a/models/issues/issue_test.go +++ b/models/issues/issue_test.go @@ -191,6 +191,19 @@ func TestIssues(t *testing.T) { }, []int64{}, // issues with **both** label 1 and 2, none of these issues matches, TODO: add more tests }, + { + issues_model.IssuesOptions{ + LabelIDs: []int64{-1, 2}, + }, + []int64{5}, // issue without label 1 but with label 2. + }, + { + issues_model.IssuesOptions{ + RepoCond: builder.In("repo_id", 1), + LabelIDs: []int64{0}, + }, + []int64{11, 3}, // issues without any label (ordered by creation date desc.)(note: 11 is a pull request) + }, { issues_model.IssuesOptions{ MilestoneIDs: []int64{1}, diff --git a/models/issues/label.go b/models/issues/label.go index 2397a29e35..61478e17ac 100644 --- a/models/issues/label.go +++ b/models/issues/label.go @@ -7,6 +7,7 @@ package issues import ( "context" "fmt" + "slices" "strconv" "strings" @@ -142,28 +143,38 @@ func (l *Label) CalOpenOrgIssues(ctx context.Context, repoID, labelID int64) { // LoadSelectedLabelsAfterClick calculates the set of selected labels when a label is clicked func (l *Label) LoadSelectedLabelsAfterClick(currentSelectedLabels []int64, currentSelectedExclusiveScopes []string) { - var labelQuerySlice []string + labelQuerySlice := []int64{} labelSelected := false - labelID := strconv.FormatInt(l.ID, 10) - labelScope := l.ExclusiveScope() - for i, s := range currentSelectedLabels { - if s == l.ID { + exclusiveScope := l.ExclusiveScope() + for i, curSel := range currentSelectedLabels { + if curSel == l.ID { labelSelected = true - } else if -s == l.ID { + } else if -curSel == l.ID { labelSelected = true l.IsExcluded = true - } else if s != 0 { + } else if curSel != 0 { // Exclude other labels in the same scope from selection - if s < 0 || labelScope == "" || labelScope != currentSelectedExclusiveScopes[i] { - labelQuerySlice = append(labelQuerySlice, strconv.FormatInt(s, 10)) + if curSel < 0 || exclusiveScope == "" || exclusiveScope != currentSelectedExclusiveScopes[i] { + labelQuerySlice = append(labelQuerySlice, curSel) } } } + if !labelSelected { - labelQuerySlice = append(labelQuerySlice, labelID) + labelQuerySlice = append(labelQuerySlice, l.ID) } l.IsSelected = labelSelected - l.QueryString = strings.Join(labelQuerySlice, ",") + + // Sort and deduplicate the ids to avoid the crawlers asking for the + // same thing with simply a different order of parameters + slices.Sort(labelQuerySlice) + labelQuerySlice = slices.Compact(labelQuerySlice) + // Quick conversion (strings.Join() doesn't accept slices of Int64) + labelQuerySliceStrings := make([]string, len(labelQuerySlice)) + for i, x := range labelQuerySlice { + labelQuerySliceStrings[i] = strconv.FormatInt(x, 10) + } + l.QueryString = strings.Join(labelQuerySliceStrings, ",") } // BelongsToOrg returns true if label is an organization label @@ -176,7 +187,7 @@ func (l *Label) BelongsToRepo() bool { return l.RepoID > 0 } -// Return scope substring of label name, or empty string if none exists +// ExclusiveScope returns scope substring of label name, or empty string if none exists func (l *Label) ExclusiveScope() string { if !l.Exclusive { return "" diff --git a/models/issues/label_test.go b/models/issues/label_test.go index 38e1560649..9934429748 100644 --- a/models/issues/label_test.go +++ b/models/issues/label_test.go @@ -23,6 +23,28 @@ func TestLabel_CalOpenIssues(t *testing.T) { assert.EqualValues(t, 2, label.NumOpenIssues) } +func TestLabel_LoadSelectedLabelsAfterClick(t *testing.T) { + assert.NoError(t, unittest.PrepareTestDatabase()) + // Loading the label id:8 (scope/label2) which have a scope and an + // exclusivity with id:7 (scope/label1) + label := unittest.AssertExistsAndLoadBean(t, &issues_model.Label{ID: 8}) + + // First test : with negative and scope + label.LoadSelectedLabelsAfterClick([]int64{1, -8}, []string{"", "scope"}) + assert.Equal(t, "1", label.QueryString) + assert.Equal(t, true, label.IsSelected) + + // Second test : with duplicates + label.LoadSelectedLabelsAfterClick([]int64{1, 7, 1, 7, 7}, []string{"", "scope", "", "scope", "scope"}) + assert.Equal(t, "1,8", label.QueryString) + assert.Equal(t, false, label.IsSelected) + + // Third test : empty set + label.LoadSelectedLabelsAfterClick([]int64{}, []string{}) + assert.False(t, label.IsSelected) + assert.Equal(t, "8", label.QueryString) +} + func TestLabel_ExclusiveScope(t *testing.T) { assert.NoError(t, unittest.PrepareTestDatabase()) label := unittest.AssertExistsAndLoadBean(t, &issues_model.Label{ID: 7})