mirror of
https://codeberg.org/forgejo/forgejo.git
synced 2025-01-15 16:19:04 -05:00
3c45cf8494
Move langauge detection to separate module to be more reusable Add option to disable vendored file exclusion from file search Allways show all language stats for search
423 lines
12 KiB
Go
423 lines
12 KiB
Go
// Copyright 2019 The Gitea Authors. All rights reserved.
|
|
// Use of this source code is governed by a MIT-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package code
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"code.gitea.io/gitea/models"
|
|
"code.gitea.io/gitea/modules/analyze"
|
|
"code.gitea.io/gitea/modules/base"
|
|
"code.gitea.io/gitea/modules/charset"
|
|
"code.gitea.io/gitea/modules/git"
|
|
"code.gitea.io/gitea/modules/log"
|
|
"code.gitea.io/gitea/modules/setting"
|
|
"code.gitea.io/gitea/modules/timeutil"
|
|
|
|
"github.com/blevesearch/bleve"
|
|
analyzer_custom "github.com/blevesearch/bleve/analysis/analyzer/custom"
|
|
analyzer_keyword "github.com/blevesearch/bleve/analysis/analyzer/keyword"
|
|
"github.com/blevesearch/bleve/analysis/token/lowercase"
|
|
"github.com/blevesearch/bleve/analysis/token/unicodenorm"
|
|
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
|
|
"github.com/blevesearch/bleve/index/upsidedown"
|
|
"github.com/blevesearch/bleve/mapping"
|
|
"github.com/blevesearch/bleve/search/query"
|
|
"github.com/ethantkoenig/rupture"
|
|
"github.com/src-d/enry/v2"
|
|
)
|
|
|
|
const unicodeNormalizeName = "unicodeNormalize"
|
|
const maxBatchSize = 16
|
|
|
|
// indexerID a bleve-compatible unique identifier for an integer id
|
|
func indexerID(id int64) string {
|
|
return strconv.FormatInt(id, 36)
|
|
}
|
|
|
|
// numericEqualityQuery a numeric equality query for the given value and field
|
|
func numericEqualityQuery(value int64, field string) *query.NumericRangeQuery {
|
|
f := float64(value)
|
|
tru := true
|
|
q := bleve.NewNumericRangeInclusiveQuery(&f, &f, &tru, &tru)
|
|
q.SetField(field)
|
|
return q
|
|
}
|
|
|
|
func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
|
|
return m.AddCustomTokenFilter(unicodeNormalizeName, map[string]interface{}{
|
|
"type": unicodenorm.Name,
|
|
"form": unicodenorm.NFC,
|
|
})
|
|
}
|
|
|
|
// openIndexer open the index at the specified path, checking for metadata
|
|
// updates and bleve version updates. If index needs to be created (or
|
|
// re-created), returns (nil, nil)
|
|
func openIndexer(path string, latestVersion int) (bleve.Index, error) {
|
|
_, err := os.Stat(path)
|
|
if err != nil && os.IsNotExist(err) {
|
|
return nil, nil
|
|
} else if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
metadata, err := rupture.ReadIndexMetadata(path)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if metadata.Version < latestVersion {
|
|
// the indexer is using a previous version, so we should delete it and
|
|
// re-populate
|
|
return nil, os.RemoveAll(path)
|
|
}
|
|
|
|
index, err := bleve.Open(path)
|
|
if err != nil && err == upsidedown.IncompatibleVersion {
|
|
// the indexer was built with a previous version of bleve, so we should
|
|
// delete it and re-populate
|
|
return nil, os.RemoveAll(path)
|
|
} else if err != nil {
|
|
return nil, err
|
|
}
|
|
return index, nil
|
|
}
|
|
|
|
// RepoIndexerData data stored in the repo indexer
|
|
type RepoIndexerData struct {
|
|
RepoID int64
|
|
CommitID string
|
|
Content string
|
|
Language string
|
|
UpdatedAt time.Time
|
|
}
|
|
|
|
// Type returns the document type, for bleve's mapping.Classifier interface.
|
|
func (d *RepoIndexerData) Type() string {
|
|
return repoIndexerDocType
|
|
}
|
|
|
|
func addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
|
|
// Ignore vendored files in code search
|
|
if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) {
|
|
return nil
|
|
}
|
|
stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
|
|
RunInDir(repo.RepoPath())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if size, err := strconv.Atoi(strings.TrimSpace(stdout)); err != nil {
|
|
return fmt.Errorf("Misformatted git cat-file output: %v", err)
|
|
} else if int64(size) > setting.Indexer.MaxIndexerFileSize {
|
|
return addDelete(update.Filename, repo, batch)
|
|
}
|
|
|
|
fileContents, err := git.NewCommand("cat-file", "blob", update.BlobSha).
|
|
RunInDirBytes(repo.RepoPath())
|
|
if err != nil {
|
|
return err
|
|
} else if !base.IsTextFile(fileContents) {
|
|
// FIXME: UTF-16 files will probably fail here
|
|
return nil
|
|
}
|
|
|
|
id := filenameIndexerID(repo.ID, update.Filename)
|
|
return batch.Index(id, &RepoIndexerData{
|
|
RepoID: repo.ID,
|
|
CommitID: commitSha,
|
|
Content: string(charset.ToUTF8DropErrors(fileContents)),
|
|
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
|
|
UpdatedAt: time.Now().UTC(),
|
|
})
|
|
}
|
|
|
|
func addDelete(filename string, repo *models.Repository, batch rupture.FlushingBatch) error {
|
|
id := filenameIndexerID(repo.ID, filename)
|
|
return batch.Delete(id)
|
|
}
|
|
|
|
const (
|
|
repoIndexerAnalyzer = "repoIndexerAnalyzer"
|
|
repoIndexerDocType = "repoIndexerDocType"
|
|
repoIndexerLatestVersion = 5
|
|
)
|
|
|
|
// createRepoIndexer create a repo indexer if one does not already exist
|
|
func createRepoIndexer(path string, latestVersion int) (bleve.Index, error) {
|
|
docMapping := bleve.NewDocumentMapping()
|
|
numericFieldMapping := bleve.NewNumericFieldMapping()
|
|
numericFieldMapping.IncludeInAll = false
|
|
docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping)
|
|
|
|
textFieldMapping := bleve.NewTextFieldMapping()
|
|
textFieldMapping.IncludeInAll = false
|
|
docMapping.AddFieldMappingsAt("Content", textFieldMapping)
|
|
|
|
termFieldMapping := bleve.NewTextFieldMapping()
|
|
termFieldMapping.IncludeInAll = false
|
|
termFieldMapping.Analyzer = analyzer_keyword.Name
|
|
docMapping.AddFieldMappingsAt("Language", termFieldMapping)
|
|
docMapping.AddFieldMappingsAt("CommitID", termFieldMapping)
|
|
|
|
timeFieldMapping := bleve.NewDateTimeFieldMapping()
|
|
timeFieldMapping.IncludeInAll = false
|
|
docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
|
|
|
|
mapping := bleve.NewIndexMapping()
|
|
if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
|
|
return nil, err
|
|
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]interface{}{
|
|
"type": analyzer_custom.Name,
|
|
"char_filters": []string{},
|
|
"tokenizer": unicode.Name,
|
|
"token_filters": []string{unicodeNormalizeName, lowercase.Name},
|
|
}); err != nil {
|
|
return nil, err
|
|
}
|
|
mapping.DefaultAnalyzer = repoIndexerAnalyzer
|
|
mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
|
|
mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
|
|
|
|
indexer, err := bleve.New(path, mapping)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if err = rupture.WriteIndexMetadata(path, &rupture.IndexMetadata{
|
|
Version: latestVersion,
|
|
}); err != nil {
|
|
return nil, err
|
|
}
|
|
return indexer, nil
|
|
}
|
|
|
|
func filenameIndexerID(repoID int64, filename string) string {
|
|
return indexerID(repoID) + "_" + filename
|
|
}
|
|
|
|
func filenameOfIndexerID(indexerID string) string {
|
|
index := strings.IndexByte(indexerID, '_')
|
|
if index == -1 {
|
|
log.Error("Unexpected ID in repo indexer: %s", indexerID)
|
|
}
|
|
return indexerID[index+1:]
|
|
}
|
|
|
|
var (
|
|
_ Indexer = &BleveIndexer{}
|
|
)
|
|
|
|
// BleveIndexer represents a bleve indexer implementation
|
|
type BleveIndexer struct {
|
|
indexDir string
|
|
indexer bleve.Index
|
|
}
|
|
|
|
// NewBleveIndexer creates a new bleve local indexer
|
|
func NewBleveIndexer(indexDir string) (*BleveIndexer, bool, error) {
|
|
indexer := &BleveIndexer{
|
|
indexDir: indexDir,
|
|
}
|
|
created, err := indexer.init()
|
|
return indexer, created, err
|
|
}
|
|
|
|
// init init the indexer
|
|
func (b *BleveIndexer) init() (bool, error) {
|
|
var err error
|
|
b.indexer, err = openIndexer(b.indexDir, repoIndexerLatestVersion)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
if b.indexer != nil {
|
|
return false, nil
|
|
}
|
|
|
|
b.indexer, err = createRepoIndexer(b.indexDir, repoIndexerLatestVersion)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
|
|
return true, nil
|
|
}
|
|
|
|
// Close close the indexer
|
|
func (b *BleveIndexer) Close() {
|
|
log.Debug("Closing repo indexer")
|
|
if b.indexer != nil {
|
|
err := b.indexer.Close()
|
|
if err != nil {
|
|
log.Error("Error whilst closing the repository indexer: %v", err)
|
|
}
|
|
}
|
|
log.Info("PID: %d Repository Indexer closed", os.Getpid())
|
|
}
|
|
|
|
// Index indexes the data
|
|
func (b *BleveIndexer) Index(repoID int64) error {
|
|
repo, err := models.GetRepositoryByID(repoID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
sha, err := getDefaultBranchSha(repo)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
changes, err := getRepoChanges(repo, sha)
|
|
if err != nil {
|
|
return err
|
|
} else if changes == nil {
|
|
return nil
|
|
}
|
|
|
|
batch := rupture.NewFlushingBatch(b.indexer, maxBatchSize)
|
|
for _, update := range changes.Updates {
|
|
if err := addUpdate(sha, update, repo, batch); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
for _, filename := range changes.RemovedFilenames {
|
|
if err := addDelete(filename, repo, batch); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
if err = batch.Flush(); err != nil {
|
|
return err
|
|
}
|
|
return repo.UpdateIndexerStatus(models.RepoIndexerTypeCode, sha)
|
|
}
|
|
|
|
// Delete deletes indexes by ids
|
|
func (b *BleveIndexer) Delete(repoID int64) error {
|
|
query := numericEqualityQuery(repoID, "RepoID")
|
|
searchRequest := bleve.NewSearchRequestOptions(query, 2147483647, 0, false)
|
|
result, err := b.indexer.Search(searchRequest)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
batch := rupture.NewFlushingBatch(b.indexer, maxBatchSize)
|
|
for _, hit := range result.Hits {
|
|
if err = batch.Delete(hit.ID); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return batch.Flush()
|
|
}
|
|
|
|
// Search searches for files in the specified repo.
|
|
// Returns the matching file-paths
|
|
func (b *BleveIndexer) Search(repoIDs []int64, language, keyword string, page, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) {
|
|
phraseQuery := bleve.NewMatchPhraseQuery(keyword)
|
|
phraseQuery.FieldVal = "Content"
|
|
phraseQuery.Analyzer = repoIndexerAnalyzer
|
|
|
|
var indexerQuery query.Query
|
|
if len(repoIDs) > 0 {
|
|
var repoQueries = make([]query.Query, 0, len(repoIDs))
|
|
for _, repoID := range repoIDs {
|
|
repoQueries = append(repoQueries, numericEqualityQuery(repoID, "RepoID"))
|
|
}
|
|
|
|
indexerQuery = bleve.NewConjunctionQuery(
|
|
bleve.NewDisjunctionQuery(repoQueries...),
|
|
phraseQuery,
|
|
)
|
|
} else {
|
|
indexerQuery = phraseQuery
|
|
}
|
|
|
|
// Save for reuse without language filter
|
|
facetQuery := indexerQuery
|
|
if len(language) > 0 {
|
|
languageQuery := bleve.NewMatchQuery(language)
|
|
languageQuery.FieldVal = "Language"
|
|
languageQuery.Analyzer = analyzer_keyword.Name
|
|
|
|
indexerQuery = bleve.NewConjunctionQuery(
|
|
indexerQuery,
|
|
languageQuery,
|
|
)
|
|
}
|
|
|
|
from := (page - 1) * pageSize
|
|
searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
|
|
searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
|
|
searchRequest.IncludeLocations = true
|
|
|
|
if len(language) == 0 {
|
|
searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
|
|
}
|
|
|
|
result, err := b.indexer.Search(searchRequest)
|
|
if err != nil {
|
|
return 0, nil, nil, err
|
|
}
|
|
|
|
total := int64(result.Total)
|
|
|
|
searchResults := make([]*SearchResult, len(result.Hits))
|
|
for i, hit := range result.Hits {
|
|
var startIndex, endIndex int = -1, -1
|
|
for _, locations := range hit.Locations["Content"] {
|
|
location := locations[0]
|
|
locationStart := int(location.Start)
|
|
locationEnd := int(location.End)
|
|
if startIndex < 0 || locationStart < startIndex {
|
|
startIndex = locationStart
|
|
}
|
|
if endIndex < 0 || locationEnd > endIndex {
|
|
endIndex = locationEnd
|
|
}
|
|
}
|
|
language := hit.Fields["Language"].(string)
|
|
var updatedUnix timeutil.TimeStamp
|
|
if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
|
|
updatedUnix = timeutil.TimeStamp(t.Unix())
|
|
}
|
|
searchResults[i] = &SearchResult{
|
|
RepoID: int64(hit.Fields["RepoID"].(float64)),
|
|
StartIndex: startIndex,
|
|
EndIndex: endIndex,
|
|
Filename: filenameOfIndexerID(hit.ID),
|
|
Content: hit.Fields["Content"].(string),
|
|
CommitID: hit.Fields["CommitID"].(string),
|
|
UpdatedUnix: updatedUnix,
|
|
Language: language,
|
|
Color: enry.GetColor(language),
|
|
}
|
|
}
|
|
|
|
searchResultLanguages := make([]*SearchResultLanguages, 0, 10)
|
|
if len(language) > 0 {
|
|
// Use separate query to go get all language counts
|
|
facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false)
|
|
facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
|
|
facetRequest.IncludeLocations = true
|
|
facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
|
|
|
|
if result, err = b.indexer.Search(facetRequest); err != nil {
|
|
return 0, nil, nil, err
|
|
}
|
|
|
|
}
|
|
languagesFacet := result.Facets["languages"]
|
|
for _, term := range languagesFacet.Terms {
|
|
if len(term.Term) == 0 {
|
|
continue
|
|
}
|
|
searchResultLanguages = append(searchResultLanguages, &SearchResultLanguages{
|
|
Language: term.Term,
|
|
Color: enry.GetColor(term.Term),
|
|
Count: term.Count,
|
|
})
|
|
}
|
|
return total, searchResults, searchResultLanguages, nil
|
|
}
|