forgejo/vendor/github.com/src-d/enry/v2/classifier.go

package enry

import (
	"math"
	"sort"

	"github.com/src-d/enry/v2/internal/tokenizer"
)

// Classifier is the interface in charge to detect the possible languages of the given content based on a set of
// candidates. Candidates is a map which can be used to assign weights to languages dynamically.
type Classifier interface {
	Classify(content []byte, candidates map[string]float64) (languages []string)
}

type classifier struct {
	languagesLogProbabilities map[string]float64
	tokensLogProbabilities    map[string]map[string]float64
	tokensTotal               float64
}

type scoredLanguage struct {
	language string
	score    float64
}

// Classify returns a sorted slice of possible languages sorted by decreasing language's probability
func (c *classifier) Classify(content []byte, candidates map[string]float64) []string {

	var languages map[string]float64
	if len(candidates) == 0 {
		languages = c.knownLangs()
	} else {
		languages = make(map[string]float64, len(candidates))
		for candidate, weight := range candidates {
			if lang, ok := GetLanguageByAlias(candidate); ok {
				candidate = lang
			}

			languages[candidate] = weight
		}
	}

	empty := len(content) == 0
	scoredLangs := make([]*scoredLanguage, 0, len(languages))

	var tokens []string
	if !empty {
		tokens = tokenizer.Tokenize(content)
	}

	for language := range languages {
		score := c.languagesLogProbabilities[language]
		if !empty {
			score += c.tokensLogProbability(tokens, language)
		}
		scoredLangs = append(scoredLangs, &scoredLanguage{
			language: language,
			score:    score,
		})
	}

	return sortLanguagesByScore(scoredLangs)
}

func sortLanguagesByScore(scoredLangs []*scoredLanguage) []string {
	sort.Stable(byScore(scoredLangs))
	sortedLanguages := make([]string, 0, len(scoredLangs))
	for _, scoredLang := range scoredLangs {
		sortedLanguages = append(sortedLanguages, scoredLang.language)
	}

	return sortedLanguages
}

func (c *classifier) knownLangs() map[string]float64 {
	langs := make(map[string]float64, len(c.languagesLogProbabilities))
	for lang := range c.languagesLogProbabilities {
		langs[lang]++
	}

	return langs
}

func (c *classifier) tokensLogProbability(tokens []string, language string) float64 {
	var sum float64
	for _, token := range tokens {
		sum += c.tokenProbability(token, language)
	}

	return sum
}

func (c *classifier) tokenProbability(token, language string) float64 {
	tokenProb, ok := c.tokensLogProbabilities[language][token]
	if !ok {
		tokenProb = math.Log(1.000000 / c.tokensTotal)
	}

	return tokenProb
}

type byScore []*scoredLanguage

func (b byScore) Len() int           { return len(b) }
func (b byScore) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
func (b byScore) Less(i, j int) bool { return b[j].score < b[i].score }
Language statistics bar for repositories (#8037) * Implementation for calculating language statistics Impement saving code language statistics to database Implement rendering langauge stats Add primary laguage to show in repository list Implement repository stats indexer queue Add indexer test Refactor to use queue module * Do not timeout for queues 2020-02-11 04:34:17 -05:00			`package enry`

			`import (`
			`"math"`
			`"sort"`

			`"github.com/src-d/enry/v2/internal/tokenizer"`
			`)`

			`// Classifier is the interface in charge to detect the possible languages of the given content based on a set of`
			`// candidates. Candidates is a map which can be used to assign weights to languages dynamically.`
			`type Classifier interface {`
			`Classify(content []byte, candidates map[string]float64) (languages []string)`
			`}`

			`type classifier struct {`
			`languagesLogProbabilities map[string]float64`
			`tokensLogProbabilities map[string]map[string]float64`
			`tokensTotal float64`
			`}`

			`type scoredLanguage struct {`
			`language string`
			`score float64`
			`}`

			`// Classify returns a sorted slice of possible languages sorted by decreasing language's probability`
			`func (c *classifier) Classify(content []byte, candidates map[string]float64) []string {`

			`var languages map[string]float64`
			`if len(candidates) == 0 {`
			`languages = c.knownLangs()`
			`} else {`
			`languages = make(map[string]float64, len(candidates))`
			`for candidate, weight := range candidates {`
			`if lang, ok := GetLanguageByAlias(candidate); ok {`
			`candidate = lang`
			`}`

			`languages[candidate] = weight`
			`}`
			`}`

			`empty := len(content) == 0`
			`scoredLangs := make([]*scoredLanguage, 0, len(languages))`

			`var tokens []string`
			`if !empty {`
			`tokens = tokenizer.Tokenize(content)`
			`}`

			`for language := range languages {`
			`score := c.languagesLogProbabilities[language]`
			`if !empty {`
			`score += c.tokensLogProbability(tokens, language)`
			`}`
			`scoredLangs = append(scoredLangs, &scoredLanguage{`
			`language: language,`
			`score: score,`
			`})`
			`}`

			`return sortLanguagesByScore(scoredLangs)`
			`}`

			`func sortLanguagesByScore(scoredLangs []*scoredLanguage) []string {`
			`sort.Stable(byScore(scoredLangs))`
			`sortedLanguages := make([]string, 0, len(scoredLangs))`
			`for _, scoredLang := range scoredLangs {`
			`sortedLanguages = append(sortedLanguages, scoredLang.language)`
			`}`

			`return sortedLanguages`
			`}`

			`func (c *classifier) knownLangs() map[string]float64 {`
			`langs := make(map[string]float64, len(c.languagesLogProbabilities))`
			`for lang := range c.languagesLogProbabilities {`
			`langs[lang]++`
			`}`

			`return langs`
			`}`

			`func (c *classifier) tokensLogProbability(tokens []string, language string) float64 {`
			`var sum float64`
			`for _, token := range tokens {`
			`sum += c.tokenProbability(token, language)`
			`}`

			`return sum`
			`}`

			`func (c *classifier) tokenProbability(token, language string) float64 {`
			`tokenProb, ok := c.tokensLogProbabilities[language][token]`
			`if !ok {`
			`tokenProb = math.Log(1.000000 / c.tokensTotal)`
			`}`

			`return tokenProb`
			`}`

			`type byScore []*scoredLanguage`

			`func (b byScore) Len() int { return len(b) }`
			`func (b byScore) Swap(i, j int) { b[i], b[j] = b[j], b[i] }`
			`func (b byScore) Less(i, j int) bool { return b[j].score < b[i].score }`