forgejo/modules/git/repo_language_stats.go

// Copyright 2020 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT

package git

import (
	"bytes"
	"cmp"
	"io"
	"strings"
	"unicode"

	"code.gitea.io/gitea/modules/analyze"
	"code.gitea.io/gitea/modules/log"
	"code.gitea.io/gitea/modules/optional"

	"github.com/go-enry/go-enry/v2"
)

const (
	fileSizeLimit int64 = 16 * 1024   // 16 KiB
	bigFileSize   int64 = 1024 * 1024 // 1 MiB
)

// mergeLanguageStats mergers language names with different cases. The name with most upper case letters is used.
func mergeLanguageStats(stats map[string]int64) map[string]int64 {
	names := map[string]struct {
		uniqueName string
		upperCount int
	}{}

	countUpper := func(s string) (count int) {
		for _, r := range s {
			if unicode.IsUpper(r) {
				count++
			}
		}
		return count
	}

	for name := range stats {
		cnt := countUpper(name)
		lower := strings.ToLower(name)
		if cnt >= names[lower].upperCount {
			names[lower] = struct {
				uniqueName string
				upperCount int
			}{uniqueName: name, upperCount: cnt}
		}
	}

	res := make(map[string]int64, len(names))
	for name, num := range stats {
		res[names[strings.ToLower(name)].uniqueName] += num
	}
	return res
}

// GetLanguageStats calculates language stats for git repository at specified commit
func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) {
	// We will feed the commit IDs in order into cat-file --batch, followed by blobs as necessary.
	// so let's create a batch stdin and stdout
	batchStdinWriter, batchReader, cancel, err := repo.CatFileBatch(repo.Ctx)
	if err != nil {
		return nil, err
	}
	defer cancel()

	writeID := func(id string) error {
		_, err := batchStdinWriter.Write([]byte(id + "\n"))
		return err
	}

	if err := writeID(commitID); err != nil {
		return nil, err
	}
	shaBytes, typ, size, err := ReadBatchLine(batchReader)
	if typ != "commit" {
		log.Debug("Unable to get commit for: %s. Err: %v", commitID, err)
		return nil, ErrNotExist{commitID, ""}
	}

	sha, err := NewIDFromString(string(shaBytes))
	if err != nil {
		log.Debug("Unable to get commit for: %s. Err: %v", commitID, err)
		return nil, ErrNotExist{commitID, ""}
	}

	commit, err := CommitFromReader(repo, sha, io.LimitReader(batchReader, size))
	if err != nil {
		log.Debug("Unable to get commit for: %s. Err: %v", commitID, err)
		return nil, err
	}
	if _, err = batchReader.Discard(1); err != nil {
		return nil, err
	}

	tree := commit.Tree

	entries, err := tree.ListEntriesRecursiveWithSize()
	if err != nil {
		return nil, err
	}

	checker, err := repo.GitAttributeChecker(commitID, LinguistAttributes...)
	if err != nil {
		return nil, err
	}
	defer checker.Close()

	contentBuf := bytes.Buffer{}
	var content []byte

	// sizes contains the current calculated size of all files by language
	sizes := make(map[string]int64)
	// by default we will only count the sizes of programming languages or markup languages
	// unless they are explicitly set using linguist-language
	includedLanguage := map[string]bool{}
	// or if there's only one language in the repository
	firstExcludedLanguage := ""
	firstExcludedLanguageSize := int64(0)

	isTrue := func(v optional.Option[bool]) bool {
		return v.ValueOrDefault(false)
	}
	isFalse := func(v optional.Option[bool]) bool {
		return !v.ValueOrDefault(true)
	}

	for _, f := range entries {
		select {
		case <-repo.Ctx.Done():
			return sizes, repo.Ctx.Err()
		default:
		}

		contentBuf.Reset()
		content = contentBuf.Bytes()

		if f.Size() == 0 {
			continue
		}

		isVendored := optional.None[bool]()
		isGenerated := optional.None[bool]()
		isDocumentation := optional.None[bool]()
		isDetectable := optional.None[bool]()

		attrs, err := checker.CheckPath(f.Name())
		if err == nil {
			isVendored = attrs["linguist-vendored"].Bool()
			isGenerated = attrs["linguist-generated"].Bool()
			isDocumentation = attrs["linguist-documentation"].Bool()
			isDetectable = attrs["linguist-detectable"].Bool()
			if language := cmp.Or(
				attrs["linguist-language"].String(),
				attrs["gitlab-language"].Prefix(),
			); language != "" {
				// group languages, such as Pug -> HTML; SCSS -> CSS
				group := enry.GetLanguageGroup(language)
				if len(group) != 0 {
					language = group
				}

				// this language will always be added to the size
				sizes[language] += f.Size()
				continue
			}
		}

		if isFalse(isDetectable) || isTrue(isVendored) || isTrue(isDocumentation) ||
			(!isFalse(isVendored) && analyze.IsVendor(f.Name())) ||
			enry.IsDotFile(f.Name()) ||
			enry.IsConfiguration(f.Name()) ||
			(!isFalse(isDocumentation) && enry.IsDocumentation(f.Name())) {
			continue
		}

		// If content can not be read or file is too big just do detection by filename

		if f.Size() <= bigFileSize {
			if err := writeID(f.ID.String()); err != nil {
				return nil, err
			}
			_, _, size, err := ReadBatchLine(batchReader)
			if err != nil {
				log.Debug("Error reading blob: %s Err: %v", f.ID.String(), err)
				return nil, err
			}

			sizeToRead := size
			discard := int64(1)
			if size > fileSizeLimit {
				sizeToRead = fileSizeLimit
				discard = size - fileSizeLimit + 1
			}

			_, err = contentBuf.ReadFrom(io.LimitReader(batchReader, sizeToRead))
			if err != nil {
				return nil, err
			}
			content = contentBuf.Bytes()
			if err := DiscardFull(batchReader, discard); err != nil {
				return nil, err
			}
		}
		if !isTrue(isGenerated) && enry.IsGenerated(f.Name(), content) {
			continue
		}

		// FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary?
		// - eg. do the all the detection tests using filename first before reading content.
		language := analyze.GetCodeLanguage(f.Name(), content)
		if language == "" {
			continue
		}

		// group languages, such as Pug -> HTML; SCSS -> CSS
		group := enry.GetLanguageGroup(language)
		if group != "" {
			language = group
		}

		included, checked := includedLanguage[language]
		langType := enry.GetLanguageType(language)
		if !checked {
			included = langType == enry.Programming || langType == enry.Markup
			if !included && (isTrue(isDetectable) || (langType == enry.Prose && isFalse(isDocumentation))) {
				included = true
			}
			includedLanguage[language] = included
		}
		if included {
			sizes[language] += f.Size()
		} else if len(sizes) == 0 && (firstExcludedLanguage == "" || firstExcludedLanguage == language) {
			// Only consider Programming or Markup languages as fallback
			if !(langType == enry.Programming || langType == enry.Markup) {
				continue
			}
			firstExcludedLanguage = language
			firstExcludedLanguageSize += f.Size()
		}
	}

	// If there are no included languages add the first excluded language
	if len(sizes) == 0 && firstExcludedLanguage != "" {
		sizes[firstExcludedLanguage] = firstExcludedLanguageSize
	}

	return mergeLanguageStats(sizes), nil
}
Language statistics bar for repositories (#8037) * Implementation for calculating language statistics Impement saving code language statistics to database Implement rendering langauge stats Add primary laguage to show in repository list Implement repository stats indexer queue Add indexer test Refactor to use queue module * Do not timeout for queues 2020-02-11 10:34:17 +01:00			`// Copyright 2020 The Gitea Authors. All rights reserved.`
Implement FSFE REUSE for golang files (#21840) Change all license headers to comply with REUSE specification. Fix #16132 Co-authored-by: flynnnnnnnnnn <flynnnnnnnnnn@github> Co-authored-by: John Olheiser <john.olheiser@gmail.com> 2022-11-27 19:20:29 +01:00			`// SPDX-License-Identifier: MIT`
Language statistics bar for repositories (#8037) * Implementation for calculating language statistics Impement saving code language statistics to database Implement rendering langauge stats Add primary laguage to show in repository list Implement repository stats indexer queue Add indexer test Refactor to use queue module * Do not timeout for queues 2020-02-11 10:34:17 +01:00
			`package git`

Merge different languages for language stats (#24900) Fix #24896 If users set different languages by `linguist-language`, the `stats` map could be: `java: 100, Java: 200`. Language stats are stored as case-insensitive in database and there is a unique key. So, the different language names should be merged to one unique name: `Java: 300` 2023-05-24 21:37:36 +02:00			`import (`
[CHORE] Drop `go-git` support See https://codeberg.org/forgejo/discussions/issues/164 for the rationale and discussion of this change. Everything related to the `go-git` dependency is dropped (Only a single instance is left in a test file to test for an XSS, it requires crafting an commit that Git itself refuses to craft). `_gogit` files have been removed entirely, `go:build: !gogit` is removed, `XXX_nogogit.go` files either have been renamed or had their code being merged into the `XXX.go` file. 2024-08-12 17:16:55 +02:00			`"bytes"`
			`"cmp"`
			`"io"`
Merge different languages for language stats (#24900) Fix #24896 If users set different languages by `linguist-language`, the `stats` map could be: `java: 100, Java: 200`. Language stats are stored as case-insensitive in database and there is a unique key. So, the different language names should be merged to one unique name: `Java: 300` 2023-05-24 21:37:36 +02:00			`"strings"`
			`"unicode"`
[CHORE] Drop `go-git` support See https://codeberg.org/forgejo/discussions/issues/164 for the rationale and discussion of this change. Everything related to the `go-git` dependency is dropped (Only a single instance is left in a test file to test for an XSS, it requires crafting an commit that Git itself refuses to craft). `_gogit` files have been removed entirely, `go:build: !gogit` is removed, `XXX_nogogit.go` files either have been renamed or had their code being merged into the `XXX.go` file. 2024-08-12 17:16:55 +02:00
			`"code.gitea.io/gitea/modules/analyze"`
			`"code.gitea.io/gitea/modules/log"`
			`"code.gitea.io/gitea/modules/optional"`

			`"github.com/go-enry/go-enry/v2"`
Merge different languages for language stats (#24900) Fix #24896 If users set different languages by `linguist-language`, the `stats` map could be: `java: 100, Java: 200`. Language stats are stored as case-insensitive in database and there is a unique key. So, the different language names should be merged to one unique name: `Java: 300` 2023-05-24 21:37:36 +02:00			`)`

format with gofumpt (#18184) * gofumpt -w -l . * gofumpt -w -l -extra . * Add linter * manual fix * change make fmt 2022-01-20 18:46:10 +01:00			`const (`
			`fileSizeLimit int64 = 16 * 1024 // 16 KiB`
			`bigFileSize int64 = 1024 * 1024 // 1 MiB`
			`)`
Merge different languages for language stats (#24900) Fix #24896 If users set different languages by `linguist-language`, the `stats` map could be: `java: 100, Java: 200`. Language stats are stored as case-insensitive in database and there is a unique key. So, the different language names should be merged to one unique name: `Java: 300` 2023-05-24 21:37:36 +02:00
			`// mergeLanguageStats mergers language names with different cases. The name with most upper case letters is used.`
			`func mergeLanguageStats(stats map[string]int64) map[string]int64 {`
			`names := map[string]struct {`
			`uniqueName string`
			`upperCount int`
			`}{}`

			`countUpper := func(s string) (count int) {`
			`for _, r := range s {`
			`if unicode.IsUpper(r) {`
			`count++`
			`}`
			`}`
			`return count`
			`}`

			`for name := range stats {`
			`cnt := countUpper(name)`
			`lower := strings.ToLower(name)`
			`if cnt >= names[lower].upperCount {`
			`names[lower] = struct {`
			`uniqueName string`
			`upperCount int`
			`}{uniqueName: name, upperCount: cnt}`
			`}`
			`}`

			`res := make(map[string]int64, len(names))`
			`for name, num := range stats {`
			`res[names[strings.ToLower(name)].uniqueName] += num`
			`}`
			`return res`
			`}`
[CHORE] Drop `go-git` support See https://codeberg.org/forgejo/discussions/issues/164 for the rationale and discussion of this change. Everything related to the `go-git` dependency is dropped (Only a single instance is left in a test file to test for an XSS, it requires crafting an commit that Git itself refuses to craft). `_gogit` files have been removed entirely, `go:build: !gogit` is removed, `XXX_nogogit.go` files either have been renamed or had their code being merged into the `XXX.go` file. 2024-08-12 17:16:55 +02:00
			`// GetLanguageStats calculates language stats for git repository at specified commit`
			`func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) {`
			`// We will feed the commit IDs in order into cat-file --batch, followed by blobs as necessary.`
			`// so let's create a batch stdin and stdout`
[PORT] Refactor the usage of batch catfile (gitea#31754) When opening a repository, it will call `ensureValidRepository` and also `CatFileBatch`. But sometimes these will not be used until repository closed. So it's a waste of CPU to invoke 3 times git command for every open repository. This PR removed all of these from `OpenRepository` but only kept checking whether the folder exists. When a batch is necessary, the necessary functions will be invoked. --- Conflict resolution: Because of the removal of go-git in (#4941) `_nogogit.go` files were either renamed or merged into the 'common' file. Git does handle the renames correctly, but for those that were merged has to be manually copied pasted over. The patch looks the same, 201 additions 90 deletions as the original patch. (cherry picked from commit c03baab678ba5b2e9d974aea147e660417f5d3f7) 2024-08-20 19:04:57 +02:00			`batchStdinWriter, batchReader, cancel, err := repo.CatFileBatch(repo.Ctx)`
			`if err != nil {`
			`return nil, err`
			`}`
[CHORE] Drop `go-git` support See https://codeberg.org/forgejo/discussions/issues/164 for the rationale and discussion of this change. Everything related to the `go-git` dependency is dropped (Only a single instance is left in a test file to test for an XSS, it requires crafting an commit that Git itself refuses to craft). `_gogit` files have been removed entirely, `go:build: !gogit` is removed, `XXX_nogogit.go` files either have been renamed or had their code being merged into the `XXX.go` file. 2024-08-12 17:16:55 +02:00			`defer cancel()`

			`writeID := func(id string) error {`
			`_, err := batchStdinWriter.Write([]byte(id + "\n"))`
			`return err`
			`}`

			`if err := writeID(commitID); err != nil {`
			`return nil, err`
			`}`
			`shaBytes, typ, size, err := ReadBatchLine(batchReader)`
			`if typ != "commit" {`
			`log.Debug("Unable to get commit for: %s. Err: %v", commitID, err)`
			`return nil, ErrNotExist{commitID, ""}`
			`}`

			`sha, err := NewIDFromString(string(shaBytes))`
			`if err != nil {`
			`log.Debug("Unable to get commit for: %s. Err: %v", commitID, err)`
			`return nil, ErrNotExist{commitID, ""}`
			`}`

			`commit, err := CommitFromReader(repo, sha, io.LimitReader(batchReader, size))`
			`if err != nil {`
			`log.Debug("Unable to get commit for: %s. Err: %v", commitID, err)`
			`return nil, err`
			`}`
			`if _, err = batchReader.Discard(1); err != nil {`
			`return nil, err`
			`}`

			`tree := commit.Tree`

			`entries, err := tree.ListEntriesRecursiveWithSize()`
			`if err != nil {`
			`return nil, err`
			`}`

			`checker, err := repo.GitAttributeChecker(commitID, LinguistAttributes...)`
			`if err != nil {`
			`return nil, err`
			`}`
			`defer checker.Close()`

			`contentBuf := bytes.Buffer{}`
			`var content []byte`

			`// sizes contains the current calculated size of all files by language`
			`sizes := make(map[string]int64)`
			`// by default we will only count the sizes of programming languages or markup languages`
			`// unless they are explicitly set using linguist-language`
			`includedLanguage := map[string]bool{}`
			`// or if there's only one language in the repository`
			`firstExcludedLanguage := ""`
			`firstExcludedLanguageSize := int64(0)`

			`isTrue := func(v optional.Option[bool]) bool {`
			`return v.ValueOrDefault(false)`
			`}`
			`isFalse := func(v optional.Option[bool]) bool {`
			`return !v.ValueOrDefault(true)`
			`}`

			`for _, f := range entries {`
			`select {`
			`case <-repo.Ctx.Done():`
			`return sizes, repo.Ctx.Err()`
			`default:`
			`}`

			`contentBuf.Reset()`
			`content = contentBuf.Bytes()`

			`if f.Size() == 0 {`
			`continue`
			`}`

			`isVendored := optional.None[bool]()`
			`isGenerated := optional.None[bool]()`
			`isDocumentation := optional.None[bool]()`
			`isDetectable := optional.None[bool]()`

			`attrs, err := checker.CheckPath(f.Name())`
			`if err == nil {`
			`isVendored = attrs["linguist-vendored"].Bool()`
			`isGenerated = attrs["linguist-generated"].Bool()`
			`isDocumentation = attrs["linguist-documentation"].Bool()`
			`isDetectable = attrs["linguist-detectable"].Bool()`
			`if language := cmp.Or(`
			`attrs["linguist-language"].String(),`
			`attrs["gitlab-language"].Prefix(),`
			`); language != "" {`
			`// group languages, such as Pug -> HTML; SCSS -> CSS`
			`group := enry.GetLanguageGroup(language)`
			`if len(group) != 0 {`
			`language = group`
			`}`

			`// this language will always be added to the size`
			`sizes[language] += f.Size()`
			`continue`
			`}`
			`}`

			`if isFalse(isDetectable) \|\| isTrue(isVendored) \|\| isTrue(isDocumentation) \|\|`
			`(!isFalse(isVendored) && analyze.IsVendor(f.Name())) \|\|`
			`enry.IsDotFile(f.Name()) \|\|`
			`enry.IsConfiguration(f.Name()) \|\|`
			`(!isFalse(isDocumentation) && enry.IsDocumentation(f.Name())) {`
			`continue`
			`}`

			`// If content can not be read or file is too big just do detection by filename`

			`if f.Size() <= bigFileSize {`
			`if err := writeID(f.ID.String()); err != nil {`
			`return nil, err`
			`}`
			`_, _, size, err := ReadBatchLine(batchReader)`
			`if err != nil {`
			`log.Debug("Error reading blob: %s Err: %v", f.ID.String(), err)`
			`return nil, err`
			`}`

			`sizeToRead := size`
			`discard := int64(1)`
			`if size > fileSizeLimit {`
			`sizeToRead = fileSizeLimit`
			`discard = size - fileSizeLimit + 1`
			`}`

			`_, err = contentBuf.ReadFrom(io.LimitReader(batchReader, sizeToRead))`
			`if err != nil {`
			`return nil, err`
			`}`
			`content = contentBuf.Bytes()`
			`if err := DiscardFull(batchReader, discard); err != nil {`
			`return nil, err`
			`}`
			`}`
			`if !isTrue(isGenerated) && enry.IsGenerated(f.Name(), content) {`
			`continue`
			`}`

			`// FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary?`
			`// - eg. do the all the detection tests using filename first before reading content.`
			`language := analyze.GetCodeLanguage(f.Name(), content)`
			`if language == "" {`
			`continue`
			`}`

			`// group languages, such as Pug -> HTML; SCSS -> CSS`
			`group := enry.GetLanguageGroup(language)`
			`if group != "" {`
			`language = group`
			`}`

			`included, checked := includedLanguage[language]`
			`langType := enry.GetLanguageType(language)`
			`if !checked {`
			`included = langType == enry.Programming \|\| langType == enry.Markup`
			`if !included && (isTrue(isDetectable) \|\| (langType == enry.Prose && isFalse(isDocumentation))) {`
			`included = true`
			`}`
			`includedLanguage[language] = included`
			`}`
			`if included {`
			`sizes[language] += f.Size()`
			`} else if len(sizes) == 0 && (firstExcludedLanguage == "" \|\| firstExcludedLanguage == language) {`
			`// Only consider Programming or Markup languages as fallback`
			`if !(langType == enry.Programming \|\| langType == enry.Markup) {`
			`continue`
			`}`
			`firstExcludedLanguage = language`
			`firstExcludedLanguageSize += f.Size()`
			`}`
			`}`

			`// If there are no included languages add the first excluded language`
			`if len(sizes) == 0 && firstExcludedLanguage != "" {`
			`sizes[firstExcludedLanguage] = firstExcludedLanguageSize`
			`}`

			`return mergeLanguageStats(sizes), nil`
			`}`