infisical/cli/detect/detect.go

// MIT License

// Copyright (c) 2019 Zachary Rice

// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:

// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.

// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

package detect

import (
	"bufio"
	"context"
	"fmt"
	"io"
	"io/fs"
	"os"
	"path/filepath"
	"regexp"
	"strings"
	"sync"

	"github.com/h2non/filetype"

	"github.com/Infisical/infisical-merge/config"
	"github.com/Infisical/infisical-merge/detect/git"
	"github.com/Infisical/infisical-merge/report"

	"github.com/fatih/semgroup"
	"github.com/gitleaks/go-gitdiff/gitdiff"
	ahocorasick "github.com/petar-dambovaliev/aho-corasick"
	"github.com/rs/zerolog/log"
	"github.com/spf13/viper"
)

// Type used to differentiate between git scan types:
// $ gitleaks detect
// $ gitleaks protect
// $ gitleaks protect staged
type GitScanType int

const (
	DetectType GitScanType = iota
	ProtectType
	ProtectStagedType

	gitleaksAllowSignature = "infisical-scan:ignore"
)

// Detector is the main detector struct
type Detector struct {
	// Config is the configuration for the detector
	Config config.Config

	// Redact is a flag to redact findings. This is exported
	// so users using gitleaks as a library can set this flag
	// without calling `detector.Start(cmd *cobra.Command)`
	Redact bool

	// verbose is a flag to print findings
	Verbose bool

	// files larger than this will be skipped
	MaxTargetMegaBytes int

	// followSymlinks is a flag to enable scanning symlink files
	FollowSymlinks bool

	// NoColor is a flag to disable color output
	NoColor bool

	// commitMap is used to keep track of commits that have been scanned.
	// This is only used for logging purposes and git scans.
	commitMap map[string]bool

	// findingMutex is to prevent concurrent access to the
	// findings slice when adding findings.
	findingMutex *sync.Mutex

	// findings is a slice of report.Findings. This is the result
	// of the detector's scan which can then be used to generate a
	// report.
	findings []report.Finding

	// prefilter is a ahocorasick struct used for doing efficient string
	// matching given a set of words (keywords from the rules in the config)
	prefilter ahocorasick.AhoCorasick

	// a list of known findings that should be ignored
	baseline []report.Finding

	// path to baseline
	baselinePath string

	// gitleaksIgnore
	gitleaksIgnore map[string]bool
}

// Fragment contains the data to be scanned
type Fragment struct {
	// Raw is the raw content of the fragment
	Raw string

	// FilePath is the path to the file if applicable
	FilePath    string
	SymlinkFile string

	// CommitSHA is the SHA of the commit if applicable
	CommitSHA string

	// newlineIndices is a list of indices of newlines in the raw content.
	// This is used to calculate the line location of a finding
	newlineIndices [][]int

	// keywords is a map of all the keywords contain within the contents
	// of this fragment
	keywords map[string]bool
}

// NewDetector creates a new detector with the given config
func NewDetector(cfg config.Config) *Detector {
	builder := ahocorasick.NewAhoCorasickBuilder(ahocorasick.Opts{
		AsciiCaseInsensitive: true,
		MatchOnlyWholeWords:  false,
		MatchKind:            ahocorasick.LeftMostLongestMatch,
		DFA:                  true,
	})

	return &Detector{
		commitMap:      make(map[string]bool),
		gitleaksIgnore: make(map[string]bool),
		findingMutex:   &sync.Mutex{},
		findings:       make([]report.Finding, 0),
		Config:         cfg,
		prefilter:      builder.Build(cfg.Keywords),
	}
}

// NewDetectorDefaultConfig creates a new detector with the default config
func NewDetectorDefaultConfig() (*Detector, error) {
	viper.SetConfigType("toml")
	err := viper.ReadConfig(strings.NewReader(config.DefaultConfig))
	if err != nil {
		return nil, err
	}
	var vc config.ViperConfig
	err = viper.Unmarshal(&vc)
	if err != nil {
		return nil, err
	}
	cfg, err := vc.Translate()
	if err != nil {
		return nil, err
	}
	return NewDetector(cfg), nil
}

func (d *Detector) AddGitleaksIgnore(gitleaksIgnorePath string) error {
	log.Debug().Msg("found .gitleaksignore file")
	file, err := os.Open(gitleaksIgnorePath)

	if err != nil {
		return err
	}

	// https://github.com/securego/gosec/issues/512
	defer func() {
		if err := file.Close(); err != nil {
			log.Warn().Msgf("Error closing .gitleaksignore file: %s\n", err)
		}
	}()
	scanner := bufio.NewScanner(file)

	for scanner.Scan() {
		d.gitleaksIgnore[scanner.Text()] = true
	}
	return nil
}

func (d *Detector) AddBaseline(baselinePath string, source string) error {
	if baselinePath != "" {
		absoluteSource, err := filepath.Abs(source)
		if err != nil {
			return err
		}

		absoluteBaseline, err := filepath.Abs(baselinePath)
		if err != nil {
			return err
		}

		relativeBaseline, err := filepath.Rel(absoluteSource, absoluteBaseline)
		if err != nil {
			return err
		}

		baseline, err := LoadBaseline(baselinePath)
		if err != nil {
			return err
		}

		d.baseline = baseline
		baselinePath = relativeBaseline

	}

	d.baselinePath = baselinePath
	return nil
}

// DetectBytes scans the given bytes and returns a list of findings
func (d *Detector) DetectBytes(content []byte) []report.Finding {
	return d.DetectString(string(content))
}

// DetectString scans the given string and returns a list of findings
func (d *Detector) DetectString(content string) []report.Finding {
	return d.Detect(Fragment{
		Raw: content,
	})
}

// detectRule scans the given fragment for the given rule and returns a list of findings
func (d *Detector) detectRule(fragment Fragment, rule config.Rule) []report.Finding {
	var findings []report.Finding

	// check if filepath or commit is allowed for this rule
	if rule.Allowlist.CommitAllowed(fragment.CommitSHA) ||
		rule.Allowlist.PathAllowed(fragment.FilePath) {
		return findings
	}

	if rule.Path != nil && rule.Regex == nil {
		// Path _only_ rule
		if rule.Path.Match([]byte(fragment.FilePath)) {
			finding := report.Finding{
				Description: rule.Description,
				File:        fragment.FilePath,
				SymlinkFile: fragment.SymlinkFile,
				RuleID:      rule.RuleID,
				Match:       fmt.Sprintf("file detected: %s", fragment.FilePath),
				Tags:        rule.Tags,
			}
			return append(findings, finding)
		}
	} else if rule.Path != nil {
		// if path is set _and_ a regex is set, then we need to check both
		// so if the path does not match, then we should return early and not
		// consider the regex
		if !rule.Path.Match([]byte(fragment.FilePath)) {
			return findings
		}
	}

	// if path only rule, skip content checks
	if rule.Regex == nil {
		return findings
	}

	// If flag configure and raw data size bigger then the flag
	if d.MaxTargetMegaBytes > 0 {
		rawLength := len(fragment.Raw) / 1000000
		if rawLength > d.MaxTargetMegaBytes {
			log.Debug().Msgf("skipping file: %s scan due to size: %d", fragment.FilePath, rawLength)
			return findings
		}
	}

	matchIndices := rule.Regex.FindAllStringIndex(fragment.Raw, -1)
	for _, matchIndex := range matchIndices {
		// extract secret from match
		secret := strings.Trim(fragment.Raw[matchIndex[0]:matchIndex[1]], "\n")

		// determine location of match. Note that the location
		// in the finding will be the line/column numbers of the _match_
		// not the _secret_, which will be different if the secretGroup
		// value is set for this rule
		loc := location(fragment, matchIndex)

		if matchIndex[1] > loc.endLineIndex {
			loc.endLineIndex = matchIndex[1]
		}

		finding := report.Finding{
			Description: rule.Description,
			File:        fragment.FilePath,
			SymlinkFile: fragment.SymlinkFile,
			RuleID:      rule.RuleID,
			StartLine:   loc.startLine,
			EndLine:     loc.endLine,
			StartColumn: loc.startColumn,
			EndColumn:   loc.endColumn,
			Secret:      secret,
			Match:       secret,
			Tags:        rule.Tags,
			Line:        fragment.Raw[loc.startLineIndex:loc.endLineIndex],
		}

		if strings.Contains(fragment.Raw[loc.startLineIndex:loc.endLineIndex],
			gitleaksAllowSignature) {
			continue
		}

		// extract secret from secret group if set
		if rule.SecretGroup != 0 {
			groups := rule.Regex.FindStringSubmatch(secret)
			if len(groups) <= rule.SecretGroup || len(groups) == 0 {
				// Config validation should prevent this
				continue
			}
			secret = groups[rule.SecretGroup]
			finding.Secret = secret
		}

		// check if the regexTarget is defined in the allowlist "regexes" entry
		allowlistTarget := finding.Secret
		switch rule.Allowlist.RegexTarget {
		case "match":
			allowlistTarget = finding.Match
		case "line":
			allowlistTarget = finding.Line
		}

		globalAllowlistTarget := finding.Secret
		switch d.Config.Allowlist.RegexTarget {
		case "match":
			globalAllowlistTarget = finding.Match
		case "line":
			globalAllowlistTarget = finding.Line
		}
		if rule.Allowlist.RegexAllowed(allowlistTarget) ||
			d.Config.Allowlist.RegexAllowed(globalAllowlistTarget) {
			continue
		}

		// check if the secret is in the list of stopwords
		if rule.Allowlist.ContainsStopWord(finding.Secret) ||
			d.Config.Allowlist.ContainsStopWord(finding.Secret) {
			continue
		}

		// check entropy
		entropy := shannonEntropy(finding.Secret)
		finding.Entropy = float32(entropy)
		if rule.Entropy != 0.0 {
			if entropy <= rule.Entropy {
				// entropy is too low, skip this finding
				continue
			}
			// NOTE: this is a goofy hack to get around the fact there golang's regex engine
			// does not support positive lookaheads. Ideally we would want to add a
			// restriction on generic rules regex that requires the secret match group
			// contains both numbers and alphabetical characters, not just alphabetical characters.
			// What this bit of code does is check if the ruleid is prepended with "generic" and enforces the
			// secret contains both digits and alphabetical characters.
			// TODO: this should be replaced with stop words
			if strings.HasPrefix(rule.RuleID, "generic") {
				if !containsDigit(secret) {
					continue
				}
			}
		}

		findings = append(findings, finding)
	}
	return findings
}

// GitScan accepts a *gitdiff.File channel which contents a git history generated from
// the output of `git log -p ...`. startGitScan will look at each file (patch) in the history
// and determine if the patch contains any findings.
func (d *Detector) DetectGit(source string, logOpts string, gitScanType GitScanType) ([]report.Finding, error) {
	var (
		gitdiffFiles <-chan *gitdiff.File
		err          error
	)
	switch gitScanType {
	case DetectType:
		gitdiffFiles, err = git.GitLog(source, logOpts)
		if err != nil {
			return d.findings, err
		}
	case ProtectType:
		gitdiffFiles, err = git.GitDiff(source, false)
		if err != nil {
			return d.findings, err
		}
	case ProtectStagedType:
		gitdiffFiles, err = git.GitDiff(source, true)
		if err != nil {
			return d.findings, err
		}
	}

	s := semgroup.NewGroup(context.Background(), 4)

	for gitdiffFile := range gitdiffFiles {
		gitdiffFile := gitdiffFile

		// skip binary files
		if gitdiffFile.IsBinary || gitdiffFile.IsDelete {
			continue
		}

		// Check if commit is allowed
		commitSHA := ""
		if gitdiffFile.PatchHeader != nil {
			commitSHA = gitdiffFile.PatchHeader.SHA
			if d.Config.Allowlist.CommitAllowed(gitdiffFile.PatchHeader.SHA) {
				continue
			}
		}
		d.addCommit(commitSHA)

		s.Go(func() error {
			for _, textFragment := range gitdiffFile.TextFragments {
				if textFragment == nil {
					return nil
				}

				fragment := Fragment{
					Raw:       textFragment.Raw(gitdiff.OpAdd),
					CommitSHA: commitSHA,
					FilePath:  gitdiffFile.NewName,
				}

				for _, finding := range d.Detect(fragment) {
					d.addFinding(augmentGitFinding(finding, textFragment, gitdiffFile))
				}
			}
			return nil
		})
	}

	if err := s.Wait(); err != nil {
		return d.findings, err
	}
	log.Info().Msgf("%d commits scanned.", len(d.commitMap))
	log.Debug().Msg("Note: this number might be smaller than expected due to commits with no additions")
	if git.ErrEncountered {
		return d.findings, fmt.Errorf("%s", "git error encountered, see logs")
	}
	return d.findings, nil
}

type scanTarget struct {
	Path    string
	Symlink string
}

// DetectFiles accepts a path to a source directory or file and begins a scan of the
// file or directory.
func (d *Detector) DetectFiles(source string) ([]report.Finding, error) {
	s := semgroup.NewGroup(context.Background(), 4)
	paths := make(chan scanTarget)
	s.Go(func() error {
		defer close(paths)
		return filepath.Walk(source,
			func(path string, fInfo os.FileInfo, err error) error {
				if err != nil {
					return err
				}
				if fInfo.Name() == ".git" && fInfo.IsDir() {
					return filepath.SkipDir
				}
				if fInfo.Size() == 0 {
					return nil
				}
				if fInfo.Mode().IsRegular() {
					paths <- scanTarget{
						Path:    path,
						Symlink: "",
					}
				}
				if fInfo.Mode().Type() == fs.ModeSymlink && d.FollowSymlinks {
					realPath, err := filepath.EvalSymlinks(path)
					if err != nil {
						return err
					}
					realPathFileInfo, _ := os.Stat(realPath)
					if realPathFileInfo.IsDir() {
						log.Debug().Msgf("found symlinked directory: %s -> %s [skipping]", path, realPath)
						return nil
					}
					paths <- scanTarget{
						Path:    realPath,
						Symlink: path,
					}
				}
				return nil
			})
	})
	for pa := range paths {
		p := pa
		s.Go(func() error {
			b, err := os.ReadFile(p.Path)
			if err != nil {
				return err
			}

			mimetype, err := filetype.Match(b)
			if err != nil {
				return err
			}
			if mimetype.MIME.Type == "application" {
				return nil // skip binary files
			}

			fragment := Fragment{
				Raw:      string(b),
				FilePath: p.Path,
			}
			if p.Symlink != "" {
				fragment.SymlinkFile = p.Symlink
			}
			for _, finding := range d.Detect(fragment) {
				// need to add 1 since line counting starts at 1
				finding.EndLine++
				finding.StartLine++
				d.addFinding(finding)
			}

			return nil
		})
	}

	if err := s.Wait(); err != nil {
		return d.findings, err
	}

	return d.findings, nil
}

// DetectReader accepts an io.Reader and a buffer size for the reader in KB
func (d *Detector) DetectReader(r io.Reader, bufSize int) ([]report.Finding, error) {
	reader := bufio.NewReader(r)
	buf := make([]byte, 0, 1000*bufSize)
	findings := []report.Finding{}

	for {
		n, err := reader.Read(buf[:cap(buf)])
		buf = buf[:n]
		if err != nil {
			if err != io.EOF {
				return findings, err
			}
			break
		}

		fragment := Fragment{
			Raw: string(buf),
		}
		for _, finding := range d.Detect(fragment) {
			findings = append(findings, finding)
			if d.Verbose {
				printFinding(finding, d.NoColor)
			}
		}
	}

	return findings, nil
}

// Detect scans the given fragment and returns a list of findings
func (d *Detector) Detect(fragment Fragment) []report.Finding {
	var findings []report.Finding

	// initiate fragment keywords
	fragment.keywords = make(map[string]bool)

	// check if filepath is allowed
	if fragment.FilePath != "" && (d.Config.Allowlist.PathAllowed(fragment.FilePath) ||
		fragment.FilePath == d.Config.Path || (d.baselinePath != "" && fragment.FilePath == d.baselinePath)) {
		return findings
	}

	// add newline indices for location calculation in detectRule
	fragment.newlineIndices = regexp.MustCompile("\n").FindAllStringIndex(fragment.Raw, -1)

	// build keyword map for prefiltering rules
	normalizedRaw := strings.ToLower(fragment.Raw)
	matches := d.prefilter.FindAll(normalizedRaw)
	for _, m := range matches {
		fragment.keywords[normalizedRaw[m.Start():m.End()]] = true
	}

	for _, rule := range d.Config.Rules {
		if len(rule.Keywords) == 0 {
			// if not keywords are associated with the rule always scan the
			// fragment using the rule
			findings = append(findings, d.detectRule(fragment, rule)...)
			continue
		}
		fragmentContainsKeyword := false
		// check if keywords are in the fragment
		for _, k := range rule.Keywords {
			if _, ok := fragment.keywords[strings.ToLower(k)]; ok {
				fragmentContainsKeyword = true
			}
		}
		if fragmentContainsKeyword {
			findings = append(findings, d.detectRule(fragment, rule)...)
		}
	}
	return filter(findings, d.Redact)
}

// addFinding synchronously adds a finding to the findings slice
func (d *Detector) addFinding(finding report.Finding) {
	if finding.Commit == "" {
		finding.Fingerprint = fmt.Sprintf("%s:%s:%d", finding.File, finding.RuleID, finding.StartLine)
	} else {
		finding.Fingerprint = fmt.Sprintf("%s:%s:%s:%d", finding.Commit, finding.File, finding.RuleID, finding.StartLine)
	}
	// check if we should ignore this finding
	if _, ok := d.gitleaksIgnore[finding.Fingerprint]; ok {
		log.Debug().Msgf("ignoring finding with Fingerprint %s",
			finding.Fingerprint)
		return
	}

	if d.baseline != nil && !IsNew(finding, d.baseline) {
		log.Debug().Msgf("baseline duplicate -- ignoring finding with Fingerprint %s", finding.Fingerprint)
		return
	}

	d.findingMutex.Lock()
	d.findings = append(d.findings, finding)
	if d.Verbose {
		printFinding(finding, d.NoColor)
	}
	d.findingMutex.Unlock()
}

// addCommit synchronously adds a commit to the commit slice
func (d *Detector) addCommit(commit string) {
	d.commitMap[commit] = true
}