Files
infisical/cli/detect/detect.go
2023-05-18 12:04:17 -04:00

653 lines
18 KiB
Go

// MIT License
// Copyright (c) 2019 Zachary Rice
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
package detect
import (
"bufio"
"context"
"fmt"
"io"
"io/fs"
"os"
"path/filepath"
"regexp"
"strings"
"sync"
"github.com/h2non/filetype"
"github.com/Infisical/infisical-merge/config"
"github.com/Infisical/infisical-merge/detect/git"
"github.com/Infisical/infisical-merge/report"
"github.com/fatih/semgroup"
"github.com/gitleaks/go-gitdiff/gitdiff"
ahocorasick "github.com/petar-dambovaliev/aho-corasick"
"github.com/rs/zerolog/log"
"github.com/spf13/viper"
)
// Type used to differentiate between git scan types:
// $ gitleaks detect
// $ gitleaks protect
// $ gitleaks protect staged
type GitScanType int
const (
DetectType GitScanType = iota
ProtectType
ProtectStagedType
gitleaksAllowSignature = "infisical-scan:ignore"
)
// Detector is the main detector struct
type Detector struct {
// Config is the configuration for the detector
Config config.Config
// Redact is a flag to redact findings. This is exported
// so users using gitleaks as a library can set this flag
// without calling `detector.Start(cmd *cobra.Command)`
Redact bool
// verbose is a flag to print findings
Verbose bool
// files larger than this will be skipped
MaxTargetMegaBytes int
// followSymlinks is a flag to enable scanning symlink files
FollowSymlinks bool
// NoColor is a flag to disable color output
NoColor bool
// commitMap is used to keep track of commits that have been scanned.
// This is only used for logging purposes and git scans.
commitMap map[string]bool
// findingMutex is to prevent concurrent access to the
// findings slice when adding findings.
findingMutex *sync.Mutex
// findings is a slice of report.Findings. This is the result
// of the detector's scan which can then be used to generate a
// report.
findings []report.Finding
// prefilter is a ahocorasick struct used for doing efficient string
// matching given a set of words (keywords from the rules in the config)
prefilter ahocorasick.AhoCorasick
// a list of known findings that should be ignored
baseline []report.Finding
// path to baseline
baselinePath string
// gitleaksIgnore
gitleaksIgnore map[string]bool
}
// Fragment contains the data to be scanned
type Fragment struct {
// Raw is the raw content of the fragment
Raw string
// FilePath is the path to the file if applicable
FilePath string
SymlinkFile string
// CommitSHA is the SHA of the commit if applicable
CommitSHA string
// newlineIndices is a list of indices of newlines in the raw content.
// This is used to calculate the line location of a finding
newlineIndices [][]int
// keywords is a map of all the keywords contain within the contents
// of this fragment
keywords map[string]bool
}
// NewDetector creates a new detector with the given config
func NewDetector(cfg config.Config) *Detector {
builder := ahocorasick.NewAhoCorasickBuilder(ahocorasick.Opts{
AsciiCaseInsensitive: true,
MatchOnlyWholeWords: false,
MatchKind: ahocorasick.LeftMostLongestMatch,
DFA: true,
})
return &Detector{
commitMap: make(map[string]bool),
gitleaksIgnore: make(map[string]bool),
findingMutex: &sync.Mutex{},
findings: make([]report.Finding, 0),
Config: cfg,
prefilter: builder.Build(cfg.Keywords),
}
}
// NewDetectorDefaultConfig creates a new detector with the default config
func NewDetectorDefaultConfig() (*Detector, error) {
viper.SetConfigType("toml")
err := viper.ReadConfig(strings.NewReader(config.DefaultConfig))
if err != nil {
return nil, err
}
var vc config.ViperConfig
err = viper.Unmarshal(&vc)
if err != nil {
return nil, err
}
cfg, err := vc.Translate()
if err != nil {
return nil, err
}
return NewDetector(cfg), nil
}
func (d *Detector) AddGitleaksIgnore(gitleaksIgnorePath string) error {
log.Debug().Msg("found .gitleaksignore file")
file, err := os.Open(gitleaksIgnorePath)
if err != nil {
return err
}
// https://github.com/securego/gosec/issues/512
defer func() {
if err := file.Close(); err != nil {
log.Warn().Msgf("Error closing .gitleaksignore file: %s\n", err)
}
}()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
d.gitleaksIgnore[scanner.Text()] = true
}
return nil
}
func (d *Detector) AddBaseline(baselinePath string, source string) error {
if baselinePath != "" {
absoluteSource, err := filepath.Abs(source)
if err != nil {
return err
}
absoluteBaseline, err := filepath.Abs(baselinePath)
if err != nil {
return err
}
relativeBaseline, err := filepath.Rel(absoluteSource, absoluteBaseline)
if err != nil {
return err
}
baseline, err := LoadBaseline(baselinePath)
if err != nil {
return err
}
d.baseline = baseline
baselinePath = relativeBaseline
}
d.baselinePath = baselinePath
return nil
}
// DetectBytes scans the given bytes and returns a list of findings
func (d *Detector) DetectBytes(content []byte) []report.Finding {
return d.DetectString(string(content))
}
// DetectString scans the given string and returns a list of findings
func (d *Detector) DetectString(content string) []report.Finding {
return d.Detect(Fragment{
Raw: content,
})
}
// detectRule scans the given fragment for the given rule and returns a list of findings
func (d *Detector) detectRule(fragment Fragment, rule config.Rule) []report.Finding {
var findings []report.Finding
// check if filepath or commit is allowed for this rule
if rule.Allowlist.CommitAllowed(fragment.CommitSHA) ||
rule.Allowlist.PathAllowed(fragment.FilePath) {
return findings
}
if rule.Path != nil && rule.Regex == nil {
// Path _only_ rule
if rule.Path.Match([]byte(fragment.FilePath)) {
finding := report.Finding{
Description: rule.Description,
File: fragment.FilePath,
SymlinkFile: fragment.SymlinkFile,
RuleID: rule.RuleID,
Match: fmt.Sprintf("file detected: %s", fragment.FilePath),
Tags: rule.Tags,
}
return append(findings, finding)
}
} else if rule.Path != nil {
// if path is set _and_ a regex is set, then we need to check both
// so if the path does not match, then we should return early and not
// consider the regex
if !rule.Path.Match([]byte(fragment.FilePath)) {
return findings
}
}
// if path only rule, skip content checks
if rule.Regex == nil {
return findings
}
// If flag configure and raw data size bigger then the flag
if d.MaxTargetMegaBytes > 0 {
rawLength := len(fragment.Raw) / 1000000
if rawLength > d.MaxTargetMegaBytes {
log.Debug().Msgf("skipping file: %s scan due to size: %d", fragment.FilePath, rawLength)
return findings
}
}
matchIndices := rule.Regex.FindAllStringIndex(fragment.Raw, -1)
for _, matchIndex := range matchIndices {
// extract secret from match
secret := strings.Trim(fragment.Raw[matchIndex[0]:matchIndex[1]], "\n")
// determine location of match. Note that the location
// in the finding will be the line/column numbers of the _match_
// not the _secret_, which will be different if the secretGroup
// value is set for this rule
loc := location(fragment, matchIndex)
if matchIndex[1] > loc.endLineIndex {
loc.endLineIndex = matchIndex[1]
}
finding := report.Finding{
Description: rule.Description,
File: fragment.FilePath,
SymlinkFile: fragment.SymlinkFile,
RuleID: rule.RuleID,
StartLine: loc.startLine,
EndLine: loc.endLine,
StartColumn: loc.startColumn,
EndColumn: loc.endColumn,
Secret: secret,
Match: secret,
Tags: rule.Tags,
Line: fragment.Raw[loc.startLineIndex:loc.endLineIndex],
}
if strings.Contains(fragment.Raw[loc.startLineIndex:loc.endLineIndex],
gitleaksAllowSignature) {
continue
}
// extract secret from secret group if set
if rule.SecretGroup != 0 {
groups := rule.Regex.FindStringSubmatch(secret)
if len(groups) <= rule.SecretGroup || len(groups) == 0 {
// Config validation should prevent this
continue
}
secret = groups[rule.SecretGroup]
finding.Secret = secret
}
// check if the regexTarget is defined in the allowlist "regexes" entry
allowlistTarget := finding.Secret
switch rule.Allowlist.RegexTarget {
case "match":
allowlistTarget = finding.Match
case "line":
allowlistTarget = finding.Line
}
globalAllowlistTarget := finding.Secret
switch d.Config.Allowlist.RegexTarget {
case "match":
globalAllowlistTarget = finding.Match
case "line":
globalAllowlistTarget = finding.Line
}
if rule.Allowlist.RegexAllowed(allowlistTarget) ||
d.Config.Allowlist.RegexAllowed(globalAllowlistTarget) {
continue
}
// check if the secret is in the list of stopwords
if rule.Allowlist.ContainsStopWord(finding.Secret) ||
d.Config.Allowlist.ContainsStopWord(finding.Secret) {
continue
}
// check entropy
entropy := shannonEntropy(finding.Secret)
finding.Entropy = float32(entropy)
if rule.Entropy != 0.0 {
if entropy <= rule.Entropy {
// entropy is too low, skip this finding
continue
}
// NOTE: this is a goofy hack to get around the fact there golang's regex engine
// does not support positive lookaheads. Ideally we would want to add a
// restriction on generic rules regex that requires the secret match group
// contains both numbers and alphabetical characters, not just alphabetical characters.
// What this bit of code does is check if the ruleid is prepended with "generic" and enforces the
// secret contains both digits and alphabetical characters.
// TODO: this should be replaced with stop words
if strings.HasPrefix(rule.RuleID, "generic") {
if !containsDigit(secret) {
continue
}
}
}
findings = append(findings, finding)
}
return findings
}
// GitScan accepts a *gitdiff.File channel which contents a git history generated from
// the output of `git log -p ...`. startGitScan will look at each file (patch) in the history
// and determine if the patch contains any findings.
func (d *Detector) DetectGit(source string, logOpts string, gitScanType GitScanType) ([]report.Finding, error) {
var (
gitdiffFiles <-chan *gitdiff.File
err error
)
switch gitScanType {
case DetectType:
gitdiffFiles, err = git.GitLog(source, logOpts)
if err != nil {
return d.findings, err
}
case ProtectType:
gitdiffFiles, err = git.GitDiff(source, false)
if err != nil {
return d.findings, err
}
case ProtectStagedType:
gitdiffFiles, err = git.GitDiff(source, true)
if err != nil {
return d.findings, err
}
}
s := semgroup.NewGroup(context.Background(), 4)
for gitdiffFile := range gitdiffFiles {
gitdiffFile := gitdiffFile
// skip binary files
if gitdiffFile.IsBinary || gitdiffFile.IsDelete {
continue
}
// Check if commit is allowed
commitSHA := ""
if gitdiffFile.PatchHeader != nil {
commitSHA = gitdiffFile.PatchHeader.SHA
if d.Config.Allowlist.CommitAllowed(gitdiffFile.PatchHeader.SHA) {
continue
}
}
d.addCommit(commitSHA)
s.Go(func() error {
for _, textFragment := range gitdiffFile.TextFragments {
if textFragment == nil {
return nil
}
fragment := Fragment{
Raw: textFragment.Raw(gitdiff.OpAdd),
CommitSHA: commitSHA,
FilePath: gitdiffFile.NewName,
}
for _, finding := range d.Detect(fragment) {
d.addFinding(augmentGitFinding(finding, textFragment, gitdiffFile))
}
}
return nil
})
}
if err := s.Wait(); err != nil {
return d.findings, err
}
log.Info().Msgf("%d commits scanned.", len(d.commitMap))
log.Debug().Msg("Note: this number might be smaller than expected due to commits with no additions")
if git.ErrEncountered {
return d.findings, fmt.Errorf("%s", "git error encountered, see logs")
}
return d.findings, nil
}
type scanTarget struct {
Path string
Symlink string
}
// DetectFiles accepts a path to a source directory or file and begins a scan of the
// file or directory.
func (d *Detector) DetectFiles(source string) ([]report.Finding, error) {
s := semgroup.NewGroup(context.Background(), 4)
paths := make(chan scanTarget)
s.Go(func() error {
defer close(paths)
return filepath.Walk(source,
func(path string, fInfo os.FileInfo, err error) error {
if err != nil {
return err
}
if fInfo.Name() == ".git" && fInfo.IsDir() {
return filepath.SkipDir
}
if fInfo.Size() == 0 {
return nil
}
if fInfo.Mode().IsRegular() {
paths <- scanTarget{
Path: path,
Symlink: "",
}
}
if fInfo.Mode().Type() == fs.ModeSymlink && d.FollowSymlinks {
realPath, err := filepath.EvalSymlinks(path)
if err != nil {
return err
}
realPathFileInfo, _ := os.Stat(realPath)
if realPathFileInfo.IsDir() {
log.Debug().Msgf("found symlinked directory: %s -> %s [skipping]", path, realPath)
return nil
}
paths <- scanTarget{
Path: realPath,
Symlink: path,
}
}
return nil
})
})
for pa := range paths {
p := pa
s.Go(func() error {
b, err := os.ReadFile(p.Path)
if err != nil {
return err
}
mimetype, err := filetype.Match(b)
if err != nil {
return err
}
if mimetype.MIME.Type == "application" {
return nil // skip binary files
}
fragment := Fragment{
Raw: string(b),
FilePath: p.Path,
}
if p.Symlink != "" {
fragment.SymlinkFile = p.Symlink
}
for _, finding := range d.Detect(fragment) {
// need to add 1 since line counting starts at 1
finding.EndLine++
finding.StartLine++
d.addFinding(finding)
}
return nil
})
}
if err := s.Wait(); err != nil {
return d.findings, err
}
return d.findings, nil
}
// DetectReader accepts an io.Reader and a buffer size for the reader in KB
func (d *Detector) DetectReader(r io.Reader, bufSize int) ([]report.Finding, error) {
reader := bufio.NewReader(r)
buf := make([]byte, 0, 1000*bufSize)
findings := []report.Finding{}
for {
n, err := reader.Read(buf[:cap(buf)])
buf = buf[:n]
if err != nil {
if err != io.EOF {
return findings, err
}
break
}
fragment := Fragment{
Raw: string(buf),
}
for _, finding := range d.Detect(fragment) {
findings = append(findings, finding)
if d.Verbose {
printFinding(finding, d.NoColor)
}
}
}
return findings, nil
}
// Detect scans the given fragment and returns a list of findings
func (d *Detector) Detect(fragment Fragment) []report.Finding {
var findings []report.Finding
// initiate fragment keywords
fragment.keywords = make(map[string]bool)
// check if filepath is allowed
if fragment.FilePath != "" && (d.Config.Allowlist.PathAllowed(fragment.FilePath) ||
fragment.FilePath == d.Config.Path || (d.baselinePath != "" && fragment.FilePath == d.baselinePath)) {
return findings
}
// add newline indices for location calculation in detectRule
fragment.newlineIndices = regexp.MustCompile("\n").FindAllStringIndex(fragment.Raw, -1)
// build keyword map for prefiltering rules
normalizedRaw := strings.ToLower(fragment.Raw)
matches := d.prefilter.FindAll(normalizedRaw)
for _, m := range matches {
fragment.keywords[normalizedRaw[m.Start():m.End()]] = true
}
for _, rule := range d.Config.Rules {
if len(rule.Keywords) == 0 {
// if not keywords are associated with the rule always scan the
// fragment using the rule
findings = append(findings, d.detectRule(fragment, rule)...)
continue
}
fragmentContainsKeyword := false
// check if keywords are in the fragment
for _, k := range rule.Keywords {
if _, ok := fragment.keywords[strings.ToLower(k)]; ok {
fragmentContainsKeyword = true
}
}
if fragmentContainsKeyword {
findings = append(findings, d.detectRule(fragment, rule)...)
}
}
return filter(findings, d.Redact)
}
// addFinding synchronously adds a finding to the findings slice
func (d *Detector) addFinding(finding report.Finding) {
if finding.Commit == "" {
finding.Fingerprint = fmt.Sprintf("%s:%s:%d", finding.File, finding.RuleID, finding.StartLine)
} else {
finding.Fingerprint = fmt.Sprintf("%s:%s:%s:%d", finding.Commit, finding.File, finding.RuleID, finding.StartLine)
}
// check if we should ignore this finding
if _, ok := d.gitleaksIgnore[finding.Fingerprint]; ok {
log.Debug().Msgf("ignoring finding with Fingerprint %s",
finding.Fingerprint)
return
}
if d.baseline != nil && !IsNew(finding, d.baseline) {
log.Debug().Msgf("baseline duplicate -- ignoring finding with Fingerprint %s", finding.Fingerprint)
return
}
d.findingMutex.Lock()
d.findings = append(d.findings, finding)
if d.Verbose {
printFinding(finding, d.NoColor)
}
d.findingMutex.Unlock()
}
// addCommit synchronously adds a commit to the commit slice
func (d *Detector) addCommit(commit string) {
d.commitMap[commit] = true
}