Skip to content

Commit

Permalink
feat(engine): warn if overlap cancels verification
Browse files Browse the repository at this point in the history
  • Loading branch information
rgmz committed Sep 13, 2024
1 parent dc9c9a3 commit c3e49e8
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 11 deletions.
58 changes: 49 additions & 9 deletions pkg/engine/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -867,12 +867,22 @@ type chunkSecretKey struct {
detectorKey ahocorasick.DetectorKey
}

func likelyDuplicate(ctx context.Context, val chunkSecretKey, dupes map[chunkSecretKey]struct{}) bool {
const similarityThreshold = 0.9
const similarityThreshold = 0.9

valStr := val.secret
func likelyDuplicate(ctx context.Context, val chunkSecretKey, dupes map[chunkSecretKey]struct{}) (bool, detectorspb.DetectorType) {
var (
detectorType detectorspb.DetectorType
valStr = val.secret
valRedacted string
)
if len(valStr) < 3 {
return false, detectorType
} else {
valRedacted = valStr[:3] + "..."
}
for dupeKey := range dupes {
dupe := dupeKey.secret
detectorType = dupeKey.detectorKey.Type()
// Avoid comparing strings of vastly different lengths.
if len(dupe)*10 < len(valStr)*9 || len(dupe)*10 > len(valStr)*11 {
continue
Expand All @@ -885,25 +895,42 @@ func likelyDuplicate(ctx context.Context, val chunkSecretKey, dupes map[chunkSec
}

if valStr == dupe {
ctx.Logger().V(2).Info(
ctx.Logger().V(1).Info(
"found exact duplicate",
"val", valRedacted,
"val_detector", val.detectorKey.Type(),
"dupe_detector", dupeKey.detectorKey.Type(),
)
return true
return true, detectorType
}

similarity := strutil.Similarity(valStr, dupe, metrics.NewLevenshtein())

// close enough
if similarity > similarityThreshold {
ctx.Logger().V(2).Info(
ctx.Logger().V(1).Info(
"found similar duplicate",
"val", valRedacted,
"val_detector", val.detectorKey.Type(),
"dupe_detector", dupeKey.detectorKey.Type(),
)
return true
return true, detectorType
}
}
return false
return false, detectorType
}

type detectorOverlapKey struct {
DetectorA detectorspb.DetectorType
DetectorB detectorspb.DetectorType
}

func (d detectorOverlapKey) Equal(other detectorOverlapKey) bool {
return (d.DetectorA == other.DetectorA && d.DetectorB == other.DetectorB) || (d.DetectorA == other.DetectorB && d.DetectorB == other.DetectorA)
}

var detectorOverlaps = make(map[detectorOverlapKey]struct{})

func (e *Engine) verificationOverlapWorker(ctx context.Context) {
var wgDetect sync.WaitGroup

Expand Down Expand Up @@ -962,7 +989,20 @@ func (e *Engine) verificationOverlapWorker(ctx context.Context) {
continue
}

if likelyDuplicate(ctx, key, chunkSecrets) {
if ok, t := likelyDuplicate(ctx, key, chunkSecrets); ok {
// Record the overlap between detectors.
overlapKey := detectorOverlapKey{
key.detectorKey.Type(),
t,
}
if _, ok := detectorOverlaps[overlapKey]; !ok {
detectorOverlaps[overlapKey] = struct{}{}
ctx.Logger().Info(
"WARNING: A result will not be verified because more than one detector matches. "+
"You can override this behavior by using the --allow-verification-overlap flag",
"detectors", []string{overlapKey.DetectorA.String(), overlapKey.DetectorB.String()})
}

// This indicates that the same secret was found by multiple detectors.
// We should NOT VERIFY this chunk's data.
if e.verificationOverlapTracker != nil {
Expand Down
4 changes: 2 additions & 2 deletions pkg/engine/engine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -818,7 +818,7 @@ func TestLikelyDuplicate(t *testing.T) {
name: "empty strings",
val: chunkSecretKey{"", detectorA.Key},
dupes: map[chunkSecretKey]struct{}{{"", detectorB.Key}: {}},
expected: true,
expected: false,
},
{
name: "similar within threshold same detector",
Expand All @@ -833,7 +833,7 @@ func TestLikelyDuplicate(t *testing.T) {
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
ctx := context.Background()
result := likelyDuplicate(ctx, tc.val, tc.dupes)
result, _ := likelyDuplicate(ctx, tc.val, tc.dupes)
if result != tc.expected {
t.Errorf("expected %v, got %v", tc.expected, result)
}
Expand Down

0 comments on commit c3e49e8

Please sign in to comment.