Skip to content

Commit

Permalink
deduplicate lines in files in store-field-dir
Browse files Browse the repository at this point in the history
  • Loading branch information
dogancanbakir committed May 9, 2024
1 parent 4e8cfd9 commit 7c1e113
Showing 1 changed file with 51 additions and 1 deletion.
52 changes: 51 additions & 1 deletion cmd/katana/main.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package main

import (
"bufio"
"fmt"
"math"
"os"
Expand Down Expand Up @@ -68,13 +69,62 @@ func main() {
gologger.Fatal().Msgf("could not execute crawling: %s", err)
}

// on successful execution remove the resume file in case it exists
// on successful execution:

// deduplicate the lines in each file in the store-field-dir
//use options.StoreFieldDir once https://github.com/projectdiscovery/katana/pull/877 is merged
storeFieldDir := "katana_field"
_ = deduplicateLinesInFilesInDir(storeFieldDir)

// remove the resume file in case it exists
if fileutil.FileExists(resumeFilename) {
os.Remove(resumeFilename)
}

}

func deduplicateLinesInFilesInDir(dir string) error {
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if !info.IsDir() {
return deduplicateLinesInFile(path)
}
return nil
})
if err != nil {
return errorutil.NewWithErr(err).Msgf("error processing directory %s", dir)
}
return nil
}

func deduplicateLinesInFile(filename string) error {
file, err := os.Open(filename)
if err != nil {
return errorutil.NewWithErr(err).Msgf("could not open file: %s", filename)
}
defer file.Close()

seenLines := make(map[string]struct{})
var deduplicatedLines []string

scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
if _, exists := seenLines[line]; !exists {
seenLines[line] = struct{}{}
deduplicatedLines = append(deduplicatedLines, line)
}
}

if err := scanner.Err(); err != nil {
return errorutil.NewWithErr(err).Msgf("could not read file: %s", filename)
}

return os.WriteFile(filename, []byte(strings.Join(deduplicatedLines, "\n")+"\n"), 0644)
}

func readFlags() (*goflags.FlagSet, error) {
flagSet := goflags.NewFlagSet()
flagSet.SetDescription(`Katana is a fast crawler focused on execution in automation
Expand Down

0 comments on commit 7c1e113

Please sign in to comment.