Skip to content

Commit

Permalink
Added -xhr-extraction option
Browse files Browse the repository at this point in the history
  • Loading branch information
aristosMiliaressis committed Jun 17, 2023
1 parent ef6fb76 commit 9b587d2
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 1 deletion.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ CONFIGURATION:
-mrs, -max-response-size int maximum response size to read (default 9223372036854775807)
-timeout int time to wait for request in seconds (default 10)
-aff, -automatic-form-fill enable automatic form filling (experimental)
-fx, -form-extraction enable extraction of form, input, textarea & select elements
-fx, -form-extraction enable extraction of form, input, textarea & select elements
-retry int number of times to retry the request (default 1)
-proxy string http/socks5 proxy to use
-H, -headers string[] custom header/cookie to include in all http request in header:value format (file)
Expand All @@ -148,6 +148,7 @@ HEADLESS:
-cdd, -chrome-data-dir string path to store chrome browser data
-scp, -system-chrome-path string use specified chrome browser for headless crawling
-noi, -no-incognito start headless chrome without incognito mode
-xhr, -xhr-extraction extract xhr requests

SCOPE:
-cs, -crawl-scope string[] in scope url regex to be followed by crawler
Expand Down Expand Up @@ -310,6 +311,7 @@ HEADLESS:
-cdd, -chrome-data-dir string path to store chrome browser data
-scp, -system-chrome-path string use specified chrome browser for headless crawling
-noi, -no-incognito start headless chrome without incognito mode
-xhr, -xhr-extraction extract xhr requests
```

*`-no-sandbox`*
Expand Down
1 change: 1 addition & 0 deletions cmd/katana/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ pipelines offering both headless and non-headless crawling.`)
flagSet.StringVarP(&options.ChromeDataDir, "chrome-data-dir", "cdd", "", "path to store chrome browser data"),
flagSet.StringVarP(&options.SystemChromePath, "system-chrome-path", "scp", "", "use specified chrome browser for headless crawling"),
flagSet.BoolVarP(&options.HeadlessNoIncognito, "no-incognito", "noi", false, "start headless chrome without incognito mode"),
flagSet.BoolVarP(&options.XhrExtraction, "xhr-extraction", "xhr", false, "extract xhr requests"),
)

flagSet.CreateGroup("scope", "Scope",
Expand Down
18 changes: 18 additions & 0 deletions pkg/engine/hybrid/crawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ func (c *Crawler) navigateRequest(s *common.CrawlSession, request *navigation.Re
URLPattern: "*",
RequestStage: proto.FetchRequestStageResponse,
})

xhrRequests := []navigation.XhrRequest{}
go pageRouter.Start(func(e *proto.FetchRequestPaused) error {
URL, _ := urlutil.Parse(e.Request.URL)
body, _ := FetchGetResponseBody(page, e)
Expand Down Expand Up @@ -104,6 +106,19 @@ func (c *Crawler) navigateRequest(s *common.CrawlSession, request *navigation.Re
Raw: string(rawBytesResponse),
}

if e.ResourceType == "XHR" && c.Options.Options.XhrExtraction {
xhr := navigation.XhrRequest{}
xhr.Url = URL.String()
xhr.Method = e.Request.Method
if !e.Request.Headers["Content-Type"].Nil() {
xhr.Enctype = e.Request.Headers["Content-Type"].Str()
}
if e.Request.HasPostData {
xhr.Body = e.Request.PostData
}
xhrRequests = append(xhrRequests, xhr)
}

// trim trailing /
normalizedheadlessURL := strings.TrimSuffix(e.Request.URL, "/")
matchOriginalURL := stringsutil.EqualFoldAny(request.URL, e.Request.URL, normalizedheadlessURL)
Expand Down Expand Up @@ -188,6 +203,9 @@ func (c *Crawler) navigateRequest(s *common.CrawlSession, request *navigation.Re
if err != nil {
return nil, errorutil.NewWithTag("hybrid", "could not parse html").Wrap(err)
}

response.XhrRequests = xhrRequests

return response, nil
}

Expand Down
8 changes: 8 additions & 0 deletions pkg/navigation/response.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@ type Form struct {
Parameters []string `json:"parameters,omitempty"`
}

type XhrRequest struct {
Url string `json:"url,omitempty"`
Method string `json:"method,omitempty"`
Enctype string `json:"enctype,omitempty"`
Body string `json:"body,omitempty"`
}

func (h *Headers) MarshalJSON() ([]byte, error) {
hCopy := make(Headers)
for k, v := range *h {
Expand All @@ -38,6 +45,7 @@ type Response struct {
Technologies []string `json:"technologies,omitempty"`
Raw string `json:"raw,omitempty"`
Forms []Form `json:"forms,omitempty"`
XhrRequests []XhrRequest `json:"xhr_requests,omitempty"`
}

func (n Response) AbsoluteURL(path string) string {
Expand Down
2 changes: 2 additions & 0 deletions pkg/types/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ type Options struct {
ChromeDataDir string
// HeadlessNoIncognito specifies if chrome should be started without incognito mode
HeadlessNoIncognito bool
// XhrExtraction extract xhr requests
XhrExtraction bool
// HealthCheck determines if a self-healthcheck should be performed
HealthCheck bool
// ErrorLogFile specifies a file to write with the errors of all requests
Expand Down

0 comments on commit 9b587d2

Please sign in to comment.