-
Notifications
You must be signed in to change notification settings - Fork 3
Home
Iss M edited this page Feb 18, 2024
·
4 revisions
A configurable, thread-safe web crawler, provides a minimal interface for crawling and downloading web pages.
-
API
-
Examples
- Install
- example: default options
the core structure and interfaces of WBot are defined in the pkg/api
package.
New(opts ...Option) *WBot
// ...
Run(links ...string) error
OnReponse(fn func(*wbot.Response))
Metrics() map[string]int64
Shutdown()
(r *Request) ResolveURL(u string) (*url.URL, error)
(u *ParsedURL) String() string
FindLinks(body []byte) (hrefs []string)
NewURL(raw string) (*ParsedURL, error)
Hostname(link string) (string, error)
Fetcher interface {
Fetch(ctx context.Context, req *Request) (*Response, error)
Close() error
}
Store interface {
HasVisited(ctx context.Context, u *ParsedURL) (bool, error)
Close() error
}
Queue interface {
Push(ctx context.Context, req *Request) error
Pop(ctx context.Context) (*Request, error)
Len() int32
Close() error
}
MetricsMonitor interface {
IncTotalRequests()
IncSuccessfulRequests()
IncFailedRequests()
IncTotalLink()
IncCrawledLink()
IncSkippedLink()
IncDuplicatedLink()
Metrics() map[string]int64
}
requires Go1.22
go get github.com/twiny/wbot
package main
import (
"fmt"
"log"
"github.com/rs/zerolog"
"github.com/twiny/wbot"
"github.com/twiny/wbot/pkg/api"
)
func main() {
bot := wbot.New(
wbot.WithParallel(50),
wbot.WithMaxDepth(5),
wbot.WithRateLimit(&api.RateLimit{
Hostname: "*",
Rate: "10/1s",
}),
wbot.WithLogLevel(zerolog.DebugLevel),
)
defer bot.Shutdown()
// read responses
bot.OnReponse(func(resp *api.Response) {
fmt.Printf("crawled: %s\n", resp.URL.String())
})
if err := bot.Run(
"https://go.dev/",
); err != nil {
log.Fatal(err)
}
log.Printf("finished crawling\n")
}