Skip to content

Commit

Permalink
Merge branch 'credo'
Browse files Browse the repository at this point in the history
  • Loading branch information
Fabrizio Monti committed Dec 10, 2017
2 parents 5560416 + c35a790 commit 244dbe1
Show file tree
Hide file tree
Showing 20 changed files with 231 additions and 39 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ erl_crash.dump
# Also ignore archive artifacts (built via "mix archive.build").
*.ez

.DS_Store

sputnik

static/report_data.js
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,17 @@ To run tests:
```bash
$ mix text --cover
```

To run credo:

```bash
$ mix credo
```

## Documentation

To generate the documentation:

```bash
$ mix docs && open doc/index.html
```
38 changes: 35 additions & 3 deletions lib/crawl.ex
Original file line number Diff line number Diff line change
@@ -1,31 +1,63 @@
defmodule Crawl do
@moduledoc """
This module exposes a sync and an async way to find all href
in a html body string
"""

@doc """
Spawns a new process that finds all links in the given html body
string. It sends back a message to the given `pid` with the links it found.
It automatically converts relative urls to absolutes urls.
## Parameters
- `body`: html page as string
- `request_url`: the page url. Needed for relative -> absolute url conversion
- `pid`: the pid which will receive a messages with the found links
"""
def start(body, request_url, pid) do
spawn __MODULE__, :parse, [body, request_url, pid]
end


@doc """
Finds all links in the given html body string.
It automatically converts relative urls to absolutes urls.
## Parameters
- `body`: html page as string
- `request_url`: the page url. Needed for relative -> absolute url conversion
"""
def start(body, request_url) do
parse(body, request_url)
end

@doc false
def parse(body, request_url, pid) do
links = parse(body, request_url)
send pid, {:ok, links}
end

defp parse(body, request_url) do
find_links(body)
body
|> find_links
|> Enum.map(fn (link) -> parse_url(request_url, link) end)
|> Enum.filter(fn (item) -> item != nil end)
|> Enum.uniq
end

defp find_links(body) do
Floki.find(body, "a")
body
|> Floki.find("a")
|> Floki.attribute("href")
end

defp parse_url(request_url, link) do
URI.merge(request_url, link)
request_url
|> URI.merge(link)
|> uri_to_string
end

Expand Down
2 changes: 2 additions & 0 deletions lib/greetings.ex
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
defmodule Greetings do
@moduledoc false

def start do
IO.puts(
"
Expand Down
25 changes: 22 additions & 3 deletions lib/page.ex
Original file line number Diff line number Diff line change
@@ -1,8 +1,25 @@
defmodule Page do
@moduledoc """
This module fetches and parses a given URL.
"""

@doc """
Asyncronously fetches a given URL. It parses the body and sends back to Queue
the list of links to follow. It parses the body and counts how many CSS selectors
there are in the page
## Parameters
- `url`: the initial URL to crawl
- `query`: list of valid CSS selectors as strings
- `queue_pid`: the pid which will receive the output
"""
def start(url, query, queue_pid) do
spawn __MODULE__, :init, [url, query, queue_pid]
end

@doc false
def init(url, query, queue_pid) do
Request.start(url, self())
loop(query, queue_pid)
Expand All @@ -27,9 +44,11 @@ defmodule Page do
end

defp header_location(headers) do
{_, location} = Enum.find(headers, (fn(item) ->
Tuple.to_list(item) |> Enum.member?("Location")
end))
{_, location} =
headers
|> Enum.find(
(fn(item) -> item |> Tuple.to_list |> Enum.member?("Location") end)
)
location
end
end
Expand Down
35 changes: 30 additions & 5 deletions lib/parse.ex
Original file line number Diff line number Diff line change
@@ -1,20 +1,45 @@
defmodule Parse do
@moduledoc """
This module parses the given html string and counts how many CSS selectors
there are in it.
"""

@doc """
Asyncronously returns a map of the given CSS selectors with their count.
## Parameters
- `body`: html page as string
- `queries`: a list of valid CSS selectors as string
- `pid`: the pid which will receive the output
"""
def start(body, queries, pid) do
spawn __MODULE__, :parse, [body, queries, pid]
end

@doc """
Returns a map of the given CSS selectors with their count.
## Parameters
- `body`: html page as string
- `queries`: a list of valid CSS selectors as string
"""
def start(body, queries) do
parse(body, queries)
end

def parse(body, queries) do
@doc false
def parse(body, queries, pid) do
send pid, parse(body, queries)
end

defp parse(body, queries) do
Enum.reduce(queries, %{}, fn(q, acc) ->
items = Floki.find(body, q)
Map.put(acc, q, Enum.count(items))
end)
end

def parse(body, queries, pid) do
send pid, parse(body, queries)
end
end
24 changes: 23 additions & 1 deletion lib/queue.ex
Original file line number Diff line number Diff line change
@@ -1,9 +1,31 @@
defmodule Queue do
@moduledoc """
This module crawls all pages and returns a list of pages as tuples.
The crawler will never go outside of the given URL host.
"""

@doc """
Asyncronously crawls all page linked from the initial URL.
It returns a list of tuples, each tuple containing:
- status code
- page url
- map with CSS selectors and their count
## Parameters
- `url`: the initial URL to crawl
- `query`: list of valid CSS selectors as strings
- `sputnik_pid`: the pid which will receive the output
"""
def start(url, query, sputnik_pid) do
spawn __MODULE__, :init, [url, query, sputnik_pid]
end

@doc false
def init(url, query, sputnik_pid) do
Page.start(url, query, self())
%URI{host: host} = URI.parse(url)
Expand All @@ -21,7 +43,7 @@ defmodule Queue do
IO.write "."
loop(domain, processing, done, query)
{:error, error} ->
IO.puts "Error: #{IO.inspect(error)}"
IO.puts "Error!: #{error}"
Greetings.error
_ ->
raise "Unknown message"
Expand Down
45 changes: 39 additions & 6 deletions lib/request.ex
Original file line number Diff line number Diff line change
@@ -1,22 +1,55 @@
defmodule Request do
@moduledoc """
This module wraps a http client
"""

@doc """
Asyncronously returns the following informations from the given url to the pid:
- page body
- request status code
- request url
- headers
## Parameters
- `url`: the URL to fetch via HTTP client
- `pid`: the pid which will receive the output
"""
def start(url, pid) do
spawn __MODULE__, :get, [url, pid]
end

@doc """
Returns the following informations from the given url to the pid:
- page body
- request status code
- request url
- headers
## Parameters
- `url`: the URL to fetch via HTTP client
"""
def start(url) do
get(url)
end

def get(url) do
start_http_client()
get_url_content(url)
|> parse_content
end

@doc false
def get(url, pid) do
send pid, get(url)
end

defp get(url) do
start_http_client()
url
|> get_url_content
|> parse_content
end

defp start_http_client do
HTTPoison.start
end
Expand Down
34 changes: 30 additions & 4 deletions lib/sputnik.ex
Original file line number Diff line number Diff line change
@@ -1,4 +1,21 @@
defmodule Sputnik do
@moduledoc """
This is the main entrance for the Sputnik program.
"""

@doc """
Crawls a url and prints out the report.
## Parameters
- `url`: String that represents the initial url to crawl
- `queries`: List of valid CSS selectors a strings
## Examples
iex> Sputnik.start("https://spawnfest.github.io", ["a", "h1,h2,h3"])
"""
def start(url, queries \\ []) do
Greetings.start
Queue.start(url, queries, self())
Expand All @@ -10,15 +27,24 @@ defmodule Sputnik do
end
end

@doc """
This function is the main entrance for the CLI and it is not
meant to be used directly.
```bash
# inside the project folder
$ mix escript.build
$ ./sputnik "http://spawnfest.github.io" --query "a" --query "h1,h2,h3"
```
"""
def main(args) do
{url, queries} = parse_args(args)
start(url, queries)
end

defp find_queries(collection) do
Enum.filter(collection, fn(element) ->
match?({:query, _}, element)
end)
collection
|> Enum.filter(fn(element) -> match?({:query, _}, element) end)
|> Enum.map(fn({_, query}) -> query end)
end

Expand All @@ -27,7 +53,7 @@ defmodule Sputnik do
exit(1)
end

def parse_args(args) do
defp parse_args(args) do
parsed = OptionParser.parse(args,
strict: [
query: [:string, :keep]
Expand Down
Loading

0 comments on commit 244dbe1

Please sign in to comment.