Merge branch 'credo'

welaika · Dec 10, 2017 · 244dbe1 · 244dbe1
2 parents 5560416 + c35a790
commit 244dbe1
Show file tree

Hide file tree

Showing 20 changed files with 231 additions and 39 deletions.
diff --git a/.gitignore b/.gitignore
@@ -19,6 +19,8 @@ erl_crash.dump
 # Also ignore archive artifacts (built via "mix archive.build").
 *.ez
 
+.DS_Store
+
 sputnik
 
 static/report_data.js
diff --git a/README.md b/README.md
@@ -71,3 +71,17 @@ To run tests:
 ```bash
 $ mix text --cover
 ```
+
+To run credo:
+
+```bash
+$ mix credo
+```
+
+## Documentation
+
+To generate the documentation:
+
+```bash
+$ mix docs && open doc/index.html
+```
diff --git a/lib/crawl.ex b/lib/crawl.ex
@@ -1,31 +1,63 @@
 defmodule Crawl do
+  @moduledoc """
+  This module exposes a sync and an async way to find all href 
+  in a html body string
+  """
+
+  @doc """
+  Spawns a new process that finds all links in the given html body
+  string. It sends back a message to the given `pid` with the links it found.
+
+  It automatically converts relative urls to absolutes urls.
+
+  ## Parameters
+
+    - `body`: html page as string
+    - `request_url`: the page url. Needed for relative -> absolute url conversion
+    - `pid`: the pid which will receive a messages with the found links
+  """
   def start(body, request_url, pid) do
     spawn __MODULE__, :parse, [body, request_url, pid]
   end
 
+
+  @doc """
+  Finds all links in the given html body string.
+
+  It automatically converts relative urls to absolutes urls.
+
+  ## Parameters
+
+    - `body`: html page as string
+    - `request_url`: the page url. Needed for relative -> absolute url conversion
+  """
   def start(body, request_url) do
     parse(body, request_url)
   end
 
+  @doc false
   def parse(body, request_url, pid) do
     links = parse(body, request_url)
     send pid, {:ok, links}
   end
 
   defp parse(body, request_url) do
-    find_links(body)
+    body
+      |> find_links
       |> Enum.map(fn (link) -> parse_url(request_url, link) end)
       |> Enum.filter(fn (item) -> item != nil end)
       |> Enum.uniq
   end
 
   defp find_links(body) do
-    Floki.find(body, "a")
+    body
+      |> Floki.find("a")
       |> Floki.attribute("href")
   end
 
   defp parse_url(request_url, link) do
-    URI.merge(request_url, link)
+    request_url
+      |> URI.merge(link)
       |> uri_to_string
   end
 

diff --git a/lib/greetings.ex b/lib/greetings.ex
@@ -1,4 +1,6 @@
 defmodule Greetings do
+  @moduledoc false
+
   def start do
     IO.puts(
       "

diff --git a/lib/page.ex b/lib/page.ex
@@ -1,8 +1,25 @@
 defmodule Page do
+  @moduledoc """
+  This module fetches and parses a given URL.
+  """
+
+  @doc """
+  Asyncronously fetches a given URL. It parses the body and sends back to Queue
+  the list of links to follow. It parses the body and counts how many CSS selectors
+  there are in the page
+
+  ## Parameters
+
+    - `url`: the initial URL to crawl
+    - `query`: list of valid CSS selectors as strings
+    - `queue_pid`: the pid which will receive the output
+
+  """
   def start(url, query, queue_pid) do
     spawn __MODULE__, :init, [url, query, queue_pid]
   end
 
+  @doc false
   def init(url, query, queue_pid) do
     Request.start(url, self())
     loop(query, queue_pid)
@@ -27,9 +44,11 @@ defmodule Page do
   end
 
   defp header_location(headers) do
-    {_, location} = Enum.find(headers, (fn(item) ->
-                      Tuple.to_list(item) |> Enum.member?("Location")
-                    end))
+    {_, location} =
+      headers 
+      |> Enum.find(
+        (fn(item) -> item |> Tuple.to_list |> Enum.member?("Location") end)
+      )
     location
   end
 end

diff --git a/lib/parse.ex b/lib/parse.ex
@@ -1,20 +1,45 @@
 defmodule Parse do
+  @moduledoc """
+  This module parses the given html string and counts how many CSS selectors
+  there are in it.
+  """
+
+  @doc """
+  Asyncronously returns a map of the given CSS selectors with their count.
+
+  ## Parameters
+
+    - `body`: html page as string
+    - `queries`: a list of valid CSS selectors as string
+    - `pid`: the pid which will receive the output
+
+  """
   def start(body, queries, pid) do
     spawn __MODULE__, :parse, [body, queries, pid]
   end
 
+  @doc """
+  Returns a map of the given CSS selectors with their count.
+
+  ## Parameters
+
+    - `body`: html page as string
+    - `queries`: a list of valid CSS selectors as string
+
+  """
   def start(body, queries) do
     parse(body, queries)
   end
 
-  def parse(body, queries) do
+  @doc false
+  def parse(body, queries, pid) do
+    send pid, parse(body, queries)
+  end
+
+  defp parse(body, queries) do
     Enum.reduce(queries, %{}, fn(q, acc) ->
       items = Floki.find(body, q)
       Map.put(acc, q, Enum.count(items))
     end)
   end
-
-  def parse(body, queries, pid) do
-    send pid, parse(body, queries)
-  end
 end
diff --git a/lib/queue.ex b/lib/queue.ex
@@ -1,9 +1,31 @@
 defmodule Queue do
+  @moduledoc """
+  This module crawls all pages and returns a list of pages as tuples.
 
+  The crawler will never go outside of the given URL host.
+  """
+
+  @doc """
+  Asyncronously crawls all page linked from the initial URL.
+
+  It returns a list of tuples, each tuple containing:
+
+    - status code
+    - page url
+    - map with CSS selectors and their count
+
+  ## Parameters
+
+    - `url`: the initial URL to crawl
+    - `query`: list of valid CSS selectors as strings
+    - `sputnik_pid`: the pid which will receive the output
+
+  """
   def start(url, query, sputnik_pid) do
     spawn __MODULE__, :init, [url, query, sputnik_pid]
   end
 
+  @doc false
   def init(url, query, sputnik_pid) do
     Page.start(url, query, self())
     %URI{host: host} = URI.parse(url)
@@ -21,7 +43,7 @@ defmodule Queue do
         IO.write "."
         loop(domain, processing, done, query)
       {:error, error} ->
-        IO.puts "Error: #{IO.inspect(error)}"
+        IO.puts "Error!: #{error}"
         Greetings.error
       _ ->
         raise "Unknown message"

diff --git a/lib/request.ex b/lib/request.ex
@@ -1,22 +1,55 @@
 defmodule Request do
+  @moduledoc """
+  This module wraps a http client
+  """
+
+  @doc """
+  Asyncronously returns the following informations from the given url to the pid:
+
+    - page body
+    - request status code
+    - request url
+    - headers
+
+  ## Parameters
+
+    - `url`: the URL to fetch via HTTP client
+    - `pid`: the pid which will receive the output
+
+  """
   def start(url, pid) do
     spawn __MODULE__, :get, [url, pid]
   end
 
+  @doc """
+  Returns the following informations from the given url to the pid:
+
+    - page body
+    - request status code
+    - request url
+    - headers
+
+  ## Parameters
+
+    - `url`: the URL to fetch via HTTP client
+
+  """
   def start(url) do
     get(url)
   end
 
-  def get(url) do
-    start_http_client()
-    get_url_content(url)
-      |> parse_content
-  end
-
+  @doc false
   def get(url, pid) do
     send pid, get(url)
   end
 
+  defp get(url) do
+    start_http_client()
+    url
+      |> get_url_content
+      |> parse_content
+  end
+
   defp start_http_client do
     HTTPoison.start
   end

diff --git a/lib/sputnik.ex b/lib/sputnik.ex
@@ -1,4 +1,21 @@
 defmodule Sputnik do
+  @moduledoc """
+  This is the main entrance for the Sputnik program.
+  """
+
+  @doc """
+  Crawls a url and prints out the report.
+
+  ## Parameters
+
+    - `url`: String that represents the initial url to crawl
+    - `queries`: List of valid CSS selectors a strings
+
+  ## Examples
+
+      iex> Sputnik.start("https://spawnfest.github.io", ["a", "h1,h2,h3"])
+
+  """
   def start(url, queries \\ []) do
     Greetings.start
     Queue.start(url, queries, self())
@@ -10,15 +27,24 @@ defmodule Sputnik do
     end
   end
 
+  @doc """
+  This function is the main entrance for the CLI and it is not
+  meant to be used directly.
+
+  ```bash
+  # inside the project folder
+  $ mix escript.build
+  $ ./sputnik "http://spawnfest.github.io" --query "a" --query "h1,h2,h3"
+  ```
+  """
   def main(args) do
     {url, queries} = parse_args(args)
     start(url, queries)
   end
 
   defp find_queries(collection) do
-    Enum.filter(collection, fn(element) ->
-      match?({:query, _}, element)
-    end)
+    collection
+      |> Enum.filter(fn(element) -> match?({:query, _}, element) end)
       |> Enum.map(fn({_, query}) -> query end)
   end
 
@@ -27,7 +53,7 @@ defmodule Sputnik do
     exit(1)
   end
 
-  def parse_args(args) do
+  defp parse_args(args) do
     parsed = OptionParser.parse(args,
       strict: [
         query: [:string, :keep]