Skip to content

Commit

Permalink
Removed calls to unsafe procedures through Subprocess library, cleane…
Browse files Browse the repository at this point in the history
…d and scrubed a little bit the code base.
  • Loading branch information
sebastian-code committed Apr 11, 2020
1 parent 0d81d77 commit 008ffcc
Showing 1 changed file with 17 additions and 7 deletions.
24 changes: 17 additions & 7 deletions bootcamp/news/metadatareader.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,18 +39,28 @@ def get_metadata_from_url_in_text(text):

@staticmethod
def get_urls_from_text(text):
# look for all urls in text
# and convert it to an array of urls
"""Method to look for all URLs in a given text, extract them and return them as a tuple of urls.
:requires:
:param text: A valid block of text of any lenght.
:returns:
A tuple of valid URLs extracted from the text.
"""
regex = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
return re.findall(regex, text)

@staticmethod
def get_url_metadata(url):
# get final url after all redirections
# then get html of the final url
# fill the meta data with the info available
# url = Metadatareader.get_final_url(url)
# url_content = Metadatareader.get_url_content(url)
"""This method looks for the page of a given URL, extracts the page content and parses the content with
BeautifulSoup searching for the page meta, then it returns the metadata in case there is any.
:requires:
:param url: Any valid URL to search for.
:returns:
Metadata information extracted from a webpage.
"""
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
metadata = Metadata()
Expand Down

0 comments on commit 008ffcc

Please sign in to comment.