Skip to content

Commit

Permalink
Merge branch 'hotfix/2.1.2'
Browse files Browse the repository at this point in the history
  • Loading branch information
sebastian-code committed Apr 15, 2020
2 parents 95afc2d + defaab8 commit 5f6d84c
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 10 deletions.
32 changes: 27 additions & 5 deletions bootcamp/helpers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import re
from urllib.parse import urljoin
from urllib.parse import urljoin, urlparse

from django.core.exceptions import PermissionDenied
from django.core.paginator import EmptyPage, PageNotAnInteger, Paginator
from django.http import HttpResponseBadRequest
from django.http import HttpResponseBadRequest, JsonResponse
from django.utils.translation import ugettext_lazy as _
from django.views.generic import View

import bs4
Expand All @@ -29,7 +30,7 @@ def paginate_data(qs, page_size, page, paginated_type, **kwargs):
has_next=page_obj.has_next(),
has_prev=page_obj.has_previous(),
objects=page_obj.object_list,
**kwargs
**kwargs,
)


Expand Down Expand Up @@ -111,7 +112,9 @@ def get_urls(text):
:returns:
A tuple of valid URLs extracted from the text.
"""
regex = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
regex = (
regex
) = r"(?i)\b((?:https?:(?:\/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b\/?(?!@)))"
return re.findall(regex, text)


Expand All @@ -126,7 +129,26 @@ def get_metadata(url):
:returns:
A dictionary with metadata from a given webpage.
"""
response = requests.get(url)
parsed_url = urlparse(url)
if not parsed_url.scheme:
url = f"http://{parsed_url.path}"

try:
response = requests.get(url, timeout=0.9)
response.raise_for_status()

except requests.exceptions.ConnectionError:
return JsonResponse(
{"message": _(f"We detected the url {url} but it appears to be invalid.")}
)

except requests.exceptions.Timeout as e:
return JsonResponse(
_(
f"We found an error trying to connect to the site {url}, please find more info here:{e}"
)
)

soup = bs4.BeautifulSoup(response.content)
ogs = soup.html.head.find_all(property=re.compile(r"^og"))
data = {og.get("property")[3:]: og.get("content") for og in ogs}
Expand Down
11 changes: 6 additions & 5 deletions bootcamp/news/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,12 @@ def __str__(self):
def save(self, *args, **kwargs):
# extract metada from content url
data = fetch_metadata(self.content)
self.meta_url = data.get("url")
self.meta_type = data.get("type", "website")
self.meta_title = data.get("title")
self.meta_description = data.get("description")
self.meta_image = data.get("image")
if data:
self.meta_url = data.get("url")
self.meta_type = data.get("type", "website")
self.meta_title = data.get("title")
self.meta_description = data.get("description")
self.meta_image = data.get("image")

super().save(*args, **kwargs)
if not self.reply:
Expand Down

0 comments on commit 5f6d84c

Please sign in to comment.