-
Notifications
You must be signed in to change notification settings - Fork 3
/
poc.py
46 lines (40 loc) · 1.78 KB
/
poc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from pprint import pprint
from lxml import html
import requests
import dateutil.parser
KNOWN_HREFS = ['/favicon.ico', '/i/pastebin.min.v9.css?1575389335', '/pro', '/api', '/faq', '/login', '/signup',
'/archive', '/tools#chrome', '/languages', '/archive', '/faq', '/night_mode', '/dmca', '/contact']
KNOWN_PREFIXES = ['http://', 'https://', '/tools', '/doc_', '/archive/']
if __name__ == '__main__':
page = requests.get('https://pastebin.com/archive')
tree = html.fromstring(page.content)
unfiltered_hrefs = [ref for ref in tree.xpath("//@href") if ref not in KNOWN_HREFS]
hrefs = []
for href in unfiltered_hrefs:
is_pastebin_href = True
for prefix in KNOWN_PREFIXES:
if str(href).startswith(prefix):
is_pastebin_href = False
break
if is_pastebin_href:
hrefs.append(href)
hrefs = [href for href in hrefs if len(href) > 5]
for href in hrefs[:10]:
page = requests.get('https://pastebin.com{}'.format(href))
tree = html.fromstring(page.content)
# title = tree.xpath("//h1")[0].text
# title = tree.xpath("//div[@class='paste_box_line1']/h1")[0].text
title = tree.xpath("//div[contains(@class, 'paste_box_line1')]")[0].getchildren()[0].text
try:
author = tree.xpath("//div[contains(@class, 'paste_box_line2')]/a")[0].text
except:
author = "Guest"
date = dateutil.parser.parse(tree.xpath("//div[contains(@class, 'paste_box_line2')]/span")[0].text)
code = tree.xpath("//textarea[contains(@id, 'paste_code')]")[0].text
print("href: ", href)
print("Title: ", title)
print("Author: ", author)
print("date: ", date)
print(code)
print("*"*20)
print("*"*20)