poc.py

from pprint import pprint

from lxml import html
import requests
import dateutil.parser

KNOWN_HREFS = ['/favicon.ico', '/i/pastebin.min.v9.css?1575389335', '/pro', '/api', '/faq', '/login', '/signup',
               '/archive', '/tools#chrome', '/languages', '/archive', '/faq', '/night_mode', '/dmca', '/contact']

KNOWN_PREFIXES = ['http://', 'https://', '/tools', '/doc_', '/archive/']

if __name__ == '__main__':
    page = requests.get('https://pastebin.com/archive')
    tree = html.fromstring(page.content)
    unfiltered_hrefs = [ref for ref in tree.xpath("//@href") if ref not in KNOWN_HREFS]
    hrefs = []
    for href in unfiltered_hrefs:
        is_pastebin_href = True
        for prefix in KNOWN_PREFIXES:
            if str(href).startswith(prefix):
                is_pastebin_href = False
                break
        if is_pastebin_href:
            hrefs.append(href)

    hrefs = [href for href in hrefs if len(href) > 5]

    for href in hrefs[:10]:
        page = requests.get('https://pastebin.com{}'.format(href))
        tree = html.fromstring(page.content)
        # title = tree.xpath("//h1")[0].text
        # title = tree.xpath("//div[@class='paste_box_line1']/h1")[0].text
        title = tree.xpath("//div[contains(@class, 'paste_box_line1')]")[0].getchildren()[0].text
        try:
            author = tree.xpath("//div[contains(@class, 'paste_box_line2')]/a")[0].text
        except:
            author = "Guest"
        date = dateutil.parser.parse(tree.xpath("//div[contains(@class, 'paste_box_line2')]/span")[0].text)
        code = tree.xpath("//textarea[contains(@id, 'paste_code')]")[0].text
        print("href: ", href)
        print("Title: ", title)
        print("Author: ", author)
        print("date: ", date)
        print(code)
        print("*"*20)
        print("*"*20)