-
Notifications
You must be signed in to change notification settings - Fork 15
/
shopifyscraper.py
92 lines (87 loc) · 3.55 KB
/
shopifyscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# -*- coding: utf-8 -*-
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import random
import sys
from fake_useragent import UserAgent
def scrapeUrlsFromFile(urls,keyword,outputlist):
ua = UserAgent()
matches = []
for url in urls:
if len(url) == 0:
pass
else:
c = requests.session()
print "["+str(datetime.now())+"]Scraping "+str(url)
try:
r = c.get(url, headers={'User-Agent':str(ua.random)}).content
soup = BeautifulSoup(r,'html.parser')
except Exception, e:
print "["+str(datetime.now())+"]Could not scrape "+str(url)
for itemlink in soup.find_all('loc'):
itemlink = itemlink.text
try:
itemname = itemlink.split('/products/')[1].replace('-',' ')
if keyword in itemname:
matches.append(itemlink)
except:
pass
if len(matches) == 0:
print "[-]No Keyword Matches Found"
else:
for match in matches:
print "["+str(datetime.now())+"]Keyword Matches: "+str(match)
outputlist.write(match+"\n")
print "\n===================================================================================="
matches = []
def scrapeUrl(url,keyword,outputlist):
matches = []
ua = UserAgent()
if len(url) == 0:
sys.exit('[-]Invalid url')
else:
c = requests.session()
print "["+str(datetime.now())+"]Scraping "+str(url)
r = c.get(url, headers={'User-Agent':str(ua.random)}).content
soup = BeautifulSoup(r,'html.parser')
for itemlink in soup.find_all('loc'):
itemlink = itemlink.text
try:
itemname = itemlink.split('/products/')[1].replace('-',' ')
if keyword in itemname:
matches.append(itemlink)
except:
pass
if len(matches) == 0:
print "[-]No Keyword Matches Found"
else:
for match in matches:
print "["+str(datetime.now())+"]Keyword Matches: "+str(match)
outputlist.write(match+"\n")
print "\n===================================================================================="
matches = []
def main():
outputfile = raw_input("[+]Enter the filename to save the links to (IE links.txt): ")
try:
outputlist = open(outputfile,'w+')
except IOError:
sys.exit("[-]Invalid filename!")
keyword = raw_input("[+]Enter keyword to search for: (IE \'yeezy\'): ")
choice = raw_input("[+]Use xml links from file? (y or n): ")
if choice.lower() == "y":
inputfile = raw_input("[+]Enter the filename containing the xml links (IE xmlinput.txt): ")
try:
inputlist = open(inputfile,'r')
urls = inputlist.read().split('\n')
except IOError:
sys.exit("[-]Invalid filename!")
scrapeUrlsFromFile(urls,keyword,outputlist)
elif choice.lower() == 'n':
url = raw_input('[+]Enter sitemap url to scrape: (should end in \'sitemap_products_1.xml\') ')
scrapeUrl(url,keyword,outputlist)
else:
print "[-]Invalid choice"
main()
if __name__ == '__main__':
main()