-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathcreate.py
338 lines (265 loc) · 8.56 KB
/
create.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
#!/usr/bin/env python
####################################
#
# Module to create a large json file with lists of
# restricted-content sites
#
# To run this:
# python create.py
####################################
#Types of restricted sites
# - Adult / Pornographic
# - Weapons
# - Drugs
# - Gambling
# - Gore/Violence
# - Alcohol
# - Cult (e.g. Scientology)
# - Terrorism recruitment (e.g. AlQaeda)
####################################
# Sources:
# - Alexa content ratings
# - DMOZ categories
# - JTerry Content Verification List
# - DomainAnalysis
# - TLDs (.xxx)
# - UNT List
# - Domain name matching
from json import dumps
from datetime import datetime
from os import listdir
from base64 import b64encode
from md5 import new as md5new
from pymongo import MongoClient
from tldextract import extract
#Accessing particular data sources
def category_chunk(c, chunks, negative=False):
"""Searches for domains by matching specific chunks in their
DMOZ categories.
Accepts a Connection (c) and an iterable (chunks)
Returns """
chunks = set(chunks)
domains = []
query = {'alexa.DMOZ.SITE.CATS.CAT':{'$exists':True}}
requirement = {'domain':1, 'alexa.DMOZ.SITE.CATS.CAT':1}
for domain in c['domains'].find(query, requirement):
negative_flag = False
try:
cat_container = domain['alexa']['DMOZ']['SITE']['CATS']['CAT'] #urgh this API
if cat_container != {}:
if type(cat_container) == list:
cats = [x['@ID'] for x in cat_container] #data consistency, anyone?
else:
cats = [cat_container['@ID']]
if negative:
for cat in cats: #pretty inefficient but gets the job done
cat = set(cat.split('/'))
if negative.intersection(cat):
negative_flag = True
if not negative_flag:
for cat in cats:
cat = cat.split('/')
for chunk in cat:
if chunk in chunks:
domain_name = domain['domain'].replace('#', '.')
domains.append(domain_name)
break
except KeyError:
continue
return domains
def check_domain_analysis(category):
"""Domain Analysis is a large spreadsheet with about 1000 several hand classified domains"""
domains = []
with open('sources/hand_classified/domain_analysis.tsv') as f:
for line in f:
line = line.split('\t')
domain = line[0]
categories = line[1]
if category in categories:
domains.append(domain)
return domains
def load_alexa():
"""Returns a set of all the domains in the latest Alexa top 1m list"""
timestamp = datetime.strftime(datetime.now(), '%Y-%m-%d')
top_1m_location = "/Users/mruttley/Documents/2015-04-22 AdGroups/Bucketerer/data_crunching/ranking_files/"+timestamp+"top-1m.csv"
alexa = set()
with open(top_1m_location) as f:
for n, line in enumerate(f):
if len(line) > 4:
if line.endswith('\n'):
line = line[:-1]
domain = line.lower().split(',')[1]
alexa.update([domain])
return alexa
def prepare_comscore_lists():
"""Cleans and prepares comscore lists. Only returns files that are in the
latest Alexa top 1m sites"""
#setup
directory = 'sources/comscore/'
alexa = load_alexa() #import alexa
#import each list
for filename in listdir(directory):
if filename.endswith("dump") == False:
print "Working on {0}".format(filename)
domains = set()
category = filename.split(".")[0] #filenames are in the format: category.txt
exists = 0
with open(directory + filename) as f:
for n, line in enumerate(f):
line = line.lower()
if len(line) > 4:
if line.endswith('\n'):
line = line[:-1]
if line.endswith("*"):
line = line[:-1]
if " " not in line:
domains.update([line])
print "Checking against Alexa"
with open(directory + category + '.dump', 'w') as g:
for domain in domains:
if domain in alexa:
exists += 1
g.write(domain + "\n")
print "Wrote {0} domains to {1}{2}.dump".format(exists, directory, category)
#Checkers
def check_toulouse_list():
"""A university in Toulouse provides a gigantic blacklist: http://dsi.ut-capitole.fr/blacklists/index_en.php.
This checks the latest alexa top 1m against it. Requires two files (see first few lines)
"""
payload_directory = "sources/toulouse/adult/"
payload_fn = "domain"
domains = set()
alexa = load_alexa()
exists = 0
with open(payload_directory + payload_fn) as f:
with open('toulouse_check.dump', 'w') as g:
print "Importing Toulouse payload"
for n, line in enumerate(f):
if len(line) > 4: #some weird line ending stuff
if line.endswith('\n'):
line = line[:-1]
domain_info = extract(line)
if domain_info.subdomain == "":
domain_name = domain_info.domain + "." + domain_info.suffix
if domain_name in alexa:
g.write(domain_name + "\n")
exists += 1
print "{0} found in Alexa. Written to toulouse_check.dump".format()
#Handlers for each genre
def get_adult_sites():
"""Gets adult sites from various data sources"""
domains = set()
#Get sites from bucketerer db
db_sites = category_chunk(c, ["Adult"])
domains.update(db_sites)
#get sites from DomainAnalysis
domain_analysis = check_domain_analysis('18')
domains.update(domain_analysis)
#get sites by tld
for domain in c['domains'].find({}, {'domain':1}):
if domain['domain'].endswith('xxx'):
domains.update([domain['domain'].replace('#', '.')])
#get comscore sites
with open('sources/comscore/adult.dump') as f:
for line in f:
if len(line) > 4:
if line.endswith('\n'):
line = line[:-1]
domains.update([line])
return sorted(list(domains))
def get_gambling_sites():
"""Gets gambling sites"""
domains = set()
#get domains from the bucketerer database
matchers = [
'Poker', 'Gambling', 'Blackjack'
]
dbdomains = category_chunk(c, matchers)
domains.update(dbdomains)
return sorted(list(domains))
def get_drugs_sites():
"""Gets drugs sites"""
domains = set()
#get domains from the bucketerer database
matchers = [
"Drugs"
]
dbdomains = category_chunk(c, matchers)
domains.update(dbdomains)
with open("sources/suggested/drugs.txt") as f:
for line in f:
if len(line) > 4:
if line.endswith('\n'):
line = line[:-1]
domains.update([line])
#remove known false positives
fps = ["fungi.com"]
for x in fps:
if x in domains:
domains.remove(x)
return sorted(list(domains))
def get_alcohol_sites():
"""Gets alcohol related sites"""
domains = set()
#get domains from the bucketerer database
matchers = [
"Wine", "Beer", "Liquor"
]
negative = ["DOS_and_Windows"]
negative = set([unicode(x) for x in negative])
dbdomains = category_chunk(c, matchers, negative=negative)
domains.update(dbdomains)
return sorted(list(domains))
def create_base64_version(sites):
"""Creates a base64 version"""
blacklist = []
for category, domains in sites.iteritems():
for domain in domains:
blacklist.append(b64encode(domain))
blacklist.append(b64encode('example.com')) #specific request
return {'domains': blacklist}
def create_md5_version(sites):
"""Creates an md5 version"""
blacklist = []
for category, domains in sites.iteritems():
for domain in domains:
blacklist.append(md5new(domain).hexdigest())
blacklist.append(md5new('example.com').hexdigest()) #specific request
return {'domains': blacklist}
def create_md5_b64_version(sites):
"""Creates a version with both hashing methods"""
blacklist = []
for category, domains in sites.iteritems():
for domain in domains:
blacklist.append(b64encode(md5new(domain).digest()))
blacklist.append(b64encode(md5new('example.com').digest())) #specific request
return {'domains': blacklist}
#Main Handler
if __name__ == "__main__":
#Set up database connection
c = MongoClient()['bucketerer']
#container
sites = {}
#prepare comscore stuff
prepare_comscore_lists()
#get sites from each genre we're concerned about
print "Processing Adult Sites"; sites['adult'] = get_adult_sites()
print "Processing Gambling Sites"; sites['gambling'] = get_gambling_sites()
print "Processing Drugs Sites"; sites['drugs'] = get_drugs_sites()
print "Processing Alcohol Sites"; sites['alcohol'] = get_alcohol_sites()
#dump b64 encoded version to file
with open('sitesb64.json', 'w') as f:
b64 = dumps(create_base64_version(sites), indent=4)
f.write(b64)
#dump md5 encoded version to file
with open('md5.json', 'w') as f:
md5 = dumps(create_md5_version(sites), indent=4)
f.write(md5)
#dump double encoded version to file
with open('md5_b64.json', 'w') as f:
both = dumps(create_md5_b64_version(sites), indent=4)
f.write(both)
#dump plaintext version to json file
with open('sites.json', 'w') as f:
sites= dumps(sites, indent=4)
f.write(sites)