forked from sherlock-project/sherlock
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathupdate_site_data.py
executable file
·118 lines (96 loc) · 4.01 KB
/
update_site_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python3
"""Maigret: Supported Site Listing with Alexa ranking and country tags
This module generates the listing of supported sites in file `SITES.md`
and pretty prints file with sites data.
"""
import json
import sys
import requests
import logging
import threading
import xml.etree.ElementTree as ET
from datetime import datetime
from argparse import ArgumentParser, RawDescriptionHelpFormatter
RANKS = {str(i):str(i) for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 50, 100, 500]}
RANKS.update({
'1000': '1K',
'5000': '5K',
'10000': '10K',
'100000': '100K',
'10000000': '1M',
})
def get_rank(domain_to_query, dest, print_errors=True):
#Retrieve ranking data via alexa API
url = f"http://data.alexa.com/data?cli=10&url={domain_to_query}"
xml_data = requests.get(url).text
root = ET.fromstring(xml_data)
try:
#Get ranking for this site.
dest['rank'] = int(root.find('.//REACH').attrib['RANK'])
country = root.find('.//COUNTRY')
if not country is None and country.attrib:
country_code = country.attrib['CODE']
tags = set(dest.get('tags', []))
if country_code:
tags.add(country_code.lower())
dest['tags'] = sorted(list(tags))
except Exception as e:
if print_errors:
logging.error(e)
# We did not find the rank for some reason.
print(f"Error retrieving rank information for '{domain_to_query}'")
print(f" Returned XML is |{xml_data}|")
return
def get_step_rank(rank):
def get_readable_rank(r):
return RANKS[str(r)]
valid_step_ranks = sorted(map(int, RANKS.keys()))
if rank == 0:
return get_readable_rank(valid_step_ranks[-1])
else:
return get_readable_rank(list(filter(lambda x: x >= rank, valid_step_ranks))[0])
if __name__ == '__main__':
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
)
parser.add_argument("--base","-b", metavar="BASE_FILE",
dest="base_file", default="maigret/resources/data.json",
help="JSON file with sites data to update.")
pool = list()
args = parser.parse_args()
with open(args.base_file, "r", encoding="utf-8") as data_file:
data = json.load(data_file)
with open("sites.md", "w") as site_file:
data_length = len(data)
site_file.write(f"""
## List of supported sites: total {data_length}\n
Rank data fetched from Alexa by domains.
""")
for social_network in data:
url_main = data.get(social_network).get("urlMain")
data.get(social_network)["rank"] = 0
th = threading.Thread(target=get_rank, args=(url_main, data.get(social_network)))
pool.append((social_network, url_main, th))
th.start()
index = 1
for social_network, url_main, th in pool:
th.join()
sys.stdout.write("\r{0}".format(f"Updated {index} out of {data_length} entries"))
sys.stdout.flush()
index = index + 1
sites_full_list = [(site, site_data['rank']) for site, site_data in data.items()]
sites_full_list.sort(reverse=False, key=lambda x: x[1])
while sites_full_list[0][1] == 0:
site = sites_full_list.pop(0)
sites_full_list.append(site)
for num, site_tuple in enumerate(sites_full_list):
site, rank = site_tuple
url_main = data[site]['urlMain']
valid_rank = get_step_rank(rank)
all_tags = data[site].get('tags', [])
tags = ', ' + ', '.join(all_tags) if all_tags else ''
site_file.write(f'1. [{site}]({url_main})*: top {valid_rank}{tags}*\n')
site_file.write(f'\nAlexa.com rank data fetched at ({datetime.utcnow()} UTC)\n')
sorted_json_data = json.dumps(data, indent=2, sort_keys=True)
with open(args.base_file, "w") as data_file:
data_file.write(sorted_json_data)
print("\nFinished updating supported site listing!")