-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathstep-6.py
59 lines (42 loc) · 1.58 KB
/
step-6.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import asyncio
import itertools
import json
import string
import aiohttp
from pyquery import PyQuery
async def scrape_page(session, url):
async with session.get(url) as resp:
content = await resp.text()
print('parsing url: {}'.format(url))
doc = PyQuery(content)
doc.make_links_absolute(base_url=url)
table = doc('#rz-main-container section:eq(1) .WriteSmallTableTop table:eq(1)')
results = []
for row in table.items('tr:gt(0)'):
company_col = row('td').eq(0)
phone_col = row('td').eq(1)
website_col = row('td').eq(2)
company = {
'name': company_col.text(),
'phone': phone_col.text(),
'url': website_col('a').attr('href'),
'details_url': company_col('a').attr('href'),
}
results.append(company)
return results
async def scrape_urls():
letters_and_nums = list(string.ascii_lowercase[:22]) + ['wxyz']
print(letters_and_nums)
urls = map(lambda x: 'https://www.rigzone.com/search/alpha/{}/'.format(x), letters_and_nums)
with aiohttp.ClientSession() as session:
scrapers = [scrape_page(session, url) for url in urls]
done, pending = await asyncio.wait(scrapers)
return list(itertools.chain.from_iterable(map(lambda task: task.result(), done)))
def save_to_file(fname, data):
with open(fname, 'w') as fh:
json.dump(data, fh, indent=4, sort_keys=True)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
data = loop.run_until_complete(scrape_urls())
print(data[0])
save_to_file('companies-6.json', data)