forked from ring04h/wydomain
-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathcrawlurl.py
37 lines (35 loc) · 1.37 KB
/
crawlurl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# !/usr/bin/dev python
# -*- coding:utf-8 -*-
from scrapy.selector import Selector
import re,urllib2,requests
#import chardet,codecs
def crawl_chinaz_url():
for rank_num in range(4,10):
rank_num_url="http://top.chinaz.com/list.aspx?baidu=%d" % rank_num
req = urllib2.Request(rank_num_url)
data = urllib2.urlopen(req, timeout=30).read().decode('gb2312')
#encode=chardet.detect(data)["encoding"]
hxs = Selector(text=data)
pagenum=''.join(hxs.xpath('//span[@class="status"]/text()').extract())
pagenum=int(re.sub('(.*)/',"",pagenum))
for i in range(1,pagenum+1):
targetUrl = "http://top.chinaz.com/list.aspx?p=%d" % i +"&baidu=%d" % rank_num
req = urllib2.Request(targetUrl)
detail_data = urllib2.urlopen(req, timeout=30).read()
detail_hxs = Selector(text=detail_data)
rank_url=detail_hxs.xpath('//div[@class="webItemList"]/ul/li/div[@class="info"]/h3/span/text()').extract()
fname=str(rank_num)+".txt"
for url in rank_url:
f=open(fname,'a+')
f.write(url)
f.write("\n")
def check_url(domain):
httpurl="http://"+domain
try:
httpresp = requests.head(httpurl)
if httpresp.status_code == 200:
return True
else:
return False
except:
return False