-
Notifications
You must be signed in to change notification settings - Fork 4
/
spider_lianjia.py
111 lines (97 loc) · 5.71 KB
/
spider_lianjia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
## V1.0 author:QT
import requests
from bs4 import BeautifulSoup
import bs4
import csv,os
from datetime import datetime
import re
##setting
TARGETCITY='北京市'
TARGETDIST='海淀区' #输入你想爬取的区
bjdistricts ={'东城区':'dongcheng','西城区':'xicheng','朝阳区':'chaoyang','海淀区':'haidian','丰台区':'fengtai','石景山区':'shijingshan',\
'通州区':'tongzhou','昌平区':'changping','大兴区':'daxing','亦庄开发区':'yizhuangkaifaqu','顺义区':'shunyi','房山区':'fangshan',\
'门头沟区':'mentougou','平谷区':'pinggu','怀柔区':'huairou','密云区':'miyun','延庆区':'yanqing','燕郊':'yanjiao','香河':'xianghe'}
#
PAGES = 50 # Based on bjdistricts[TARGETDIST],
# configure PAGES form `https://bj.lianjia.com/ershoufang/bjdistricts[TARGETDIST]`
#-----------------------------------------------------------------------------------------------------------------------------------------------##
def getlocation(name):#调用百度API查询位置
bdurl = 'http://api.map.baidu.com/geocoder/v2/?address='
output = 'json'
ak = ''#your key
callback = 'showLocation'
url = bdurl + name + '&output=t' + output + '&ak=' + ak + '&callback=' + callback
res = requests.get(url)
s = BeautifulSoup(res.text,'html.parser')
lng = s.find('lng')
lat = s.find('lat')
if lng:
return lng.get_text()+','+lat.get_text()
#-----------------------------------------------------------------------------------------------------------------------------------------------##
def getHTMLText(url):
mheader = {'User-Agent': 'Baiduspider+(+http://www.baidu.com/search/spider.htm)'}
#mheader = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}#请求头,模拟浏览器登陆
try:
r = requests.get(url, headers=mheader, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return "None"
#-----------------------------------------------------------------------------------------------------------------------------------------------##
def fillHomeList(hlist, html):
soup = BeautifulSoup(html, 'html.parser')
for ele in soup.find_all('li',class_='clear'):
houseInfoTag = ele.find('div',class_='houseInfo')
str_houseInfo = houseInfoTag.get_text().replace(' ','')
#print(houseInfoTag.a.get('href').split('/')[-2])
district_number = houseInfoTag.a.get('href').split('/')[-2]
house_code = ele.a.get('data-housecode')
name_houseInfo = houseInfoTag.a.string
style_houseInfo = re.search(r'(\d室\d厅)|(\d房间\d卫)', str_houseInfo).group()
area_houseInfo = re.search(r'/[0-9.]*平米/', str_houseInfo).group()[1:-3]
dire_houseInfo = re.search(r'/[南北东西]+/', str_houseInfo).group()[1:-1]
decro_houseInfo = re.search(r'(精装|简装|毛坯|其他)(/[有无]电梯)?', str_houseInfo).group()
# print(name_houseInfo,style_houseInfo,area_houseInfo,dire_houseInfo,decro_houseInfo)
app_positionInfo = ele.find('div',class_='positionInfo').get_text().split('/')
lnglat = getlocation(TARGETCITY +' '+TARGETDIST +' '+ app_positionInfo[2] + ' ' + name_houseInfo) #add latitude/longitude from baidu API
#district No.,housecode,houseInfo['Name','Style','Area(m2)','Direct','Docorat'],positionInfo['Floor','Time','Address'],priceInfo,location(longitude,latitude)
totalPrice = ele.find('div',class_='priceInfo').span.string
fol_followInfo = ele.find('div',class_='followInfo').get_text().split('/')[0][:-3]
visit_followInfo = ele.find('div',class_='followInfo').get_text().split('/')[1].split('次')[0]
avgPrice = round(10000*float(totalPrice)/float(area_houseInfo))
hlist.append([district_number,house_code,name_houseInfo,style_houseInfo,area_houseInfo,dire_houseInfo,decro_houseInfo]+\
app_positionInfo+[totalPrice,avgPrice,fol_followInfo,visit_followInfo,lnglat])
for li in hlist: print(li)
#-----------------------------------------------------------------------------------------------------------------------------------------------##
def main():
homelist = []
CSVheaders=['小区编号','房屋编号','小区名','户型','面积(m2)','朝向','装修','楼层','年代','区位','总价(万)','均价(元/m2)','关注(人)','带看(次)','坐标']
myroot ='./csv/'
path = myroot + datetime.now().strftime('%Y%m%d %H:%M:%S') + '_' + bjdistricts[TARGETDIST] + '.csv'
try:
if not os.path.exists(myroot):
os.mkdir(myroot)
if not os.path.exists(path):
with open(path,'w',encoding='utf-8',newline='') as cf:#encoding='gb18030'
writers = csv.DictWriter(cf, CSVheaders)
writers.writeheader()
print("start to connect 'lianjia.com'")
for i in range(1, PAGES+1):#1,PAGES+1
url = 'https://bj.lianjia.com/ershoufang/' + bjdistricts[TARGETDIST] + '/pg'+str(i)
html = getHTMLText(url)
if html == 'None': raise NameError
fillHomeList(homelist, html)
for row in homelist:
writers.writerow(dict(zip(CSVheaders,row)))#将dict写入到csv文件中
homelist=[]
print("{0}/{1} completed!".format(i,PAGES))
print("Done!")
else:
print("Already exist.")
except NameError:
print('getHTMLText return empty')
except Exception as e:
print("Error!")
if __name__ == '__main__':
main()