Skip to content

Commit

Permalink
完成获取指定页面的小组信息抽取(组名、地址、人数、介绍)。
Browse files Browse the repository at this point in the history
  • Loading branch information
ysh329 committed Aug 9, 2016
1 parent 94bb0e0 commit 5641da0
Showing 1 changed file with 35 additions and 8 deletions.
43 changes: 35 additions & 8 deletions spider/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,21 +80,48 @@ def filterBasicInfoList(basicInfoList):
return basicInfoList

# 查找指定关键词的前topNGroup个群组的信息[组名、地址、人数、介绍]
def getGroupsInfoList(queryKeywordsList, topNGroup=10, findGroupUrl="https://www.douban.com/group/search?cat=1019&q=[REPLACEBYQUERY]&sort=relevance"):
def getGroupsInfoList(queryKeywordsList, topNGroup=10, findGroupUrl="https://www.douban.com/group/search?cat=1019&q=[REPLACEBYQUERY]&sort=relevance", maxGroupsNumForEachPage=20):
# 获取查询结果地址
queryString = "+".join(queryKeywordsList)
queryUrl = findGroupUrl.replace("[REPLACEBYQUERY]", queryString)
print queryUrl
logging.info("queryUrl:{0}".format(queryUrl))

request = urllib2.Request(queryUrl)
# 发送访问请求和接收
try:
# 发出请求接收响应转换为soup对象
request = urllib2.Request(queryUrl)
response = urllib2.urlopen(request)
html = response.read()
soup = BeautifulSoup(html, "lxml")
soupOfGroups = soup.div.div.div["class"]
for i in soupOfGroups: print i
except urllib2.HTTPError, e:
logging.error("HTTPError code:{0}, URL:{1}".format(e.code, queryUrl))
except Exception, e:
logging.error(e)

# 读取响应内容并转换为soup对象
html = response.read()
soup = BeautifulSoup(html, "lxml")

# 取出关键标签部分
#print soup.find("div", id="wrapper").find("div", id="content")
resultOfSoup = soup.find_all("div", attrs={'class':"result"})

# 遍历抽取重要信息[组名、链接、人数、介绍]
groupInfoList = []
# for resultIdx in xrange(len(resultOfSoup)):
# try:
# result = resultOfSoup[resultIdx]
# groupName = "".join(list(result.find("div", attrs={'class':"title"}).strings)).strip()
# groupHref = result.a.get("href")
# groupMemberPlain = result.find("div", attrs={'class':"info"}).string
# groupMemberNum = int(re.findall('^[0-9]\d*', groupMemberPlain))
# groupIntro = result.p.string.strip().replace(" ", "").replace("", "")
# groupInfoList.append(groupName, groupHref, groupMemberNum, groupIntro)
# except Exception, e:
# logging.error(e)
# return groupInfoList
countPlain = soup.find("span", attrs={"class":"count"}).string
countNum = int(re.findall('[0-9]\d', countPlain)[0])
import math
print countNum, math.ceil(float(countNum)/maxGroupsNumForEachPage)


################################### PART3 TEST #######################################

Expand Down

0 comments on commit 5641da0

Please sign in to comment.