Skip to content

Commit

Permalink
完成函数getGroupCategoryUrlList(待测试)和部分函数getGroupInfoListForEachPage。
Browse files Browse the repository at this point in the history
  • Loading branch information
ysh329 committed Aug 12, 2016
1 parent 731f6d8 commit 527d31e
Showing 1 changed file with 86 additions and 75 deletions.
161 changes: 86 additions & 75 deletions spider/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@
################################### PART2 CLASS && FUNCTION ###########################

# 通过目录页获取列表信息列表
def getBasicInfoList(listUrl):
def getPostsBasicInfoList(discussionUrlOfGroup):
basicInfoList = []

# 发送请求接受响应并转换为soup对象后抽出结果保存
request = urllib2.Request(listUrl)
request = urllib2.Request(discussionUrlOfGroup)
try:
# 发出请求接收响应转换为soup对象
response = urllib2.urlopen(request)
Expand Down Expand Up @@ -76,103 +76,114 @@ def getBasicInfoList(listUrl):
return basicInfoList

# 过滤基本信息列表中的无用条目
def filterBasicInfoList(basicInfoList):
def filterPostsBasicInfoList(basicInfoList):
pass
return basicInfoList


# 根据需要的小组数目计算小组信息的目录页
def getGroupCategoryUrlList(queryKeywordsList, topNGroup=10,
findGroupUrl="https://www.douban.com/group/search?start=0&cat=1019&q=[REPLACEBYQUERY]&sort=relevance",
maxGroupsNumForEachPage=20):
# 获取查询结果地址
queryString = "+".join(queryKeywordsList)
queryUrl = findGroupUrl.replace("[REPLACEBYQUERY]", queryString)
logging.info("queryUrl:{0}".format(queryUrl))

# 发送访问请求和接收
try:
request = urllib2.Request(queryUrl)
response = urllib2.urlopen(request)
except urllib2.HTTPError, e:
logging.error("HTTPError code:{0}, URL:{1}".format(e.code, queryUrl))
except Exception, e:
logging.error(e)

# 读取响应内容并转换为soup对象
html = response.read()
soup = BeautifulSoup(html, "lxml")

countPlain = soup.find("span", attrs={"class": "count"}).string
countNum = int(re.findall('[0-9]\d', countPlain)[0])
maxPageNum = int(math.ceil(float(countNum) / maxGroupsNumForEachPage))
pageStartGroupIdxList = map(lambda pageIdx: str(pageIdx * 10), xrange(maxPageNum))
groupCategoryUrlList = map(lambda start: findGroupUrl.replace("start=0", "start=" + start), pageStartGroupIdxList)
if topNGroup < len(groupCategoryUrlList) * maxGroupsNumForEachPage:
groupCategoryUrlList = groupCategoryUrlList[:topNGroup / maxGroupsNumForEachPage + 1]
return groupCategoryUrlList


# 取出关键标签部分
def getGroupInfoListForEachPage(groupCategoryUrl):
# 发送访问请求和接收
try:
request = urllib2.Request(groupCategoryUrl)
response = urllib2.urlopen(request)
except urllib2.HTTPError, e:
logging.error("HTTPError code:{0}, URL:{1}".format(e.code, groupCategoryUrl))
except Exception, e:
logging.error(e)

# 读取响应内容并转换为soup对象
html = response.read()
soup = BeautifulSoup(html, "lxml")

# print soup.find("div", id="wrapper").find("div", id="content")
resultOfSoup = soup.find_all("div", attrs={'class': "result"})

# 遍历抽取重要信息[组名、链接、人数、介绍]
groupInfoList = []
# for resultIdx in xrange(len(resultOfSoup)):
# try:
# result = resultOfSoup[resultIdx]
# groupName = "".join(list(result.find("div", attrs={'class':"title"}).strings)).strip()
# groupHref = result.a.get("href")
# groupMemberPlain = result.find("div", attrs={'class':"info"}).string
# groupMemberNum = int(re.findall('^[0-9]\d*', groupMemberPlain))
# groupIntro = result.p.string.strip().replace(" ", "").replace("", "")
# groupInfoList.append(groupName, groupHref, groupMemberNum, groupIntro)
# except Exception, e:
# logging.error(e)
# return groupInfoList

# 查找指定关键词的前topNGroup个群组的信息[组名、地址、人数、介绍]
def getGroupsInfoList(queryKeywordsList, topNGroup=10, findGroupUrl="https://www.douban.com/group/search?start=0&cat=1019&q=[REPLACEBYQUERY]&sort=relevance", maxGroupsNumForEachPage=20):

# 根据需要的小组数目计算小组信息的目录页
def getGroupCategoryUrlList(queryKeywordsList, topNGroup=10, findGroupUrl="https://www.douban.com/group/search?start=0&cat=1019&q=[REPLACEBYQUERY]&sort=relevance", maxGroupsNumForEachPage=20):
# 获取查询结果地址
queryString = "+".join(queryKeywordsList)
queryUrl = findGroupUrl.replace("[REPLACEBYQUERY]", queryString)
logging.info("queryUrl:{0}".format(queryUrl))

# 发送访问请求和接收
try:
request = urllib2.Request(queryUrl)
response = urllib2.urlopen(request)
except urllib2.HTTPError, e:
logging.error("HTTPError code:{0}, URL:{1}".format(e.code, queryUrl))
except Exception, e:
logging.error(e)

# 读取响应内容并转换为soup对象
html = response.read()
soup = BeautifulSoup(html, "lxml")
groupCategoryUrlList = getGroupCategoryUrlList(queryKeywordsList, topNGroup, findGroupUrl, maxGroupsNumForEachPage)
for groupCategoryUrlIdx in xrange(len(groupCategoryUrlList)):
groupCategoryUrl = groupCategoryUrlList[groupCategoryUrlIdx]
getGroupInfoListForEachPage(groupCategoryUrl)

countPlain = soup.find("span", attrs={"class":"count"}).string
countNum = int(re.findall('[0-9]\d', countPlain)[0])
maxPageNum = int(math.ceil(float(countNum)/maxGroupsNumForEachPage))
pageStartGroupIdxList = map(lambda pageIdx: str(pageIdx*10), xrange(maxPageNum))
groupCategoryUrlList = map(lambda start: findGroupUrl.replace("start=0", "start="+start), pageStartGroupIdxList)
if topNGroup < len(groupCategoryUrlList)*maxGroupsNumForEachPage:
groupCategoryUrlList = groupCategoryUrlList[:topNGroup/maxGroupsNumForEachPage+1]
return groupCategoryUrlList


# 取出关键标签部分
def getGroupInfoListForEachPage(groupCategoryUrl):
# 发送访问请求和接收
try:
request = urllib2.Request(groupCategoryUrl)
response = urllib2.urlopen(request)
except urllib2.HTTPError, e:
logging.error("HTTPError code:{0}, URL:{1}".format(e.code, queryUrl))
except Exception, e:
logging.error(e)

# 读取响应内容并转换为soup对象
html = response.read()
soup = BeautifulSoup(html, "lxml")

#print soup.find("div", id="wrapper").find("div", id="content")
resultOfSoup = soup.find_all("div", attrs={'class':"result"})

# 遍历抽取重要信息[组名、链接、人数、介绍]
groupInfoList = []
# for resultIdx in xrange(len(resultOfSoup)):
# try:
# result = resultOfSoup[resultIdx]
# groupName = "".join(list(result.find("div", attrs={'class':"title"}).strings)).strip()
# groupHref = result.a.get("href")
# groupMemberPlain = result.find("div", attrs={'class':"info"}).string
# groupMemberNum = int(re.findall('^[0-9]\d*', groupMemberPlain))
# groupIntro = result.p.string.strip().replace(" ", "").replace("", "")
# groupInfoList.append(groupName, groupHref, groupMemberNum, groupIntro)
# except Exception, e:
# logging.error(e)
# return groupInfoList



################################### PART3 TEST #######################################

# 初始化参数
listUrl = "https://www.douban.com/group/HZhome/discussion?start=0"
discussionUrlOfGroup = "https://www.douban.com/group/HZhome/discussion?start=0"
queryKeywordsList = ["杭州", "租房"]

# 初始化环境
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

'''

# 执行爬取
basicInfoList = getBasicInfoList(listUrl)
filteredBasicInfoList = filterBasicInfoList(basicInfoList)
postsBasicInfoList = getPostsBasicInfoList(discussionUrlOfGroup)
#filteredBasicInfoList = filterPostsBasicInfoList(postsBasicInfoList)

# 打印结果
for basicInfoIdx in xrange(len(filteredBasicInfoList)):
title = basicInfoList[basicInfoIdx][0]
href = basicInfoList[basicInfoIdx][1]
userName = basicInfoList[basicInfoIdx][2]
doubanId = basicInfoList[basicInfoIdx][3]
userLink = basicInfoList[basicInfoIdx][4]
commentNum = basicInfoList[basicInfoIdx][5]
lastTime = basicInfoList[basicInfoIdx][6]
for basicInfoIdx in xrange(len(postsBasicInfoList)):
title = postsBasicInfoList[basicInfoIdx][0]
href = postsBasicInfoList[basicInfoIdx][1]
userName = postsBasicInfoList[basicInfoIdx][2]
doubanId = postsBasicInfoList[basicInfoIdx][3]
userLink = postsBasicInfoList[basicInfoIdx][4]
commentNum = postsBasicInfoList[basicInfoIdx][5]
lastTime = postsBasicInfoList[basicInfoIdx][6]

logging.info("idx:{0}, title:{1}, href:{2}, userName:{3}, doubanId:{4}, userLink:{5}, commentNum:{6}, lastTime:{7}".format(basicInfoIdx+1, title, href, userName, doubanId, userLink, commentNum, lastTime))
'''


getGroupsInfoList(queryKeywordsList)

0 comments on commit 527d31e

Please sign in to comment.