From 5641da02dbfde6e72b9b67a726ca6a3cb01d3e96 Mon Sep 17 00:00:00 2001 From: yuens Date: Wed, 10 Aug 2016 00:38:36 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90=E8=8E=B7=E5=8F=96=E6=8C=87?= =?UTF-8?q?=E5=AE=9A=E9=A1=B5=E9=9D=A2=E7=9A=84=E5=B0=8F=E7=BB=84=E4=BF=A1?= =?UTF-8?q?=E6=81=AF=E6=8A=BD=E5=8F=96=EF=BC=88=E7=BB=84=E5=90=8D=E3=80=81?= =?UTF-8?q?=E5=9C=B0=E5=9D=80=E3=80=81=E4=BA=BA=E6=95=B0=E3=80=81=E4=BB=8B?= =?UTF-8?q?=E7=BB=8D=EF=BC=89=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- spider/crawler.py | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/spider/crawler.py b/spider/crawler.py index 99505b6..a8cf5b6 100755 --- a/spider/crawler.py +++ b/spider/crawler.py @@ -80,21 +80,48 @@ def filterBasicInfoList(basicInfoList): return basicInfoList # 查找指定关键词的前topNGroup个群组的信息[组名、地址、人数、介绍] -def getGroupsInfoList(queryKeywordsList, topNGroup=10, findGroupUrl="https://www.douban.com/group/search?cat=1019&q=[REPLACEBYQUERY]&sort=relevance"): +def getGroupsInfoList(queryKeywordsList, topNGroup=10, findGroupUrl="https://www.douban.com/group/search?cat=1019&q=[REPLACEBYQUERY]&sort=relevance", maxGroupsNumForEachPage=20): + # 获取查询结果地址 queryString = "+".join(queryKeywordsList) queryUrl = findGroupUrl.replace("[REPLACEBYQUERY]", queryString) - print queryUrl + logging.info("queryUrl:{0}".format(queryUrl)) - request = urllib2.Request(queryUrl) + # 发送访问请求和接收 try: - # 发出请求接收响应转换为soup对象 + request = urllib2.Request(queryUrl) response = urllib2.urlopen(request) - html = response.read() - soup = BeautifulSoup(html, "lxml") - soupOfGroups = soup.div.div.div["class"] - for i in soupOfGroups: print i except urllib2.HTTPError, e: logging.error("HTTPError code:{0}, URL:{1}".format(e.code, queryUrl)) + except Exception, e: + logging.error(e) + + # 读取响应内容并转换为soup对象 + html = response.read() + soup = BeautifulSoup(html, "lxml") + + # 取出关键标签部分 + #print soup.find("div", id="wrapper").find("div", id="content") + resultOfSoup = soup.find_all("div", attrs={'class':"result"}) + + # 遍历抽取重要信息[组名、链接、人数、介绍] + groupInfoList = [] + # for resultIdx in xrange(len(resultOfSoup)): + # try: + # result = resultOfSoup[resultIdx] + # groupName = "".join(list(result.find("div", attrs={'class':"title"}).strings)).strip() + # groupHref = result.a.get("href") + # groupMemberPlain = result.find("div", attrs={'class':"info"}).string + # groupMemberNum = int(re.findall('^[0-9]\d*', groupMemberPlain)) + # groupIntro = result.p.string.strip().replace(" ", "").replace("", "") + # groupInfoList.append(groupName, groupHref, groupMemberNum, groupIntro) + # except Exception, e: + # logging.error(e) + # return groupInfoList + countPlain = soup.find("span", attrs={"class":"count"}).string + countNum = int(re.findall('[0-9]\d', countPlain)[0]) + import math + print countNum, math.ceil(float(countNum)/maxGroupsNumForEachPage) + ################################### PART3 TEST #######################################