From 5641da02dbfde6e72b9b67a726ca6a3cb01d3e96 Mon Sep 17 00:00:00 2001
From: yuens <ysh329@sina.com>
Date: Wed, 10 Aug 2016 00:38:36 +0800
Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90=E8=8E=B7=E5=8F=96=E6=8C=87?=
 =?UTF-8?q?=E5=AE=9A=E9=A1=B5=E9=9D=A2=E7=9A=84=E5=B0=8F=E7=BB=84=E4=BF=A1?=
 =?UTF-8?q?=E6=81=AF=E6=8A=BD=E5=8F=96=EF=BC=88=E7=BB=84=E5=90=8D=E3=80=81?=
 =?UTF-8?q?=E5=9C=B0=E5=9D=80=E3=80=81=E4=BA=BA=E6=95=B0=E3=80=81=E4=BB=8B?=
 =?UTF-8?q?=E7=BB=8D=EF=BC=89=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 spider/crawler.py | 43 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 35 insertions(+), 8 deletions(-)

diff --git a/spider/crawler.py b/spider/crawler.py
index 99505b6..a8cf5b6 100755
--- a/spider/crawler.py
+++ b/spider/crawler.py
@@ -80,21 +80,48 @@ def filterBasicInfoList(basicInfoList):
     return basicInfoList
 
 # 查找指定关键词的前topNGroup个群组的信息[组名、地址、人数、介绍]
-def getGroupsInfoList(queryKeywordsList, topNGroup=10, findGroupUrl="https://www.douban.com/group/search?cat=1019&q=[REPLACEBYQUERY]&sort=relevance"):
+def getGroupsInfoList(queryKeywordsList, topNGroup=10, findGroupUrl="https://www.douban.com/group/search?cat=1019&q=[REPLACEBYQUERY]&sort=relevance", maxGroupsNumForEachPage=20):
+    # 获取查询结果地址
     queryString = "+".join(queryKeywordsList)
     queryUrl = findGroupUrl.replace("[REPLACEBYQUERY]", queryString)
-    print queryUrl
+    logging.info("queryUrl:{0}".format(queryUrl))
 
-    request = urllib2.Request(queryUrl)
+    # 发送访问请求和接收
     try:
-        # 发出请求接收响应转换为soup对象
+        request = urllib2.Request(queryUrl)
         response = urllib2.urlopen(request)
-        html = response.read()
-        soup = BeautifulSoup(html, "lxml")
-        soupOfGroups = soup.div.div.div["class"]
-        for i in soupOfGroups: print i
     except urllib2.HTTPError, e:
         logging.error("HTTPError code:{0}, URL:{1}".format(e.code, queryUrl))
+    except Exception, e:
+        logging.error(e)
+
+    # 读取响应内容并转换为soup对象
+    html = response.read()
+    soup = BeautifulSoup(html, "lxml")
+
+    # 取出关键标签部分
+    #print soup.find("div", id="wrapper").find("div", id="content")
+    resultOfSoup = soup.find_all("div", attrs={'class':"result"})
+
+    # 遍历抽取重要信息[组名、链接、人数、介绍]
+    groupInfoList = []
+    # for resultIdx in xrange(len(resultOfSoup)):
+    #     try:
+    #         result = resultOfSoup[resultIdx]
+    #         groupName = "".join(list(result.find("div", attrs={'class':"title"}).strings)).strip()
+    #         groupHref = result.a.get("href")
+    #         groupMemberPlain = result.find("div", attrs={'class':"info"}).string
+    #         groupMemberNum = int(re.findall('^[0-9]\d*', groupMemberPlain))
+    #         groupIntro = result.p.string.strip().replace(" ", "").replace("", "")
+    #         groupInfoList.append(groupName, groupHref, groupMemberNum, groupIntro)
+    #     except Exception, e:
+    #         logging.error(e)
+    # return groupInfoList
+    countPlain = soup.find("span", attrs={"class":"count"}).string
+    countNum = int(re.findall('[0-9]\d', countPlain)[0])
+    import math
+    print countNum, math.ceil(float(countNum)/maxGroupsNumForEachPage)
+
 
 ################################### PART３ TEST #######################################