完成获取指定页面的小组信息抽取（组名、地址、人数、介绍）。

ysh329 · Aug 9, 2016 · 5641da0 · 5641da0
1 parent 94bb0e0
commit 5641da0
Showing 1 changed file with 35 additions and 8 deletions.
diff --git a/spider/crawler.py b/spider/crawler.py
@@ -80,21 +80,48 @@ def filterBasicInfoList(basicInfoList):
     return basicInfoList
 
 # 查找指定关键词的前topNGroup个群组的信息[组名、地址、人数、介绍]
-def getGroupsInfoList(queryKeywordsList, topNGroup=10, findGroupUrl="https://www.douban.com/group/search?cat=1019&q=[REPLACEBYQUERY]&sort=relevance"):
+def getGroupsInfoList(queryKeywordsList, topNGroup=10, findGroupUrl="https://www.douban.com/group/search?cat=1019&q=[REPLACEBYQUERY]&sort=relevance", maxGroupsNumForEachPage=20):
+    # 获取查询结果地址
     queryString = "+".join(queryKeywordsList)
     queryUrl = findGroupUrl.replace("[REPLACEBYQUERY]", queryString)
-    print queryUrl
+    logging.info("queryUrl:{0}".format(queryUrl))
 
-    request = urllib2.Request(queryUrl)
+    # 发送访问请求和接收
     try:
-        # 发出请求接收响应转换为soup对象
+        request = urllib2.Request(queryUrl)
         response = urllib2.urlopen(request)
-        html = response.read()
-        soup = BeautifulSoup(html, "lxml")
-        soupOfGroups = soup.div.div.div["class"]
-        for i in soupOfGroups: print i
     except urllib2.HTTPError, e:
         logging.error("HTTPError code:{0}, URL:{1}".format(e.code, queryUrl))
+    except Exception, e:
+        logging.error(e)
+
+    # 读取响应内容并转换为soup对象
+    html = response.read()
+    soup = BeautifulSoup(html, "lxml")
+
+    # 取出关键标签部分
+    #print soup.find("div", id="wrapper").find("div", id="content")
+    resultOfSoup = soup.find_all("div", attrs={'class':"result"})
+
+    # 遍历抽取重要信息[组名、链接、人数、介绍]
+    groupInfoList = []
+    # for resultIdx in xrange(len(resultOfSoup)):
+    #     try:
+    #         result = resultOfSoup[resultIdx]
+    #         groupName = "".join(list(result.find("div", attrs={'class':"title"}).strings)).strip()
+    #         groupHref = result.a.get("href")
+    #         groupMemberPlain = result.find("div", attrs={'class':"info"}).string
+    #         groupMemberNum = int(re.findall('^[0-9]\d*', groupMemberPlain))
+    #         groupIntro = result.p.string.strip().replace(" ", "").replace("", "")
+    #         groupInfoList.append(groupName, groupHref, groupMemberNum, groupIntro)
+    #     except Exception, e:
+    #         logging.error(e)
+    # return groupInfoList
+    countPlain = soup.find("span", attrs={"class":"count"}).string
+    countNum = int(re.findall('[0-9]\d', countPlain)[0])
+    import math
+    print countNum, math.ceil(float(countNum)/maxGroupsNumForEachPage)
+
 
 ################################### PART３ TEST #######################################