正在进行【作者评论、作者评论个数】等POST表的数据获取。

ysh329 · Aug 14, 2016 · 15cf432 · 15cf432
1 parent 9efea23
commit 15cf432
Show file tree

Hide file tree

Showing 2 changed files with 151 additions and 46 deletions.
diff --git a/README.md b/README.md
@@ -44,36 +44,46 @@
 ##### 来源和所在小组基本信息  
 |字段名|类型|含义|举例|  
 |-| :-:|::|:|   
-|GROUP_SOURCE|VARCHAR(10)|小组来源|"douban"或"tieba"|  
+|GROUP_SOURCE|VARCHAR(10)|小组来源|"douban"或"tieba"|
+|GROUP_URL|TEXT|小组地址链接|"https://www.douban.com/group/551307/"|
 |GROUP_ID|VARCHAR(20)|所在来源的全(站)局唯一性ID|"hangzhougonglue"|  
 |GROUP_NAME|VARCHAR(30)|小组名称|"杭州旅游"|  
 
 ##### 帖子基本信息  
 |字段名|类型|含义|举例|  
 |-|:-:|::|:|    
-|POST_TITLE|TEXT|帖子标题|"这是标题"|  
+|POST_URL|TEXT|帖子链接|"https://www.douban.com/group/topic/88272843/"|
+|POST_TITLE|TEXT|帖子标题|"这是标题"|
+|POST_ID|VARCHAR(10)|帖子唯一性ID|"850407300"|
 |POST_CREATE_DATE|VARCHAR(19)|帖子创建时间|"2014-08-10 16:58:21"|  
 |POST_LAST_COMMENT_DATE|VARCHAR(16)|帖子最后回复时间|"2015-08-13 15:22"|  
-|POST_COMMENT_NUM|INT|帖子回复个数|10|  
+|POST_COMMENT_NUM|INT|帖子回复个数|10|
+|POST_LIKE_NUM|INT|喜欢人数|10|
+
 ##### 帖子创建者基本信息  
 |字段名|类型|含义|举例|  
 |-|:-:|::|:|    
 |POST_AUTHOR_NAME|VARCHAR(50)|帖子创建者名称|"章小希"|  
 |POST_AUTHOR_ID|VARCHAR(10)|帖子创建者全站唯一性ID|"148647315"|  
 |POST_AUTHOR_SIGNATURE|TEXT|签名|"目标，前进；一切只为生活"|  
-|POST_AUTHOR_URL|TEXT|帖子创建者个人页面地址|"https://www.douban.com/people/148647315/"|  
+|POST_AUTHOR_URL|TEXT|帖子创建者个人页面地址|"https://www.douban.com/people/148647315/"|
+
 ##### 内容和评论  
 |字段名|类型|含义|举例|  
 |-|:-:|::|:|    
-|POST_CONTENT|TEXT|帖子内容|"这是帖子内容"|  
-|POST_AUTHOR_COMMENT|TEXT|帖子创建者的所有评论|"这是评论1+2+3拼接起来的结果"|  
+|POST_CONTENT|TEXT|帖子内容|"这是帖子内容"|
+|POST_IMG_NUM|INT|图片张数|3|
+|POST_IMG_URL_LIST|TEXT|所有图片地址列表的字符串(用"\t"拼接)|'www.1.com/1.png::www.1.com/2.jpg'|
+|POST_AUTHOR_COMMENT|TEXT|帖子创建者的所有评论|"这是评论1+2+3拼接起来的结果"|
+|POST_AUTHOR_COMMENT_NUM|INT|帖子创建者的评论个数|10|
+
 ##### 感兴趣信息(需要提取/抽取)  
 |字段名|类型|含义|举例|  
 |-|:-:|::|:|    
 |POST_CONTENT_QQ|VARCHAR(12)|帖子内容里的QQ号|"12345"|  
 |POST_CONTENT_WECHAT|VARCHAR(16)|帖子内容里的微信号|"12345"|  
 |POST_CONTENT_TEL|VARCHAR(15)|帖子内容里的电话号|"13312345678"|  
-|POST_CONTENT_ADDRESS|VARCHAR(30)|帖子内容里的地址|"北京市海淀区"|  
+|POST_CONTENT_ADDRESS|VARCHAR(30)|帖子内容里的地址|"北京市海淀区"|
 
 ### USER表  
 注备:主要用来记录小组(或贴吧)管理员和发帖人个人信息  
@@ -93,7 +103,8 @@
 |POST_NUM|INT|发帖总数|32|  
 |POST_LAST_CREATE_DATE|VARCHAR(16)|用户发帖目录页第1页最后一次发帖日期|"2015-01-01 11:11"|  
 |POST_MIDDLE_CREATE_DATE|VARCHAR(16)|用户发帖目录页第1页中间一次发帖日期|"2015-01-01 11:11"|  
-|POST_FIRST_CREATE_DATE|VARCHAR(16)|用户发帖目录页第1页第一次发帖日期|"2015-01-01 11:11"|  
+|POST_FIRST_CREATE_DATE|VARCHAR(16)|用户发帖目录页第1页第一次发帖日期|"2015-01-01 11:11"|
+
 ##### 活跃程度(定期更新)  
 |字段名|类型|含义|举例|  
 |-|:-:|::|:|  

diff --git a/spider/crawler.py b/spider/crawler.py
@@ -217,6 +217,7 @@ def getGroupsInfoDictList(self, queryKeywordsList,
         logging.info("成功获取有关【{0}】 总计 {1} 个小组的详细信息.".format(",".join(queryKeywordsList), len(groupsInfoDictList)))
         return groupsInfoDictList
 
+
     # 获取小组当天所有帖子链接
     def getTodayPostUrlListOfGroup(self, groupUrl):
         # 发送访问请求和接收
@@ -239,11 +240,22 @@ def getTodayPostUrlListOfGroup(self, groupUrl):
         logging.info("成功获得 {0} 个帖子链接.".format(len(postUrlList)))
         return postUrlList
 
+
     def getPostDetailInfoDict(self, postUrl):
         logging.info("postUrl:{0}".format(postUrl))
         # 发送访问请求和接收
         try:
-            request = urllib2.Request(postUrl)
+            headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",\
+                       'Referer': postUrl,\
+                       'Accept':'*/*',\
+                       #'Accept-Encoding':'gzip, deflate, sdch',\
+                       'Accept-Encoding':'utf8',\
+                       'Accept-Language':'zh-CN,zh;q=0.8',\
+                       'Connection':'keep-alive',\
+                       'Cookie':'bid=7xWCDDoU6pk; gr_user_id=78b1ca83-183c-49ee-b79c-c7db9f211ad4; viewed="26308725_25753386"; ps=y; ll="118172"; ct=y; ap=1; as="https://www.douban.com/group/topic/85508155/"; _vwo_uuid_v2=4967D4925ED4A7F3DAB6ACBF56139020|ea6d135f73d347c2b2bd567e10a10b4b; __utmt=1; _ga=GA1.2.1582242860.1470147765; _gat=1; __utma=30149280.1582242860.1470147765.1471179208.1471182104.19; __utmb=30149280.18.5.1471182180158; __utmc=30149280; __utmz=30149280.1470802677.13.8.utmcsr=121.42.47.99|utmccn=(referral)|utmcmd=referral|utmcct=/yuenshome/wordpress/',\
+                       #XX 'Host':'erebor.douban.com',
+                        }
+            request = urllib2.Request(postUrl, headers=headers,origin_req_host='erebor.douban.com')
             response = urllib2.urlopen(request)
         except urllib2.HTTPError, e:
             logging.error("HTTPError code:{0}, URL:{1}".format(e.code, postUrl))
@@ -256,47 +268,126 @@ def getPostDetailInfoDict(self, postUrl):
         postContent = soup.find("div", attrs={"class":"topic-content clearfix"})
         postComment = soup.find("ul", attrs={"id":"comments", "class":"topic-reply"})
 
-        # 标题、创建时间、最后回复时间、回复个数、喜欢人数
-        # 作者姓名、ID、签名、个人地址
-        # 内容、作者评论
-        # QQ、微信号、电话号、包含地名
+        # [4项]小组地址、来源、组全站唯一性ID、小组名称
+        # [7项]帖子链接、标题、ID、创建时间、最后回复时间、回复个数、喜欢人数
+        # [3项]作者姓名、ID、签名、个人地址
+        # [5项]内容、图片张数、图片地址列表字符串、作者评论、作者评论个数
+        # [4项]QQ、微信号、电话号、包含地名
         postDetailInfoDict = {}
 
+        # 小组来源、组全站唯一性ID、小组名称
         try:
-            postDetailInfoDict['postTitle'] = re.findall('<h1>\n(.*)\n</h1>', str(soup.h1))[0].strip()
+            postDetailInfoDict['groupUrl'] = re.findall('<a href="(.*)\?ref=sidebar">', html)[0].encode("utf8")
+        except:
+            postDetailInfoDict['groupUrl'] = re.findall('<a href="(.*)#topics', str(soup))[0].encode("utf8")
+        postDetailInfoDict['groupSource'] = re.findall('www\.(.*)\.com', postDetailInfoDict['groupUrl'])[0].encode("utf8")
+        postDetailInfoDict['groupId'] = re.findall('group/(.*)/', postDetailInfoDict['groupUrl'])[0].encode("utf8")
+        postDetailInfoDict['groupName'] = re.findall('/\?ref=sidebar">(.*)</a>', html)[1].encode("utf8")
+
+        logging.info("postDetailInfoDict['groupUrl']:{0}".format(postDetailInfoDict['groupUrl']))
+        logging.info("postDetailInfoDict['groupSource']:{0}".format(postDetailInfoDict['groupSource']))
+        logging.info("postDetailInfoDict['groupId']:{0}".format(postDetailInfoDict['groupId']))
+        logging.info("postDetailInfoDict['groupName']:{0}".format(postDetailInfoDict['groupName']))
+
+        # 帖子链接、标题、ID、创建时间、最后回复时间、回复个数、喜欢人数
+        postDetailInfoDict['postUrl'] = postUrl
+        # postTitle
+        try: # 查找是否存在长标题的标签并匹配
+            postDetailInfoDict['postTitle'] = re.findall('<strong>标题：</strong>(.*)</td><td class="tablerc"></td></tr>', str(postContent))[0].encode("utf8")
         except:
-            postDetailInfoDict['postTitle'] = soup.title.text
-        # postDetailInfoDict['postCreateDate'] = str(postContent.find("span", attrs={"class":"color-green"}).string).strip()
-        #postDetailInfoDict['postCommentNum'] = str(postComment).count('<p class="">')
-        postDetailInfoDict['postCommentNum'] = soup.find_all('p', attrs={'class':""})
-        print str(postDetailInfoDict['postCommentNum']).count('<p class="">')
+            postDetailInfoDict['postTitle'] = soup.title.text.strip().encode('utf8')
+        postDetailInfoDict['postCreateDate'] = str(postContent.find("span", attrs={"class":"color-green"}).string).strip().encode("utf8")
+        postDetailInfoDict['postCommentNum'] = (len(postComment)-1)/2
+        # postLastCommentDate
         if postDetailInfoDict['postCommentNum'] > 0:
-            postDetailInfoDict['postLastCommentDate'] = re.findall('<span class="pubtime">(.*)</span>', html)[-1] if html.count('paginator') == 0 else self.getPostLastCommentDate(soup)
+            postDetailInfoDict['postLastCommentDate'] = re.findall('<span class="pubtime">(.*)</span>', html)[-1].encode("utf8") if html.count('paginator') == 0 else self.getPostLastCommentDate(soup).encode("utf8")
         else:
-            postDetailInfoDict['postLastCommentDate'] = postDetailInfoDict['postCreateDate']
+            postDetailInfoDict['postLastCommentDate'] = postDetailInfoDict['postCreateDate'].encode("utf8")
+
+        try:
+            postDetailInfoDict["postLikeNum"] = int(re.findall(u'type=like#sep">(\d*).*</a>', str(postContent))[0])
+        except:
+            postDetailInfoDict["postLikeNum"] = 0
+
+        logging.info("postDetailInfoDict['postUrl']:{0}".format(postDetailInfoDict['postUrl']))
+        logging.info("postDetailInfoDict['postTitle']:{0}".format(postDetailInfoDict['postTitle']))
+        logging.info("postDetailInfoDict['postCreateDate']:{0}".format(postDetailInfoDict['postCreateDate']))
+        logging.info("postDetailInfoDict['postCommentNum']):{0}".format(postDetailInfoDict['postCommentNum']))
+        logging.info("postDetailInfoDict['postLastCommentDate']:{0}".format(postDetailInfoDict['postLastCommentDate']))
+        logging.info("postDetailInfoDict['postLikeNum']:{0}".format(postDetailInfoDict['postLikeNum']))
+
+        # 作者姓名、ID、签名、个人地址
+        postDetailInfoDict['postAuthorName'] = re.findall('alt="(.*)" class="pil"', str(postContent))[0].encode("utf8")
+        postDetailInfoDict['postAuthorUrl'] = re.findall('(https://www\.douban\.com/people/.*/)"><img', str(postContent))[0].encode("utf8")
+        postDetailInfoDict['postAuthorId'] = re.findall('https://www\.douban\.com/people/(.*)/"><img', str(postContent))[0].encode("utf8")
+        try:
+            postDetailInfoDict['postAuthorSignature'] = re.findall('</a>\((.*)\)</span>', str(postContent))[0].encode("utf8")
+        except:
+            postDetailInfoDict['postAuthorSignature'] = "".encode("utf8")
+
+        logging.info("postDetailInfoDict['postAuthorName']:{0}".format(postDetailInfoDict['postAuthorName']))
+        logging.info("postDetailInfoDict['postAuthorUrl']:{0}".format(postDetailInfoDict['postAuthorUrl']))
+        logging.info("postDetailInfoDict['postAuthorId']:{0}".format(postDetailInfoDict['postAuthorId']))
+        logging.info("postDetailInfoDict['postAuthorSignature']:{0}".format(postDetailInfoDict['postAuthorSignature']))
+
+        # 内容、图片张数、图片地址列表字符串、作者评论、作者评论个数
+        postDetailInfoDict['postContent'] = postContent.find("div", attrs={"class":"topic-content"}).text.replace("\r", "").replace("\n", "").replace(" ", "").encode("utf8")
+        postImgTags = postContent.find_all("img", attrs={"class":""})
+        postDetailInfoDict['postImgNum'] = len(postImgTags)
+        if postDetailInfoDict['postImgNum'] > 0:
+            postImgUrlList = map(lambda tag: tag['src'].encode("utf8"), postImgTags)
+            postDetailInfoDict['postImgUrlList'] = u'\t'.join(postImgUrlList).encode("utf8")
+        else:
+            postDetailInfoDict['postImgUrlList'] = u''.encode("utf8")
+
+        if postDetailInfoDict['postCommentNum'] >= 1:
+            commentUserNameList = re.findall('<a href="https://www\.douban\.com/people/.*/" class="">(.*)</a>', str(postComment))
+            commentContentList = re.findall('<p class="">(.*)</p>', str(postContent))
+            userNameAndCommentContentList = map(lambda name, comment: (name, comment), commentUserNameList, commentContentList)
+
+            authorCommentList = filter(lambda name: postDetailInfoDict['postAuthorName'] in name, userNameAndCommentContentList)
+            postDetailInfoDict['postAuthorCommentNum'] = len(authorCommentList)
+            postDetailInfoDict['postAuthorComment'] = "".join(map(lambda (name, comment): comment, authorCommentList)).encode("utf8")
 
 
+        logging.info("postDetailInfoDict['postContent']:{0}".format(postDetailInfoDict['postContent']))
+        #logging.info("len(postDetailInfoDict['postContent']):{0}".format(len(postDetailInfoDict['postContent'])))
 
-        print "============================="
-        # print postDetailInfoDict['postTitle']
-        # print postDetailInfoDict['postCreateDate']
-        # print type(postDetailInfoDict['postCreateDate'])
-        # print len(postComment)
-        print postDetailInfoDict['postCommentNum']
-        print postDetailInfoDict['postLastCommentDate']
+        logging.info("postDetailInfoDict['postImgNum']:{0}".format(postDetailInfoDict['postImgNum']))
+        logging.info("postImgUrlList:{0}".format(postImgUrlList))
+        logging.info("postDetailInfoDict['postImgUrlList']:{0}".format(postDetailInfoDict['postImgUrlList']))
 
+        logging.info("str(postContent):{0}".format(str(postComment)))
+        logging.info("commentUserNameList:{0}".format(commentUserNameList))
+        logging.info("len(commentUserNameList):{0}".format(len(commentUserNameList)))
+        logging.info("commentContentList:{0}".format(commentContentList))
+        logging.info("len(commentContentList):{0}".format(len(commentContentList)))
+
+        logging.info("postDetailInfoDict['postAuthorComment']:{0}".format(postDetailInfoDict['postAuthorComment']))
+        logging.info("postDetailInfoDict['postAuthorCommentNum']:{0}".format(postDetailInfoDict['postAuthorCommentNum']))
+
+
+
+
+        logging.info("================================================================")
+
+
+
+
+    # 获取最后一条评论的日期
     def getPostLastCommentDate(self, soup):
         pageContent = soup.find_all('div', attrs={'class': "paginator"})
         pageUrlList = re.findall('<a href="(.*)">\d*', str(pageContent))
-        print pageUrlList
+        logging.info("len(pageUrlList):{0}".format(len(pageUrlList)))
+        for idx in xrange(len(pageUrlList)): logging.info("{0}:{1}".format(idx+1, pageUrlList[idx]))
 
 
 
 
 ################################### PART3 TEST #######################################
 
 # 初始化参数
-queryKeywordsList = ["摄影"]#""杭州", "租房"]
+queryKeywordsList = ["杭州", "租房"]
 topNGroup = 1
 maxGroupsNumForEachPage = 20
 findGroupUrl = "https://www.douban.com/group/search?start=0&cat=1019&q=[REPLACEBYQUERY]&sort=relevance"
@@ -305,20 +396,23 @@ def getPostLastCommentDate(self, soup):
 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 crawler = Crawler()
 
-# 获取指定关键词下的小组详细信息
-groupsInfoDictList = crawler.getGroupsInfoDictList(queryKeywordsList,\
-                                                   topNGroup,\
-                                                   maxGroupsNumForEachPage,\
-                                                   findGroupUrl)
-
-# 获取指定小组(链接)的所有今日帖子地址
-postUrl2DList = map(lambda groupInfoDict:\
-                        crawler.getTodayPostUrlListOfGroup(groupInfoDict['groupUrl']),\
-                    groupsInfoDictList)
-postUrlList = flatten(postUrl2DList)
-
-# 根据帖子地址获取帖子详细信息
-postsDetailInfoDictList = map(lambda postUrl:\
-                                  crawler.getPostDetailInfoDict(postUrl),\
-                              postUrlList)
+#crawler.getPostDetailInfoDict("https://www.douban.com/group/topic/83083253/")
+crawler.getPostDetailInfoDict("https://www.douban.com/group/topic/88272843/")
 
+# # 获取指定关键词下的小组详细信息
+# groupsInfoDictList = crawler.getGroupsInfoDictList(queryKeywordsList,\
+#                                                    topNGroup,\
+#                                                    maxGroupsNumForEachPage,\
+#                                                    findGroupUrl)
+#
+# # 获取指定小组(链接)的所有今日帖子地址
+# postUrl2DList = map(lambda groupInfoDict:\
+#                         crawler.getTodayPostUrlListOfGroup(groupInfoDict['groupUrl']),\
+#                     groupsInfoDictList)
+# postUrlList = flatten(postUrl2DList)
+#
+# # 根据帖子地址获取帖子详细信息
+# postsDetailInfoDictList = map(lambda postUrl:\
+#                                   crawler.getPostDetailInfoDict(postUrl),\
+#                               postUrlList)
+#