diff --git a/spider/crawler.py b/spider/crawler.py index 11f52c6..6802e79 100755 --- a/spider/crawler.py +++ b/spider/crawler.py @@ -262,33 +262,41 @@ def getPostDetailInfoDict(self, postUrl): # QQ、微信号、电话号、包含地名 postDetailInfoDict = {} - postDetailInfoDict['postTitle'] = re.findall('
') + try: + postDetailInfoDict['postTitle'] = re.findall('
') + postDetailInfoDict['postCommentNum'] = soup.find_all('p', attrs={'class':""}) + print str(postDetailInfoDict['postCommentNum']).count('
') if postDetailInfoDict['postCommentNum'] > 0: - postDetailInfoDict['postLastCommentDate'] = re.findall('(.*)')[-1] if html.count('paginator') == 0 else self.getPostLastCommentDate(soup) + postDetailInfoDict['postLastCommentDate'] = re.findall('(.*)', html)[-1] if html.count('paginator') == 0 else self.getPostLastCommentDate(soup) else: postDetailInfoDict['postLastCommentDate'] = postDetailInfoDict['postCreateDate'] print "=============================" - print postDetailInfoDict['postTitle'] - print postDetailInfoDict['postCreateDate'] - print type(postDetailInfoDict['postCreateDate']) - print len(postComment) + # print postDetailInfoDict['postTitle'] + # print postDetailInfoDict['postCreateDate'] + # print type(postDetailInfoDict['postCreateDate']) + # print len(postComment) print postDetailInfoDict['postCommentNum'] print postDetailInfoDict['postLastCommentDate'] def getPostLastCommentDate(self, soup): pageContent = soup.find_all('div', attrs={'class': "paginator"}) + pageUrlList = re.findall('\d*', str(pageContent)) + print pageUrlList + ################################### PART3 TEST ####################################### # 初始化参数 -queryKeywordsList = ["杭州", "租房"] +queryKeywordsList = ["摄影"]#""杭州", "租房"] topNGroup = 1 maxGroupsNumForEachPage = 20 findGroupUrl = "https://www.douban.com/group/search?start=0&cat=1019&q=[REPLACEBYQUERY]&sort=relevance"