From ace0d11a560cf06dc72bcc04475d4eedf95b9584 Mon Sep 17 00:00:00 2001 From: yuens Date: Tue, 16 Aug 2016 23:57:51 +0800 Subject: [PATCH] =?UTF-8?q?=E6=AD=A3=E5=9C=A8=E8=BF=9B=E8=A1=8C=E3=80=90?= =?UTF-8?q?=E4=BD=9C=E8=80=85=E8=AF=84=E8=AE=BA=E3=80=81=E4=BD=9C=E8=80=85?= =?UTF-8?q?=E8=AF=84=E8=AE=BA=E4=B8=AA=E6=95=B0=E3=80=91=E7=AD=89POST?= =?UTF-8?q?=E8=A1=A8=E7=9A=84=E6=95=B0=E6=8D=AE=E8=8E=B7=E5=8F=96=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- spider/crawler.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/spider/crawler.py b/spider/crawler.py index 039bef7..c650bbb 100755 --- a/spider/crawler.py +++ b/spider/crawler.py @@ -341,7 +341,10 @@ def getPostDetailInfoDict(self, postUrl): postDetailInfoDict['postImgUrlList'] = u''.encode("utf8") if postDetailInfoDict['postCommentNum'] >= 1: - commentUserNameList = re.findall('(.*)', str(postComment)) + #commentUserNameList = re.findall(r'(.*)', str(postComment)) + commentUserNameTagList = postComment.find_all('a', attrs={"href":postDetailInfoDict['postAuthorUrl'], 'class':''}) + print len(commentUserNameTagList) + commentUserNameList = commentUserNameTagList#map(lambda tag: tag.a, commentUserNameTagList) commentContentList = re.findall('

(.*)

', str(postContent)) userNameAndCommentContentList = map(lambda name, comment: (name, comment), commentUserNameList, commentContentList) @@ -357,7 +360,7 @@ def getPostDetailInfoDict(self, postUrl): logging.info("postImgUrlList:{0}".format(postImgUrlList)) logging.info("postDetailInfoDict['postImgUrlList']:{0}".format(postDetailInfoDict['postImgUrlList'])) - logging.info("str(postContent):{0}".format(str(postComment))) + #logging.info("str(postContent):{0}".format(str(postComment))) logging.info("commentUserNameList:{0}".format(commentUserNameList)) logging.info("len(commentUserNameList):{0}".format(len(commentUserNameList))) logging.info("commentContentList:{0}".format(commentContentList))