Skip to content

Commit

Permalink
正在进行【作者评论、作者评论个数】等POST表的数据获取。
Browse files Browse the repository at this point in the history
  • Loading branch information
ysh329 committed Aug 16, 2016
1 parent 15cf432 commit ace0d11
Showing 1 changed file with 5 additions and 2 deletions.
7 changes: 5 additions & 2 deletions spider/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,10 @@ def getPostDetailInfoDict(self, postUrl):
postDetailInfoDict['postImgUrlList'] = u''.encode("utf8")

if postDetailInfoDict['postCommentNum'] >= 1:
commentUserNameList = re.findall('<a href="https://www\.douban\.com/people/.*/" class="">(.*)</a>', str(postComment))
#commentUserNameList = re.findall(r'<a href="https://www\.douban\.com/people/135813880/" class="">(.*)</a>', str(postComment))
commentUserNameTagList = postComment.find_all('a', attrs={"href":postDetailInfoDict['postAuthorUrl'], 'class':''})
print len(commentUserNameTagList)
commentUserNameList = commentUserNameTagList#map(lambda tag: tag.a, commentUserNameTagList)
commentContentList = re.findall('<p class="">(.*)</p>', str(postContent))
userNameAndCommentContentList = map(lambda name, comment: (name, comment), commentUserNameList, commentContentList)

Expand All @@ -357,7 +360,7 @@ def getPostDetailInfoDict(self, postUrl):
logging.info("postImgUrlList:{0}".format(postImgUrlList))
logging.info("postDetailInfoDict['postImgUrlList']:{0}".format(postDetailInfoDict['postImgUrlList']))

logging.info("str(postContent):{0}".format(str(postComment)))
#logging.info("str(postContent):{0}".format(str(postComment)))
logging.info("commentUserNameList:{0}".format(commentUserNameList))
logging.info("len(commentUserNameList):{0}".format(len(commentUserNameList)))
logging.info("commentContentList:{0}".format(commentContentList))
Expand Down

0 comments on commit ace0d11

Please sign in to comment.