Skip to content

Commit

Permalink
正在进行getPostDetailInfoDict。
Browse files Browse the repository at this point in the history
  • Loading branch information
ysh329 committed Aug 13, 2016
1 parent d608fd2 commit fd0d792
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 9 deletions.
12 changes: 7 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Douban-Crawler
爬豆瓣小组的帖子信息
指定小组查询关键词,完全傻瓜式获取指定个数豆瓣小组详细信息,以及小组下指定个数帖子信息

## CRAWLER库表结构
三个数据表:小组表,用户表,帖子表。
Expand Down Expand Up @@ -53,19 +53,20 @@
|-|:-:|::|:|
|POST_TITLE|TEXT|帖子标题|"这是标题"|
|POST_CREATE_DATE|VARCHAR(19)|帖子创建时间|"2014-08-10 16:58:21"|
|POST_LAST_REPLY_DATE|VARCHAR(16)|帖子最后回复时间|"2015-08-13 15:22"|
|POST_REPLY_NUM|INT|帖子回复个数|10|
|POST_LAST_COMMENT_DATE|VARCHAR(16)|帖子最后回复时间|"2015-08-13 15:22"|
|POST_COMMENT_NUM|INT|帖子回复个数|10|
##### 帖子创建者基本信息
|字段名|类型|含义|举例|
|-|:-:|::|:|
|POST_AUTHOR_NAME|VARCHAR(50)|帖子创建者名称|"章小希"|
|POST_AUTHOR_ID|VARCHAR(10)|帖子创建者全站唯一性ID|"148647315"|
|POST_AUTHOR_SIGNATURE|TEXT|签名|"目标,前进;一切只为生活"|
|POST_AUTHOR_URL|TEXT|帖子创建者个人页面地址|"https://www.douban.com/people/148647315/"|
##### 内容和评论
|字段名|类型|含义|举例|
|-|:-:|::|:|
|POST_CONTENT|TEXT|帖子内容|"这是帖子内容"|
|POST_CREATE_USER_COMMENT|TEXT|帖子创建者的所有评论|"这是评论1+2+3拼接起来的结果"|
|POST_AUTHOR_COMMENT|TEXT|帖子创建者的所有评论|"这是评论1+2+3拼接起来的结果"|
##### 感兴趣信息(需要提取/抽取)
|字段名|类型|含义|举例|
|-|:-:|::|:|
Expand All @@ -82,8 +83,9 @@
|USER_SOURCE|VARCHAR(10)|用户来源|"douban"或"tieba"|
|USER_NAME|TEXT|用户名、昵称|"小豆芽"|
|USER_ID|VARCHAR(20)|全站唯一性ID|"yncyd"|
|USER_SEX|INT|性别|0(未知)或1(男)或2(女)|
|USER_SIGNATURE|TEXT|签名|"目标,前进;一切只为生活"|
|USER_URL|TEXT|个人页面|"https://www.douban.com/people/yncyd/"|
|USER_SEX|INT|性别|1(男)或0(女)|

##### 发帖情况(定期更新)
|字段名|类型|含义|举例|
Expand Down
66 changes: 62 additions & 4 deletions spider/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,6 @@ def getGroupInfoDict(self, groupUrl, queryKeywordsList):
# 小组活跃信息[3项]当天帖子总数、当天帖子历史累计回复数、平均帖子回复数
# 组长信息[3项]管理员姓名、全站唯一性ID、个人页面地址
# 系统信息[1项]表更新时间

groupInfoDict = {}

# 小组基本信息
Expand Down Expand Up @@ -237,13 +236,58 @@ def getTodayPostUrlListOfGroup(self, groupUrl):
# 解析soup对象获取所有帖子链接
postTitlesOfSoup = soup.find_all("td", attrs={"class": "title"})
postUrlList = map(lambda tag: tag.a['href'], postTitlesOfSoup)
logging.info("成功获得 {0} 个帖子链接.".format(len(postUrlList)))
return postUrlList

def getPostDetailInfoDict(self, postUrl):
logging.info("postUrl:{0}".format(postUrl))
# 发送访问请求和接收
try:
request = urllib2.Request(postUrl)
response = urllib2.urlopen(request)
except urllib2.HTTPError, e:
logging.error("HTTPError code:{0}, URL:{1}".format(e.code, postUrl))
except Exception, e:
logging.error(e)

# 读取响应内容并转换为soup对象
html = response.read()
soup = BeautifulSoup(html, "lxml")
postContent = soup.find("div", attrs={"class":"topic-content clearfix"})
postComment = soup.find("ul", attrs={"id":"comments", "class":"topic-reply"})

# 标题、创建时间、最后回复时间、回复个数、喜欢人数
# 作者姓名、ID、签名、个人地址
# 内容、作者评论
# QQ、微信号、电话号、包含地名
postDetailInfoDict = {}

postDetailInfoDict['postTitle'] = re.findall('<h1>\n(.*)\n</h1>', str(soup.h1))[0].strip()
postDetailInfoDict['postCreateDate'] = str(postContent.find("span", attrs={"class":"color-green"}).string).strip()
postDetailInfoDict['postCommentNum'] = str(postComment).count('<p class="">')
if postDetailInfoDict['postCommentNum'] > 0:
postDetailInfoDict['postLastCommentDate'] = re.findall('<span class="pubtime">(.*)</span>')[-1] if html.count('paginator') == 0 else self.getPostLastCommentDate(soup)
else:
postDetailInfoDict['postLastCommentDate'] = postDetailInfoDict['postCreateDate']



print "============================="
print postDetailInfoDict['postTitle']
print postDetailInfoDict['postCreateDate']
print type(postDetailInfoDict['postCreateDate'])
print len(postComment)
print postDetailInfoDict['postCommentNum']
print postDetailInfoDict['postLastCommentDate']

def getPostLastCommentDate(self, soup):
pageContent = soup.find_all('div', attrs={'class': "paginator"})



################################### PART3 TEST #######################################

# 初始化参数
#discussionUrlOfGroup = "https://www.douban.com/group/HZhome/discussion?start=0"
queryKeywordsList = ["杭州", "租房"]
topNGroup = 1
maxGroupsNumForEachPage = 20
Expand All @@ -253,6 +297,20 @@ def getTodayPostUrlListOfGroup(self, groupUrl):
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
crawler = Crawler()

groupsInfoDictList = crawler.getGroupsInfoDictList(queryKeywordsList, topNGroup, maxGroupsNumForEachPage, findGroupUrl)
crawler.getTodayPostUrlListOfGroup(groupsInfoDictList[0]['groupUrl'])
# 获取指定关键词下的小组详细信息
groupsInfoDictList = crawler.getGroupsInfoDictList(queryKeywordsList,\
topNGroup,\
maxGroupsNumForEachPage,\
findGroupUrl)

# 获取指定小组(链接)的所有今日帖子地址
postUrl2DList = map(lambda groupInfoDict:\
crawler.getTodayPostUrlListOfGroup(groupInfoDict['groupUrl']),\
groupsInfoDictList)
postUrlList = flatten(postUrl2DList)

# 根据帖子地址获取帖子详细信息
postsDetailInfoDictList = map(lambda postUrl:\
crawler.getPostDetailInfoDict(postUrl),\
postUrlList)

0 comments on commit fd0d792

Please sign in to comment.