Skip to content

Commit

Permalink
正在进行【作者评论、作者评论个数】等POST表的数据获取。
Browse files Browse the repository at this point in the history
  • Loading branch information
ysh329 committed Aug 14, 2016
1 parent 9efea23 commit 15cf432
Show file tree
Hide file tree
Showing 2 changed files with 151 additions and 46 deletions.
27 changes: 19 additions & 8 deletions README.md
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -44,36 +44,46 @@
##### 来源和所在小组基本信息
|字段名|类型|含义|举例|
|-| :-:|::|:|
|GROUP_SOURCE|VARCHAR(10)|小组来源|"douban"或"tieba"|
|GROUP_SOURCE|VARCHAR(10)|小组来源|"douban"或"tieba"|
|GROUP_URL|TEXT|小组地址链接|"https://www.douban.com/group/551307/"|
|GROUP_ID|VARCHAR(20)|所在来源的全(站)局唯一性ID|"hangzhougonglue"|
|GROUP_NAME|VARCHAR(30)|小组名称|"杭州旅游"|

##### 帖子基本信息
|字段名|类型|含义|举例|
|-|:-:|::|:|
|POST_TITLE|TEXT|帖子标题|"这是标题"|
|POST_URL|TEXT|帖子链接|"https://www.douban.com/group/topic/88272843/"|
|POST_TITLE|TEXT|帖子标题|"这是标题"|
|POST_ID|VARCHAR(10)|帖子唯一性ID|"850407300"|
|POST_CREATE_DATE|VARCHAR(19)|帖子创建时间|"2014-08-10 16:58:21"|
|POST_LAST_COMMENT_DATE|VARCHAR(16)|帖子最后回复时间|"2015-08-13 15:22"|
|POST_COMMENT_NUM|INT|帖子回复个数|10|
|POST_COMMENT_NUM|INT|帖子回复个数|10|
|POST_LIKE_NUM|INT|喜欢人数|10|

##### 帖子创建者基本信息
|字段名|类型|含义|举例|
|-|:-:|::|:|
|POST_AUTHOR_NAME|VARCHAR(50)|帖子创建者名称|"章小希"|
|POST_AUTHOR_ID|VARCHAR(10)|帖子创建者全站唯一性ID|"148647315"|
|POST_AUTHOR_SIGNATURE|TEXT|签名|"目标,前进;一切只为生活"|
|POST_AUTHOR_URL|TEXT|帖子创建者个人页面地址|"https://www.douban.com/people/148647315/"|
|POST_AUTHOR_URL|TEXT|帖子创建者个人页面地址|"https://www.douban.com/people/148647315/"|

##### 内容和评论
|字段名|类型|含义|举例|
|-|:-:|::|:|
|POST_CONTENT|TEXT|帖子内容|"这是帖子内容"|
|POST_AUTHOR_COMMENT|TEXT|帖子创建者的所有评论|"这是评论1+2+3拼接起来的结果"|
|POST_CONTENT|TEXT|帖子内容|"这是帖子内容"|
|POST_IMG_NUM|INT|图片张数|3|
|POST_IMG_URL_LIST|TEXT|所有图片地址列表的字符串(用"\t"拼接)|'www.1.com/1.png::www.1.com/2.jpg'|
|POST_AUTHOR_COMMENT|TEXT|帖子创建者的所有评论|"这是评论1+2+3拼接起来的结果"|
|POST_AUTHOR_COMMENT_NUM|INT|帖子创建者的评论个数|10|

##### 感兴趣信息(需要提取/抽取)
|字段名|类型|含义|举例|
|-|:-:|::|:|
|POST_CONTENT_QQ|VARCHAR(12)|帖子内容里的QQ号|"12345"|
|POST_CONTENT_WECHAT|VARCHAR(16)|帖子内容里的微信号|"12345"|
|POST_CONTENT_TEL|VARCHAR(15)|帖子内容里的电话号|"13312345678"|
|POST_CONTENT_ADDRESS|VARCHAR(30)|帖子内容里的地址|"北京市海淀区"|
|POST_CONTENT_ADDRESS|VARCHAR(30)|帖子内容里的地址|"北京市海淀区"|

### USER表
注备:主要用来记录小组(或贴吧)管理员和发帖人个人信息
Expand All @@ -93,7 +103,8 @@
|POST_NUM|INT|发帖总数|32|
|POST_LAST_CREATE_DATE|VARCHAR(16)|用户发帖目录页第1页最后一次发帖日期|"2015-01-01 11:11"|
|POST_MIDDLE_CREATE_DATE|VARCHAR(16)|用户发帖目录页第1页中间一次发帖日期|"2015-01-01 11:11"|
|POST_FIRST_CREATE_DATE|VARCHAR(16)|用户发帖目录页第1页第一次发帖日期|"2015-01-01 11:11"|
|POST_FIRST_CREATE_DATE|VARCHAR(16)|用户发帖目录页第1页第一次发帖日期|"2015-01-01 11:11"|

##### 活跃程度(定期更新)
|字段名|类型|含义|举例|
|-|:-:|::|:|
Expand Down
170 changes: 132 additions & 38 deletions spider/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ def getGroupsInfoDictList(self, queryKeywordsList,
logging.info("成功获取有关【{0}】 总计 {1} 个小组的详细信息.".format(",".join(queryKeywordsList), len(groupsInfoDictList)))
return groupsInfoDictList


# 获取小组当天所有帖子链接
def getTodayPostUrlListOfGroup(self, groupUrl):
# 发送访问请求和接收
Expand All @@ -239,11 +240,22 @@ def getTodayPostUrlListOfGroup(self, groupUrl):
logging.info("成功获得 {0} 个帖子链接.".format(len(postUrlList)))
return postUrlList


def getPostDetailInfoDict(self, postUrl):
logging.info("postUrl:{0}".format(postUrl))
# 发送访问请求和接收
try:
request = urllib2.Request(postUrl)
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",\
'Referer': postUrl,\
'Accept':'*/*',\
#'Accept-Encoding':'gzip, deflate, sdch',\
'Accept-Encoding':'utf8',\
'Accept-Language':'zh-CN,zh;q=0.8',\
'Connection':'keep-alive',\
'Cookie':'bid=7xWCDDoU6pk; gr_user_id=78b1ca83-183c-49ee-b79c-c7db9f211ad4; viewed="26308725_25753386"; ps=y; ll="118172"; ct=y; ap=1; as="https://www.douban.com/group/topic/85508155/"; _vwo_uuid_v2=4967D4925ED4A7F3DAB6ACBF56139020|ea6d135f73d347c2b2bd567e10a10b4b; __utmt=1; _ga=GA1.2.1582242860.1470147765; _gat=1; __utma=30149280.1582242860.1470147765.1471179208.1471182104.19; __utmb=30149280.18.5.1471182180158; __utmc=30149280; __utmz=30149280.1470802677.13.8.utmcsr=121.42.47.99|utmccn=(referral)|utmcmd=referral|utmcct=/yuenshome/wordpress/',\
#XX 'Host':'erebor.douban.com',
}
request = urllib2.Request(postUrl, headers=headers,origin_req_host='erebor.douban.com')
response = urllib2.urlopen(request)
except urllib2.HTTPError, e:
logging.error("HTTPError code:{0}, URL:{1}".format(e.code, postUrl))
Expand All @@ -256,47 +268,126 @@ def getPostDetailInfoDict(self, postUrl):
postContent = soup.find("div", attrs={"class":"topic-content clearfix"})
postComment = soup.find("ul", attrs={"id":"comments", "class":"topic-reply"})

# 标题、创建时间、最后回复时间、回复个数、喜欢人数
# 作者姓名、ID、签名、个人地址
# 内容、作者评论
# QQ、微信号、电话号、包含地名
# [4项]小组地址、来源、组全站唯一性ID、小组名称
# [7项]帖子链接、标题、ID、创建时间、最后回复时间、回复个数、喜欢人数
# [3项]作者姓名、ID、签名、个人地址
# [5项]内容、图片张数、图片地址列表字符串、作者评论、作者评论个数
# [4项]QQ、微信号、电话号、包含地名
postDetailInfoDict = {}

# 小组来源、组全站唯一性ID、小组名称
try:
postDetailInfoDict['postTitle'] = re.findall('<h1>\n(.*)\n</h1>', str(soup.h1))[0].strip()
postDetailInfoDict['groupUrl'] = re.findall('<a href="(.*)\?ref=sidebar">', html)[0].encode("utf8")
except:
postDetailInfoDict['groupUrl'] = re.findall('<a href="(.*)#topics', str(soup))[0].encode("utf8")
postDetailInfoDict['groupSource'] = re.findall('www\.(.*)\.com', postDetailInfoDict['groupUrl'])[0].encode("utf8")
postDetailInfoDict['groupId'] = re.findall('group/(.*)/', postDetailInfoDict['groupUrl'])[0].encode("utf8")
postDetailInfoDict['groupName'] = re.findall('/\?ref=sidebar">(.*)</a>', html)[1].encode("utf8")

logging.info("postDetailInfoDict['groupUrl']:{0}".format(postDetailInfoDict['groupUrl']))
logging.info("postDetailInfoDict['groupSource']:{0}".format(postDetailInfoDict['groupSource']))
logging.info("postDetailInfoDict['groupId']:{0}".format(postDetailInfoDict['groupId']))
logging.info("postDetailInfoDict['groupName']:{0}".format(postDetailInfoDict['groupName']))

# 帖子链接、标题、ID、创建时间、最后回复时间、回复个数、喜欢人数
postDetailInfoDict['postUrl'] = postUrl
# postTitle
try: # 查找是否存在长标题的标签并匹配
postDetailInfoDict['postTitle'] = re.findall('<strong>标题:</strong>(.*)</td><td class="tablerc"></td></tr>', str(postContent))[0].encode("utf8")
except:
postDetailInfoDict['postTitle'] = soup.title.text
# postDetailInfoDict['postCreateDate'] = str(postContent.find("span", attrs={"class":"color-green"}).string).strip()
#postDetailInfoDict['postCommentNum'] = str(postComment).count('<p class="">')
postDetailInfoDict['postCommentNum'] = soup.find_all('p', attrs={'class':""})
print str(postDetailInfoDict['postCommentNum']).count('<p class="">')
postDetailInfoDict['postTitle'] = soup.title.text.strip().encode('utf8')
postDetailInfoDict['postCreateDate'] = str(postContent.find("span", attrs={"class":"color-green"}).string).strip().encode("utf8")
postDetailInfoDict['postCommentNum'] = (len(postComment)-1)/2
# postLastCommentDate
if postDetailInfoDict['postCommentNum'] > 0:
postDetailInfoDict['postLastCommentDate'] = re.findall('<span class="pubtime">(.*)</span>', html)[-1] if html.count('paginator') == 0 else self.getPostLastCommentDate(soup)
postDetailInfoDict['postLastCommentDate'] = re.findall('<span class="pubtime">(.*)</span>', html)[-1].encode("utf8") if html.count('paginator') == 0 else self.getPostLastCommentDate(soup).encode("utf8")
else:
postDetailInfoDict['postLastCommentDate'] = postDetailInfoDict['postCreateDate']
postDetailInfoDict['postLastCommentDate'] = postDetailInfoDict['postCreateDate'].encode("utf8")

try:
postDetailInfoDict["postLikeNum"] = int(re.findall(u'type=like#sep">(\d*).*</a>', str(postContent))[0])
except:
postDetailInfoDict["postLikeNum"] = 0

logging.info("postDetailInfoDict['postUrl']:{0}".format(postDetailInfoDict['postUrl']))
logging.info("postDetailInfoDict['postTitle']:{0}".format(postDetailInfoDict['postTitle']))
logging.info("postDetailInfoDict['postCreateDate']:{0}".format(postDetailInfoDict['postCreateDate']))
logging.info("postDetailInfoDict['postCommentNum']):{0}".format(postDetailInfoDict['postCommentNum']))
logging.info("postDetailInfoDict['postLastCommentDate']:{0}".format(postDetailInfoDict['postLastCommentDate']))
logging.info("postDetailInfoDict['postLikeNum']:{0}".format(postDetailInfoDict['postLikeNum']))

# 作者姓名、ID、签名、个人地址
postDetailInfoDict['postAuthorName'] = re.findall('alt="(.*)" class="pil"', str(postContent))[0].encode("utf8")
postDetailInfoDict['postAuthorUrl'] = re.findall('(https://www\.douban\.com/people/.*/)"><img', str(postContent))[0].encode("utf8")
postDetailInfoDict['postAuthorId'] = re.findall('https://www\.douban\.com/people/(.*)/"><img', str(postContent))[0].encode("utf8")
try:
postDetailInfoDict['postAuthorSignature'] = re.findall('</a>\((.*)\)</span>', str(postContent))[0].encode("utf8")
except:
postDetailInfoDict['postAuthorSignature'] = "".encode("utf8")

logging.info("postDetailInfoDict['postAuthorName']:{0}".format(postDetailInfoDict['postAuthorName']))
logging.info("postDetailInfoDict['postAuthorUrl']:{0}".format(postDetailInfoDict['postAuthorUrl']))
logging.info("postDetailInfoDict['postAuthorId']:{0}".format(postDetailInfoDict['postAuthorId']))
logging.info("postDetailInfoDict['postAuthorSignature']:{0}".format(postDetailInfoDict['postAuthorSignature']))

# 内容、图片张数、图片地址列表字符串、作者评论、作者评论个数
postDetailInfoDict['postContent'] = postContent.find("div", attrs={"class":"topic-content"}).text.replace("\r", "").replace("\n", "").replace(" ", "").encode("utf8")
postImgTags = postContent.find_all("img", attrs={"class":""})
postDetailInfoDict['postImgNum'] = len(postImgTags)
if postDetailInfoDict['postImgNum'] > 0:
postImgUrlList = map(lambda tag: tag['src'].encode("utf8"), postImgTags)
postDetailInfoDict['postImgUrlList'] = u'\t'.join(postImgUrlList).encode("utf8")
else:
postDetailInfoDict['postImgUrlList'] = u''.encode("utf8")

if postDetailInfoDict['postCommentNum'] >= 1:
commentUserNameList = re.findall('<a href="https://www\.douban\.com/people/.*/" class="">(.*)</a>', str(postComment))
commentContentList = re.findall('<p class="">(.*)</p>', str(postContent))
userNameAndCommentContentList = map(lambda name, comment: (name, comment), commentUserNameList, commentContentList)

authorCommentList = filter(lambda name: postDetailInfoDict['postAuthorName'] in name, userNameAndCommentContentList)
postDetailInfoDict['postAuthorCommentNum'] = len(authorCommentList)
postDetailInfoDict['postAuthorComment'] = "".join(map(lambda (name, comment): comment, authorCommentList)).encode("utf8")


logging.info("postDetailInfoDict['postContent']:{0}".format(postDetailInfoDict['postContent']))
#logging.info("len(postDetailInfoDict['postContent']):{0}".format(len(postDetailInfoDict['postContent'])))

print "============================="
# print postDetailInfoDict['postTitle']
# print postDetailInfoDict['postCreateDate']
# print type(postDetailInfoDict['postCreateDate'])
# print len(postComment)
print postDetailInfoDict['postCommentNum']
print postDetailInfoDict['postLastCommentDate']
logging.info("postDetailInfoDict['postImgNum']:{0}".format(postDetailInfoDict['postImgNum']))
logging.info("postImgUrlList:{0}".format(postImgUrlList))
logging.info("postDetailInfoDict['postImgUrlList']:{0}".format(postDetailInfoDict['postImgUrlList']))

logging.info("str(postContent):{0}".format(str(postComment)))
logging.info("commentUserNameList:{0}".format(commentUserNameList))
logging.info("len(commentUserNameList):{0}".format(len(commentUserNameList)))
logging.info("commentContentList:{0}".format(commentContentList))
logging.info("len(commentContentList):{0}".format(len(commentContentList)))

logging.info("postDetailInfoDict['postAuthorComment']:{0}".format(postDetailInfoDict['postAuthorComment']))
logging.info("postDetailInfoDict['postAuthorCommentNum']:{0}".format(postDetailInfoDict['postAuthorCommentNum']))




logging.info("================================================================")




# 获取最后一条评论的日期
def getPostLastCommentDate(self, soup):
pageContent = soup.find_all('div', attrs={'class': "paginator"})
pageUrlList = re.findall('<a href="(.*)">\d*', str(pageContent))
print pageUrlList
logging.info("len(pageUrlList):{0}".format(len(pageUrlList)))
for idx in xrange(len(pageUrlList)): logging.info("{0}:{1}".format(idx+1, pageUrlList[idx]))




################################### PART3 TEST #######################################

# 初始化参数
queryKeywordsList = ["摄影"]#""杭州", "租房"]
queryKeywordsList = ["杭州", "租房"]
topNGroup = 1
maxGroupsNumForEachPage = 20
findGroupUrl = "https://www.douban.com/group/search?start=0&cat=1019&q=[REPLACEBYQUERY]&sort=relevance"
Expand All @@ -305,20 +396,23 @@ def getPostLastCommentDate(self, soup):
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
crawler = Crawler()

# 获取指定关键词下的小组详细信息
groupsInfoDictList = crawler.getGroupsInfoDictList(queryKeywordsList,\
topNGroup,\
maxGroupsNumForEachPage,\
findGroupUrl)

# 获取指定小组(链接)的所有今日帖子地址
postUrl2DList = map(lambda groupInfoDict:\
crawler.getTodayPostUrlListOfGroup(groupInfoDict['groupUrl']),\
groupsInfoDictList)
postUrlList = flatten(postUrl2DList)

# 根据帖子地址获取帖子详细信息
postsDetailInfoDictList = map(lambda postUrl:\
crawler.getPostDetailInfoDict(postUrl),\
postUrlList)
#crawler.getPostDetailInfoDict("https://www.douban.com/group/topic/83083253/")
crawler.getPostDetailInfoDict("https://www.douban.com/group/topic/88272843/")

# # 获取指定关键词下的小组详细信息
# groupsInfoDictList = crawler.getGroupsInfoDictList(queryKeywordsList,\
# topNGroup,\
# maxGroupsNumForEachPage,\
# findGroupUrl)
#
# # 获取指定小组(链接)的所有今日帖子地址
# postUrl2DList = map(lambda groupInfoDict:\
# crawler.getTodayPostUrlListOfGroup(groupInfoDict['groupUrl']),\
# groupsInfoDictList)
# postUrlList = flatten(postUrl2DList)
#
# # 根据帖子地址获取帖子详细信息
# postsDetailInfoDictList = map(lambda postUrl:\
# crawler.getPostDetailInfoDict(postUrl),\
# postUrlList)
#

0 comments on commit 15cf432

Please sign in to comment.