From 7794b4a19e3f2e7916670ed634fee34832479f1e Mon Sep 17 00:00:00 2001 From: yuens Date: Sat, 13 Aug 2016 20:41:12 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90=E8=8E=B7=E5=8F=96=E5=B0=8F?= =?UTF-8?q?=E7=BB=84=E4=BF=A1=E6=81=AF=E9=83=A8=E5=88=86=E7=9A=84=E5=87=BD?= =?UTF-8?q?=E6=95=B0=EF=BC=8C=E5=B9=B6=E5=AF=B9group=E8=A1=A8=E7=BB=93?= =?UTF-8?q?=E6=9E=84=E5=81=9A=E4=BA=86=E8=B0=83=E6=95=B4=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 41 +++--- spider/crawler.py | 356 ++++++++++++++++++++++++++++++---------------- 2 files changed, 251 insertions(+), 146 deletions(-) diff --git a/README.md b/README.md index cdc87dc..4c1e237 100644 --- a/README.md +++ b/README.md @@ -13,33 +13,31 @@ |GROUP_SOURCE|VARCHAR(10)|数据来源|"douban"或"tieba"| |GROUP_QUERY|VARCHAR(20)|查询query(类似GROUP_TAG)|"北京,IT"| |GROUP_NAME|VARCHAR(30)|组名、吧名|"北京读书交友会"| -|GROUP_ID|VARCHAR(10)|全站唯一性ID|"576850"| +|GROUP_ID|VARCHAR(20)|全站唯一性ID|"576850"| |GROUP_MEMBER_NUM|INT|小组人数|300| -|GROUP_URL|TEXT|地址|"https://www.douban.com/group/10274/"| +|GROUP_URL|TEXT|地址|"https://www.douban.com/group/10274/"| +|GROUP_INTRO|TEXT|介绍|"小组介绍内容"| |GROUP_CREATE_DATE|VARCHAR(10)|小组创建时间|2010-10-10| |GROUP_TAG|VARCHAR(20)|小组标签|"北京,读书,交友"| ##### 活跃度基本信息(每天字段更新) |字段名|类型|含义|举例| |-|:-:|::|:| -|POST_FIRST_REPLY_DATE|VARCHAR(16)|小组首页第一条最后回复时间|"2015-11-19 21:04"| -|POST_FIRST_CREATE_DATE|VARCHAR(16)|小组首页第一条创建时间|"2015-11-19 21:04"| -|POST_MIDDLE_REPLY_DATE|VARCHAR(16)|小组首页中间一条最后回复时间|"2015-11-19 21:04"| -|POST_MIDDLE_CREATE_DATE|VARCHAR(16)|小组首页中间一条创建时间|"2015-11-19 21:04"| -|POST_LAST_REPLY_DATE|VARCHAR(16)|小组首页第一条最后回复时间|"2015-11-19 21:04"| -|POST_LAST_CREATE_DATE|VARCHAR(16)|小组首页最后一条创建时间|"2015-11-19 21:04"| - -##### 表更新时间(定期更新) -|字段名|类型|含义|举例| -|-| :-:|::|:| -|TABLE_UPDATE_DATE|VARCHAR(16)|最后一次表更新时间|"2015-11-19 21:04:48"| +|CURRENT_DAY_POST_NUM|INT|小组当天总帖数|100| +|CURRENT_DAY_COMMENT_NUM|INT|当天帖子累计总回复数|5000| +|CURRENT_DAY_AVE_COMMENT_NUM|INT|当天帖子累计总回复数|50| ##### 管理员基本信息 |字段名|类型|含义|举例| |-| :-:|::|:| |ADMIN_NAME|VARCHAR(50)|管理员姓名|"章小希"| -|ADMIN_ID|VARCHAR(10)|全站唯一性ID(豆瓣唯一ID、贴吧唯一ID)|"148647315"| -|ADMIN_URL|TEXT|管理员账号页面|"https://www.douban.com/people/148647315/"| +|ADMIN_ID|VARCHAR(20)|全站唯一性ID(豆瓣唯一ID、贴吧唯一ID)|"148647315"| +|ADMIN_URL|TEXT|管理员账号页面|"https://www.douban.com/people/148647315/"| + +##### 表更新时间(定期更新) +|字段名|类型|含义|举例| +|-| :-:|::|:| +|TABLE_UPDATE_DATE|VARCHAR(16)|最后一次表更新时间|"2015-11-19 21:04:48"| ### POST表 @@ -83,7 +81,7 @@ |-| :-:|::|:| |USER_SOURCE|VARCHAR(10)|用户来源|"douban"或"tieba"| |USER_NAME|TEXT|用户名、昵称|"小豆芽"| -|USER_ID|VARCHAR(10)|全站唯一性ID|"yncyd"| +|USER_ID|VARCHAR(20)|全站唯一性ID|"yncyd"| |USER_URL|TEXT|个人页面|"https://www.douban.com/people/yncyd/"| |USER_SEX|INT|性别|1(男)或0(女)| @@ -98,11 +96,7 @@ |字段名|类型|含义|举例| |-|:-:|::|:| |USER_LAST_LOGIN|VARCHAR(16)|上次登陆时间|"2015-01-01 11:11"| -|USER_CREATE_DATE|VARCHAR(12)|用户创建日期|"2015-01-01"| -##### 表更新时间(定期更新) -|字段名|类型|含义|举例| -|-| -|TABLE_UPDATE_DATE|VARCHAR(16)|最后一次表更新时间|"2015-11-19 21:04:48"| +|USER_CREATE_DATE|VARCHAR(12)|用户创建日期|"2015-01-01"| ##### 感兴趣信息(需要提取/抽取) |字段名|类型|含义|举例| @@ -112,3 +106,8 @@ |USER_TEL|VARCHAR(15)|手机号|"13311111111"| |USER_MAIL|TEXT|邮箱|"zhangsan0912@gmail.com"| |USER_ADDRESS|TEXT|所在位置|"北京市海淀区XXX路XXX号"或"XXX区"或"南京"| + +##### 表更新时间(定期更新) +|字段名|类型|含义|举例| +|-| +|TABLE_UPDATE_DATE|VARCHAR(16)|最后一次表更新时间|"2015-11-19 21:04:48"| diff --git a/spider/crawler.py b/spider/crawler.py index a002e31..e02820a 100755 --- a/spider/crawler.py +++ b/spider/crawler.py @@ -22,153 +22,259 @@ import re import time import math +from compiler.ast import flatten ################################### PART2 CLASS && FUNCTION ########################### +class Crawler(object): + # 通过目录页获取列表信息列表 + def getPostsBasicInfoList(self, discussionUrlOfGroup): + basicInfoList = [] + + # 发送请求接受响应并转换为soup对象后抽出结果保存 + request = urllib2.Request(discussionUrlOfGroup) + try: + # 发出请求接收响应转换为soup对象 + response = urllib2.urlopen(request) + html = response.read() + soup = BeautifulSoup(html, "lxml") + soupOfAllTitles = soup.find_all("tr") + + #for i in soupOfAllTitles: logging.info(i) + + # 统计结果 + failureNum = 0 + successNum = 0 + for titleIdx in xrange(len(soupOfAllTitles)): + titleDetailInfo = soupOfAllTitles[titleIdx] + try: + # 获取每条信息的[链接、标题、用户名、用户名链接、回帖人数、回帖日期] + title = titleDetailInfo.a.get("title") + href = titleDetailInfo.a.get("href") + + userLink = re.compile('').findall(str(titleDetailInfo))[0] + userName = re.findall(userLink+'">(.*)', str(titleDetailInfo))[0] + doubanId = re.findall(r"\d+", userLink)[0] + + commentNumStr = re.findall('(.*)', str(titleDetailInfo))[0] + commentNum = 0 if commentNumStr=="" else int(commentNumStr) + + lastTime = str(time.localtime(time.time())[0]) +\ + "-" +\ + re.findall('(.*)', str(titleDetailInfo))[0] + + #print commentNum, lastTime, type(lastTime) + + basicInfoList.append( (title, href, userName, doubanId, userLink, commentNum, lastTime) ) + successNum += 1 + except Exception, e: + logging.error("ExceptionError:{0}".format(e)) + failureNum += 1 + logging.info("title:{0},successNum:{1},failureNum:{2}".format(soup.title.string.strip(), successNum, failureNum)) + except urllib2.HTTPError, e: + logging.error("HTTPError code:{0}, URL:{1}".format(e.code, discussionUrlOfGroup)) + + return basicInfoList + + # 过滤基本信息列表中的无用条目 + def filterPostsBasicInfoList(self, basicInfoList): + pass + return basicInfoList + + + # 根据需要的小组数目计算小组信息的目录页 + def getGroupsCategoryUrlList(self, queryKeywordsList, topNGroup=10, + findGroupUrl="https://www.douban.com/group/search?start=0&cat=1019&q=[REPLACEBYQUERY]&sort=relevance", + maxGroupsNumForEachPage=20): + # 获取查询结果地址 + queryString = "+".join(queryKeywordsList) + queryUrl = findGroupUrl.replace("[REPLACEBYQUERY]", queryString) + logging.info("queryUrl:{0}".format(queryUrl)) + + # 发送访问请求和接收 + try: + request = urllib2.Request(queryUrl) + response = urllib2.urlopen(request) + except urllib2.HTTPError, e: + logging.error("HTTPError code:{0}, URL:{1}".format(e.code, queryUrl)) + except Exception, e: + logging.error(e) + + # 读取响应内容并转换为soup对象 + html = response.read() + soup = BeautifulSoup(html, "lxml") -# 通过目录页获取列表信息列表 -def getPostsBasicInfoList(discussionUrlOfGroup): - basicInfoList = [] + countPlain = soup.find("span", attrs={"class": "count"}).string + countNum = int(re.findall('[0-9]\d', countPlain)[0]) + maxPageNum = int(math.ceil(float(countNum) / maxGroupsNumForEachPage)) + + pageStartGroupIdxList = map(lambda pageIdx: str(pageIdx * 10), xrange(maxPageNum)) + groupCategoryUrlList = map(lambda start: queryUrl.replace("start=0", "start=" + start),\ + pageStartGroupIdxList) + if topNGroup < len(groupCategoryUrlList) * maxGroupsNumForEachPage: + groupCategoryUrlList = groupCategoryUrlList[:topNGroup / maxGroupsNumForEachPage + 1] + logging.info("topNGroup:{0},needPageNum:{1},countNum:{2},maxPageNum:{3},maxGroupsNumForEachPage:{4}"\ + .format(topNGroup, len(groupCategoryUrlList), countNum, maxPageNum, maxGroupsNumForEachPage)\ + ) + return groupCategoryUrlList + + + # 取出关键标签部分 + def getGroupInfo(self, groupUrl, queryKeywordsList): + # 发送访问请求和接收 + try: + request = urllib2.Request(groupUrl) + response = urllib2.urlopen(request) + except urllib2.HTTPError, e: + logging.error("HTTPError code:{0}, URL:{1}".format(e.code, groupUrl)) + except Exception, e: + logging.error(e) + + # 读取响应内容并转换为soup对象 + html = response.read() + soup = BeautifulSoup(html, "lxml") - # 发送请求接受响应并转换为soup对象后抽出结果保存 - request = urllib2.Request(discussionUrlOfGroup) - try: - # 发出请求接收响应转换为soup对象 - response = urllib2.urlopen(request) + # 获取一个小组的所有信息 + # 小组基本信息[9项]组来源、查询、组名、组ID、成员数、组地址、介绍、创建时间、组标签 + # 小组活跃信息[3项]当天帖子总数、当天帖子历史累计回复数、平均帖子回复数 + # 组长信息[3项]管理员姓名、全站唯一性ID、个人页面地址 + # 系统信息[1项]表更新时间 + + # 小组基本信息 + groupSource = re.findall(r"www\.(.*)\.com", groupUrl)[0] + groupQuery = ",".join(queryKeywordsList) + groupName = soup.title.string.strip() + groupId = re.findall(r'group/(.*)/', groupUrl)[0] + try: + groupMemberNum = re.findall(r'members">浏览所有.* \((.*)\)', html)[0] + except Exception, e: + groupMemberNum = '0' + logging.error(e) + #groupUrl = groupUrl + groupIntro = str(soup.findAll("div", attrs={"class": "group-intro"})[0]) + groupBoard = soup.find("div", attrs={"class": "group-board"}).p + groupCreateDate = re.findall(r"\d{4}-\d{2}-\d{2}", str(groupBoard))[0] + groupTagList = re.findall('(\d*)', str(allPostInfoList)) + currentDayCommentNumIntList = map(lambda s: 0 if s == '' else int(s), currentDayCommentNumStrList) + currentDayCommentNum = sum(currentDayCommentNumIntList) + currentDayPostNum = len(currentDayCommentNumStrList) + currentDayAveCommentNum = int(float(currentDayCommentNum)/currentDayPostNum) + + # 组长信息 + adminName = str(groupBoard.a.string) + adminUrl = groupBoard.a['href'] + adminId = re.findall('people/(.*)/', adminUrl)[0] + + # 表更新时间 + tableUpdateDate = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + + # print "==============" + # print groupSource + # print groupQuery + # print groupName + # print groupId + # print groupMemberNum + # print groupUrl + # print groupIntro + # print groupCreateDate + # print groupTag + # + # print currentDayCommentNum + # print currentDayPostNum + # print currentDayAveCommentNum + # + # print adminName + # print adminUrl + # print adminId + # + # print tableUpdateDate + return (groupSource, groupQuery, groupName, groupId, groupMemberNum,\ + groupUrl, groupIntro, groupCreateDate, groupTag,\ + currentDayPostNum, currentDayCommentNum, currentDayAveCommentNum,\ + adminName, adminId, adminUrl,\ + tableUpdateDate) + + + def getGroupUrl(self, groupCategoryUrl): + # 发送访问请求和接收 + try: + request = urllib2.Request(groupCategoryUrl) + response = urllib2.urlopen(request) + except urllib2.HTTPError, e: + logging.error("HTTPError code:{0}, URL:{1}".format(e.code, groupCategoryUrl)) + except Exception, e: + logging.error(e) + + # 读取响应内容并转换为soup对象 html = response.read() soup = BeautifulSoup(html, "lxml") - soupOfAllTitles = soup.find_all("tr") + resultOfSoup = soup.find_all("div", attrs={'class': "result"}) - #for i in soupOfAllTitles: logging.info(i) + # 遍历抽取重要信息[组名、链接、人数、介绍] + groupUrlList = [] - # 统计结果 - failureNum = 0 - successNum = 0 - for titleIdx in xrange(len(soupOfAllTitles)): - titleDetailInfo = soupOfAllTitles[titleIdx] + for resultIdx in xrange(len(resultOfSoup)): try: - # 获取每条信息的[链接、标题、用户名、用户名链接、回帖人数、回帖日期] - title = titleDetailInfo.a.get("title") - href = titleDetailInfo.a.get("href") + result = resultOfSoup[resultIdx] + #groupName = "".join(list(result.find("div", attrs={'class':"title"}).strings)).strip() + groupUrl = result.a.get("href") + #groupId = re.findall(r'group/(.*)/', groupHref)[0] + #print groupId + #groupMemberPlain = result.find("div", attrs={'class':"info"}).string + #groupMemberNum = int(re.findall('^[0-9]\d*', groupMemberPlain)[0]) + #groupIntro = result.p.string.strip().replace(" ", "").replace("", "") + #print groupHref + #groupInfoList.append(groupName, groupHref, groupMemberNum, groupIntro) + groupUrlList.append(groupUrl) + except Exception, e: + logging.error(e) + return groupUrlList - userLink = re.compile('').findall(str(titleDetailInfo))[0] - userName = re.findall(userLink+'">(.*)', str(titleDetailInfo))[0] - doubanId = re.findall(r"\d+", userLink)[0] - commentNumStr = re.findall('(.*)', str(titleDetailInfo))[0] - commentNum = 0 if commentNumStr=="" else int(commentNumStr) + # 查找指定关键词的前topNGroup个群组的信息[组名、地址、人数、介绍] + def getGroupsInfoList(self, queryKeywordsList, + topNGroup=10, + maxGroupsNumForEachPage=20, + findGroupUrl="https://www.douban.com/group/search?start=0&cat=1019&q=[REPLACEBYQUERY]&sort=relevance"): + # 目录页地址列表 + groupsCategoryUrlList = self.getGroupsCategoryUrlList(queryKeywordsList, topNGroup, findGroupUrl, maxGroupsNumForEachPage) - lastTime = str(time.localtime(time.time())[0]) +\ - "-" +\ - re.findall('(.*)', str(titleDetailInfo))[0] + # 获取group地址并取topNGroup + groupsUrl2DList = map(lambda groupCategoryUrl: self.getGroupUrl(groupCategoryUrl), groupsCategoryUrlList) + groupsUrlList = flatten(groupsUrl2DList) + groupsUrlList = groupsUrlList[:topNGroup] if len(groupsUrlList) > topNGroup else groupsUrlList - #print commentNum, lastTime, type(lastTime) + # 获取group详细信息并取topNGroup + groupsInfoTupleList = map(lambda groupUrl: self.getGroupInfo(groupUrl, queryKeywordsList), groupsUrlList) + logging.info("成功获取有关【{0}】 总计 {1} 个小组的详细信息.".format(",".join(queryKeywordsList), len(groupsInfoTupleList))) - basicInfoList.append( (title, href, userName, doubanId, userLink, commentNum, lastTime) ) - successNum += 1 - except Exception, e: - logging.error("ExceptionError:{0}".format(e)) - failureNum += 1 - logging.info("title:{0},successNum:{1},failureNum:{2}".format(soup.title.string.strip(), successNum, failureNum)) - except urllib2.HTTPError, e: - logging.error("HTTPError code:{0}, URL:{1}".format(e.code, listUrl)) - - return basicInfoList - -# 过滤基本信息列表中的无用条目 -def filterPostsBasicInfoList(basicInfoList): - pass - return basicInfoList - - -# 根据需要的小组数目计算小组信息的目录页 -def getGroupCategoryUrlList(queryKeywordsList, topNGroup=10, - findGroupUrl="https://www.douban.com/group/search?start=0&cat=1019&q=[REPLACEBYQUERY]&sort=relevance", - maxGroupsNumForEachPage=20): - # 获取查询结果地址 - queryString = "+".join(queryKeywordsList) - queryUrl = findGroupUrl.replace("[REPLACEBYQUERY]", queryString) - logging.info("queryUrl:{0}".format(queryUrl)) - - # 发送访问请求和接收 - try: - request = urllib2.Request(queryUrl) - response = urllib2.urlopen(request) - except urllib2.HTTPError, e: - logging.error("HTTPError code:{0}, URL:{1}".format(e.code, queryUrl)) - except Exception, e: - logging.error(e) - - # 读取响应内容并转换为soup对象 - html = response.read() - soup = BeautifulSoup(html, "lxml") - - countPlain = soup.find("span", attrs={"class": "count"}).string - countNum = int(re.findall('[0-9]\d', countPlain)[0]) - maxPageNum = int(math.ceil(float(countNum) / maxGroupsNumForEachPage)) - pageStartGroupIdxList = map(lambda pageIdx: str(pageIdx * 10), xrange(maxPageNum)) - groupCategoryUrlList = map(lambda start: findGroupUrl.replace("start=0", "start=" + start), pageStartGroupIdxList) - if topNGroup < len(groupCategoryUrlList) * maxGroupsNumForEachPage: - groupCategoryUrlList = groupCategoryUrlList[:topNGroup / maxGroupsNumForEachPage + 1] - return groupCategoryUrlList - - -# 取出关键标签部分 -def getGroupInfoListForEachPage(groupCategoryUrl): - # 发送访问请求和接收 - try: - request = urllib2.Request(groupCategoryUrl) - response = urllib2.urlopen(request) - except urllib2.HTTPError, e: - logging.error("HTTPError code:{0}, URL:{1}".format(e.code, groupCategoryUrl)) - except Exception, e: - logging.error(e) - - # 读取响应内容并转换为soup对象 - html = response.read() - soup = BeautifulSoup(html, "lxml") - - # print soup.find("div", id="wrapper").find("div", id="content") - resultOfSoup = soup.find_all("div", attrs={'class': "result"}) - - # 遍历抽取重要信息[组名、链接、人数、介绍] - groupInfoList = [] - # for resultIdx in xrange(len(resultOfSoup)): - # try: - # result = resultOfSoup[resultIdx] - # groupName = "".join(list(result.find("div", attrs={'class':"title"}).strings)).strip() - # groupHref = result.a.get("href") - # groupMemberPlain = result.find("div", attrs={'class':"info"}).string - # groupMemberNum = int(re.findall('^[0-9]\d*', groupMemberPlain)) - # groupIntro = result.p.string.strip().replace(" ", "").replace("", "") - # groupInfoList.append(groupName, groupHref, groupMemberNum, groupIntro) - # except Exception, e: - # logging.error(e) - # return groupInfoList - -# 查找指定关键词的前topNGroup个群组的信息[组名、地址、人数、介绍] -def getGroupsInfoList(queryKeywordsList, topNGroup=10, findGroupUrl="https://www.douban.com/group/search?start=0&cat=1019&q=[REPLACEBYQUERY]&sort=relevance", maxGroupsNumForEachPage=20): - - groupCategoryUrlList = getGroupCategoryUrlList(queryKeywordsList, topNGroup, findGroupUrl, maxGroupsNumForEachPage) - for groupCategoryUrlIdx in xrange(len(groupCategoryUrlList)): - groupCategoryUrl = groupCategoryUrlList[groupCategoryUrlIdx] - getGroupInfoListForEachPage(groupCategoryUrl) - - - - - - -################################### PART3 TEST ####################################### + # 获取groupIntroUrl页面的小组详细信息 + return groupsInfoTupleList + + +################################### PART3 TEST ####################################### # 初始化参数 -discussionUrlOfGroup = "https://www.douban.com/group/HZhome/discussion?start=0" +#discussionUrlOfGroup = "https://www.douban.com/group/HZhome/discussion?start=0" queryKeywordsList = ["杭州", "租房"] +topNGroup = 5 +maxGroupsNumForEachPage = 20 +findGroupUrl = "https://www.douban.com/group/search?start=0&cat=1019&q=[REPLACEBYQUERY]&sort=relevance" # 初始化环境 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) +crawler = Crawler() + +groupsInfoList = crawler.getGroupsInfoList(queryKeywordsList, topNGroup, maxGroupsNumForEachPage, findGroupUrl) + +""" # 执行爬取 postsBasicInfoList = getPostsBasicInfoList(discussionUrlOfGroup) #filteredBasicInfoList = filterPostsBasicInfoList(postsBasicInfoList) @@ -184,6 +290,6 @@ def getGroupsInfoList(queryKeywordsList, topNGroup=10, findGroupUrl="https://www lastTime = postsBasicInfoList[basicInfoIdx][6] logging.info("idx:{0}, title:{1}, href:{2}, userName:{3}, doubanId:{4}, userLink:{5}, commentNum:{6}, lastTime:{7}".format(basicInfoIdx+1, title, href, userName, doubanId, userLink, commentNum, lastTime)) +""" -getGroupsInfoList(queryKeywordsList)