Skip to content

Commit

Permalink
Add files.
Browse files Browse the repository at this point in the history
  • Loading branch information
ysh329 committed Aug 7, 2016
1 parent bea1b45 commit 94bb0e0
Show file tree
Hide file tree
Showing 4 changed files with 132 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Empty file added Douban-Crawler.py
Empty file.
Empty file added spider/__init__.py
Empty file.
126 changes: 126 additions & 0 deletions spider/crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
# -*- coding: utf-8 -*-
################################### PART0 DESCRIPTION #################################
# Filename: crawler.py
# Description:
#

# E-mail: ysh329@sina.com
# Create: 2016-8-7 14:22:51
# Last:
__author__ = 'yuens'


################################### PART1 IMPORT ######################################

import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import logging

import urllib2
from bs4 import BeautifulSoup
import re
import time


################################### PART2 CLASS && FUNCTION ###########################

# 通过目录页获取列表信息列表
def getBasicInfoList(listUrl):
basicInfoList = []

# 发送请求接受响应并转换为soup对象后抽出结果保存
request = urllib2.Request(listUrl)
try:
# 发出请求接收响应转换为soup对象
response = urllib2.urlopen(request)
html = response.read()
soup = BeautifulSoup(html, "lxml")
soupOfAllTitles = soup.find_all("tr")

#for i in soupOfAllTitles: logging.info(i)

# 统计结果
failureNum = 0
successNum = 0
for titleIdx in xrange(len(soupOfAllTitles)):
titleDetailInfo = soupOfAllTitles[titleIdx]
try:
# 获取每条信息的[链接、标题、用户名、用户名链接、回帖人数、回帖日期]
title = titleDetailInfo.a.get("title")
href = titleDetailInfo.a.get("href")

userLink = re.compile('<td nowrap="nowrap"><a class="" href="(.*)">').findall(str(titleDetailInfo))[0]
userName = re.findall(userLink+'">(.*)</a></td>', str(titleDetailInfo))[0]
doubanId = re.findall(r"\d+", userLink)[0]

commentNumStr = re.findall('<td class="" nowrap="nowrap">(.*)</td>', str(titleDetailInfo))[0]
commentNum = 0 if commentNumStr=="" else int(commentNumStr)

lastTime = str(time.localtime(time.time())[0]) +\
"-" +\
re.findall('<td class="time" nowrap="nowrap">(.*)</td>', str(titleDetailInfo))[0]

#print commentNum, lastTime, type(lastTime)

basicInfoList.append( (title, href, userName, doubanId, userLink, commentNum, lastTime) )
successNum += 1
except Exception, e:
logging.error("ExceptionError:{0}".format(e))
failureNum += 1
logging.info("title:{0},successNum:{1},failureNum:{2}".format(soup.title.string.strip(), successNum, failureNum))
except urllib2.HTTPError, e:
logging.error("HTTPError code:{0}, URL:{1}".format(e.code, listUrl))

return basicInfoList

# 过滤基本信息列表中的无用条目
def filterBasicInfoList(basicInfoList):
pass
return basicInfoList

# 查找指定关键词的前topNGroup个群组的信息[组名、地址、人数、介绍]
def getGroupsInfoList(queryKeywordsList, topNGroup=10, findGroupUrl="https://www.douban.com/group/search?cat=1019&q=[REPLACEBYQUERY]&sort=relevance"):
queryString = "+".join(queryKeywordsList)
queryUrl = findGroupUrl.replace("[REPLACEBYQUERY]", queryString)
print queryUrl

request = urllib2.Request(queryUrl)
try:
# 发出请求接收响应转换为soup对象
response = urllib2.urlopen(request)
html = response.read()
soup = BeautifulSoup(html, "lxml")
soupOfGroups = soup.div.div.div["class"]
for i in soupOfGroups: print i
except urllib2.HTTPError, e:
logging.error("HTTPError code:{0}, URL:{1}".format(e.code, queryUrl))

################################### PART3 TEST #######################################

# 初始化参数
listUrl = "https://www.douban.com/group/HZhome/discussion?start=0"
queryKeywordsList = ["杭州", "租房"]

# 初始化环境
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

'''
# 执行爬取
basicInfoList = getBasicInfoList(listUrl)
filteredBasicInfoList = filterBasicInfoList(basicInfoList)
# 打印结果
for basicInfoIdx in xrange(len(filteredBasicInfoList)):
title = basicInfoList[basicInfoIdx][0]
href = basicInfoList[basicInfoIdx][1]
userName = basicInfoList[basicInfoIdx][2]
doubanId = basicInfoList[basicInfoIdx][3]
userLink = basicInfoList[basicInfoIdx][4]
commentNum = basicInfoList[basicInfoIdx][5]
lastTime = basicInfoList[basicInfoIdx][6]
logging.info("idx:{0}, title:{1}, href:{2}, userName:{3}, doubanId:{4}, userLink:{5}, commentNum:{6}, lastTime:{7}".format(basicInfoIdx+1, title, href, userName, doubanId, userLink, commentNum, lastTime))
'''

getGroupsInfoList(queryKeywordsList)

0 comments on commit 94bb0e0

Please sign in to comment.