Skip to content

Commit

Permalink
[update] 代理校验修改
Browse files Browse the repository at this point in the history
  • Loading branch information
jhao104 committed Apr 3, 2018
1 parent 24a9f09 commit ae09f0d
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 24 deletions.
2 changes: 1 addition & 1 deletion DB/SsdbClient.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def put(self, proxy, num=1):
:param num:
:return:
"""
data = self.__conn.hincrby(self.name, proxy, num)
data = self.__conn.hset(self.name, proxy, num)
return data

def delete(self, key):
Expand Down
22 changes: 8 additions & 14 deletions Schedule/ProxyCheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from Manager.ProxyManager import ProxyManager
from Util.LogHandler import LogHandler

FAIL_COUNT = 1 # 校验失败次数, 超过次数删除代理
FAIL_COUNT = 2 # 校验失败次数, 超过次数删除代理


class ProxyCheck(ProxyManager, Thread):
Expand All @@ -34,27 +34,21 @@ def __init__(self):
def run(self):
self.db.changeTable(self.useful_proxy_queue)
while True:
proxy_item = self.db.pop()
while proxy_item:
proxy = proxy_item.get('proxy')
counter = proxy_item.get('value', 1)
for proxy, count in self.db.getAll().items():
if validUsefulProxy(proxy):
# 验证通过计数器加1
if counter and int(counter) < 1:
self.db.put(proxy, num=int(counter) + 1)
# 验证通过计数器减1
if count and int(count) > 0:
self.db.put(proxy, num=int(count) - 1)
else:
self.db.put(proxy)
pass
self.log.info('ProxyCheck: {} validation pass'.format(proxy))
else:
self.log.info('ProxyCheck: {} validation fail'.format(proxy))
# 验证失败,计数器减1
if counter and int(counter) <= FAIL_COUNT:
if count and int(count) > FAIL_COUNT:
self.log.info('ProxyCheck: {} fail too many, delete!'.format(proxy))
self.db.delete(proxy)
else:
self.db.put(proxy, num=int(counter) - 1)

proxy_item = self.db.pop()
self.db.put(proxy, num=int(count) + 1)
sleep(60 * 5)


Expand Down
6 changes: 3 additions & 3 deletions Schedule/ProxyRefreshSchedule.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,9 @@ def main(process_num=30):

def run():
main()
sched = BlockingScheduler()
sched.add_job(main, 'interval', minutes=10) # 每10分钟抓取一次
sched.start()
sch = BlockingScheduler()
sch.add_job(main, 'interval', minutes=10) # 每10分钟抓取一次
sch.start()


if __name__ == '__main__':
Expand Down
13 changes: 7 additions & 6 deletions Util/utilFunction.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from Util.LogHandler import LogHandler
from Util.WebRequest import WebRequest

logger = LogHandler(__name__, stream=False)
# logger = LogHandler(__name__, stream=False)


# noinspection PyPep8Naming
Expand All @@ -27,8 +27,9 @@ def decorate(*args, **kwargs):
try:
return func(*args, **kwargs)
except Exception as e:
logger.info(u"sorry, 抓取出错。错误原因:")
logger.info(e)
pass
# logger.info(u"sorry, 抓取出错。错误原因:")
# logger.info(e)

return decorate

Expand Down Expand Up @@ -98,10 +99,10 @@ def validUsefulProxy(proxy):
proxies = {"http": "http://{proxy}".format(proxy=proxy)}
try:
# 超过20秒的代理就不要了
r = requests.get('http://httpbin.org/ip', proxies=proxies, timeout=20, verify=False)
r = requests.get('http://httpbin.org/ip', proxies=proxies, timeout=10, verify=False)
if r.status_code == 200:
logger.info('%s is ok' % proxy)
# logger.info('%s is ok' % proxy)
return True
except Exception as e:
logger.debug(e)
# logger.error(str(e))
return False

0 comments on commit ae09f0d

Please sign in to comment.