Skip to content

Commit

Permalink
[update] 修改验证计数,放松验证
Browse files Browse the repository at this point in the history
  • Loading branch information
jinghao_wb committed Sep 21, 2017
1 parent d13f41f commit c590e64
Show file tree
Hide file tree
Showing 6 changed files with 30 additions and 21 deletions.
1 change: 1 addition & 0 deletions Config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,6 @@ freeProxyFourth = 1
freeProxyFifth = 1

[HOST]
; API接口配置 http://127.0.0.1:5051
ip = 0.0.0.0
port = 5010
3 changes: 2 additions & 1 deletion Manager/ProxyManager.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def refresh(self):
proxy_set = set()
# fetch raw proxy
for proxy in getattr(GetFreeProxy, proxyGetter.strip())():
if proxy.strip():
if proxy:
self.log.info('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy))
proxy_set.add(proxy.strip())

Expand Down Expand Up @@ -76,6 +76,7 @@ def getAll(self):
return self.db.getAll()

def get_status(self):
# TODO rename get_count..
self.db.changeTable(self.raw_proxy_queue)
total_raw_proxy = self.db.get_status()
self.db.changeTable(self.useful_proxy_queue)
Expand Down
2 changes: 1 addition & 1 deletion Run/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from Api.ProxyApi import run as ProxyApiRun
from Schedule.ProxyValidSchedule import run as ValidRun
from Schedule.ProxyRefreshSchedule import run as RefreshRun
from Util.GetConfig import GetConfig


def run():
p_list = list()
Expand Down
16 changes: 10 additions & 6 deletions Schedule/ProxyRefreshSchedule.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,20 +47,24 @@ def validProxy(self):
"""
self.db.changeTable(self.raw_proxy_queue)
raw_proxy = self.db.pop()
self.log.info('%s start validProxy_a' % time.ctime())
self.log.info('ProxyRefreshSchedule: %s start validProxy' % time.ctime())
# 计算剩余代理,用来减少重复计算
remaining_proxies = self.db.getAll()
while raw_proxy:
if isinstance(raw_proxy, bytes):
# 兼容Py3
raw_proxy = raw_proxy.decode('utf8')

if (raw_proxy not in remaining_proxies) and validUsefulProxy(raw_proxy):
self.db.changeTable(self.useful_proxy_queue)
self.db.put(raw_proxy)
self.log.info('validProxy_a: %s validation pass' % raw_proxy)
self.log.info('ProxyRefreshSchedule: %s validation pass' % raw_proxy)
else:
self.log.debug('validProxy_a: %s validation fail' % raw_proxy)
self.log.info('ProxyRefreshSchedule: %s validation fail' % raw_proxy)
self.db.changeTable(self.raw_proxy_queue)
raw_proxy = self.db.pop()
remaining_proxies = self.db.getAll()
self.log.info('%s validProxy_a complete' % time.ctime())
self.log.info('ProxyRefreshSchedule: %s validProxy complete' % time.ctime())


def refreshPool():
Expand Down Expand Up @@ -88,9 +92,9 @@ def main(process_num=30):


def run():
# main()
main()
sched = BlockingScheduler()
sched.add_job(main, 'interval', minutes=5)
sched.add_job(main, 'interval', minutes=10) # 每10分钟抓取一次
sched.start()


Expand Down
23 changes: 13 additions & 10 deletions Schedule/ProxyValidSchedule.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,22 +36,25 @@ def __validProxy(self):
self.db.changeTable(self.useful_proxy_queue)
for each_proxy in self.db.getAll():
if isinstance(each_proxy, bytes):
# 兼容PY3
each_proxy = each_proxy.decode('utf-8')

value = self.db.getvalue(each_proxy)
if validUsefulProxy(each_proxy):
# 成功计数器加1
self.db.inckey(each_proxy, 1)
self.log.debug('validProxy_b: {} validation pass'.format(each_proxy))
if value and int(value) < 1:
self.db.inckey(each_proxy, 1)
self.log.info('ProxyValidSchedule: {} validation pass'.format(each_proxy))
else:
# 失败计数器减一
self.db.inckey(each_proxy, -1)
# self.db.delete(each_proxy)
self.log.info('validProxy_b: {} validation fail'.format(each_proxy))
value = self.db.getvalue(each_proxy)
if value and int(value) < -5:
# 计数器小于-5删除该代理
self.db.delete(each_proxy)
self.log.info('validProxy_a running normal')
if value and int(value) < -5:
# 计数器小于-5删除该代理
self.db.delete(each_proxy)
else:
self.db.inckey(each_proxy, -1)
self.log.info('ProxyValidSchedule: {} validation fail'.format(each_proxy))

self.log.info('ProxyValidSchedule running normal')
sleep(60 * 1)

def main(self):
Expand Down
6 changes: 3 additions & 3 deletions Util/utilFunction.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from Util.LogHandler import LogHandler
from Util.WebRequest import WebRequest

logger = LogHandler(__name__)
logger = LogHandler(__name__, stream=False)


# noinspection PyPep8Naming
Expand Down Expand Up @@ -81,8 +81,8 @@ def validUsefulProxy(proxy):
# 超过40秒的代理就不要了
r = requests.get('https://www.baidu.com', proxies=proxies, timeout=40, verify=False)
if r.status_code == 200:
logger.debug('%s is ok' % proxy)
logger.info('%s is ok' % proxy)
return True
except Exception as e:
logger.info(e)
logger.debug(e)
return False

0 comments on commit c590e64

Please sign in to comment.