Skip to content

Commit

Permalink
struct
Browse files Browse the repository at this point in the history
  • Loading branch information
jhao104 committed Jun 24, 2020
1 parent 12477f8 commit 8791f86
Show file tree
Hide file tree
Showing 11 changed files with 161 additions and 95 deletions.
24 changes: 23 additions & 1 deletion test/testDbClient.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,26 @@
2020/6/23:
-------------------------------------------------
"""
__author__ = 'JHao'
__author__ = 'JHao'

from db.DbClient import DbClient

if __name__ == '__main__':
# ############### ssdb ###############
ssdb_uri = "ssdb://:password@127.0.0.1:8888"
s = DbClient.parseDbConn(ssdb_uri)
assert s.db_type == "SSDB"
assert s.db_pwd == "password"
assert s.db_host == "127.0.0.1"
assert s.db_port == 8888

# ############### redis ###############
redis_uri = "redis://:password@127.0.0.1:6379/1"
r = DbClient.parseDbConn(redis_uri)
assert r.db_type == "REDIS"
assert r.db_pwd == "password"
assert r.db_host == "127.0.0.1"
assert r.db_port == 6379
assert r.db_name == "1"

print("DbClient ok!")
4 changes: 2 additions & 2 deletions test/testGetFreeProxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
__author__ = 'J_hao'


from ProxyGetter.getFreeProxy import GetFreeProxy
from Config.ConfigGetter import config
from fetcher.getFreeProxy import GetFreeProxy
from config.ConfigGetter import config


def testGetFreeProxy():
Expand Down
2 changes: 1 addition & 1 deletion test/testLogHandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"""
__author__ = 'J_hao'

from Util.LogHandler import LogHandler
from util.LogHandler import LogHandler


# noinspection PyPep8Naming
Expand Down
2 changes: 1 addition & 1 deletion test/testProxyClass.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
__author__ = 'JHao'

import json
from ProxyHelper import Proxy
from helper import Proxy


def testProxyClass():
Expand Down
2 changes: 1 addition & 1 deletion test/testWebRequest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"""
__author__ = 'J_hao'

from Util.WebRequest import WebRequest
from util.WebRequest import WebRequest


# noinspection PyPep8Naming
Expand Down
124 changes: 62 additions & 62 deletions util/LogHandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,23 @@
2017/9/21: 屏幕输出/文件输出 可选(默认屏幕和文件均输出)
-------------------------------------------------
"""
__author__ = 'JHao'
# __author__ = 'JHao'

import os

import logging

from logging.handlers import TimedRotatingFileHandler

# 日志级别
CRITICAL = 50
FATAL = CRITICAL
ERROR = 40
WARNING = 30
WARN = WARNING
INFO = 20
DEBUG = 10
NOTSET = 0
# # 日志级别
# CRITICAL = 50
# FATAL = CRITICAL
# ERROR = 40
# WARNING = 30
# WARN = WARNING
# INFO = 20
# DEBUG = 10
# NOTSET = 0

CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
ROOT_PATH = os.path.join(CURRENT_PATH, os.pardir)
Expand All @@ -44,59 +44,59 @@ class LogHandler(logging.Logger):
LogHandler
"""

def __init__(self, name, level=DEBUG, stream=True, file=True):
self.name = name
self.level = level
logging.Logger.__init__(self, self.name, level=level)
if stream:
self.__setStreamHandler__()
if file:
self.__setFileHandler__()
# def __init__(self, name, level=DEBUG, stream=True, file=True):
# self.name = name
# self.level = level
# logging.Logger.__init__(self, self.name, level=level)
# if stream:
# self.__setStreamHandler__()
# if file:
# self.__setFileHandler__()

def __setFileHandler__(self, level=None):
"""
set file handler
:param level:
:return:
"""
file_name = os.path.join(LOG_PATH, '{name}.log'.format(name=self.name))
# 设置日志回滚, 保存在log目录, 一天保存一个文件, 保留15天
file_handler = TimedRotatingFileHandler(filename=file_name, when='D', interval=1, backupCount=15)
file_handler.suffix = '%Y%m%d.log'
if not level:
file_handler.setLevel(self.level)
else:
file_handler.setLevel(level)
formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s')

file_handler.setFormatter(formatter)
self.file_handler = file_handler
self.addHandler(file_handler)

def __setStreamHandler__(self, level=None):
"""
set stream handler
:param level:
:return:
"""
stream_handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s')
stream_handler.setFormatter(formatter)
if not level:
stream_handler.setLevel(self.level)
else:
stream_handler.setLevel(level)
self.addHandler(stream_handler)

def resetName(self, name):
"""
reset name
:param name:
:return:
"""
self.name = name
self.removeHandler(self.file_handler)
self.__setFileHandler__()
# def __setFileHandler__(self, level=None):
# """
# set file handler
# :param level:
# :return:
# """
# file_name = os.path.join(LOG_PATH, '{name}.log'.format(name=self.name))
# # 设置日志回滚, 保存在log目录, 一天保存一个文件, 保留15天
# file_handler = TimedRotatingFileHandler(filename=file_name, when='D', interval=1, backupCount=15)
# file_handler.suffix = '%Y%m%d.log'
# if not level:
# file_handler.setLevel(self.level)
# else:
# file_handler.setLevel(level)
# formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s')
#
# file_handler.setFormatter(formatter)
# self.file_handler = file_handler
# self.addHandler(file_handler)
#
# def __setStreamHandler__(self, level=None):
# """
# set stream handler
# :param level:
# :return:
# """
# stream_handler = logging.StreamHandler()
# formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s')
# stream_handler.setFormatter(formatter)
# if not level:
# stream_handler.setLevel(self.level)
# else:
# stream_handler.setLevel(level)
# self.addHandler(stream_handler)
#
# def resetName(self, name):
# """
# reset name
# :param name:
# :return:
# """
# self.name = name
# self.removeHandler(self.file_handler)
# self.__setFileHandler__()


if __name__ == '__main__':
Expand Down
6 changes: 3 additions & 3 deletions util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,6 @@
-------------------------------------------------
"""

from Util.utilFunction import validUsefulProxy
from Util.LogHandler import LogHandler
from Util.utilClass import Singleton
# from util.utilFunction import validUsefulProxy
# from util.LogHandler import LogHandler
# from util.utilClass import Singleton
19 changes: 16 additions & 3 deletions util/singleton.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,23 @@
File Name: singleton
Description :
Author : JHao
date: 2020/6/22
date: 2016/12/3
-------------------------------------------------
Change Activity:
2020/6/22:
2016/12/3:
-------------------------------------------------
"""
__author__ = 'JHao'
__author__ = 'JHao'


class Singleton(type):
"""
Singleton Metaclass
"""

_inst = {}

def __call__(cls, *args, **kwargs):
if cls not in cls._inst:
cls._inst[cls] = super(Singleton, cls).__call__(*args)
return cls._inst[cls]
4 changes: 2 additions & 2 deletions util/utilFunction.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
from lxml import etree
import requests

from Util.WebRequest import WebRequest
from util.WebRequest import WebRequest
from .validators import validators
from Config.ConfigGetter import config
from config.ConfigGetter import config


def robustCrawl(func):
Expand Down
28 changes: 24 additions & 4 deletions util/validators.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# -*- coding: utf-8 -*-

import requests
from re import findall
from handler.configHandler import ConfigHandler

conf = ConfigHandler()
validators = []


Expand All @@ -8,18 +13,33 @@ def validator(func):
return func


@validator
def formatValidator(proxy):
"""
检查代理格式
:param proxy:
:return:
"""
verify_regex = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}"
_proxy = findall(verify_regex, proxy)
return True if len(_proxy) == 1 and _proxy[0] == proxy else False


@validator
def timeOutValidator(proxy):
"""
检测超时
:param proxy:
:return:
"""
if isinstance(proxy, bytes):
proxy = proxy.decode("utf8")
proxies = {"http": "http://{proxy}".format(proxy=proxy)}

proxies = {"http": "http://{proxy}".format(proxy=proxy), "https": "https://{proxy}".format(proxy=proxy)}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
'Accept': '*/*',
'Connection': 'keep-alive',
'Accept-Language': 'zh-CN,zh;q=0.8'}
try:
r = requests.get('http://www.baidu.com', proxies=proxies, timeout=10, verify=False)
r = requests.head(conf.verifyUrl, headers=headers, proxies=proxies, timeout=conf.verifyTimeout, verify=False)
if r.status_code == 200:
return True
except Exception as e:
Expand Down
41 changes: 26 additions & 15 deletions util/webRequest.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,22 @@
__author__ = 'J_hao'

from requests.models import Response
from lxml import etree
import requests
import random
import time

from handler.logHandler import LogHandler

requests.packages.urllib3.disable_warnings()


class WebRequest(object):
name = "web_request"

def __init__(self, *args, **kwargs):
pass
self.log = LogHandler(self.name, file=False)
self.response = Response()

@property
def user_agent(self):
Expand Down Expand Up @@ -51,35 +59,38 @@ def header(self):
'Connection': 'keep-alive',
'Accept-Language': 'zh-CN,zh;q=0.8'}

def get(self, url, header=None, retry_time=5, timeout=30,
retry_flag=list(), retry_interval=5, *args, **kwargs):
def get(self, url, header=None, retry_time=5, retry_interval=5, timeout=30, *args, **kwargs):
"""
get method
:param url: target url
:param header: headers
:param retry_time: retry time when network error
:param retry_time: retry time
:param retry_interval: retry interval
:param timeout: network timeout
:param retry_flag: if retry_flag in content. do retry
:param retry_interval: retry interval(second)
:param args:
:param kwargs:
:return:
"""
headers = self.header
if header and isinstance(header, dict):
headers.update(header)
while True:
try:
html = requests.get(url, headers=headers, timeout=timeout, **kwargs)
if any(f in html.content for f in retry_flag):
raise Exception
return html
self.response = requests.get(url, headers=headers, timeout=timeout, *args, **kwargs)
return self
except Exception as e:
print(e)
self.log.error("requests: %s error: %s" % (url, str(e)))
retry_time -= 1
if retry_time <= 0:
# 多次请求失败
resp = Response()
resp.status_code = 200
return resp
return self
self.log.info("retry %s second after" % retry_interval)
time.sleep(retry_interval)

@property
def tree(self):
return etree.HTML(self.response.content)

@property
def text(self):
return self.response.text

0 comments on commit 8791f86

Please sign in to comment.