Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
Spark3122 authored Feb 11, 2019
1 parent 27712bf commit f186b14
Show file tree
Hide file tree
Showing 20 changed files with 23,176 additions and 0 deletions.
123 changes: 123 additions & 0 deletions multifactor/multifactor/crawler/crawFinance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import pymysql.cursors
from tools import HttpApi
import json
import logging
import time
# Connect to the database
connection = pymysql.connect(host='localhost',
user='root',
password='123456',
db='finance',
charset='utf8mb4')


def insert(args):
try:
with connection.cursor() as cursor:
# Create a new record
sql = "INSERT INTO finance_rp (stock_code, date, jbmgsy, kfmgsy, xsmgsy, mgjzc,mggjj, mgwfply,mgjyxjl, yyzsr, mlr, gsjlr,kfjlr ,yyzsrtbzz, gsjlrtbzz, kfjlrtbzz, yyzsrgdhbzz, gsjlrgdhbzz, kfjlrgdhbzz, jqjzcsyl, tbjzcsyl, tbzzcsyl, mll, jll, sjsl, yskyysr, xsxjlyysr, jyxjlyysr, zzczzy, yszkzzts, chzzts, zcfzl, ldzczfz, ldbl, sdbl) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s,%s, %s, %s, %s, %s, %s, %s, %s, %s, %s,%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"

cursor.execute(sql, args)

# connection is not autocommit by default. So you must commit to save
# your changes.
connection.commit()
except :
logging.error("error insert")
finally:
# connection.close()
pass


def query(idarg):
try:
with connection.cursor() as cursor:
# Read a single record
sql = "SELECT * FROM 'finance_rp' WHERE 'id'=%s"
cursor.execute(sql, (idarg,))
result = cursor.fetchone()
print(result)
finally:
connection.close()
pass
pass

def queryStocks():
try:
with connection.cursor() as cursor:
# Read a single record
sql = "SELECT code,name FROM stocks"
cursor.execute(sql)
result = cursor.fetchall()
# print(result)
return result
finally:
# connection.close()
pass


def crawFinance2DbYear(code):
'''
按年维度
'''
market = "SH"
if int(code[0:1])<=3:
market ="SZ"
content = HttpApi.httpGet2(u'http://emweb.securities.eastmoney.com/NewFinanceAnalysis/MainTargetAjax?ctype=4&type=1&code='+market+code)
for data in content:
line = [str(data[key]) for key in ['date', 'jbmgsy', 'kfmgsy', 'xsmgsy', 'mgjzc','mggjj', 'mgwfply','mgjyxjl',
'yyzsr', 'mlr', 'gsjlr', 'kfjlr' , 'yyzsrtbzz', 'gsjlrtbzz', 'kfjlrtbzz', 'yyzsrgdhbzz', 'gsjlrgdhbzz',
'kfjlrgdhbzz', 'jqjzcsyl', 'tbjzcsyl', 'tbzzcsyl', 'mll', 'jll', 'sjsl', 'yskyysr', 'xsxjlyysr', 'jyxjlyysr',
'zzczzy', 'yszkzzts', 'chzzts', 'zcfzl', 'ldzczfz', 'ldbl', 'sdbl']]
line.insert(0, str(code))
insert(tuple(line))

def crawFinance2Db(code):
'''
按年维度
'''
market = "SH"
if int(code[0:1])<=3:
market ="SZ"
content = HttpApi.httpGet2(u'http://emweb.securities.eastmoney.com/NewFinanceAnalysis/MainTargetAjax?ctype=4&type=0&code='+market+code)
for data in content:
line = [str(data[key]) for key in ['date', 'jbmgsy', 'kfmgsy', 'xsmgsy', 'mgjzc','mggjj', 'mgwfply','mgjyxjl',
'yyzsr', 'mlr', 'gsjlr', 'kfjlr' , 'yyzsrtbzz', 'gsjlrtbzz', 'kfjlrtbzz', 'yyzsrgdhbzz', 'gsjlrgdhbzz',
'kfjlrgdhbzz', 'jqjzcsyl', 'tbjzcsyl', 'tbzzcsyl', 'mll', 'jll', 'sjsl', 'yskyysr', 'xsxjlyysr', 'jyxjlyysr',
'zzczzy', 'yszkzzts', 'chzzts', 'zcfzl', 'ldzczfz', 'ldbl', 'sdbl']]
line.insert(0, str(code))
insert(tuple(line))

if __name__ == '__main__':
# data =[str(i) for i in range(1,36)]
# t =tuple(data)
# import pdb; pdb.set_trace()
# insert(t)
# 'stock_code',
# query(str(1))

stocks = queryStocks()
# ('000001', '平安银行'), ('000002', '万 科A'))
# print(stocks[0][0])

codes = []
for stockname in stocks:
codes.append(stockname[0])



count =0
for stockname in stocks:
time.sleep(0.1)
crawFinance2Db(stockname[0])
count+=1
logging.debug("-------------crawFinance2Db-end------------"+stockname[0]+" count:"+str(count))

# code = "000651"
# crawFinance2Db(code)

# code = "600339"
# crawFinance2Db(code)
# connection.close()
print(codes)
logging.debug("--------------end------------")
154 changes: 154 additions & 0 deletions multifactor/multifactor/crawler/crawSinaIndustry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
# -*- coding: utf-8 -*-

# 导入必要模块
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import VARCHAR
import json as json
from pandas.core.frame import DataFrame
from tools import HttpApi
import json
import logging
import time
import pymysql.cursors

# 初始化数据库连接,使用pymysql模块
# MySQL的用户:root, 密码:147369, 端口:3306,数据库:mydb
engine = create_engine('mysql+pymysql://root:123456@localhost:3306/finance')

# Connect to the database
connection = pymysql.connect(host='localhost',
user='root',
password='dai693122',
db='finance',
charset='utf8mb4')

def makeTopNode(jnode,allNodes,industryName):
curLevel=0
for childNode in jnode:
if(len(childNode)<=3):
curCode ,curName = childNode[2],childNode[0]
allNodes.append([curCode,curName,curLevel,"-1","-1", industryName])
elif (len(childNode)==4):
curCode ,curName = childNode[3],childNode[0]
allNodes.append([curCode,curName,curLevel,"-1","-1", industryName])
for childNode2 in childNode[1]:
if(isinstance(childNode2,list) and len(childNode2)<=3):
allNodes.append([childNode2[2],childNode2[0],curLevel+1,curCode ,curName, industryName])
# makeNode(curCode,curName,childNode,curLevel+1,allNodes)

def save2DB2():
with open("/Users/admin/Desktop/doc/finance/multifactor/data/industry/sina_config_data.txt",'r') as f:
configstr = f.read().replace("\\'", "'")
ldict = json.loads(configstr)
#申万二级
ind =ldict[1][0][1][2][1]
allNodes=[]
# for ind2 in ind:
# ind3 = ind2[1]
makeTopNode(ind,allNodes,"申万二级")
data ={
'indcode'
}
# print(ldict)
pdind = DataFrame(allNodes)
pdind.columns=['indcode','indname','level','par_indcode','par_indname','classname']
# pdmean.to_sql('statistic2', engine)
pdind.to_sql('industry',engine,if_exists='append')

def save2DB():
with open("/Users/admin/Desktop/doc/finance/multifactor/data/industry/sina_config_data.txt",'r') as f:
configstr = f.read().replace("\\'", "'")
ldict = json.loads(configstr)
#申万二级
ind =ldict[1][0][1][3][1]
allNodes=[]
# for ind2 in ind:
# ind3 = ind2[1]
makeTopNode(ind,allNodes,"热门概念")
data ={
'indcode'
}
# print(ldict)
pdind = DataFrame(allNodes)
pdind.columns=['indcode','indname','level','par_indcode','par_indname','classname']
# pdmean.to_sql('statistic2', engine)
pdind.to_sql('industry',engine,if_exists='append')


import traceback
def insert(args):
try:
with connection.cursor() as cursor:
# Create a new record
sql = "INSERT INTO stocks (code, name, industry, industry_code, pe) VALUES (%s, %s, %s, %s, %s)"
cursor.execute(sql, args)

# connection is not autocommit by default. So you must commit to save
# your changes.
connection.commit()
#except :
except Exception as e:
# ogging.error("error insert")
print(traceback.print_exc())
finally:
# connection.close()
pass



def queryIndustrys():
try:
with connection.cursor() as cursor:
# Read a single record
sql = "select indcode,indname from industry ORDER BY indcode asc "
cursor.execute(sql)
result = cursor.fetchall()
# print(result)
return result
finally:
# connection.close()
pass




def crawIndustry2DBForStock(indcode,indname):
content = HttpApi.httpGet2(u'http://vip.stock.finance.sina.com.cn/quotes_service/api/json_v2.php/Market_Center.getHQNodeData?page=1&num=1000&sort=symbol&asc=1&node='+indcode+'&symbol=&_s_r_a=page')
if(content is None):
logging.error("null content:"+indcode)
return
for data in content:
line = [str(data[key]) for key in [ 'code', 'name']]
line.append(indname)
line.append(indcode)
line.append("")
insert(tuple(line))


def crawIndustryStocks2DB():
stocks = queryIndustrys()
# ('000001', '平安银行'), ('000002', '万 科A'))
# print(stocks[0][0])

begin = False
count =0
for ind in stocks:
if(ind[0]=="chgn_730016"):
begin = True
if(begin==False):
continue
time.sleep(2)
crawIndustry2DBForStock(ind[0],ind[1])
count+=1
logging.debug("-------------crawIndustryStocks2DB-end------------"+ind[0]+" count:"+str(count))


def main():
# save2DB2()
# crawIndustry2DBForStock("sw2_270100","test")
crawIndustryStocks2DB()

if __name__ == '__main__':
main()

19 changes: 19 additions & 0 deletions multifactor/multifactor/crawler/tushareTest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import tushare as ts

print(ts.__version__)

dir = "/data/finance/"
# pro = ts.pro_api()
# stocks = pro.get_stock_basics()

# pro = ts
stocks = ts.get_stock_basics()
stocks.to_csv(dir+"stocks")


industry = ts.get_industry_classified()
industry.to_csv(dir+"industry")


concepts = ts.get_concept_classified()
concepts.to_csv(dir+"concepts")
86 changes: 86 additions & 0 deletions multifactor/multifactor/knowledgemap/crawKnowledge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import pymysql.cursors
# from tools import HttpApi
import json
import logging
import time
# Connect to the database
connection = pymysql.connect(host='localhost',
user='root',
password='123456',
db='finance',
charset='utf8mb4')


import requests
# import logging
import json
import demjson

from selenium import webdriver
browser = webdriver.Chrome("/Users/admin/Desktop/soft/chromedriver")

def crawKForcode(code,filename):
#知识图谱爬取
print("-------------begin------"+code+" ---"+filename)
url =u'https://www.iwencai.com/diag/block-detail?pid=11666&codes='+code+'&codeType=stock&info={"view":{"nolazy":1,"parseArr":{"_v":"new","dateRange":[],"staying":[],"queryCompare":[],"comparesOfIndex":[]},"asyncParams":{"tid":137}}}'
browser.get(url)
source = browser.page_source
source = source.encode('utf-8').decode('unicode_escape')
print(source)
with open(filename, "w") as f:
f.write(str(source))
f.close()
print("-------------end------")


def queryStocks():
try:
with connection.cursor() as cursor:
# Read a single record
sql = "SELECT code,name FROM stocks"
cursor.execute(sql)
result = cursor.fetchall()
# print(result)
return result
finally:
# connection.close()
pass

def queryDStocks():
try:
with connection.cursor() as cursor:
# Read a single record
sql = "SELECT DISTINCT(code) FROM stocks ORDER BY code asc"
cursor.execute(sql)
result = cursor.fetchall()
# print(result)
return result
finally:
# connection.close()
pass

if __name__ == '__main__':
# stocks = queryStocks()
stocks = queryDStocks()
# ('000001', '平安银行'), ('000002', '万 科A'))
# print(stocks[0][0])

codes = []
for stockname in stocks:
codes.append(stockname[0])

print(len(codes))
dir = "/data/knowledge/"

count =0
for stockname in stocks:
time.sleep(0.5)
# crawFinance2Db(stockname[0])
crawKForcode(stockname[0],dir+stockname[0]+".txt")
count+=1
logging.debug("-------------crawKnowledge2Db-end------------"+stockname[0]+" count:"+str(count))
# if(count>=2):
# break

print(codes)
logging.debug("--------------end------------")
Loading

0 comments on commit f186b14

Please sign in to comment.