-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathscrape.py
85 lines (72 loc) · 2.73 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python
import argparse
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
import scrapy
import re, requests
import bs4 as bs
import urllib.request
from sys import argv
import pandas as pd
import random
import os
# TODO before starting extensive scraping jobs implement a solid proxy system(Late stage dev)
tickers = input("Enter Tickers: ").split()
year = int(input("Enter from which year to start: "))
# Function to convert Tickers to CIK numbers
def getCIK(TICKERS):
URL = 'http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany'
CIK_RE = re.compile(r'.*CIK=(\d{10}).*')
cik_dict = {}
for ticker in TICKERS:
f = requests.get(URL.format(ticker), stream = True)
results = CIK_RE.findall(f.text)
if len(results):
results[0] = int(re.sub('\.[0]*', '.', results[0]))
cik_dict[str(ticker).upper()] = str(results[0])
return cik_dict
# reads files names in files directory
def raw_files():
files = []
for file in os.listdir("./files"):
if file.endswith(".csv"):
pat = os.path.join("./files", file)
files.append(pat)
return files
# check if provided Tickers already have CIKs and return URL
def look_up(Tickers):
# read existing tickers
existing_df = pd.read_excel('Sorted.xlsx', index_col = 0)
ex_Tickers = existing_df['Ticker'].values.tolist()
# compare existing tickers and add non existing tickers
for t in Tickers:
if t in ex_Tickers:
print(t + ' already exists')
else:
print(t + " doesn't exist")
links = getCIK([t])
new_part = pd.DataFrame(list(links.items()), columns = ['Ticker', 'CIK'])
existing_df = existing_df.append(new_part, ignore_index = True)
existing_df.to_excel('Sorted.xlsx')
URLs = []
# get names of already scraped CIKs
names = [i.split(' ')[0] for i in raw_files() ]
names = [i.split('\\')[-1] for i in names ]
# get CIKs for required tickers and convert to URL
for t in Tickers:
df = existing_df.loc[existing_df['Ticker'] == t]
CIK = str(df['CIK'].values[0])
# Check if company data already has been scraped
if CIK in names:
print(CIK + 'already scraped')
else:
URL = 'https://www.sec.gov/Archives/edgar/data/' + CIK
URLs.append(URL)
return URLs
# Start Spider provided the links
process = CrawlerProcess(get_project_settings())
def scrape(links, year):
for i in links:
process.crawl('sec', start_url=links, company = i, year = year)
process.start()
scrape(links = look_up(Tickers = tickers ), year = year)