-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathlink_scrape.py
136 lines (126 loc) · 3.65 KB
/
link_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from bs4 import BeautifulSoup
import re, socket, urllib.request, urllib.parse, urllib.error, urllib.error, urllib.parse
from urllib.parse import urlparse
from urllib.parse import urljoin
from collections import defaultdict
from time import time
import pickle, os
from configuration import MINE_BASE_DIRECTORY, PDF_LINKS_FILE, OUTPUT_FILE_PATH
mine = MINE_BASE_DIRECTORY
if not os.path.exists(mine):
os.makedirs(mine)
miner = open(PDF_LINKS_FILE, "w+")
f=open(OUTPUT_FILE_PATH ,'w+')
unvisited = []
unvisited_map = {}
visited = defaultdict(lambda: 0)
pdfs = []
local_ip = ""
numPDF = 0
numUnvisited = 0
def crawl():
global local_ip, unvisited, visited, pdfs, unvisited_map, f, numPDF, numUnvisited
website = unvisited.pop()
numUnvisited -= 1
print((website,"\n"))
f.write(website+"\n")
visited[website] = 1
try:
req = urllib.request.Request(website)
except:
return
page = "xyz"
if website[-4:] == ".pdf":
pdfs.append(website)
numPDF += 1
return
try:
reponse = urllib.request.urlopen(req,timeout=5)
except:
print('Error 404 :Not Found')
f.write('Error 404 :Not Found\n')
return
# print('The Content-Type of processing url is: '+str(reponse.getheader('Content-Type')))
try:
if 'text/pdf' in reponse.getheader('Content-Type') or 'application/pdf' in reponse.getheader('Content-Type') or 'application/x-pdf' in reponse.getheader('Content-Type'):
pdfs.append(abs_link)
numPDF += 1
return
if 'text/html' in reponse.getheader('Content-Type'):
page = reponse.read()
else:
return
except:
print("Invalid Response")
return
soup = BeautifulSoup(page, 'html.parser')
for link in soup.find_all('a'):
path = link.get('href')
classlink = link.get('class')
idlink = link.get('id')
if path == None or path=='' or path[0] == '#':
return
if '\n' in path or '\r' in path:
path=path.replace('\n','')
path=path.replace('\r','')
print('New line or carriage return character found in path')
if "http" == path[:4]:
abs_link = path
else:
abs_link = urljoin(website, path)
if ( visited[abs_link] == 0 ):
if abs_link[-4:] == ".pdf":
visited[abs_link] = 1
pdfs.append(abs_link)
numPDF += 1
else:
try:
ip = (urlparse(abs_link).netloc)
if (ip == local_ip) and (abs_link not in unvisited_map):
unvisited.append(abs_link)
unvisited_map[abs_link] = 1
numUnvisited += 1
except:
pass
def get_links(start_url):
global unvisited, pdfs, unvisited_map, local_ip, numPDF, numUnvisited, miner
numUnvisited = 0
local_ip = (urlparse(start_url).netloc)
unvisited = []
unvisited.append(start_url)
numUnvisited += 1
unvisited_map[start_url] = 1
scanned = 0
while ( numUnvisited > 0 ):
crawl()
if ( (numPDF != 0) and numPDF % 100 == 0 ):
for var in range(numPDF-100, numPDF):
miner.write(pdfs[var]+"\n")
scanned += 1
stri=("%d PDFs found, %d links scanned, %d links still left" % (numPDF, scanned, numUnvisited))
print(stri)
stri=stri+"\n"
f.write(stri)
return pdfs
'''
Function: get_links
Does the job of getting the links of all pdf files found on a particular site provided, and also writes them in a file
named out.txt
Parameters:
start_url - url of the base site from which we want to extract pdf links
Returns:
list of all pdf links found on a specified website
Uses:
Function - crawl()
'''
'''
Function: crawl
This is the main recursive crawling function that helps the get_links function to get all pdf links.
Parameters:
No parameters.
All variables are already set by script and get_links fucntion.
Returns:
Nothing
Note to user:
This function is a helper function called by get_links function. It is not intended to be invoked manually by user.
'''