-
Notifications
You must be signed in to change notification settings - Fork 280
/
Copy pathvalidate_wiki_links.py
204 lines (166 loc) · 5.82 KB
/
validate_wiki_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import sys
import os
import time
import getopt
import re
import requests
# Globals
_WEB_LINKS = 0
_HELP_MSG=\
"""
Usage: python3 validate_wiki_links.py [OPTIONS]
This script needs to be run from the mpich/doc/ directory. It will crawl
through the mpich/doc/wiki directory and sub-directories, processing all
markdown files. Any link within the file is validated to ensure it still works.
If the link is a web link, it will check for a 200 HTTP response code. If the
link is to another file, it will validate if that file exists.
Short and Long OPTIONS:
-h, --help Display this help message
--web-links Enable validating of web links. Disabled by default
"""
_ERROR_MSG=\
"Error parsing option. Please use '--help' for more a list of valid options."
_ALL_FILES = {}
# Functions
'''
crawl_dir: Crawls through the starting directory and recurses for any
sub-directory. All files are added to an array and returned. Additionally all
files are added to the global _All_FILES dictionary for later use.
'''
def crawl_dir(dir_start):
global _ALL_FILES
files = []
directory = os.fsencode(dir_start)
for file in os.listdir(directory):
filename = os.fsdecode(file)
full_path = os.path.join(dir_start, filename)
if(os.path.isdir(full_path)):
# Recursive call for a sub-directory
files += crawl_dir(full_path)
else:
# We only want to process markdown files
if(os.path.splitext(filename)[1] == ".md"):
files.append(full_path)
# We only want to add the actual file name and extension
_ALL_FILES[os.path.basename(filename)] = 0
return files
'''
find_links: We look for any links of the pattern [<text>](<link>) within the
document. We return all matches that are found.
'''
def find_links(file_name):
pattern = re.compile(r'\[.*\]\(.*\)')
content = open(file_name).read()
return pattern.findall(content, re.M)
'''
parse_links: We process the found links and split them into their respective
<text> and <link> sections. The <link> content is aggregated and returned.
'''
def parse_links(links):
l = []
for link in links:
# <Text>
page = link[link.find("[")+1:link.find("]")]
#<Link>
link = link[link.find("(")+1:link.find(")")]
# In Markdown you can have a link such as
# ["my link"](<link> "wikilink")
# We still want to process those links, and just skip the remaining
# text
if len(link.split()) > 1:
link = link.split()[0]
l.append(link)
return l
'''
validate_links: Here we call the previous two functions to get all of our links
properly parsed and collected. Once we have the list we then check whether the
link is a web link, mailto link, or a file link.
- Web links are checked for a 200 HTTP response code to be considered valid.
- Mailto links are skipped.
- Files are checked to see if they exist in their relative locations.
We return a dictionary of each file and any broken links attached to it.
Additionally we count each time a file is linked to and store that information
in the global _ALL_FILES dictionary.
'''
def validate_links(file_name):
global _ALL_FILES
global _WEB_LINKS
web_link_pattern = re.compile(r'http(|s):\/\/.*\.*')
mailto_pattern = re.compile(r'mailto:.*@.*\.[a-zA-Z0-9][a-zA-Z0-9]*')
ret = {"File": file_name, "Broken": []}
links = parse_links(find_links(file_name))
for link in links:
# Web links
if web_link_pattern.match(link):
# If we dont have web link checking enabled, skip
if not _WEB_LINKS:
continue
try:
response = requests.head(link)
if not response.status_code == 200:
ret["Broken"].append(link)
except requests.exceptions.RequestException as e:
ret["Broken"].append(link)
# Mailto links - skip these by default, no easy way to validate
elif mailto_pattern.match(link):
continue
# File links
else:
# If this is a markdown relative link, skip it
if link.startswith('#'):
continue
# If file exists in our _ALL_FILES dictionary, increase its link
# count by 1.
if os.path.basename(link) in _ALL_FILES:
_ALL_FILES[os.path.basename(link)] += 1
directory = os.path.dirname(file_name)
file_path = os.path.join(directory, link)
if not os.path.exists(file_path):
ret["Broken"].append(file_path)
return ret
'''
parse_arguments: Parse command lind arguments
'''
def parse_arguments(argv):
global _HELP_MSG
global _ERROR_MSG
global _WEB_LINKS
try:
opts, args = getopt.getopt(argv, "h", ["help", "web-links"])
except getopt.GetoptError:
print(_ERROR_MSG)
sys.exit()
if args:
print(_ERROR_MSG)
sys.exit()
for opt, arg in opts:
# Output help message
if opt in ("-h", "--help"):
print(_HELP_MSG)
sys.exit()
# Used to enable checking web links
elif opt in ("--web-links"):
_WEB_LINKS = 1
# Main
def main(argv):
global _VERBOSE
global _ALL_FILES
links = []
parse_arguments(argv)
all_files = crawl_dir("wiki")
for file in all_files:
links.append(validate_links(file))
print("==== Broken Links ====")
for link in links:
if link["Broken"]:
print("File: ", link["File"])
for b in link["Broken"]:
print(" Link: ", b)
print()
print()
print("==== Unlinked Files ====")
for f in _ALL_FILES:
if _ALL_FILES[f] == 0:
print(f)
if __name__ == "__main__":
main(sys.argv[1:])