-
Notifications
You must be signed in to change notification settings - Fork 0
/
smarttraveller_to_mobi.py
178 lines (155 loc) · 7.73 KB
/
smarttraveller_to_mobi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
'''
Smart Traveller information to mobi script.
This script relies on having calibre's ebook-convert inbuilt program.
http://calibre-ebook.com/
Once the webdesigners at smarttraveller.gov.au decide to update their website I'll have to rewrite this to make it work again.
- Hoz, October 2013.
'''
import argparse
import codecs
import lxml.html
import lxml.html.clean
import os
import pickle
import time
import urllib
# Some basic settings
BASE_HTTP = 'http://www.smarttraveller.gov.au'
COUNTRY_LIST_URL = BASE_HTTP + '/zw-cgi/view/Advice/'
COUNTRY_LIST_FILE = 'country_list.pickle'
SAVE_DIR = 'country_html'
TOC_HTML = os.path.join(SAVE_DIR, '_toc.html')
MAIN_HTML = os.path.join(SAVE_DIR, '_main.html')
TOC_HEADER_FILE = "toc_header.html"
EBOOK_OUTPUT_FILE = "smarttraveller%s.mobi" % time.strftime("%b%Y")
EBOOK_COVER = "cover.jpg"
EBOOK_CONVERT = '/Applications/calibre.app/Contents/MacOS/ebook-convert'
def goto_website_return_html(url):
'''
Very basic function, could probably do without it.
'''
url_handler = urllib.urlopen(url)
return url_handler.read()
def find_country_list(filename):
'''
This function will try to find a pickle file (filename) and try to return a loaded pickle of that.
If it does not find it, then it goes to smarttraveller and downloads the latest country list.
'''
country_list = {}
try:
pickle_file = open(filename, 'rb')
country_list = pickle.load(pickle_file)
pickle_file.close()
except:
# Can't open or use pickle file.
# Must fetch a new list.
html = goto_website_return_html(COUNTRY_LIST_URL)
root = lxml.html.fromstring(html)
elements = root.find_class('topicRow')
for element in elements:
country = element.find_class('hidden')[0].text
href = BASE_HTTP + element.find_class('topicTitle')[0].get('href')
issue_date = element.find_class('issueDate')[0].text
if issue_date:
issue_date = time.strftime('%d %b %Y', time.strptime(issue_date, '%d/%m/%Y'))
country_list[country] = {}
country_list[country]['url'] = href
country_list[country]['issue_date'] = issue_date
country_list[country]['safe_name'] = href.split('/')[-1]
country_list[country]['file_name'] = os.path.join(SAVE_DIR, country_list[country]['safe_name'] + '.html')
if country_list:
# Got data now try to save the pickle file.
pickle_file = open(filename, 'wb')
pickle.dump(country_list, pickle_file)
pickle_file.close()
return country_list
def get_country_html(url):
'''
This function goes to the country specific url and grabs the relevant advice html.
It then also strips away html tags not required for ebook reading.
'''
html = goto_website_return_html(url)
tree = lxml.html.fromstring(html)
# The advice information is located in the <article id="theArticle"> tag.
article = tree.xpath("//article[@id='theArticle']")[0]
try:
# This has maps and videos, doesn't really place nice with ebooks.
removeme = article.xpath("//section[@class='mediaFiles']")[0]
removeme.getparent().remove(removeme)
except:
pass
articlehtml = lxml.html.tostring(article)
# I don't want extra tags!
cleaner = lxml.html.clean.Cleaner(safe_attrs_only=True, remove_tags=['a', 'article', 'section', 'span', 'div'])
cleansed = cleaner.clean_html(articlehtml)
output_html = cleansed.decode('utf-8')
return output_html
def build_table_of_contents(country_list):
'''
This function builds the top half of the output html. It's a bit of a sloppy way to do this, but it works.
'''
header_text = "<!DOCTYPE html><html><head><style type='text/css'>.toc { page-break-after: always; text-indent: 0em; }</style></head><body><h1>Table of Contents</h1><ul id='toc'>"
output_html = header_text
for country in sorted(country_list):
# make sure the links are nice for table of contents building.
output_html += "<li><a href=\"#%s\">%s</a> (Issued: %s)</li>" % (country_list[country]['safe_name'], country, country_list[country]['issue_date'])
output_html += "</ul>\n"
return output_html
def build_big_file(country_list, output_file):
'''
Build the big html file (it can be like 3 meg or something).
This requires a helper function: build_table_of_contents() to build the heading for the output html file.
Because the file gets large, I decided to make it write to the file on the fly. I didn't want to store all the data into a large variable.
'''
outfile = codecs.open(output_file, mode='w', encoding='utf-8')
header_text = build_table_of_contents(country_list)
outfile.write(header_text)
for country in sorted(country_list):
cfile = codecs.open(country_list[country]['file_name'], mode='r', encoding='utf-8')
cfile_contents = cfile.read()
cfile.close()
# Create a heading with table of contents link.
# class='chapter' is something that ebook-convert looks for.
outfile.write("<h1 class='chapter' id='%s'>%s</h1>\n" % (country_list[country]['safe_name'], country))
# For some reason a div tag doesn't get removed when it's getting 'cleansed'. This replace is a bit of a hack.
outfile.write(cfile_contents.replace('<div>', '').replace('</div>', ''))
outfile.write("</body></html>")
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='http://smarttraveller.gov.au -> MOBI converter')
parser.add_argument('-u', help='Update country html files', action='store_true')
parser.add_argument('-b', help='Build large html file', action='store_true')
parser.add_argument('-o', help='Output .mobi file', action='store_true')
args = parser.parse_args()
# If you didn't specify the script anything to do, print help and quit.
if not args.u and not args.b and not args.o:
parser.print_help()
exit()
# Keep this in here just to make sure a directory exists.
if not os.path.isdir(SAVE_DIR):
os.makedirs(SAVE_DIR)
print "Created Directory: %s" % os.path.abspath(SAVE_DIR)
# Populate country_list, either form pickle or loaded form website.
country_list = find_country_list(COUNTRY_LIST_FILE)
# error handling, this might be a good indicator if they decide to redesign their website.
if not country_list:
print "Problems with finding Country List!"
exit()
print "Got country list with %d countries." % len(country_list)
# Update country html files
if args.u:
for country in sorted(country_list):
html = get_country_html(country_list[country]['url'])
outfile = codecs.open(country_list[country]['file_name'], mode='w', encoding='utf-8')
outfile.write(html)
outfile.close()
print country_list[country]['file_name'], "written."
print "------------------------------\nFinished updating html files"
# Create big html file
if args.b:
build_big_file(country_list, MAIN_HTML)
print "Built output html: %s" % os.path.abspath(MAIN_HTML)
# Create output mobi file. This takes time.
if args.o:
sys_command = "%s %s %s -v -v --max-toc-links=0 --no-chapters-in-toc --output-profile=kindle --change-justification=justify --chapter-mark=both --authors='Australian Government' --book-producer='Hoz' --language='English' --pretty-print --toc-filter=r'*' --title='Smart Traveller (%s)' --pubdate='%s' --comments='This is information taken from smarttraveller.gov.au'" % (EBOOK_CONVERT, os.path.abspath(MAIN_HTML), os.path.abspath(EBOOK_OUTPUT_FILE), time.strftime("%b, %Y"), time.strftime("%d %b %Y"))
print "Executing: %s" % sys_command
os.system(sys_command)