-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgutenberg.py
130 lines (99 loc) · 4.03 KB
/
gutenberg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# import the os module
import os
# extra modules
import codecs
import rdflib
def write_gutenberg_data_rdf():
graph = rdflib.Graph()
rdf = graph.parse(fd, format="application/rdf+xml")
from lxml.etree import iterparse
def get_dc_element(el, el_name):
elements = list(el.iterfind('{http://purl.org/dc/elements/1.1/}' + el_name))
if len(elements) == 1:
return elements[0]
else:
return None
def write_gutenberg_data_iterparse(infile, outfile):
out_fd = codecs.open(outfile, 'w', 'utf-8')
in_fd = open(infile, 'r')
for el in iterparse(in_fd):
assert el[0] =='end', (el,)
el = el[1]
if el.tag != '{http://www.gutenberg.org/rdfterms/}etext':
continue
creator = get_dc_element(el, 'creator')
if creator is None:
continue
# etext id
try:
etext_id = el.values()[0].replace('etext', '')
etext_id = int(etext_id)
except:
print etext_id
print 'Error parsing etext id'
raise
# author, dates
birth_year, death_year = '', ''
if len(creator.text.split(', ')) >= 2:
years = creator.text.split(', ')[-1].split('-')
try:
birth_year, death_year = map(int, years)
except:
# Could not determine years; skip
continue
title = get_dc_element(el, 'title')
url = 'http://www.gutenberg.org/cache/epub/%s/pg%s.txt' % (
etext_id, etext_id)
if 1730 < birth_year < 1885 and \
1800 < death_year < 2000:
fields = [creator.text,
birth_year,
death_year,
title.text if title is not None else '',
url]
row = '\t'.join(map(unicode, fields)).replace('\n', '')
out_fd.write(row + '\n')
in_fd.close()
out_fd.close()
# This version uses a more basic XML library but it involves reading
# the entire XML file into memory which requires a very large amount
# of RAM for large XML files (e.g. the gutenberge catalog is 193M and
# required > 8Gb RAM to load into memory in python as parsed XML)
# import the python module that will parse the XML input file
from xml.dom import minidom
def write_gutenberg_data(gut):
outfile = codecs.open('gutenberg_data.txt', 'w', 'utf-8')
for element in gut.getElementsByTagName('pgterms:etext'):
creator_elements = element.getElementsByTagName('dc:creator')
if len(creator_elements) == 0:
continue
assert len(creator_elements) == 1, 'Should only be one creator!'
creator_el = creator_elements[0]
children = creator_el.childNodes
# assert len(children) == 1, 'Should only be one child!'
creator_text = ''
for child in children:
try:
creator_text += child.data
except:
pass
if len(creator_text.split(', ')) >= 2:
years = creator_text.split(', ')[-1].split('-')
try:
birth_year, death_year = map(str, map(int, years))
except:
birth_year, death_year = '', ''
title = element.getElementsByTagName('dc:title')[0].childNodes[0].data
etext_num = element.getAttribute('rdf:ID').replace('etext', '')
url = 'http://www.gutenberg.org/cache/epub/%s/pg%s.txt' % (etext_num, etext_num)
fields = [creator_text, birth_year, death_year, title, url]
row = '\t'.join(fields).replace('\n', '')
outfile.write(row + '\n')
if __name__ == '__main__':
# store the location of your input file in a variable
infile = '/Users/Shared/Mary/Gutenberg/catalog.rdf'
# parse the XML in that file into a python data structure
# We'd like to just do this:
# gut = minidom.parse(file_path)
# But that reads the whole thing into memory, and the file is too big for that. So we have to use the bufsize argument:
gut = minidom.parse(file_path, parser=None, bufsize=1)