-
Notifications
You must be signed in to change notification settings - Fork 0
/
p3.py
148 lines (113 loc) · 4.26 KB
/
p3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""
$ python p3.py
kowiki-20180401-pages-articles-multistream.xml
Sequential하게 탐색하여 최대 1000개의 축구 선수 정보를 수집한 후 랜덤으로 10 개씩 추출하고
csv포멧으로 결과값 저장.
※ 추출 정규식 및 parser가 축구선수정보 > 1000에 대해서는 예외가 발생할 수 있음.
"""
import xml.etree.ElementTree as ET
import re
import csv
import random
xml_filename = 'kowiki-20180401-pages-articles-multistream.xml'
info_title = '축구 선수 정보'
def fixtag(ns, tag, nsmap):
return '{%s}%s' % (nsmap[ns], tag)
def process_page(page_elem, nsmap):
page_id = page_elem.find(fixtag('', 'id', nsmap)).text
page_title = page_elem.find(fixtag('', 'title', nsmap)).text
text = page_elem \
.find(fixtag('', 'revision', nsmap)) \
.find(fixtag('', 'text', nsmap)).text or ''
text = find_infobox_text(text)
if text is None:
return None
info_dict = get_infobox_dict(text)
info_dict = process_infobox_dict(info_dict)
return {
'page_id': page_id,
'page_title': page_title,
'template_name': info_title,
'date_of_birth': info_dict['date_of_birth'],
'height': info_dict['height'],
'team': info_dict['team'],
}
def find_infobox_text(text, info_title=info_title):
regex = r'({{info_title\n(?:\|\s?.*\n)+}})'.replace(
'info_title', info_title)
match = re.match(regex, text)
return match.group() if match is not None else None
def get_infobox_dict(text, info_title=info_title):
info_dict = {}
text = text[2 + len(info_title):-2]
for line in text.split('\n'):
if not line.startswith('|'):
continue
try:
key, value, *_ = line.split('=')
except ValueError: # invalid format
continue
info_dict[key[1:].strip()] = value.strip()
return info_dict
def process_infobox_dict(infobox_dict):
date_of_birth = infobox_dict['출생일'] if '출생일' in infobox_dict else None
height = infobox_dict['키'] if '키' in infobox_dict else None
team = infobox_dict['현 소속팀'] if '현 소속팀' in infobox_dict else None
if date_of_birth:
match = re.search(
r'(\d{4})[^\d]*(\d{1,2})[^\d]*(\d{1,2})',
date_of_birth)
assert match is not None
year, month, day = match.groups()
date_of_birth = '%02d-%02d-%02d' % (int(year), int(month), int(day))
if height:
match = re.match(r'((?:1|2)\.?\d{2})', height)
assert match is not None
height = match.groups()[0]
if '.' in height:
height = str(int(float(height) * 100))
if team:
match = re.search(r'\[\[((?:\w|\s|\d)+).*\]\]', team)
if not match:
team = re.sub(r'{{(?:\d|\w|\s|\|)+}}', '', team).strip()
else:
team = match.groups()[0]
date_of_birth = date_of_birth or '정보 없음'
height = height or '정보 없음'
team = team or '없음'
return {'date_of_birth': date_of_birth, 'height': height, 'team': team}
def run_parse():
nsmap = {}
results = []
count = 0
for event, elem in ET.iterparse(xml_filename, events=['end', 'start-ns']):
if event == 'start-ns':
ns, url = elem
nsmap[ns] = url
elif event == 'end':
if elem.tag == fixtag('', 'page', nsmap):
result = process_page(elem, nsmap)
if result is not None:
count += 1
results.append(result)
if count == 1000:
break
elem.clear()
samples = []
while True:
if len(samples) == 10:
break
sample = results[random.randrange(0, count)]
if sample not in samples:
samples.append(sample)
with open('results.csv', 'w', encoding='utf8', newline='') as f:
wr = csv.writer(f)
wr.writerow([
'Page ID', 'Page Title', 'Template Name',
'Date of Birth', 'Height', 'Team'])
for sample in samples:
wr.writerow([
sample['page_id'], sample['page_title'], info_title,
sample['date_of_birth'], sample['height'], sample['team']])
if __name__ == '__main__':
run_parse()