-
Notifications
You must be signed in to change notification settings - Fork 1
/
PhotoCountyBot.py
240 lines (200 loc) · 7.64 KB
/
PhotoCountyBot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#! /usr/bin/python
# PhotoCountyBot
#
# Walk through [[Category:Wikipedia requested photographs in <state>]],
# looking for articles that can be reclassified into subcategories.
#
# Notes on sources for county information:
#
# Massachusetts: http://www.sec.state.ma.us/ele/elecct/cctidx.htm
# Pennsylvania: http://pennsylvania.hometownlocator.com/counties/
# Iowa: http://iowa.hometownlocator.com/counties/
# Maryland, Indiana, California?
import argparse
import os
import re
import sys
import time
import county_map
import mwparserfromhell as mw
import pywikibot
from pywikibot import pagegenerators
# These strings are used to find a starting category to crawl
# and a pattern to look for. The location specified on the command
# line will be substituted into both.
#
startCat = 'Category:Wikipedia requested photographs in %s'
debug = False
def guess_county(text, state):
cm = county_map.county_map()
# find the first paragraph in the text (skipping grafs that are
# just templates or images)
while re.match('\s*({{[^}]}}|\[\[[^]]?\]\])\n\s*', text, re.DOTALL):
text = re.sub('^\s*({{[^}]}}|\[\[[^]]?\]\])\n\s*', '', text, re.DOTALL)
grafs = re.split('\n\s*\n', text)
try:
intro = grafs[0]
except IndexError:
return None
links = re.findall(r'\[\[(.*?)\]\]', intro)
# look for [[Foo, Bar]] links and see if any of them are recognized towns
for link in links:
exactlink = link.split('|')[0]
county = find_county_in_text(exactlink, state)
if county:
log("guess_county: found '{}' in link [[{}]]".format(county, exactlink))
return county
county = cm.lookup(exactlink)
if county:
log("guess_county: found '{}' from looking up link [[{}]]".format(county, exactlink))
return county
def lookup_county(town):
"""Look up the county for a given town from its Wikipedia article.
The 'town' argument should be the name of a Wikipedia
article for a town or city. lookup_county will load this
article, look for {{Infobox settlement}} and will see if
a county is named in one of the 'subdivision_name' parameters,
and will return that county name if so.
If no Wikipedia article exists for this town, or if the article
does not have a matching infobox, or if the infobox does not
mention a county, None is returned.
"""
try:
townpage = pywikibot.Page(pywikibot.Site(), town).get()
except pywikibot.NoPage():
return None
w = mw.parse(townpage)
for t in w.filter_templates():
if t.name.strip_code() == 'Infobox settlement':
# Find the subdivision_name parameters and
# look for one that names a county
params = [ p for p in t.params
if p.name.find('subdivision_name') > -1 ]
for p in params:
c = p.value.filter_wikilinks(matches='County,')
if c:
return c[0].title
def find_county_in_text(text, state):
m = re.search(' *([^,(]* County, %s)$' % state, text)
if m:
log("find_county_in_text: found {}".format(m.group(1)))
return m.group(1)
return False
def maybe_create_category(county, state):
cat = 'Category:Wikipedia requested photographs in %s' % county
catpage = pywikibot.Page(pywikibot.Site(), cat)
try:
text = catpage.get()
except pywikibot.NoPage:
catpage.put("""{{US image sources}}
{{howtoreqphotoin|%s}}
<br clear=all />
[[Category:Wikipedia requested photographs in %s|%s]]""" % (county, state, county))
print 'created category [[%s]]' % cat
def log(msg):
if debug:
script = os.path.basename(__file__)
print "{}: {} {}".format(script, time.asctime(), msg)
def canonical_name(template):
"""Returns the canonical name of the template in mediawiki node
'template', after following any redirects.
"""
page = pywikibot.Page(pywikibot.Site(), 'Template:' + unicode(template.name))
while page.isRedirectPage():
page = page.getRedirectTarget()
return page.title()
def is_photo_request(node):
"""Returns True if the specified mediawiki node represents a template
that is or redirects to {{image requested}}.
"""
if isinstance(node, mw.nodes.Template):
return canonical_name(node) == 'Template:Image requested'
return False
class PhotoCountyBot(pywikibot.bot.Bot):
def __init__(self, state, **kwargs):
self.state = state
super(PhotoCountyBot, self).__init__(**kwargs)
def treat(self, page):
global debug
if page.isTalkPage():
article = page.toggleTalkPage()
talk = page
else:
article = page
talk = page.toggleTalkPage()
try:
text = article.get()
except KeyboardInterrupt:
raise
except:
print "%s error thrown by %s" % (sys.exc_info()[0], article.title())
return
newtext = False
# Try finding a county by:
# - looking up the article title in the county map
# - looking for a county given explicitly in the article title
# - searching the text of the first paragraph for a related town
# cm = county_map.county_map()
# county = cm.lookup(article.title())
county = lookup_county(article.title())
if not county:
county = find_county_in_text(page.title(), self.state)
if not county:
county = guess_county(text, self.state)
if not county:
print "couldn't guess at %s" % page.title()
return
# Find an {{image requested}} template and update it with
# the desired location.
oldtext = talk.get()
parsed = mw.parse(oldtext)
tmpls = parsed.filter_templates(matches=is_photo_request)
if tmpls:
tmpls[0].add('in', county)
else:
# insert a new {{image requested}} template after any
# templates at the start of the article.
for n in parsed.nodes:
if isinstance(n, mw.nodes.Template):
continue
parsed.insert_before(n, "\n")
parsed.insert_before(n, mw.nodes.Template(
'image requested', ['in=' + county] ))
break
newtext = parsed.__unicode__()
if not newtext:
print "something friggin weird happened on %s" % article.title()
return
log(page.title())
try:
self.userPut(
page, oldtext, newtext, botflag=True,
comment='moving to [[Category:Wikipedia requested photographs in %s]] by the [[User:PhotoCatBot|PhotoCat]]' % county)
#maybe_create_category(county, self.state, self.site)
except pywikibot.LockedPage:
return False
def main(argv):
global startCat
global debug
debug = False
state = False
parser = argparse.ArgumentParser()
parser.add_argument('--debug', '-d',
help='enable debugging output',
action='store_true')
parser.add_argument('--place', '-p', '--location', '-l',
help='specify location to start (required)',
required=True)
args = parser.parse_args(argv[1:])
debug = args.debug
startCat = startCat % args.place
site = pywikibot.Site()
cat = pywikibot.Category(site, startCat)
gen = pagegenerators.CategorizedPageGenerator(cat)
bot = PhotoCountyBot(state=args.place, generator=gen)
bot.run()
if __name__ == '__main__':
try:
main(sys.argv)
finally:
pywikibot.stopme()