Fixup for hyphens within body text

jayaddison · Nov 4, 2011 · d3480ac · d3480ac
1 parent 48d0a43
commit d3480ac
Showing 1 changed file with 4 additions and 2 deletions.
diff --git a/utils/extract.py b/utils/extract.py
@@ -4,7 +4,7 @@
 import logging
 import re
 
-from mwlib.parser import Section, Item
+from mwlib.parser import Section, Item, Link
 from mwlib.uparser import parseString
 from mwlib.utils import fetch_url
 from mwlib import advtree
@@ -32,14 +32,16 @@
 	if isinstance(node, Section) and node.children[0].asText().strip() in (u'Events', u'Births', u'Deaths'):
 
 		section = node.children[0].asText().strip().lower().encode('utf-8')
+		if not section == "events":
+			continue
 
 		results[section] = []
 
 		for item in [x.children[0] for x in node.children[1].allchildren() if isinstance(x, Item)]:
 
 			# Extract the year and text from the item
 			raw = item.getAllDisplayText().strip()
-			matches = re.match(u'(.*)[\s]*(\u2013|&ndash;)[\s]*(.*)', raw)
+			matches = re.match(u'([^\u2013]*)[\s]*(\u2013|&ndash;)[\s]*(.*)', raw)
 
 			if not matches:
 				logging.log(logging.WARN, '%s - Unable to split year and text from string: "%s"' % (title, raw))