Skip to content
This repository has been archived by the owner on Apr 5, 2024. It is now read-only.
/ onthisday Public archive

Commit

Permalink
Fixup for hyphens within body text
Browse files Browse the repository at this point in the history
  • Loading branch information
jayaddison committed Nov 4, 2011
1 parent 48d0a43 commit d3480ac
Showing 1 changed file with 4 additions and 2 deletions.
6 changes: 4 additions & 2 deletions utils/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import logging
import re

from mwlib.parser import Section, Item
from mwlib.parser import Section, Item, Link
from mwlib.uparser import parseString
from mwlib.utils import fetch_url
from mwlib import advtree
Expand Down Expand Up @@ -32,14 +32,16 @@
if isinstance(node, Section) and node.children[0].asText().strip() in (u'Events', u'Births', u'Deaths'):

section = node.children[0].asText().strip().lower().encode('utf-8')
if not section == "events":
continue

results[section] = []

for item in [x.children[0] for x in node.children[1].allchildren() if isinstance(x, Item)]:

# Extract the year and text from the item
raw = item.getAllDisplayText().strip()
matches = re.match(u'(.*)[\s]*(\u2013|–)[\s]*(.*)', raw)
matches = re.match(u'([^\u2013]*)[\s]*(\u2013|–)[\s]*(.*)', raw)

if not matches:
logging.log(logging.WARN, '%s - Unable to split year and text from string: "%s"' % (title, raw))
Expand Down

0 comments on commit d3480ac

Please sign in to comment.