Skip to content

Commit

Permalink
Properly remove whitespace from titles when there are entityrefs. Fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
mankyd committed Mar 4, 2014
1 parent ea5b161 commit 59f189c
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 5 deletions.
38 changes: 35 additions & 3 deletions htmlmin/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@
'*': ('hidden',),
}

leading_whitespace_re = re.compile(r'^\s+')
trailing_whitespace_re = re.compile(r'\s+$')
leading_trailing_whitespace_re = re.compile(r'(^\s+)|(\s+$)')
whitespace_re = re.compile(r'\s+')
whitespace_newline_re = re.compile(r'\s*(\r|\n)+\s*')
Expand Down Expand Up @@ -103,8 +105,11 @@ def __init__(self,
self._data_buffer = []
self._in_pre_tag = 0
self._in_head = False
self._in_title = False
self._after_doctype = False
self._tag_stack = []
self._title_newly_opened = False
self.__title_trailing_whitespace = False

def _has_pre(self, attrs):
for k,v in attrs:
Expand Down Expand Up @@ -166,6 +171,9 @@ def handle_starttag(self, tag, attrs):
self._after_doctype = False
if tag == 'head':
self._in_head = True
elif self._in_head and tag == 'title':
self._in_title = True
self._title_newly_opened = True

tag_sets = ( # a list of tags and tags that they are closed by
(('li',), ('li',)),
Expand Down Expand Up @@ -222,6 +230,9 @@ def handle_endtag(self, tag):
# TODO: Did we know that we were in an head tag?! If not, we need to
# reminify everything to remove extra spaces.
self._in_head = False
elif tag == 'title':
self._in_title = False
self._title_newly_opened = False
try:
self._in_pre_tag -= self._close_tags_up_to(tag)
except OpenTagNotFoundError:
Expand Down Expand Up @@ -259,9 +270,20 @@ def handle_data(self, data):
return


# if we're in the title, remove leading and trailing whitespace
if self._tag_stack and self._tag_stack[0][0] == 'title':
data = leading_trailing_whitespace_re.sub('', data)
# if we're in the title, remove leading and trailing whitespace.
# note that the title may be parsed in chunks if entityref's or charrefs
# are encountered.
if self._in_title:
if self.__title_trailing_whitespace:
self._data_buffer.append(' ')
self.__title_trailing_whitespace = data[-1].isspace()
if self._title_newly_opened:
self._title_newly_opened = False
data = leading_trailing_whitespace_re.sub('', data)
else:
data = trailing_whitespace_re.sub(
'', leading_whitespace_re.sub(' ', data))

data = whitespace_re.sub(' ', data)
if not data:
return
Expand All @@ -277,9 +299,19 @@ def handle_data(self, data):
self._data_buffer.append(data)

def handle_entityref(self, data):
if self._in_title:
if not self._title_newly_opened and self.__title_trailing_whitespace:
self._data_buffer.append(' ')
self.__title_trailing_whitespace = False
self._title_newly_opened = False
self._data_buffer.append('&{};'.format(data))

def handle_charref(self, data):
if self._in_title:
if not self._title_newly_opened and self.__title_trailing_whitespace:
self._data_buffer.append(' ')
self.__title_trailing_whitespace = False
self._title_newly_opened = False
self._data_buffer.append('&#{};'.format(data))

def handle_pi(self, data):
Expand Down
4 changes: 2 additions & 2 deletions htmlmin/tests/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,8 @@
'<body> <pre> X </pre> </body>',
),
'remove_head_spaces': (
'<head> <title> X Y </title> </head>',
'<head><title>X Y</title></head>',
'<head> <title> &#x2603;X Y &amp; Z </title> </head>',
'<head><title>&#x2603;X Y &amp; Z</title></head>',
),
'dont_minify_scripts_or_styles': (
'<body> <script> X </script> <style> X</style> </body>',
Expand Down

0 comments on commit 59f189c

Please sign in to comment.