Skip to content

Commit

Permalink
Refactored delimited literal tokenisation, removed bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
jasonl committed Aug 3, 2010
1 parent 329467e commit 851d0cd
Show file tree
Hide file tree
Showing 7 changed files with 143 additions and 70 deletions.
3 changes: 2 additions & 1 deletion lib/eden/tokenizer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,13 @@ def tokenize!
case( @state )
when :newline
advance
@expr_state = :beg
@current_line.tokens << capture_token( :newline )
@current_line.tokens.flatten!
@sf.lines << @current_line
@ln += 1
@current_line = Line.new( @ln )
@expr_state = :beg

if @heredoc_delimiter
@current_line.tokens << tokenize_heredoc_body
end
Expand Down
60 changes: 16 additions & 44 deletions lib/eden/tokenizers/delimited_literal_tokenizer.rb
Original file line number Diff line number Diff line change
@@ -1,50 +1,37 @@
module Eden
module DelimitedLiteralTokenizer
def tokenize_delimited_literal
match_delimiter = false
end_delimiter, start_delimiter = nil, nil
delimiter_depth = 0

advance # Pass the %

if( /[^A-Za-z0-9]/.match( cchar ) )
def_char = 'Q'
@state = :double_q_string
elsif( /[qQswWrx]/.match( cchar) )
def_char = cchar
@state = infer_delimited_literal_type
advance
else
raise "Invalid delimiter character"
end

start_delimiter = cchar
end_delimiter = find_matching_delimiter( cchar )
matched_delimiter = is_matched_delimiter?( cchar )

advance # past the delimiter

until((cchar == end_delimiter && delimiter_depth == 0) || @i >= @length )
if matched_delimiter
delimiter_depth += 1 if cchar == start_delimiter
delimiter_depth -= 1 if cchar == end_delimiter
case def_char
when 'r', 'Q', 'W', 'x'
token = tokenize_expanded_string( cchar ) do
# Regex options - See Section 8.5.5.4
if @state == :regex
advance if (cchar == 'i' or cchar == 'm')
end
end

if cchar == '\\'
advance(2)
else
advance
when 's', 'q', 'w'
token = tokenize_non_expanded_string( cchar ) do
# Regex options - See Section 8.5.5.4
if @state == :regex
advance if (cchar == 'i' or cchar == 'm')
end
end
end

if( @i < @length )
advance # Capture the closing delimiter
end

# Regex option - See Section 8.5.5.4
if @state == :regex
advance if (cchar == 'i' or cchar == 'm')
end

capture_token( @state )
return token
end

def infer_delimited_literal_type
Expand All @@ -57,20 +44,5 @@ def infer_delimited_literal_type
when 'x' then :backquote_string
end
end

def find_matching_delimiter( start_delimiter )
case start_delimiter
when '{' then '}'
when '(' then ')'
when '[' then ']'
when '<' then '>'
else
start_delimiter
end
end

def is_matched_delimiter?( cchar )
!! /[{\(\[<]/.match(cchar)
end
end
end
47 changes: 44 additions & 3 deletions lib/eden/tokenizers/string_tokenizer.rb
Original file line number Diff line number Diff line change
@@ -1,19 +1,54 @@
module Eden
module StringTokenizer
def tokenize_single_quote_string
advance # Pass the opening quote
until( cchar == '\'' || @i >= @length)
tokenize_non_expanded_string("'")
end

# If a block is given, it gets run after the final delimiter is detected. The
# primary purpose for this is to allow the capture of regex modifiers
def tokenize_non_expanded_string( start_delimiter, &block )
delimiter_depth = 0
matched_delimiter = is_matched_delimiter?( start_delimiter )
end_delimiter = find_matching_delimiter( start_delimiter )

advance # Pass the opening delimiter

until((cchar == end_delimiter && delimiter_depth == 0) || @i >= @length)

if matched_delimiter
delimiter_depth += 1 if cchar == start_delimiter
delimiter_depth -= 1 if cchar == end_delimiter
end

if cchar == '\\'
advance(2) # Pass the escaped character
else
advance
end
end
advance # Pass the closing quote

block.call if block_given?

@expr_state = :end
capture_token( @state )
end

def find_matching_delimiter( start_delimiter )
case start_delimiter
when '{' then '}'
when '(' then ')'
when '[' then ']'
when '<' then '>'
else
start_delimiter
end
end

def is_matched_delimiter?( cchar )
!! /[{\(\[<]/.match(cchar)
end

def tokenize_backquote_string
advance
advance until cchar == '`' || @i >= @length
Expand All @@ -23,10 +58,15 @@ def tokenize_backquote_string
end

def tokenize_double_quote_string( in_string_already = false )
tokenize_expanded_string('"', in_string_already)
end

def tokenize_expanded_string( start_delimiter, in_string_already = false, &block )
saved_state = @state
tokens = []
end_delimiter = find_matching_delimiter( start_delimiter )
advance unless in_string_already # Pass the opening backquote
until( cchar == '"' || @i >= @length )
until( cchar == end_delimiter || @i >= @length )
if cchar == '\\'
advance(2) # Pass the escaped character
elsif cchar == '#'
Expand Down Expand Up @@ -57,6 +97,7 @@ def tokenize_double_quote_string( in_string_already = false )
end
end
advance # Pass the closing double-quote
block.call if block_given?
@expr_state = :end
tokens << capture_token( @state )
return tokens
Expand Down
20 changes: 15 additions & 5 deletions test/array_literal_tokenization_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,24 @@ def setup
@sf = Eden::SourceFile.new( "dummy.rb" )
end

def test_simple_symbol_tokenisation
@sf.stubs(:source).returns("%w{rah rah rah} %w<rah <rah> rah>")
def test_array_literal_tokenization
@sf.stubs(:source).returns("%w{rah rah rah}\n%w<rah <rah> rah>")
@sf.tokenize!
tokens = @sf.lines[0].tokens
assert_equal 3, tokens.size
assert_equal 2, tokens.size
assert_equal :array_literal, tokens[0].type
assert_equal "%w{rah rah rah}", tokens[0].content
assert_equal :array_literal, tokens[2].type
assert_equal "%w<rah <rah> rah>", tokens[2].content
tokens = @sf.lines[1].tokens
assert_equal :array_literal, tokens[0].type
assert_equal "%w<rah <rah> rah>", tokens[0].content
end

def test_should_not_expand_delimited_array_literal
@sf.stubs(:source).returns("%w{rah \#{@inst} rah}\n")
@sf.tokenize!
tokens = @sf.lines[0].tokens
assert_equal 2, tokens.size
assert_equal :array_literal, tokens[0].type
assert_equal "%w{rah \#{@inst} rah}", tokens[0].content
end
end
15 changes: 11 additions & 4 deletions test/regex_tokenization_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,21 @@ def setup
end

def test_delimited_regex_tokenization
@sf.stubs(:source).returns("%r{[a-z]} %r{[a-z]}i")
@sf.stubs(:source).returns("%r{[a-z]}")
@sf.tokenize!
tokens = @sf.lines[0].tokens
assert_equal 3, tokens.size
assert_equal 1, tokens.size
assert_equal :regex, tokens[0].type
assert_equal "%r{[a-z]}", tokens[0].content
assert_equal :regex, tokens[2].type
assert_equal "%r{[a-z]}i", tokens[2].content
end

def test_delimited_regex_tokenization2
@sf.stubs(:source).returns("%r{[a-z]}i")
@sf.tokenize!
tokens = @sf.lines[0].tokens
assert_equal 1, tokens.size
assert_equal :regex, tokens[0].type
assert_equal "%r{[a-z]}i", tokens[0].content
end

def test_regex_tokenization_at_line_start
Expand Down
48 changes: 44 additions & 4 deletions test/string_tokenization_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -56,16 +56,41 @@ def test_double_quote_string_escaping
end

def test_quoted_expanded_literal_string_tokenization
@sf.stubs(:source).returns("%(test) %Q(test)")
@sf.stubs(:source).returns("%(test)\n%Q(test)")
@sf.tokenize!
tokens = @sf.lines[0].tokens
assert_equal 3, tokens.size
assert_equal 2, tokens.size
assert_equal "%(test)", tokens[0].content
assert_equal :double_q_string, tokens[0].type
assert_equal "%Q(test)", tokens[2].content
assert_equal :double_q_string, tokens[2].type
tokens = @sf.lines[1].tokens
assert_equal "%Q(test)", tokens[0].content
assert_equal :double_q_string, tokens[0].type
end

def test_should_expand_expanded_literal_strings
@sf.stubs(:source).returns("%Q(rah\#{@ivar}rah)")
@sf.tokenize!
tokens = @sf.lines[0].tokens
assert_equal 5, tokens.size
assert_equal "%Q(rah\#", tokens[0].content
assert_equal :double_q_string, tokens[0].type
assert_equal :lcurly, tokens[1].type
assert_equal "@ivar", tokens[2].content
assert_equal :instancevar, tokens[2].type
assert_equal :rcurly, tokens[3].type
assert_equal "rah)", tokens[4].content
assert_equal :double_q_string, tokens[4].type
end

def test_should_not_expand_non_expanded_literal_strings
@sf.stubs(:source).returns("%q(rah\#{@ivar}rah)")
@sf.tokenize!
tokens = @sf.lines[0].tokens
assert_equal 1, tokens.size
assert_equal "%q(rah\#{@ivar}rah)", tokens[0].content
assert_equal :single_q_string, tokens[0].type
end

def test_double_quote_string_interpolation
@sf.stubs(:source).returns("\"str\#{ @inst }str\"")
@sf.tokenize!
Expand Down Expand Up @@ -133,6 +158,21 @@ def test_delimited_backquote_string_tokenization
assert_equal "%x{rah --e}", tokens[0].content
end

def test_should_expand_backquote_string_delimited_literals
@sf.stubs(:source).returns("%x(rah\#{@rah})")
@sf.tokenize!
tokens = @sf.lines[0].tokens
assert_equal 5, tokens.size
assert_equal "%x(rah\#", tokens[0].content
assert_equal :backquote_string, tokens[0].type
assert_equal :lcurly, tokens[1].type
assert_equal "@rah", tokens[2].content
assert_equal :instancevar, tokens[2].type
assert_equal :rcurly, tokens[3].type
assert_equal ")", tokens[4].content
assert_equal :backquote_string, tokens[4].type
end

def test_heredoc_tokenization
@sf.stubs(:source).returns("str = <<HEREDOC\nLorem Ipsum\nHEREDOC\n")
@sf.tokenize!
Expand Down
20 changes: 11 additions & 9 deletions test/symbol_tokenization_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -45,18 +45,20 @@ def test_dynamic_symbol_tokenisation
end

def test_dynamic_symbol_tokenization2
@sf.stubs(:source).returns("%s{rah} %s(rah) %s:rah: %s<rah<rah>rah>")
@sf.stubs(:source).returns("%s{rah}\n%s(rah)\n%s:rah:\n%s<rah<rah>rah>")
@sf.tokenize!
tokens = @sf.lines[0].tokens
assert_equal 7, tokens.size
assert_equal 2, tokens.size
assert_equal :symbol, tokens[0].type
assert_equal "%s{rah}", tokens[0].content
assert_equal :symbol, tokens[2].type
assert_equal "%s(rah)", tokens[2].content
assert_equal :symbol, tokens[4].type
assert_equal "%s:rah:", tokens[4].content
assert_equal :symbol, tokens[6].type
assert_equal "%s<rah<rah>rah>", tokens[6].content
tokens = @sf.lines[1].tokens
assert_equal :symbol, tokens[0].type
assert_equal "%s(rah)", tokens[0].content
tokens = @sf.lines[2].tokens
assert_equal :symbol, tokens[0].type
assert_equal "%s:rah:", tokens[0].content
tokens = @sf.lines[3].tokens
assert_equal :symbol, tokens[0].type
assert_equal "%s<rah<rah>rah>", tokens[0].content
end

end

0 comments on commit 851d0cd

Please sign in to comment.