From 0a0c02adfea96316a3abeac105b764170aba1815 Mon Sep 17 00:00:00 2001 From: julp Date: Sat, 26 Dec 2015 23:45:20 +0100 Subject: [PATCH 1/5] Bring some improvements to PHP lexer: * translation of PHP's lexer into rouge * add lexer option short_open_tag to permit highlighting as if short_open_tag were off when disabled (enabled by default) * add some missing features: + binary numbers (introduced by PHP 5.4.0) + string interpolation in backquoted strings + Unicode codepoints escape syntax (\u{...}) (introduced by PHP 7.0.0) * stricter syntax: + goto(state) +# - yy_push_state(state) => push(state) +# - yy_pop_state() => pop! +# - yyless => a (positive) lookahead assertion (?=...) +# +# Note that third argument of RegexLexer#rule (if used) internally calls push, not goto +# +# Elements not handled: +# - asp_tags (<%=? ... %>) (removed by PHP 7.0.0) +# - "long" tags () (also removed by PHP 7.0.0) +# - major part of keywords can be used as identifier (introduced by PHP 7.0.0) + module Rouge module Lexers class PHP < TemplateLexer @@ -19,6 +33,7 @@ def initialize(opts={}) # if truthy, the lexer starts highlighting with php code # (no Error, +# :var_offset => Error, + h[:nowdoc] = Str::Heredoc + h[:heredoc] = Str::Heredoc + h[:backquote] = Str::Backtick + h[:single_quotes] = Str::Single + h[:double_quotes] = Str::Double + end + + # assignment by default + state :default do + rule /#{NEWLINE}/ do + token DEFAULTS[state.name.to_sym] + end + rule /./ do + token DEFAULTS[state.name.to_sym] + end + end + + # common rules for variables interpolation (its use implies common mixin) + state :interpolation do + rule /\$\{/, Str::Interpol, :looking_for_varname + rule /\{(?=\$)/, Str::Interpol, :in_scripting + rule /\$#{LABEL}(?=->#{LABEL_FIRST_CHAR})/, Name::Variable, :looking_for_property + rule /\$#{LABEL}(?=\[)/, Name::Variable, :var_offset + + # Unicode escaped sequences (\u{...}) (PHP >= 7) + rule /\\u\{[0-9a-fA-F]+\}/, Str::Escape + # regular octal (\0DD), hexadecimal (\xDD) and "named" (eg \n) sequences + rule /\\0[0-9]{2}|\\[xX][0-9A-Fa-f]{2}|\\[$efrntv\\]/, Str::Escape + end + + # handle end of (here|now)doc (its use implies default mixin) + # "label" ending the (here|now)doc must be: + # - at the beginning of the line (no indentation) + # - followed by a ; then a newline even if it is the last instruction of the PHP block + state :here_and_now_doc do + rule /^(#{LABEL})(;)(?=#{NEWLINE})/ do |m| + if m[1] == @label + groups Name::Constant, Punctuation + goto :in_scripting + else + token DEFAULTS[state.name.to_sym] + end + end + end + + ##### real lexer states ##### + state :root do - rule /<\?(php|=)?/, Comment::Preproc, :php + rule /(<\?php)([ \t]|#{NEWLINE})/i do + groups Comment::Preproc, Text + goto :in_scripting + end + rule /<\?=?/ do |m| + if '/, Comment::Preproc, :pop! - # heredocs - rule /<<<('?)([a-z_]\w*)\1\n.*?\n\2;?\n/im, Str::Heredoc - rule /\s+/, Text - rule /#.*?\n/, Comment::Single - rule %r(//.*?\n), Comment::Single + state :in_scripting do + rule /\?>/ do + token Comment::Preproc + goto :root + end + rule /#{WHITESPACE}/, Text + rule /#.*/, Comment::Single + rule %r(//.*), Comment::Single # empty comment, otherwise seen as the start of a docstring rule %r(/\*\*/), Comment::Multiline - rule %r(/\*\*.*?\*/)m, Str::Doc + rule %r(/\*\*#{WHITESPACE}.*?\*/)m, Str::Doc rule %r(/\*.*?\*/)m, Comment::Multiline - rule /(->|::)(\s*)([a-zA-Z_][a-zA-Z0-9_]*)/ do + + rule /(::)(\s*)(#{LABEL})/ do groups Operator, Text, Name::Attribute end - rule /[~!%^&*+=\|:.<>\/?@-]+/, Operator - rule /[\[\]{}();,]/, Punctuation - rule /class\b/, Keyword, :classname - # anonymous functions - rule /(function)(\s*)(?=\()/ do - groups Keyword, Text + rule /\(#{TABS_AND_SPACES}(?:int(?:eger)?|real|double|float|string|binary|array|object|bool(?:ean)|unset)#{TABS_AND_SPACES}\)/i, Operator + + rule /[\[\]();,]+/, Punctuation + + rule /\{/, Punctuation, :in_scripting + rule /}/ do + pop! + token state?(:in_scripting) ? Punctuation : Str::Interpol end - # named functions - rule /(function)(\s+)(&?)(\s*)/ do - groups Keyword, Text, Operator, Text - push :funcname + rule /(class)(#{WHITESPACE})(#{LABEL})/i do + groups Keyword, Text, Name::Class end - rule /(const)(\s+)([a-zA-Z_]\w*)/i do + # these two gives inconsistent results when preceded by "use". Eg: + # use function A\B\c as foo, A\B\d as bar /* ... */; + # A\B\c will be highlighted as Name::Function when A\B\d as Name::Other + # (by rule /#{NAMESPACED_LABEL}/) + # we can't rely on a negative lookbehind assertion (?= 7) + rule /(yield)(#{WHITESPACE})(from)/i do + groups Keyword, Text, Keyword end + rule /(true|false|null)\b/i, Keyword::Constant + rule /(int|float|bool|string|resource|object|mixed|numeric|void)\b/i, Keyword::Type + rule /(\d+\.\d*|\d*\.\d+)(e[+-]?\d+)?/i, Num::Float rule /\d+e[+-]?\d+/i, Num::Float - rule /0[0-7]+/, Num::Oct - rule /0x[a-f0-9]+/i, Num::Hex + rule /#{NUMBER_OCT}/, Num::Oct + rule /#{NUMBER_BIN}/, Num::Bin + rule /#{NUMBER_HEX}/, Num::Hex rule /\d+/, Num::Integer - rule /'([^'\\]*(?:\\.[^'\\]*)*)'/, Str::Single - rule /`([^`\\]*(?:\\.[^`\\]*)*)`/, Str::Backtick - rule /"/, Str::Double, :string + + rule /b?"/ do + token Str::Double + goto :double_quotes + end + rule /b?'/ do + token Str::Single + goto :single_quotes + end + rule /`/ do + token Str::Backtick + goto :backquote + end + rule /(b?<<<)(#{TABS_AND_SPACES})(["']?)(#{LABEL})(\3)(#{NEWLINE})/ do |m| + @label = m[4] + groups Operator, Text, Str::Heredoc, Name::Constant, Str::Heredoc, Text + goto "'" == m[3] ? :nowdoc : :heredoc + end + + rule /#{NAMESPACED_LABEL}/ do |m| + token ( + if m[0].include? '\\' + Name::Other + else + downcased = m[0].downcase + if self.class.keywords.include? downcased + Keyword + elsif self.builtins.include? downcased + Name::Builtin + else + Name::Other + end + end + ) + end + + rule /->/, Operator, :looking_for_property + # keep this last to not have precedence and create conflicts + rule /[~!%^&*+=\|:.<>\/?@-]+/, Operator + + mixin :common + mixin :default end - state :classname do - rule /\s+/, Text - rule /[a-z_][\\\w]*/i, Name::Class, :pop! + # handles "short" (ie not surrounded by brackets) + # interpolation of array or string variable offset (eg: "$foo[...]") + state :var_offset do + rule /\[/, Punctuation + rule /]/, Punctuation, :pop! + rule /#{NAMESPACED_LABEL}/, Name::Constant + # non decimal integers are treated as strings + rule /#{NUMBER_BIN}|#{NUMBER_OCT}|#{NUMBER_HEX}/, Str + rule /0|[1-9]\d*/, Num::Integer + + mixin :common + mixin :default end - state :funcname do - rule /[a-z_]\w*/i, Name::Function, :pop! + state :looking_for_property do + rule /->/, Operator + rule /#{LABEL}/, Name::Variable::Instance, :pop! # instance variable or method name (instance method call) + rule /#{WHITESPACE}/, Text + rule /./ do + pop! + restart! + end end - state :string do - rule /"/, Str::Double, :pop! - rule /[^\\{$"]+/, Str::Double - rule /\\([nrt\"$\\]|[0-7]{1,3}|x[0-9A-Fa-f]{1,2})/, - Str::Escape - rule /\$[a-zA-Z_][a-zA-Z0-9_]*(\[\S+\]|->[a-zA-Z_][a-zA-Z0-9_]*)?/, Name::Variable + state :looking_for_varname do + rule /#{LABEL}(?=[\[}])/ do + pop! + push :in_scripting + token Name::Variable + end + rule /./ do |m| + pop! + push :in_scripting + restart! + end + end + + state :nowdoc do + mixin :here_and_now_doc + mixin :default + end - rule /\{\$\{/, Str::Interpol, :interp_double - rule /\{(?=\$)/, Str::Interpol, :interp_single - rule /(\{)(\S+)(\})/ do - groups Str::Interpol, Name::Variable, Str::Interpol + state :heredoc do + mixin :common + mixin :interpolation + mixin :here_and_now_doc + mixin :default + end + + state :backquote do + mixin :common + mixin :interpolation + + rule /\\[\\`]/, Str::Escape + rule /`/ do + token Str::Backtick + goto :in_scripting end - rule /[${\\]+/, Str::Double + mixin :default end - state :interp_double do - rule /\}\}/, Str::Interpol, :pop! - mixin :php + state :single_quotes do + rule /\\[\\']/, Str::Escape + rule /'/ do + token Str::Single + goto :in_scripting + end + + mixin :default end - state :interp_single do - rule /\}/, Str::Interpol, :pop! - mixin :php + state :double_quotes do + mixin :interpolation + mixin :common + + rule /\\[\\"]/, Str::Escape + # end of (double quoted) string + rule /"/ do + token Str::Double + goto :in_scripting + end + + mixin :default end end end diff --git a/lib/rouge/regex_lexer.rb b/lib/rouge/regex_lexer.rb index dd2a75c2ca..15bbb8346d 100644 --- a/lib/rouge/regex_lexer.rb +++ b/lib/rouge/regex_lexer.rb @@ -322,6 +322,12 @@ def step(state, stream) false end + # Restore internal stream at its state before the last token read. + # You can just go back before last token, not further + def restart! + @current_stream.unscan + end + # Yield a token. # # @param tok From 74d738cbe1088ce5a53edaaa6f7d8985113d0a88 Mon Sep 17 00:00:00 2001 From: julp Date: Fri, 3 Jun 2016 15:40:56 +0200 Subject: [PATCH 2/5] remove restart! by using rule // --- lib/rouge/lexers/php.rb | 6 ++---- lib/rouge/regex_lexer.rb | 6 ------ 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/lib/rouge/lexers/php.rb b/lib/rouge/lexers/php.rb index cf6df0c6b8..4a760933ff 100644 --- a/lib/rouge/lexers/php.rb +++ b/lib/rouge/lexers/php.rb @@ -301,9 +301,8 @@ def self.analyze_text(text) rule /->/, Operator rule /#{LABEL}/, Name::Variable::Instance, :pop! # instance variable or method name (instance method call) rule /#{WHITESPACE}/, Text - rule /./ do + rule // do pop! - restart! end end @@ -313,10 +312,9 @@ def self.analyze_text(text) push :in_scripting token Name::Variable end - rule /./ do |m| + rule // do |m| pop! push :in_scripting - restart! end end diff --git a/lib/rouge/regex_lexer.rb b/lib/rouge/regex_lexer.rb index 15bbb8346d..dd2a75c2ca 100644 --- a/lib/rouge/regex_lexer.rb +++ b/lib/rouge/regex_lexer.rb @@ -322,12 +322,6 @@ def step(state, stream) false end - # Restore internal stream at its state before the last token read. - # You can just go back before last token, not further - def restart! - @current_stream.unscan - end - # Yield a token. # # @param tok From fd74a12b9d31f2ec3966bc727d1f0fc56c4b4030 Mon Sep 17 00:00:00 2001 From: julp Date: Fri, 3 Jun 2016 15:43:00 +0200 Subject: [PATCH 3/5] remove commented code --- lib/rouge/lexers/php.rb | 2 -- 1 file changed, 2 deletions(-) diff --git a/lib/rouge/lexers/php.rb b/lib/rouge/lexers/php.rb index 4a760933ff..bdba0c10fd 100644 --- a/lib/rouge/lexers/php.rb +++ b/lib/rouge/lexers/php.rb @@ -108,8 +108,6 @@ def self.analyze_text(text) end DEFAULTS = Hash.new(Error).tap do |h| -# :in_scripting => Error, -# :var_offset => Error, h[:nowdoc] = Str::Heredoc h[:heredoc] = Str::Heredoc h[:backquote] = Str::Backtick From b1b6d9bc9125e40bbddc5090f26a9812a849023a Mon Sep 17 00:00:00 2001 From: julp Date: Wed, 21 Dec 2016 16:58:10 +0100 Subject: [PATCH 4/5] fix heredoc/nowdoc when ending label is not followed by a semi-colon --- lib/rouge/lexers/php.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/rouge/lexers/php.rb b/lib/rouge/lexers/php.rb index bdba0c10fd..3d43a8adaf 100644 --- a/lib/rouge/lexers/php.rb +++ b/lib/rouge/lexers/php.rb @@ -143,7 +143,7 @@ def self.analyze_text(text) # - at the beginning of the line (no indentation) # - followed by a ; then a newline even if it is the last instruction of the PHP block state :here_and_now_doc do - rule /^(#{LABEL})(;)(?=#{NEWLINE})/ do |m| + rule /^(#{LABEL})(;?)(?=#{NEWLINE})/ do |m| if m[1] == @label groups Name::Constant, Punctuation goto :in_scripting From b70a812e233a77f994d96cfb4174fc86ac0058c9 Mon Sep 17 00:00:00 2001 From: julp Date: Wed, 21 Dec 2016 17:04:04 +0100 Subject: [PATCH 5/5] update typehinting for PHP 7.1.0 --- lib/rouge/lexers/php.rb | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/rouge/lexers/php.rb b/lib/rouge/lexers/php.rb index 3d43a8adaf..4bb70574bb 100644 --- a/lib/rouge/lexers/php.rb +++ b/lib/rouge/lexers/php.rb @@ -228,8 +228,13 @@ def self.analyze_text(text) groups Keyword, Text, Keyword end - rule /(true|false|null)\b/i, Keyword::Constant - rule /(int|float|bool|string|resource|object|mixed|numeric|void)\b/i, Keyword::Type + rule /(?:true|false|null)\b/i, Keyword::Constant + # PHP 7.0: generalized typehinting + # PHP 7.1: null allowed by prefixing type by a '?' + void (which is not nullable) and iterable added + # from builtin_types in Zend/zend_compile.c + rule /(?:void|\??(?:int|float|bool|string|iterable))\b/i, Keyword::Type + # PHP 7.1: self and callable are keywords, handle them here as nullable types + rule /\??(?:self|callable)\b/i, Keyword::Type rule /(\d+\.\d*|\d*\.\d+)(e[+-]?\d+)?/i, Num::Float rule /\d+e[+-]?\d+/i, Num::Float