Skip to content

Commit

Permalink
factor out the guessing infra to Guesser classes
Browse files Browse the repository at this point in the history
  • Loading branch information
http://jneen.net/ committed Jun 7, 2016
1 parent 7db5e04 commit b2d086a
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 85 deletions.
5 changes: 5 additions & 0 deletions lib/rouge.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ def highlight(text, lexer, formatter, &b)
load load_dir.join('rouge/text_analyzer.rb')
load load_dir.join('rouge/token.rb')

load load_dir.join('rouge/guesser.rb')
load load_dir.join('rouge/guessers/filename.rb')
load load_dir.join('rouge/guessers/mimetype.rb')
load load_dir.join('rouge/guessers/source.rb')

load load_dir.join('rouge/lexer.rb')
load load_dir.join('rouge/regex_lexer.rb')
load load_dir.join('rouge/template_lexer.rb')
Expand Down
21 changes: 21 additions & 0 deletions lib/rouge/guesser.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
module Rouge
class Guesser
def self.guess(guessers, lexers)
original_size = lexers.size

guessers.each do |g|
new_lexers = g.filter(lexers)
lexers = new_lexers.any? ? new_lexers : lexers
end

# if we haven't filtered the input at *all*,
# then we have no idea what language it is,
# so we bail and return [].
lexers.size < original_size ? lexers : []
end

def filter(lexers)
raise 'abstract'
end
end
end
41 changes: 41 additions & 0 deletions lib/rouge/guessers/filename.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
module Rouge
module Guessers
class Filename < Guesser
attr_reader :fname
def initialize(filename)
@filename = filename
@basename = File.basename(filename)
end

# returns a list of lexers that match the given filename with
# equal specificity (i.e. number of wildcards in the pattern).
# This helps disambiguate between, e.g. the Nginx lexer, which
# matches `nginx.conf`, and the Conf lexer, which matches `*.conf`.
# In this case, nginx will win because the pattern has no wildcards,
# while `*.conf` has one.
def filter(lexers)
out = []
best_seen = nil
lexers.each do |lexer|
score = lexer.filenames.map do |pattern|
if File.fnmatch?(pattern, @basename, File::FNM_DOTMATCH)
# specificity is better the fewer wildcards there are
pattern.scan(/[*?\[]/).size
end
end.compact.min

next unless score

if best_seen.nil? || score < best_seen
best_seen = score
out = [lexer]
elsif score == best_seen
out << lexer
end
end

out.any? ? out : lexers
end
end
end
end
14 changes: 14 additions & 0 deletions lib/rouge/guessers/mimetype.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
module Rouge
module Guessers
class Mimetype < Guesser
attr_reader :mimetype
def initialize(mimetype)
@mimetype = mimetype
end

def filter(lexers)
lexers.select { |lexer| lexer.mimetypes.include? @mimetype }
end
end
end
end
48 changes: 48 additions & 0 deletions lib/rouge/guessers/source.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
module Rouge
module Guessers
class Source < Guesser
attr_reader :source
def initialize(source)
@source = source
end

def filter(lexers)
# don't bother reading the input if
# we've already filtered to 1
return lexers if lexers.size == 1

# If we're filtering against *all* lexers, we only use confident return
# values from analyze_text. But if we've filtered down already, we can trust
# the analysis more.
threshold = lexers.size < 10 ? 0 : 0.5

source_text = case @source
when String
@source
when ->(s){ s.respond_to? :read }
@source.read
else
raise 'invalid source'
end

Lexer.assert_utf8!(source_text)

source_text = TextAnalyzer.new(source_text)

best_result = threshold
best_match = nil
lexers.each do |lexer|
result = lexer.analyze_text(source_text) || 0
return [lexer] if result == 1

if result > best_result
best_match = lexer
best_result = result
end
end

[best_match]
end
end
end
end
95 changes: 10 additions & 85 deletions lib/rouge/lexer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -109,26 +109,16 @@ def all
# to use.
def guesses(info={})
mimetype, filename, source = info.values_at(:mimetype, :filename, :source)
lexers = registry.values.uniq
total_size = lexers.size

lexers = filter_by_mimetype(lexers, mimetype) if mimetype
return lexers if lexers.size == 1

lexers = filter_by_filename(lexers, filename) if filename
return lexers if lexers.size == 1

if source
# If we're filtering against *all* lexers, we only use confident return
# values from analyze_text. But if we've filtered down already, we can trust
# the analysis more.
source_threshold = lexers.size < total_size ? 0 : 0.5
return [best_by_source(lexers, source, source_threshold)].compact
elsif lexers.size < total_size
return lexers
else
return []
end

guessers = info[:guessers] || []

guessers = []

guessers << Guessers::Mimetype.new(mimetype) if mimetype
guessers << Guessers::Filename.new(filename) if filename
guessers << Guessers::Source.new(source) if source

Guesser.guess(guessers, registry.values.uniq)
end

class AmbiguousGuess < StandardError
Expand Down Expand Up @@ -175,71 +165,6 @@ def guess_by_source(source)
end

private
def filter_by_mimetype(lexers, mt)
filtered = lexers.select { |lexer| lexer.mimetypes.include? mt }
filtered.any? ? filtered : lexers
end

# returns a list of lexers that match the given filename with
# equal specificity (i.e. number of wildcards in the pattern).
# This helps disambiguate between, e.g. the Nginx lexer, which
# matches `nginx.conf`, and the Conf lexer, which matches `*.conf`.
# In this case, nginx will win because the pattern has no wildcards,
# while `*.conf` has one.
def filter_by_filename(lexers, fname)
fname = File.basename(fname)

out = []
best_seen = nil
lexers.each do |lexer|
score = lexer.filenames.map do |pattern|
if File.fnmatch?(pattern, fname, File::FNM_DOTMATCH)
# specificity is better the fewer wildcards there are
pattern.scan(/[*?\[]/).size
end
end.compact.min

next unless score

if best_seen.nil? || score < best_seen
best_seen = score
out = [lexer]
elsif score == best_seen
out << lexer
end
end

out.any? ? out : lexers
end

def best_by_source(lexers, source, threshold=0)
source = case source
when String
source
when ->(s){ s.respond_to? :read }
source.read
else
raise 'invalid source'
end

assert_utf8!(source)

source = TextAnalyzer.new(source)

best_result = threshold
best_match = nil
lexers.each do |lexer|
result = lexer.analyze_text(source) || 0
return lexer if result == 1

if result > best_result
best_match = lexer
best_result = result
end
end

best_match
end

protected
# @private
Expand Down

0 comments on commit b2d086a

Please sign in to comment.