Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Exclusion Filters #705

Merged
merged 23 commits into from
Dec 29, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Concatenate regexes prio to compilation
* Concatenating regexes into one Pattern might yield better performance under (un)certain conditions.
* Filenames are tested against regexes with no os.sep in them. This may or may not be what we want to do.
And alternative would be to test against the whole (absolute) path of each file, which would filter more agressively.
  • Loading branch information
glubsy committed Aug 20, 2020
commit 9f223f3964fca3d0471d2994b1636f5b0b3f1433
2 changes: 1 addition & 1 deletion core/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def __init__(self, view):
self.app_mode = AppMode.Standard
self.discarded_file_count = 0
self.exclude_list = ExcludeList()
self.directories = directories.Directories()
self.directories = directories.Directories(self.exclude_list)
self.results = results.Results(self)
self.ignore_list = IgnoreList()
# In addition to "app-level" options, this dictionary also holds options that will be
Expand Down
70 changes: 38 additions & 32 deletions core/directories.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from hscommon.util import FileOrPath

from . import fs
from .exclude import ExcludeList

__all__ = [
"Directories",
Expand Down Expand Up @@ -53,17 +52,15 @@ class Directories:
Then, when the user starts the scan, :meth:`get_files` is called to retrieve all files (wrapped
in :mod:`core.fs`) that have to be scanned according to the chosen folders/states.
"""
# FIXME: if there is zero item in these sets, the for each loops will yield NOTHING
deny_list_str = set()
deny_list_re = set()
deny_list_re_files = set()

# ---Override
def __init__(self, excluded=ExcludeList()):
def __init__(self, exclude_list=None):
self._dirs = []
# {path: state}
self.states = {}
self._excluded = excluded
self._exclude_list = exclude_list
if exclude_list is not None:
exclude_list._combined_regex = False # TODO make a setter

def __contains__(self, path):
for p in self._dirs:
Expand All @@ -81,57 +78,66 @@ def __len__(self):
return len(self._dirs)

# ---Private
def _default_state_for_path(self, path, deny_list_re=deny_list_re):
def _default_state_for_path(self, path):
# New logic with regex filters
if self._exclude_list is not None and len(self._exclude_list) > 0:
# We iterate even if we only have one item here
for denied_path_re in self._exclude_list.compiled_combined:
if denied_path_re.match(str(path)):
return DirectoryState.Excluded
return None
# Old default logic, still used during initialization of DirectoryTree:
# Override this in subclasses to specify the state of some special folders.
# if path.name.startswith("."): # hidden
# return DirectoryState.Excluded
for denied_path_re in deny_list_re:
if denied_path_re.match(str(path)):
return DirectoryState.Excluded
if path.name.startswith("."):
return DirectoryState.Excluded

def _get_files(self, from_path, fileclasses, j, deny_list_re=deny_list_re_files):
def _get_files(self, from_path, fileclasses, j):
for root, dirs, files in os.walk(str(from_path)):
j.check_if_cancelled()
root = Path(root)
rootPath = Path(root)
state = self.get_state(root)
if state == DirectoryState.Excluded:
# Recursively get files from folders with lots of subfolder is expensive. However, there
# might be a subfolder in this path that is not excluded. What we want to do is to skim
# through self.states and see if we must continue, or we can stop right here to save time
if not any(p[: len(root)] == root for p in self.states):
if not any(p[: len(rootPath)] == rootPath for p in self.states):
del dirs[:]
try:
if state != DirectoryState.Excluded:
found_files = []
for f in files:
found = False
for expr in deny_list_re:
found = expr.match(f)
if found:
break
if not found:
found_files.append(fs.get_file(root + f, fileclasses=fileclasses))
# Old logic
if self._exclude_list is None or not len(self._exclude_list):
found_files = [fs.get_file(rootPath + f, fileclasses=fileclasses) for f in files]
else:
found_files = []
for f in files:
found = False
for expr in self._exclude_list.compiled_files_combined:
found = expr.match(f)
if found:
break
if not found:
found_files.append(fs.get_file(rootPath + f, fileclasses=fileclasses))
found_files = [f for f in found_files if f is not None]
# In some cases, directories can be considered as files by dupeGuru, which is
# why we have this line below. In fact, there only one case: Bundle files under
# OS X... In other situations, this forloop will do nothing.
for d in dirs[:]:
f = fs.get_file(root + d, fileclasses=fileclasses)
f = fs.get_file(rootPath + d, fileclasses=fileclasses)
if f is not None:
found_files.append(f)
dirs.remove(d)
logging.debug(
"Collected %d files in folder %s",
len(found_files),
str(root),
str(rootPath),
)
for file in found_files:
file.is_ref = state == DirectoryState.Reference
yield file
except (EnvironmentError, fs.InvalidPath):
pass

def _get_folders(self, from_folder, j, deny_list_re=deny_list_re):
def _get_folders(self, from_folder, j):
j.check_if_cancelled()
try:
for subfolder in from_folder.subfolders:
Expand Down Expand Up @@ -177,15 +183,15 @@ def get_subfolders(path):
except EnvironmentError:
return []

def get_files(self, fileclasses=None, j=job.nulljob, deny_list_re=deny_list_re_files):
def get_files(self, fileclasses=None, j=job.nulljob):
"""Returns a list of all files that are not excluded.

Returned files also have their ``is_ref`` attr set if applicable.
"""
if fileclasses is None:
fileclasses = [fs.File]
for path in self._dirs:
for file in self._get_files(path, fileclasses=fileclasses, j=j, deny_list_re=deny_list_re):
for file in self._get_files(path, fileclasses=fileclasses, j=j):
yield file

def get_folders(self, folderclass=None, j=job.nulljob):
Expand All @@ -200,15 +206,15 @@ def get_folders(self, folderclass=None, j=job.nulljob):
for folder in self._get_folders(from_folder, j):
yield folder

def get_state(self, path, deny_list_re=deny_list_re):
def get_state(self, path):
"""Returns the state of ``path``.

:rtype: :class:`DirectoryState`
"""
# direct match? easy result.
if path in self.states:
return self.states[path]
state = self._default_state_for_path(path, deny_list_re) or DirectoryState.Normal
state = self._default_state_for_path(path) or DirectoryState.Normal
prevlen = 0
# we loop through the states to find the longest matching prefix
for p, s in self.states.items():
Expand Down
94 changes: 60 additions & 34 deletions core/exclude.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

from .markable import Markable
from xml.etree import ElementTree as ET
# TODO: perhaps use regex module for better Unicode support? https://pypi.org/project/regex/
# or perhaps also https://pypi.org/project/re2/
import re
from os import sep
import logging
Expand Down Expand Up @@ -50,14 +52,18 @@ class ExcludeList(Markable):
The downside is we have to compare strings every time we look for an item in the list
since we use regex strings as keys.
[regex:str, compilable:bool, error:Exception, compiled:Pattern])
If combined_regex is True, the compiled regexes will be combined into one Pattern
instead of returned as separate Patterns.
"""

# ---Override
def __init__(self):
def __init__(self, combined_regex=False):
Markable.__init__(self)
self._combined_regex = combined_regex
self._excluded = []
self._count = 0
self._excluded_compiled = set()
self._dirty = True

def __debug_test(self):
self.test_regexes = [
Expand All @@ -81,30 +87,38 @@ def __iter__(self):
yield self.is_marked(regex), regex

def __len__(self):
return self._count
"""Returns the number of marked regexes."""
return len([x for marked, x in self if marked])

def is_markable(self, regex):
return self._is_markable(regex)

def _is_markable(self, regex):
"""Return the cached result of "compilable" property"""
# FIXME save result of compilation via memoization
# return self._excluded.get(regex)[0]
for item in self._excluded:
if item[0] == regex:
return item[1]
return False # FIXME should not be needed
return False # should not be needed

def _did_mark(self, regex):
self._add_compiled(regex)

def _did_unmark(self, regex):
self._remove_compiled(regex)

def _add_compiled(self, regex):
if self._combined_regex:
self._dirty = True
return
for item in self._excluded:
if item[0] == regex:
# no need to test if already present since it's a set()
self._excluded_compiled.add(item[3])

def _did_unmark(self, regex):
self._remove_compiled(regex)

def _remove_compiled(self, regex):
if self._combined_regex:
self._dirty = True
return
for item in self._excluded_compiled:
if regex in item.pattern:
self._excluded_compiled.remove(item)
Expand Down Expand Up @@ -137,13 +151,41 @@ def error(self, regex):
@property
def compiled(self):
"""Should be used by other classes to retrieve the up-to-date list of patterns."""
return self._excluded_compiled
if not self._combined_regex:
return self._excluded_compiled
else:
return self.compiled_combined

@property
def compiled_files(self):
"""Should be used by other classes to retrieve the up-to-date list of patterns
for files only."""
return [compiled_pattern for compiled_pattern in self.compiled if sep not in compiled_pattern.pattern]
if not self._combined_regex:
# Return each compiled element separately
# return [compiled_pattern for compiled_pattern in self.compiled if sep not in compiled_pattern.pattern]
for compiled in self.compiled:
if sep not in compiled.pattern:
yield compiled
else:
return self.compiled_files_combined

@property
def compiled_combined(self):
if self._dirty:
self._cached_compiled_combined =\
re.compile('|'.join(x for marked, x in self if marked))
# Must compute the filtered out version as well
self._cached_compiled_combined_files =\
re.compile('|'.join(x for marked, x in self
if marked and sep not in x))
self._dirty = False
# returned as a tuple to get a free iterator and to avoid subclassing
return (self._cached_compiled_combined,)

@property
def compiled_files_combined(self):
# returned as a tuple to get a free iterator and to avoid subclassing
return (self._cached_compiled_combined_files,)

# ---Public
def add(self, regex, forced=False):
Expand All @@ -164,7 +206,7 @@ def add(self, regex, forced=False):
def _do_add(self, regex, iscompilable, exception, compiled):
# We need to insert at the top
self._excluded.insert(0, [regex, iscompilable, exception, compiled])
self._count = len(self._excluded)
# self._count = len(self._excluded)

def isExcluded(self, regex):
for item in self._excluded:
Expand All @@ -174,7 +216,6 @@ def isExcluded(self, regex):

def clear(self):
self._excluded = []
self._count = 0

def remove(self, regex):
for item in self._excluded:
Expand Down Expand Up @@ -286,9 +327,6 @@ def __iter__(self):
for regex in ordered_keys(self._excluded):
yield self.is_marked(regex), regex

def __len__(self):
return self._count

def is_markable(self, regex):
return self._is_markable(regex)

Expand All @@ -299,17 +337,16 @@ def _is_markable(self, regex):
return exists.get("compilable")
return False

def _did_mark(self, regex):
# self._excluded[regex][0] = True # is compilable
def _add_compiled(self, regex):
if self._combined_regex:
self._dirty = True
return
try:
self._excluded_compiled.add(self._excluded[regex]["compiled"])
except Exception as e:
print(f"Exception while adding regex {regex} to compiled set: {e}")
return

def _did_unmark(self, regex):
self._remove_compiled(regex)

def is_compilable(self, regex):
"""Returns the cached "compilable" value"""
return self._excluded[regex]["compilable"]
Expand All @@ -318,24 +355,13 @@ def error(self, regex):
"""Return the compilation error message for regex string"""
return self._excluded.get(regex).get("error")

@property
def compiled(self):
"""Should be used by other classes to retrieve the up-to-date list of patterns."""
return self._excluded_compiled

@property
def compiled_files(self):
"""Should be used by other classes to retrieve the up-to-date list of patterns
for files only."""
return [compiled_pattern for compiled_pattern in self.compiled if sep not in compiled_pattern.pattern]

# ---Public
def _do_add(self, regex, iscompilable, exception, compiled):
# We always insert at the top, so index should be 0 and other indices should be pushed by one
for value in self._excluded.values():
value["index"] += 1
self._excluded[regex] = {"index": 0, "compilable": iscompilable, "error": exception, "compiled": compiled}
self._count = len(self._excluded)
# self._count = len(self._excluded)

def isExcluded(self, regex):
if regex in self._excluded.keys():
Expand All @@ -344,13 +370,13 @@ def isExcluded(self, regex):

def clear(self):
self._excluded = {}
self._count = 0

def remove(self, regex):
old_value = self._excluded.pop(regex)
# Bring down all indices which where above it
index = old_value["index"]
if index == len(self._excluded):
if index == len(self._excluded) - 1: # we start at 0...
# Old index was at the end, no need to update other indices
self._remove_compiled(regex)
return

Expand Down
3 changes: 3 additions & 0 deletions qt/exclude_list_dialog.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,13 @@ def _setupUI(self):
gridlayout.addItem(QSpacerItem(0, 0, QSizePolicy.Minimum, QSizePolicy.Expanding), 4, 1)
gridlayout.addWidget(self.buttonClose, 5, 1)
layout.addLayout(gridlayout)
self.linedit.setPlaceholderText("Type a regular expression here...")
self.linedit.setFocus()

# --- model --> view
def show(self):
super().show()
self.linedit.setFocus()

@pyqtSlot()
def addStringFromLineEdit(self):
Expand Down