Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Exclusion Filters #705

Merged
merged 23 commits into from
Dec 29, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Highlight rows when testing regex string
* Add testing feature to Exclusion dialog to allow users to test regexes against an arbitrary string.
* Fixed test suites.
* Improve comments and help dialog box.
  • Loading branch information
glubsy committed Sep 1, 2020
commit ea11a566af4e351760b9fc2aca1fa1fa9cc73366
2 changes: 0 additions & 2 deletions core/directories.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,6 @@ def __init__(self, exclude_list=None):
# {path: state}
self.states = {}
self._exclude_list = exclude_list
if exclude_list is not None:
exclude_list._combined_regex = False # TODO make a setter

def __contains__(self, path):
for p in self._dirs:
Expand Down
138 changes: 78 additions & 60 deletions core/exclude.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
r"^\$Recycle\.Bin$", # Windows
r"^\..*" # Hidden files
]
# These are too agressive
# These are too broad
forbidden_regexes = [r".*", r"\/.*", r".*\/.*", r".*\..*"]


Expand Down Expand Up @@ -53,20 +53,21 @@ def __init__(self, arg="Expression is already in excluded list."):


class ExcludeList(Markable):
"""Exclude list of regular expression strings to filter out directories
and files that we want to avoid scanning.
The list() class allows us to preserve item order without too much hassle.
The downside is we have to compare strings every time we look for an item in the list
since we use regex strings as keys.
[regex:str, compilable:bool, error:Exception, compiled:Pattern])
If combined_regex is True, the compiled regexes will be combined into one Pattern
instead of returned as separate Patterns.
"""
"""A list of lists holding regular expression strings and the compiled re.Pattern"""

# Used to filter out directories and files that we would rather avoid scanning.
# The list() class allows us to preserve item order without too much hassle.
# The downside is we have to compare strings every time we look for an item in the list
# since we use regex strings as keys.
# If _use_union is True, the compiled regexes will be combined into one single
# Pattern instead of separate Patterns which may or may not give better
# performance compared to looping through each Pattern individually.

# ---Override
def __init__(self, combined_regex=False):
def __init__(self, union_regex=True):
Markable.__init__(self)
self._use_combined = combined_regex
self._use_union = union_regex
# list([str regex, bool iscompilable, re.error exception, Pattern compiled], ...)
self._excluded = []
self._excluded_compiled = set()
self._dirty = True
Expand All @@ -85,6 +86,7 @@ def __len__(self):
return len(self._excluded)

def __getitem__(self, key):
"""Returns the list item corresponding to key."""
for item in self._excluded:
if item[0] == key:
return item
Expand All @@ -98,6 +100,10 @@ def __delitem__(self, key):
# TODO if necessary
pass

def get_compiled(self, key):
"""Returns the (precompiled) Pattern for key"""
return self.__getitem__(key)[3]

def is_markable(self, regex):
return self._is_markable(regex)

Expand All @@ -106,7 +112,7 @@ def _is_markable(self, regex):
for item in self._excluded:
if item[0] == regex:
return item[1]
return False # should not be necessary, regex SHOULD be in there
return False # should not be necessary, the regex SHOULD be in there

def _did_mark(self, regex):
self._add_compiled(regex)
Expand All @@ -116,7 +122,7 @@ def _did_unmark(self, regex):

def _add_compiled(self, regex):
self._dirty = True
if self._use_combined:
if self._use_union:
return
for item in self._excluded:
# FIXME probably faster to just rebuild the set from the compiled instead of comparing strings
Expand All @@ -127,7 +133,7 @@ def _add_compiled(self, regex):

def _remove_compiled(self, regex):
self._dirty = True
if self._use_combined:
if self._use_union:
return
for item in self._excluded_compiled:
if regex in item.pattern:
Expand All @@ -153,74 +159,78 @@ def compile_re(self, regex):
return True, None, compiled

def error(self, regex):
"""Return the compilation error Exception for regex. It should have a "msg" attr."""
"""Return the compilation error Exception for regex.
It should have a "msg" attr."""
for item in self._excluded:
if item[0] == regex:
return item[2]

def build_compiled_caches(self, combined=False):
if not combined:
def build_compiled_caches(self, union=False):
if not union:
self._cached_compiled_files =\
[x for x in self._excluded_compiled if sep not in x.pattern]
self._cached_compiled_paths =\
[x for x in self._excluded_compiled if sep in x.pattern]
return
# HACK returned as a tuple to get a free iterator to keep interface the same
# regardless of whether the client asked for combined or not
marked_count = [x for marked, x in self if marked]
# If there is no item, the compiled Pattern will be '' and match everything!
if not marked_count:
self._cached_compiled_combined_all = []
self._cached_compiled_combined_files = []
self._cached_compiled_combined_paths = []
self._cached_compiled_union_all = []
self._cached_compiled_union_files = []
self._cached_compiled_union_paths = []
else:
self._cached_compiled_combined_all =\
# HACK returned as a tuple to get a free iterator and keep interface
# the same regardless of whether the client asked for union or not
self._cached_compiled_union_all =\
(re.compile('|'.join(marked_count)),)
files_marked = [x for x in marked_count if sep not in x]
if not files_marked:
self._cached_compiled_combined_files = tuple()
self._cached_compiled_union_files = tuple()
else:
self._cached_compiled_combined_files =\
self._cached_compiled_union_files =\
(re.compile('|'.join(files_marked)),)
paths_marked = [x for x in marked_count if sep in x]
if not paths_marked:
self._cached_compiled_combined_paths = tuple()
self._cached_compiled_union_paths = tuple()
else:
self._cached_compiled_combined_paths =\
self._cached_compiled_union_paths =\
(re.compile('|'.join(paths_marked)),)

@property
def compiled(self):
"""Should be used by other classes to retrieve the up-to-date list of patterns."""
if self._use_combined:
if self._use_union:
if self._dirty:
self.build_compiled_caches(True)
self._dirty = False
return self._cached_compiled_combined_all
return self._cached_compiled_union_all
return self._excluded_compiled

@property
def compiled_files(self):
"""When matching against filenames only, we probably won't be seeing any
directory separator, so we filter out regexes with os.sep in them.
The interface should be expected to be a generator, even if it returns only
one item (one Pattern in the combined case)."""
one item (one Pattern in the union case)."""
if self._dirty:
self.build_compiled_caches(True if self._use_combined else False)
self.build_compiled_caches(True if self._use_union else False)
self._dirty = False
return self._cached_compiled_combined_files if self._use_combined else self._cached_compiled_files
return self._cached_compiled_union_files if self._use_union\
else self._cached_compiled_files

@property
def compiled_paths(self):
"""Returns patterns with only separators in them, for more precise filtering."""
if self._dirty:
self.build_compiled_caches(True if self._use_combined else False)
self.build_compiled_caches(True if self._use_union else False)
self._dirty = False
return self._cached_compiled_combined_paths if self._use_combined else self._cached_compiled_paths
return self._cached_compiled_union_paths if self._use_union\
else self._cached_compiled_paths

# ---Public
def add(self, regex, forced=False):
"""This interface should throw exceptions if there is an error during regex compilation"""
"""This interface should throw exceptions if there is an error during
regex compilation"""
if self.isExcluded(regex):
# This exception should never be ignored
raise AlreadyThereException()
Expand All @@ -229,7 +239,8 @@ def add(self, regex, forced=False):

iscompilable, exception, compiled = self.compile_re(regex)
if not iscompilable and not forced:
# This exception can be ignored, but taken into account to avoid adding to compiled set
# This exception can be ignored, but taken into account
# to avoid adding to compiled set
raise exception
else:
self._do_add(regex, iscompilable, exception, compiled)
Expand All @@ -249,18 +260,13 @@ def isExcluded(self, regex):
return True
return False

def clear(self):
"""Not used and needs refactoring"""
self._excluded = []

def remove(self, regex):
for item in self._excluded:
if item[0] == regex:
self._excluded.remove(item)
self._remove_compiled(regex)

def rename(self, regex, newregex):
# if regex not in self._excluded: return
if regex == newregex:
return
found = False
Expand Down Expand Up @@ -318,7 +324,8 @@ def load_from_xml(self, infile):
# "forced" avoids compilation exceptions and adds anyway
self.add(regex_string, forced=True)
except AlreadyThereException:
logging.error(f"Regex \"{regex_string}\" loaded from XML was already present in the list.")
logging.error(f"Regex \"{regex_string}\" \
loaded from XML was already present in the list.")
continue
if exclude_item.get("marked") == "y":
marked.add(regex_string)
Expand All @@ -328,9 +335,7 @@ def load_from_xml(self, infile):

def save_to_xml(self, outfile):
"""Create a XML file that can be used by load_from_xml.

outfile can be a file object or a filename.
"""
outfile can be a file object or a filename."""
root = ET.Element("exclude_list")
# reversed in order to keep order of entries when reloading from xml later
for item in reversed(self._excluded):
Expand All @@ -343,15 +348,23 @@ def save_to_xml(self, outfile):


class ExcludeDict(ExcludeList):
"""Version implemented around a dictionary instead of a list, which implies
to keep the index of each string-key as its sub-element and keep it updated
whenever insert/remove is done."""
"""Exclusion list holding a set of regular expressions as keys, the compiled
Pattern, compilation error and compilable boolean as values."""
# Implemntation around a dictionary instead of a list, which implies
# to keep the index of each string-key as its sub-element and keep it updated
# whenever insert/remove is done.

def __init__(self, combined_regex=False):
def __init__(self, union_regex=False):
Markable.__init__(self)
self._use_combined = combined_regex
# { "regex": { "index": int, "compilable": bool, "error": str, "compiled": Pattern or None}}
# Note: "compilable" key should only be updated on add / rename
self._use_union = union_regex
# { "regex string":
# {
# "index": int,
# "compilable": bool,
# "error": str,
# "compiled": Pattern or None
# }
# }
self._excluded = {}
self._excluded_compiled = set()
self._dirty = True
Expand All @@ -361,6 +374,14 @@ def __iter__(self):
for regex in ordered_keys(self._excluded):
yield self.is_marked(regex), regex

def __getitem__(self, key):
"""Returns the dict item correponding to key"""
return self._excluded.__getitem__(key)

def get_compiled(self, key):
"""Returns the compiled item for key"""
return self.__getitem__(key).get("compiled")

def is_markable(self, regex):
return self._is_markable(regex)

Expand All @@ -373,12 +394,12 @@ def _is_markable(self, regex):

def _add_compiled(self, regex):
self._dirty = True
if self._use_combined:
if self._use_union:
return
try:
self._excluded_compiled.add(self._excluded[regex]["compiled"])
except Exception as e:
print(f"Exception while adding regex {regex} to compiled set: {e}")
logging.warning(f"Exception while adding regex {regex} to compiled set: {e}")
return

def is_compilable(self, regex):
Expand All @@ -391,7 +412,8 @@ def error(self, regex):

# ---Public
def _do_add(self, regex, iscompilable, exception, compiled):
# We always insert at the top, so index should be 0 and other indices should be pushed by one
# We always insert at the top, so index should be 0
# and other indices should be pushed by one
for value in self._excluded.values():
value["index"] += 1
self._excluded[regex] = {
Expand All @@ -406,10 +428,6 @@ def isExcluded(self, regex):
return True
return False

def clear(self):
"""Not used, need refactoring"""
self._excluded = {}

def remove(self, regex):
old_value = self._excluded.pop(regex)
# Bring down all indices which where above it
Expand Down
22 changes: 17 additions & 5 deletions core/gui/exclude_list_dialog.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,10 @@

# from hscommon.trans import tr
from .exclude_list_table import ExcludeListTable
import logging


class ExcludeListDialogCore:
# --- View interface
# show()
#

def __init__(self, app):
self.app = app
self.exclude_list = self.app.exclude_list # Markable from exclude.py
Expand Down Expand Up @@ -43,7 +40,7 @@ def rename_selected(self, newregex):
self.refresh()
return True
except Exception as e:
print(f"dupeGuru Warning: {e}")
logging.warning(f"Error while renaming regex to {newregex}: {e}")
return False

def add(self, regex):
Expand All @@ -54,5 +51,20 @@ def add(self, regex):
self.exclude_list.mark(regex)
self.exclude_list_table.add(regex)

def test_string(self, test_string):
"""Sets property on row to highlight if its regex matches test_string supplied."""
matched = False
for row in self.exclude_list_table.rows:
if self.exclude_list.get_compiled(row.regex).match(test_string):
matched = True
row.highlight = True
else:
row.highlight = False
return matched

def reset_rows_highlight(self):
for row in self.exclude_list_table.rows:
row.highlight = False

def show(self):
self.view.show()
Loading