Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Exclusion Filters #705

Merged
merged 23 commits into from
Dec 29, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Ignore path and filename based on regex
* Added initial draft for test suit
* Fixed small logging bug
  • Loading branch information
glubsy committed Aug 3, 2020
commit 470307aa3c1b66bce1fd6551ef9337b43c01693a
58 changes: 45 additions & 13 deletions core/directories.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# http://www.gnu.org/licenses/gpl-3.0.html

import os
import re
from xml.etree import ElementTree as ET
import logging

Expand Down Expand Up @@ -52,12 +53,34 @@ class Directories:
Then, when the user starts the scan, :meth:`get_files` is called to retrieve all files (wrapped
in :mod:`core.fs`) that have to be scanned according to the chosen folders/states.
"""
deny_list_str = set()
deny_list_re = set()
deny_list_re_files = set()

# ---Override
def __init__(self):
self._dirs = []
# {path: state}
self.states = {}
self.deny_list_str.add(r".*Recycle\.Bin$")
self.deny_list_str.add(r"denyme.*")
self.deny_list_str.add(r".*denyme")
self.deny_list_str.add(r".*/test/denyme*")
self.deny_list_str.add(r".*/test/*denyme")
self.deny_list_str.add(r"denyme")
self.deny_list_str.add(r".*\/\..*")
self.deny_list_str.add(r"^\..*")
self.compile_re()

def compile_re(self):
for expr in self.deny_list_str:
try:
self.deny_list_re.add(re.compile(expr))
if os.sep not in expr:
self.deny_list_re_files.add(re.compile(expr))
except Exception as e:
logging.debug(f"Invalid regular expression \"{expr}\" in exclude list: {e}")
print(f"re_all: {self.deny_list_re}\nre_files: {self.deny_list_re_files}")

def __contains__(self, path):
for p in self._dirs:
Expand All @@ -75,12 +98,15 @@ def __len__(self):
return len(self._dirs)

# ---Private
def _default_state_for_path(self, path):
def _default_state_for_path(self, path, deny_list_re=deny_list_re):
# Override this in subclasses to specify the state of some special folders.
if path.name.startswith("."): # hidden
return DirectoryState.Excluded
# if path.name.startswith("."): # hidden
# return DirectoryState.Excluded
for denied_path_re in deny_list_re:
if denied_path_re.match(str(path)):
return DirectoryState.Excluded

def _get_files(self, from_path, fileclasses, j):
def _get_files(self, from_path, fileclasses, j, deny_list_re=deny_list_re_files):
for root, dirs, files in os.walk(str(from_path)):
j.check_if_cancelled()
root = Path(root)
Expand All @@ -93,9 +119,15 @@ def _get_files(self, from_path, fileclasses, j):
del dirs[:]
try:
if state != DirectoryState.Excluded:
found_files = [
fs.get_file(root + f, fileclasses=fileclasses) for f in files
]
found_files = []
for f in files:
found = False
for expr in deny_list_re:
found = expr.match(f)
if found:
break
if not found:
found_files.append(fs.get_file(root + f, fileclasses=fileclasses))
found_files = [f for f in found_files if f is not None]
# In some cases, directories can be considered as files by dupeGuru, which is
# why we have this line below. In fact, there only one case: Bundle files under
Expand All @@ -108,15 +140,15 @@ def _get_files(self, from_path, fileclasses, j):
logging.debug(
"Collected %d files in folder %s",
len(found_files),
str(from_path),
str(root),
)
for file in found_files:
file.is_ref = state == DirectoryState.Reference
yield file
except (EnvironmentError, fs.InvalidPath):
pass

def _get_folders(self, from_folder, j):
def _get_folders(self, from_folder, j, deny_list_re=deny_list_re):
j.check_if_cancelled()
try:
for subfolder in from_folder.subfolders:
Expand Down Expand Up @@ -162,15 +194,15 @@ def get_subfolders(path):
except EnvironmentError:
return []

def get_files(self, fileclasses=None, j=job.nulljob):
def get_files(self, fileclasses=None, j=job.nulljob, deny_list_re=deny_list_re_files):
"""Returns a list of all files that are not excluded.

Returned files also have their ``is_ref`` attr set if applicable.
"""
if fileclasses is None:
fileclasses = [fs.File]
for path in self._dirs:
for file in self._get_files(path, fileclasses=fileclasses, j=j):
for file in self._get_files(path, fileclasses=fileclasses, j=j, deny_list_re=deny_list_re):
yield file

def get_folders(self, folderclass=None, j=job.nulljob):
Expand All @@ -185,15 +217,15 @@ def get_folders(self, folderclass=None, j=job.nulljob):
for folder in self._get_folders(from_folder, j):
yield folder

def get_state(self, path):
def get_state(self, path, denylist=deny_list_re):
"""Returns the state of ``path``.

:rtype: :class:`DirectoryState`
"""
# direct match? easy result.
if path in self.states:
return self.states[path]
state = self._default_state_for_path(path) or DirectoryState.Normal
state = self._default_state_for_path(path, denylist) or DirectoryState.Normal
prevlen = 0
# we loop through the states to find the longest matching prefix
for p, s in self.states.items():
Expand Down
11 changes: 8 additions & 3 deletions core/fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ def can_handle(cls, path):
return not path.islink() and path.isdir()


def get_file(path, fileclasses=[File]):
def get_file(path, fileclasses=[File], deny_list_re=set()):
"""Wraps ``path`` around its appropriate :class:`File` class.

Whether a class is "appropriate" is decided by :meth:`File.can_handle`
Expand All @@ -255,10 +255,15 @@ def get_file(path, fileclasses=[File]):
"""
for fileclass in fileclasses:
if fileclass.can_handle(path):
# print(f"returning {path}")
# for expr in deny_list_re:
# if expr.match(str(path.name)):
# print(f"FOUND {repr(expr)} in {str(path.name)}")
# return
return fileclass(path)


def get_files(path, fileclasses=[File]):
def get_files(path, fileclasses=[File], deny_list_re=set()):
"""Returns a list of :class:`File` for each file contained in ``path``.

:param Path path: path to scan
Expand All @@ -268,7 +273,7 @@ def get_files(path, fileclasses=[File]):
try:
result = []
for path in path.listdir():
file = get_file(path, fileclasses=fileclasses)
file = get_file(path, fileclasses=fileclasses, deny_list_re=deny_list_re)
if file is not None:
result.append(file)
return result
Expand Down
53 changes: 52 additions & 1 deletion core/tests/directories_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ def test_get_state_returns_excluded_by_default_for_hidden_directories(tmpdir):
def test_default_path_state_override(tmpdir):
# It's possible for a subclass to override the default state of a path
class MyDirectories(Directories):
def _default_state_for_path(self, path):
def _default_state_for_path(self, path, denylist):
if "foobar" in path:
return DirectoryState.Excluded

Expand All @@ -341,3 +341,54 @@ def _default_state_for_path(self, path):
d.set_state(p1["foobar"], DirectoryState.Normal)
eq_(d.get_state(p1["foobar"]), DirectoryState.Normal)
eq_(len(list(d.get_files())), 2)


def test_exclude_list_regular_expressions(tmpdir):
d = Directories()
d.deny_list_str.clear()
d.deny_list_re.clear()
d.deny_list_re_files.clear()
# This should only exlude the directory, but not the contained files if
# its status is set to normal after loading it in the directory tree
d.deny_list_str.add(r".*Recycle\.Bin$")
d.deny_list_str.add(r"denyme.*")
# d.deny_list_str.add(r".*denymetoo")
# d.deny_list_str.add(r"denyme")
d.deny_list_str.add(r".*\/\..*")
d.deny_list_str.add(r"^\..*")
d.compile_re()
p1 = Path(str(tmpdir))
# Should be ignored on Windows only (by default)
p1["Recycle.Bin"].mkdir()
p1["Recycle.Bin/somerecycledfile"].open("w").close()

p1["denyme_blah.txt"].open("w").close()
p1["blah_denymetoo"].open("w").close()
p1["blah_denyme"].open("w").close()

p1[".hidden_file"].open("w").close()
p1[".hidden_dir"].mkdir()
p1[".hidden_dir/somenormalfile1"].open("w").close()
p1[".hidden_dir/somenormalfile2_denyme"].open("w").close()

p1["foobar"].mkdir()
p1["foobar/somefile"].open("w").close()
d.add_path(p1)
eq_(d.get_state(p1["Recycle.Bin"]), DirectoryState.Excluded)
eq_(d.get_state(p1["foobar"]), DirectoryState.Normal)
files = list(d.get_files())
files = [file.name for file in files]
print(f"first files: {files}")
assert "somerecycledfile" not in files
assert "denyme_blah.txt" not in files
assert ".hidden_file" not in files
assert "somefile1" not in files
assert "somefile2_denyme" not in files
# Overriding the default state from the Directory Tree
d.set_state(p1["Recycle.Bin"], DirectoryState.Normal)
d.set_state(p1[".hidden_dir"], DirectoryState.Normal)
files = list(d.get_files())
files = [file.name for file in files]
print(f"second files: {files}")
assert "somerecycledfile" in files
assert "somenormalfile1" in files
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ setenv =
PYTHON="{envpython}"
commands =
make modules
py.test core hscommon
{posargs:py.test} core hscommon
deps =
-r{toxinidir}/requirements.txt
-r{toxinidir}/requirements-extra.txt
Expand Down