Skip to content

Commit

Permalink
[runtime env] Support .gitignore exclusion in working dir (ray-proj…
Browse files Browse the repository at this point in the history
  • Loading branch information
fishbone authored Apr 26, 2021
1 parent a38761b commit fc70106
Show file tree
Hide file tree
Showing 13 changed files with 1,973 additions and 37 deletions.
3 changes: 2 additions & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
[flake8]
exclude =
exclude =
python/ray/core/generated/
streaming/python/generated
doc/source/conf.py
python/ray/cloudpickle/
python/ray/thirdparty_files/
python/build/
python/.eggs/
python/ray/_private/thirdparty/*
max-line-length = 79
inline-quotes = "
ignore =
Expand Down
5 changes: 3 additions & 2 deletions ci/travis/format.sh
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ YAPF_FLAGS=(
)

# TODO(dmitri): When more of the codebase is typed properly, the mypy flags
# should be set to do a more stringent check.
# should be set to do a more stringent check.
MYPY_FLAGS=(
'--follow-imports=skip'
'--ignore-missing-imports'
Expand All @@ -116,6 +116,7 @@ YAPF_EXCLUDES=(
'--exclude' 'python/build/*'
'--exclude' 'python/ray/core/src/ray/gcs/*'
'--exclude' 'python/ray/thirdparty_files/*'
'--exclude' 'python/ray/_private/thirdparty/*'
)

GIT_LS_EXCLUDES=(
Expand Down Expand Up @@ -143,7 +144,7 @@ shellcheck_scripts() {
shellcheck "${SHELLCHECK_FLAGS[@]}" "$@"
}

# Runs mypy on each argument in sequence. This is different than running mypy
# Runs mypy on each argument in sequence. This is different than running mypy
# once on the list of arguments.
mypy_on_each() {
pushd python/ray
Expand Down
79 changes: 59 additions & 20 deletions python/ray/_private/runtime_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
from filelock import FileLock
from pathlib import Path
from zipfile import ZipFile
from ray._private.thirdparty.pathspec import PathSpec
from ray.job_config import JobConfig
from enum import Enum

from ray.experimental.internal_kv import (_internal_kv_put, _internal_kv_get,
_internal_kv_exists,
_internal_kv_initialized)

from typing import List, Tuple, Optional, Set, Callable
from typing import List, Tuple, Optional, Callable
from urllib.parse import urlparse
import os
import sys
Expand Down Expand Up @@ -114,18 +115,23 @@ def _xor_bytes(left: bytes, right: bytes) -> bytes:

def _dir_travel(
path: Path,
excludes: Set[Path],
excludes: List[Callable],
handler: Callable,
):
if path in excludes:
return
handler(path)
if path.is_dir():
for sub_path in path.iterdir():
_dir_travel(sub_path, excludes, handler)


def _zip_module(root: Path, relative_path: Path, excludes: Set[Path],
e = _get_gitignore(path)
if e is not None:
excludes.append(e)
skip = any([e(path) for e in excludes])
if not skip:
handler(path)
if path.is_dir():
for sub_path in path.iterdir():
_dir_travel(sub_path, excludes, handler)
if e is not None:
excludes.pop()


def _zip_module(root: Path, relative_path: Path, excludes: Optional[Callable],
zip_handler: ZipFile) -> None:
"""Go through all files and zip them into a zip file"""

Expand All @@ -141,13 +147,14 @@ def handler(path: Path):
to_path = path.relative_to(relative_path)
zip_handler.write(path, to_path)

excludes = [] if excludes is None else [excludes]
_dir_travel(root, excludes, handler)


def _hash_modules(
root: Path,
relative_path: Path,
excludes: Set[Path],
excludes: Optional[Callable],
) -> bytes:
"""Helper function to create hash of a directory.
Expand All @@ -169,6 +176,7 @@ def handler(path: Path):
nonlocal hash_val
hash_val = _xor_bytes(hash_val, md5.digest())

excludes = [] if excludes is None else [excludes]
_dir_travel(root, excludes, handler)
return hash_val

Expand All @@ -185,6 +193,36 @@ def _parse_uri(pkg_uri: str) -> Tuple[Protocol, str]:
return (protocol, uri.netloc)


def _get_excludes(path: Path, excludes: List[str]) -> Callable:
path = path.absolute()
pathspec = PathSpec.from_lines("gitwildmatch", excludes)

def match(p: Path):
path_str = str(p.absolute().relative_to(path))
path_str += "/"
return pathspec.match_file(path_str)

return match


def _get_gitignore(path: Path) -> Optional[Callable]:
path = path.absolute()
ignore_file = path / ".gitignore"
if ignore_file.is_file():
with ignore_file.open("r") as f:
pathspec = PathSpec.from_lines("gitwildmatch", f.readlines())

def match(p: Path):
path_str = str(p.absolute().relative_to(path))
if p.is_dir():
path_str += "/"
return pathspec.match_file(path_str)

return match
else:
return None


# TODO(yic): Fix this later to handle big directories in better way
def get_project_package_name(working_dir: str, py_modules: List[str],
excludes: List[str]) -> str:
Expand All @@ -208,14 +246,13 @@ def get_project_package_name(working_dir: str, py_modules: List[str],
Args:
working_dir (str): The working directory.
py_modules (list[str]): The python module.
excludes (set[str]): The dir or files that should be excluded
excludes (list[str]): The dir or files that should be excluded
Returns:
Package name as a string.
"""
RAY_PKG_PREFIX = "_ray_pkg_"
hash_val = None
excludes = {Path(p).absolute() for p in excludes}
if working_dir:
if not isinstance(working_dir, str):
raise TypeError("`working_dir` must be a string.")
Expand All @@ -224,7 +261,9 @@ def get_project_package_name(working_dir: str, py_modules: List[str],
raise ValueError(f"working_dir {working_dir} must be an existing"
" directory")
hash_val = _xor_bytes(
hash_val, _hash_modules(working_dir, working_dir, excludes))
hash_val,
_hash_modules(working_dir, working_dir,
_get_excludes(working_dir, excludes)))
for py_module in py_modules or []:
if not isinstance(py_module, str):
raise TypeError("`py_module` must be a string.")
Expand All @@ -233,7 +272,7 @@ def get_project_package_name(working_dir: str, py_modules: List[str],
raise ValueError(f"py_module {py_module} must be an existing"
" directory")
hash_val = _xor_bytes(
hash_val, _hash_modules(module_dir, module_dir.parent, excludes))
hash_val, _hash_modules(module_dir, module_dir.parent, None))
return RAY_PKG_PREFIX + hash_val.hex() + ".zip" if hash_val else None


Expand All @@ -252,15 +291,15 @@ def create_project_package(working_dir: str, py_modules: List[str],
output_path (str): The path of file to be created.
"""
pkg_file = Path(output_path).absolute()
excludes = [Path(e).absolute() for e in excludes]
with ZipFile(pkg_file, "w") as zip_handler:
if working_dir:
# put all files in /path/working_dir into zip
working_path = Path(working_dir).absolute()
_zip_module(working_path, working_path, excludes, zip_handler)
_zip_module(working_path, working_path,
_get_excludes(working_path, excludes), zip_handler)
for py_module in py_modules or []:
module_path = Path(py_module).absolute()
_zip_module(module_path, module_path.parent, excludes, zip_handler)
_zip_module(module_path, module_path.parent, None, zip_handler)


def fetch_package(pkg_uri: str) -> int:
Expand Down Expand Up @@ -359,7 +398,7 @@ def rewrite_working_dir_uri(job_config: JobConfig) -> None:
if (not job_config.runtime_env.get("working_dir_uri")) and (working_dir
or py_modules):
if excludes is None:
excludes = set()
excludes = []
pkg_name = get_project_package_name(working_dir, py_modules, excludes)
job_config.runtime_env[
"working_dir_uri"] = Protocol.GCS.value + "://" + pkg_name
Expand Down
Empty file.
Loading

0 comments on commit fc70106

Please sign in to comment.