diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 000000000..ac6d485a3 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,35 @@ +## Description of changes: + + + +## Issue #, if available: + + + +## Merge Checklist: +_Put an `x` without space in the boxes that apply. If you are unsure about any checklist, please don't hesitate to ask. We are here to help! This is simply a reminder of what we are going to look for before merging your pull request._ + +### General +- [ ] I have read the [contributor guidelines](https://github.com/mosaicml/streaming/blob/main/CONTRIBUTING.md) +- [ ] This is a documentation change or typo fix. If so, skip the rest of this checklist. +- [ ] I certify that the changes I am introducing will be backward compatible, and I have discussed concerns about this, if any, with the MosaicML team. +- [ ] I have updated any necessary documentation, including [README](https://github.com/mosaicml/streaming/blob/main/README.md) and [API docs](https://github.com/mosaicml/streaming/tree/main/docs) (if appropriate). + +### Tests +- [ ] I ran `pre-commit` on my change. (check out the `pre-commit` section of [prerequisites](https://github.com/mosaicml/streaming/blob/main/CONTRIBUTING.md#prerequisites)) +- [ ] I have added tests that prove my fix is effective or that my feature works (if appropriate). +- [ ] I ran the tests locally to make sure it pass. (check out [testing](https://github.com/mosaicml/streaming/blob/main/CONTRIBUTING.md#running-tests)) +- [ ] I have added unit and/or integration tests as appropriate to ensure backward compatibility of the changes. + + diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 7878de644..b4368fcc7 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -5,8 +5,8 @@ version: 2 updates: - - package-ecosystem: "pip" # See documentation for possible values - directory: "/" # Location of package manifests + - package-ecosystem: "pip" # See documentation for possible values + directory: "/" # Location of package manifests schedule: interval: "weekly" # Allow up to 5 open pull requests for pip dependencies diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index a9b9d175a..f687e4e37 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -13,9 +13,9 @@ name: "CodeQL" on: push: - branches: [ main ] + branches: [main] schedule: - - cron: '0 9 * * 1' # Every Monday at 09:00 (9:00 AM) + - cron: "0 9 * * 1" # Every Monday at 09:00 (9:00 AM) jobs: analyze: @@ -29,39 +29,39 @@ jobs: strategy: fail-fast: false matrix: - language: [ 'python' ] + language: ["python"] # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] # Learn more about CodeQL language support at https://git.io/codeql-language-support steps: - - name: Checkout repository - uses: actions/checkout@v3 + - name: Checkout repository + uses: actions/checkout@v3 - # Initializes the CodeQL tools for scanning. - - name: Initialize CodeQL - uses: github/codeql-action/init@v2 - with: - languages: ${{ matrix.language }} - # If you wish to specify custom queries, you can do so here or in a config file. - # By default, queries listed here will override any specified in a config file. - # Prefix the list here with "+" to use these queries and those in the config file. - # queries: ./path/to/local/query, your-org/your-repo/queries@main + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v2 + with: + languages: ${{ matrix.language }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + # queries: ./path/to/local/query, your-org/your-repo/queries@main - # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). - # If this step fails, then you should remove it and run the build manually (see below) - - name: Autobuild - uses: github/codeql-action/autobuild@v2 + # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). + # If this step fails, then you should remove it and run the build manually (see below) + - name: Autobuild + uses: github/codeql-action/autobuild@v2 - # ℹ️ Command-line programs to run using the OS shell. - # 📚 https://git.io/JvXDl + # ℹ️ Command-line programs to run using the OS shell. + # 📚 https://git.io/JvXDl - # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines - # and modify them (or add more) to build your code if your project - # uses a compiled language + # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines + # and modify them (or add more) to build your code if your project + # uses a compiled language - #- run: | - # make bootstrap - # make release + # - run: | + # make bootstrap + # make release - - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v2 + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v2 diff --git a/.github/workflows/pypi-release.yaml b/.github/workflows/pypi-release.yaml index 324154249..1099b2e49 100644 --- a/.github/workflows/pypi-release.yaml +++ b/.github/workflows/pypi-release.yaml @@ -51,7 +51,7 @@ jobs: id: create_release uses: actions/create-release@v1 env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token with: tag_name: ${{ github.ref }} release_name: ${{ github.ref }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fea490f8b..63914fc78 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ default_language_version: python: python3 # Skip the pre-commit check for below directories to have # a consistency with the official tfrecord preprocessing scripts -exclude: '^(streaming/text/convert/enwiki/)' +exclude: "^(streaming/text/convert/enwiki/)" repos: - repo: https://github.com/google/yapf rev: v0.32.0 @@ -11,7 +11,7 @@ repos: name: yapf description: "A formatter for Python files." entry: yapf - args: [-i, -vv, -p] #inplace + args: [-i, -vv, -p] # inplace language: python types: [python] additional_dependencies: @@ -83,7 +83,16 @@ repos: entry: pydocstyle language: python types: [python] - exclude: '(tests|.ci|.github)' + exclude: "(tests|.ci|.github)" additional_dependencies: - "toml" rev: 6.1.1 + - repo: https://github.com/adrienverge/yamllint.git + rev: v1.28.0 + hooks: + - id: yamllint + name: yamllint + description: This hook runs yamllint. + entry: yamllint + language: python + types: [file, yaml] diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 1e96650a8..77e68cdf1 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -7,12 +7,12 @@ version: 2 # Build documentation in the docs/ directory with Sphinx sphinx: - builder: html - configuration: docs/source/conf.py + builder: html + configuration: docs/source/conf.py # Optionally set the version of Python and requirements required to build your docs python: - version: "3.8" - install: - - method: pip - path: .[all] + version: "3.8" + install: + - method: pip + path: .[all] diff --git a/.yamllint.yaml b/.yamllint.yaml new file mode 100644 index 000000000..5821b8330 --- /dev/null +++ b/.yamllint.yaml @@ -0,0 +1,40 @@ +yaml-files: + - "*.yaml" + - "*.yml" + - .yamllint + +rules: + braces: + forbid: non-empty + brackets: + forbid: false + colons: enable + commas: enable + comments: + level: warning + comments-indentation: enable + document-end: + present: false + document-start: + present: false + empty-lines: enable + empty-values: disable + hyphens: enable + indentation: + spaces: 2 + indent-sequences: true + check-multi-line-strings: false + key-duplicates: enable + key-ordering: disable + line-length: + max: 200 + allow-non-breakable-words: true + allow-non-breakable-inline-mappings: true + new-line-at-end-of-file: enable + new-lines: enable + octal-values: enable + quoted-strings: + quote-type: double + required: false + trailing-spaces: enable + truthy: disable diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 000000000..01da718da --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1,3 @@ +# Require admin approval to modify all files in the root of the repository +# This includes setup.py, the README, and the CODEOWNERS file itself! +/* @knighton @karan6181 @bandish-shah diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 000000000..7d30b8b78 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,5 @@ +# Community Guidelines + +This repository is governed by MosaicML's community guidelines and code of conduct. +For more details, including information on how to report issues affecting the community, please read the +[MosaicML Community Guidelines](https://docs.google.com/document/d/1h8S9x9bCTsA_H8ourZJy3SQVWy-6z7i28TP5rcZt8RI/edit) and the [MosaicML Code of Conduct](https://docs.google.com/document/d/1aCaMLO65qfMaqP3uDYiUsTauMvBrSKd7qgeYqz458Ew/edit). diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 000000000..bc8488f8f --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +global-exclude **/__pycache__/** +global-exclude *.pyc diff --git a/STYLE_GUIDE.md b/STYLE_GUIDE.md new file mode 100644 index 000000000..55d0a6edd --- /dev/null +++ b/STYLE_GUIDE.md @@ -0,0 +1,394 @@ +# 1. Style and Conventions + +## 1.1 Style Guide + +Streaming generally follows Google's +[Python Style Guide](https://google.github.io/styleguide/pyguide.html) for how to format and structure code. + +## 1.2. Pre-Commit Hooks + +Streaming uses [Pre Commit](https://pre-commit.com/) to enforce style checks. To configure, run +``` +pip install '.[dev]' # if not already installed +pre-commit install +``` + +The pre-commit hooks will now be run before each commit. You can also run the hooks manually via: + +``` +pre-commit run # run all hooks on changed files +pre-commit run --all-files # or, run all hooks on all files +``` + +## 1.3. Code Formatting + +Streaming uses the [yapf](https://github.com/google/yapf) formatter for general formatting +[isort](https://github.com/PyCQA/isort) to sort imports. These checks run through pre-commit +(see section 2.2). These checks can also be run manually via: + +``` +pre-commit run yapf --all-files # for yahp +pre-commit run isort --all-files # for isort +``` + +The configuration is stored in [pyproject.toml](pyproject.toml). + +## 1.4. Code Structure + +As a general rule of thumb, + +- Don't: Default to using inheritance for code reuse + + Do: prefer [composition over inheritance](https://en.wikipedia.org/wiki/Composition_over_inheritance) +- Don't: strive to implement all logic using classes + + Do: strive to implement logic as pure functions when possible, and classes when there is good reason +- Don't: Have a function accept falsy values that would then result in a no-op. + + Example of the anti-pattern: + + ```python + from typing import Optional + + def custom_configuration(config: Optional[dict]): + if config is None: + # Don't do this check in the callee, which results in a no-op + return + ... + ``` + + Do: Require the caller, instead of the callee, check for and handle falsy values. It's ok to accept falsy values + for individual arguments of a caller function, so long as the entire function would not be a no-op. + + Example: + ```python + from typing import Optional + + def custom_configuration(config: dict): + ... + + def trainer(config: Optional[dict]): + if config is not None: + # Do this check in the caller function + custom_configuration(config) + ... + ``` + +# 2. Type Annotations and Typechecking + +Streaming aims to annotate all functions with type annotations (introduced in +[PEP 526](https://www.python.org/dev/peps/pep-0526/)). Type annotations help statically catch `TypeError` and +`AttributeError` bugs, in addition to other benefits, as outlined in the PEP. + +For documentation on typing annotations, see: +* [PEP 483](https://peps.python.org/pep-0483/) for a simplified introduction +* [PEP 484](https://peps.python.org/pep-0484/) for the full specification +* [Python docs for `typing`](https://docs.python.org/3/library/typing.html) for the API reference + +Streaming uses [pyright](https://github.com/microsoft/pyright) +to validate type annotations. PyRight is automatically run as part of the pre-commit hooks, but you can also +run PyRight specifically via: + +``` +pre-commit run pyright --all-files +``` + +The pyright configuration is stored in [pyproject.toml](pyproject.toml). + + +## 2.1 Debugging + +Here are some suggestions to deal with pyright errors: + +1. Suppose a variable could be one of multiple types, like the following: + + ```python + from typing import Union + + def foo(x: Union[int, None]): + return x + 5 # type error -- None + 5 is not allowed! + ``` + + PyRight will complain since `None + 5` is not a valid operation. + Instead, add a check to ensure that `x is not None`: + + ```python + from typing import Union + + def foo(x: Union[int, None]): + if x is None: + raise TypeError("x must be an integer, not None!") + return x + 5 # valid + ``` + + Assert statements also work. However, assert statements should not be used for data validation + (see the assert statement section below). + ```python + from typing import Union + + def foo(x: Union[int, None]): + assert x is not None, "x should never be None" + return x + 5 # valid + ``` + +1. For variables where it is impossible for pyright to infer the correct type, use +[cast](https://docs.python.org/3/library/typing.html#typing.cast). +1. As a last resort, add a `# type: ignore` comment to the line where pyright emits an error. +Immediately following this statement, paste in the error emitted by pyright, +so other contributors will know why this error was silenced. + + +# 3. Public APIs +A public API, generally speaking, can be invoked by a user without a leading underscore in any portion of the path. +The following are examples of public APIs: + +* Standalone functions in public modules (e.g. `streaming.base.distributed.get_world_size`) +* Classes in public modules (e.g. `streaming.base.format.MDSWriter`) +* Public methods in public classes (e.g. `streaming.base.format.MDSWriter.write`) +* Public modules (e.g. `streaming.base.dataset`) + +The following rules apply to public APIs: +1. All public APIs must have a docstring (see the Documentation section below) +1. All parameters must have type annotations. +1. To minimize user imports, parameters should should use native PyTorch or Python types whenever possible. + + It is acceptable to use a union of types, so long as one of the options is a primitive. + +1. Parameters that could take a sequence of elements should also allow `None` or a singleton. + This simplifies the user API by not having to construct a list (or tuple) to hold a single element + (or no element). For example, use `Optional[Union[torch.Tensor, Sequence[torch.Tensor]]`. + + +# 4. Use of `assert` + +`assert` should be used only in test cases and for verifying invariants (likely required for type checking), +not for data validation. As asserts can be disabled in python by using the `-O` flag +(e.g. `python -O path/to/script.py`), they are not guaranteed to run. For data validation, instead use a style like +the following: + + + + +```python +if parameter is None: + raise ValueError("parameter must be specified and cannot be None") +``` + + +# 5. Imports and `__init__.py` + +All imports in Streaming should be absolute -- that is, they do not begin with a period. + +## 5.1 External Dependencies +1. All external dependencies must be specified in both [setup.py](setup.py) for pip. + +1. If a dependency is not core to Streaming (e.g. it is for a model, dataset, or some callbacks): + 1. It must be specified in a entry of the `extra_deps` dictionary of [setup.py](setup.py). + This dictionary groups dependencies that can be conditionally installed. An entry named `foo` + can be installed with `pip install 'mosaicml-streaming[foo]'`. For example, running `pip install 'mosaicml-streaming[docs]'` + will install everything in `install_requires`, along with `docs`. + 1. It must also be specified in the `run_constrained` and the `test.requires` section. + 1. If the dependency is core to Streaming, add the dependency to the `install_requires` section of + [setup.py](./setup.py). + +## 5.2 Use of `__all__` + +All public modules must define `__all__` to be the list of members that should be re-exported. +The variable is necessary to 1) limit what `from XXX import *` imports, and 2) ensure that the documentation only +includes exported members, not unrelated re-imports. + +For example, from [streaming/base/dataset.py](streaming/base/dataset.py) + +```python +"""The :class:`Dataset` class, used for building streaming iterable datasets.""" +from torch.utils.data import IterableDataset + +from streaming.base.format import reader_from_json +from streaming.base.index import Index, Partition + +__all__ = ["Dataset"] # export only the Dataset, not other imports like `Index`, `Partition`, or `reader_from_json` + + +class Dataset(IterableDataset): + ... +``` + + +## 5.3 `__init__.py` + +All public classes and functions should be added to the module's `__init__.py`. + + +```python +from streaming.path.to.module.file import MyClass as MyClass +from streaming.path.to.module.file import my_func as my_func +``` + +If a file only contains public functions, then the following is also acceptable: + + +```python +from streaming.path.to.module import my_file as my_file +``` + + +# 6. Documentation + +## 6.1 Docstrings + +Streaming uses [Google Style Docstrings](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html). +All public APIs require documentation. + +### 6.1.1 What to include in Docstrings? + +Docstrings, at a minimum, should include a summary of what the function or class does, along with the arguments it takes. See [below](#612-formatting-docstrings) for how to format docstrings. The [Google Style Guide](https://google.github.io/styleguide/pyguide.html) also includes some guidelines on how to write docstrings. + +### 6.1.2 Formatting Docstrings + +The following guidelines apply to documentation. +1. Each function that needs a docstring must have its input arguments, return statement (if not None), and any custom + exceptions annotated. +1. The arguments for the `__init__` signature of classes should be documented under the class-level docstring. There + should not be any `__init__`-level docstring. +1. Each argument annotation should include the type. If the argument has a default value, the type annotation should + specify "optional", and the docstring should say the default value. Some examples: + + ```python + from typing import Optional, Tuple, Union + + def foo(bar: int): + """Foo. + + Args: + bar (int): Required bar. + """ + ... + + def foo2(bar: int = 42): + """Foo2. + + Args: + bar (int, optional): The first Argument. Default: ``42``. + """ + ... + + def foo3(bar: Optional[int] = None): + """Foo3. + + Args: + bar (int, optional): The first Argument. Default: ``None``. + """ + ... + + def foo4(bar: Union[int, str] = 42): + """Foo4. + + Args: + bar (int | str, optional): The first Argument. Default: ``42``. + """ + ... + + def foo5(bar: int) -> int: + """Foo5. + + Args: + bar (int): Required bar. + + Returns: + int: Description of return statement. + """ + ... + + def foo6(bar: int) -> Tuple[int, str]: + """Foo6. + + Args: + bar (int): Required bar. + + Returns: + a (int): Returned value. + b (str): Returned value. + """ + ... + ``` + +### 6.1.3 Building and Viewing Docs Locally + +Assuming you already have a development install of Streaming (see these [instructions](CONTRIBUTING.md#prerequisites)), here’s how to build and previous the docs locally. + +**️️ ⚠ Warning:** CI treats all sphinx warnings as errors, so they must be addressed before a PR can be merged. Building docs locally can help debug any warnings showing up on Jenkins! + +In a terminal, run: + + +```bash +source path/to/streaming_venv/bin/activate # activate your streaming virtual env +cd streaming/docs # cd to the docs folder inside your streaming clone +make clean # Cleans the artifacts and remove source/api_reference folder +make html # build the docs +make host # Run the docs locally +``` + +Then, navigate to [http://localhost:8000](http://localhost:8000) in your browser. + +## 6.2 Doctests + +Most docstrings should also include a `.. doctest` or `.. testcode` example to clearly illustrate how one would interact with the class or function. As part of the CI/CD process, all `.. doctest` blocks are executed to ensure the example in the documentation actually works. + +### 6.2.1 Writing Doctests + +See the [Sphinx Doctest Extension](https://www.sphinx-doc.org/en/master/usage/extensions/doctest.html) for all of the available directives. Do not use `.. code-block::` for Python examples, as they are untested. + +Any test fixtures for doctests should go in [docs/source/doctest_fixtures.py](docs/source/doctest_fixtures.py) or in a `.. testsetup::` block. + +For example: +```python +import torch +from typing import Optional + +def my_function(x: Optional[torch.Tensor]) -> torch.Tensor: + """blah function + + Args: + input (torch.Tensor): Your guess. + + Returns: + torch.Tensor: How good your input is. + + Raises: + ValueError: If your input is negative. + + Example: + .. testsetup:: + + # optional setup section, not shown in docs + import torch + x = torch.randn(42) + + + .. testcode:: + + # shown in docs; runs after testsetup + my_function(x) + """ + ... +``` + +All doctests load the [docs/source/doctest_fixtures.py](docs/source/doctest_fixtures.py) file *before* tests run. If there are any variables that would be helpful have defined for all tests, feel free to add them into this file. However, if a variable is more specific to an individual doctest, then it would be best to include it in a `.. testsetup::` block, as not to pollute the global fixture namespace. (Unlike pytest fixtures, all doctest fixtures are given to every doctest; they cannot be specifically requested) + +### 6.2.2 Running Doctests Locally + +Assuming you already have a development install of Streaming (see these [instructions](CONTRIBUTING.md#prerequisites)), here’s how to run the doctests. + + +```bash +source path/to/streaming_venv/bin/activate # activate your streaming virtual env +cd streaming/docs # cd to the docs folder inside your streaming clone +make clean # Cleans the artifacts and remove source/api_reference folder +make html # the html build must be completed first to ensure all doctests are identified +make doctest 2>/dev/null # For more verbosity, do not direct stderr to /dev/null +``` diff --git a/streaming/base/dataset.py b/streaming/base/dataset.py index 0c79a436f..74d3a5444 100644 --- a/streaming/base/dataset.py +++ b/streaming/base/dataset.py @@ -22,6 +22,8 @@ from streaming.base.hashing import get_hash from streaming.base.index import Index, Partition, get_index_basename +__all__ = ['Dataset'] + class DownloadStatus(IntEnum): """Download status.""" diff --git a/streaming/base/format/base/reader.py b/streaming/base/format/base/reader.py index c851ff8ce..a067b9815 100644 --- a/streaming/base/format/base/reader.py +++ b/streaming/base/format/base/reader.py @@ -7,6 +7,8 @@ from dataclasses import dataclass from typing import Any, Dict, Iterator, List, Optional +__all__ = ['FileInfo', 'Reader', 'JointReader', 'SplitReader'] + @dataclass class FileInfo(object): diff --git a/streaming/base/format/base/writer.py b/streaming/base/format/base/writer.py index ed7bf7d20..4394b5a3b 100644 --- a/streaming/base/format/base/writer.py +++ b/streaming/base/format/base/writer.py @@ -15,6 +15,8 @@ from streaming.base.hashing import get_hash, is_hash from streaming.base.index import get_index_basename +__all__ = ['JointWriter', 'SplitWriter'] + class Writer(ABC): """Writes a streaming dataset. diff --git a/streaming/base/format/json/reader.py b/streaming/base/format/json/reader.py index 4ac4409c9..07a34636e 100644 --- a/streaming/base/format/json/reader.py +++ b/streaming/base/format/json/reader.py @@ -13,6 +13,8 @@ from streaming.base.format.base.reader import FileInfo, SplitReader +__all__ = ['JSONReader'] + class JSONReader(SplitReader): """Provides random access to the samples of a JSON shard. diff --git a/streaming/base/format/json/writer.py b/streaming/base/format/json/writer.py index 217a123ae..1cc27b3cc 100644 --- a/streaming/base/format/json/writer.py +++ b/streaming/base/format/json/writer.py @@ -11,6 +11,8 @@ from streaming.base.format.base.writer import SplitWriter from streaming.base.format.json.encodings import is_json_encoded, is_json_encoding +__all__ = ['JSONWriter'] + class JSONWriter(SplitWriter): r"""Writes a streaming JSON dataset. diff --git a/streaming/base/format/mds/reader.py b/streaming/base/format/mds/reader.py index 698bc2979..e289434d9 100644 --- a/streaming/base/format/mds/reader.py +++ b/streaming/base/format/mds/reader.py @@ -13,6 +13,8 @@ from streaming.base.format.base.reader import FileInfo, JointReader from streaming.base.format.mds.encodings import mds_decode +__all__ = ['MDSReader'] + class MDSReader(JointReader): """Provides random access to the samples of an MDS shard. diff --git a/streaming/base/format/mds/writer.py b/streaming/base/format/mds/writer.py index ec97d776d..ae0e5bc44 100644 --- a/streaming/base/format/mds/writer.py +++ b/streaming/base/format/mds/writer.py @@ -11,6 +11,8 @@ from streaming.base.format.base.writer import JointWriter from streaming.base.format.mds.encodings import get_mds_encoded_size, is_mds_encoding, mds_encode +__all__ = ['MDSWriter'] + class MDSWriter(JointWriter): """Writes a streaming MDS dataset. diff --git a/streaming/base/format/xsv/reader.py b/streaming/base/format/xsv/reader.py index d53a797ab..fe0421661 100644 --- a/streaming/base/format/xsv/reader.py +++ b/streaming/base/format/xsv/reader.py @@ -13,6 +13,8 @@ from streaming.base.format.base.reader import FileInfo, SplitReader from streaming.base.format.xsv.encodings import xsv_decode +__all__ = ['XSVReader', 'CSVReader', 'TSVReader'] + class XSVReader(SplitReader): """Provides random access to the samples of an XSV shard. diff --git a/streaming/base/format/xsv/writer.py b/streaming/base/format/xsv/writer.py index 05e8c95b8..c76687479 100644 --- a/streaming/base/format/xsv/writer.py +++ b/streaming/base/format/xsv/writer.py @@ -11,6 +11,8 @@ from streaming.base.format.base.writer import SplitWriter from streaming.base.format.xsv.encodings import is_xsv_encoding, xsv_encode +__all__ = ['XSVWriter', 'CSVWriter', 'TSVWriter'] + class XSVWriter(SplitWriter): r"""Writes a streaming XSV dataset. diff --git a/streaming/base/index.py b/streaming/base/index.py index 098b03ad4..684ea648c 100644 --- a/streaming/base/index.py +++ b/streaming/base/index.py @@ -11,6 +11,8 @@ from streaming.base import distributed as dist +__all__ = ['get_index_basename', 'Partition', 'Index'] + def get_index_basename() -> str: """Get the canonical index file basename. diff --git a/streaming/base/local.py b/streaming/base/local.py index ad232099e..926f67c79 100644 --- a/streaming/base/local.py +++ b/streaming/base/local.py @@ -12,6 +12,8 @@ from streaming.base.format import reader_from_json from streaming.base.index import Index +__all__ = ['LocalDataset'] + class LocalDataset(Dataset): """The dataset resides locally in a machine. diff --git a/streaming/base/util.py b/streaming/base/util.py index a523bd4a9..1675dec13 100644 --- a/streaming/base/util.py +++ b/streaming/base/util.py @@ -5,6 +5,8 @@ from typing import List +__all__ = ['get_list_arg'] + def get_list_arg(text: str) -> List[str]: """Pass a list as a command-line flag. diff --git a/streaming/text/c4.py b/streaming/text/c4.py index 45dbdd6c4..343805867 100644 --- a/streaming/text/c4.py +++ b/streaming/text/c4.py @@ -13,6 +13,8 @@ from streaming.base import Dataset +__all__ = ['C4'] + class C4(Dataset): """Implementation of the C4 (Colossal Cleaned Common Crawl) dataset using streaming Dataset. diff --git a/streaming/text/enwiki.py b/streaming/text/enwiki.py index 3f1a8d46a..17a9c4ec5 100644 --- a/streaming/text/enwiki.py +++ b/streaming/text/enwiki.py @@ -9,6 +9,8 @@ from streaming.base import Dataset +__all__ = ['EnWiki'] + class EnWiki(Dataset): """Implementation of the English Wikipedia 2020-01-01 streaming dataset. diff --git a/streaming/vision/ade20k.py b/streaming/vision/ade20k.py index 3909d5799..1c274979d 100644 --- a/streaming/vision/ade20k.py +++ b/streaming/vision/ade20k.py @@ -11,6 +11,8 @@ from streaming.base import Dataset +__all__ = ['ADE20K'] + class ADE20K(Dataset): """Implementation of the ADE20K dataset using streaming Dataset. diff --git a/streaming/vision/cifar10.py b/streaming/vision/cifar10.py index 1dda6b1d6..3a5383fcb 100644 --- a/streaming/vision/cifar10.py +++ b/streaming/vision/cifar10.py @@ -9,6 +9,8 @@ from streaming.vision.base import ImageClassDataset +__all__ = ['CIFAR10'] + class CIFAR10(ImageClassDataset): """Implementation of the CIFAR-10 dataset using streaming Dataset. diff --git a/streaming/vision/coco.py b/streaming/vision/coco.py index a29d8682e..9e5ce4486 100644 --- a/streaming/vision/coco.py +++ b/streaming/vision/coco.py @@ -11,6 +11,8 @@ from streaming.base import Dataset +__all__ = ['COCO'] + class COCO(Dataset): """Implementation of the COCO dataset using streaming Dataset. diff --git a/streaming/vision/imagenet.py b/streaming/vision/imagenet.py index acf23837c..0d2f55fb2 100644 --- a/streaming/vision/imagenet.py +++ b/streaming/vision/imagenet.py @@ -9,6 +9,8 @@ from streaming.vision.base import ImageClassDataset +__all__ = ['ImageNet'] + class ImageNet(ImageClassDataset): """Implementation of the ImageNet dataset using streaming Dataset. diff --git a/tests/test_compression.py b/tests/test_compression.py index bc596b942..6e6ad32a0 100644 --- a/tests/test_compression.py +++ b/tests/test_compression.py @@ -11,9 +11,8 @@ from streaming.base.compression.compression import (Brotli, Bzip2, Gzip, Snappy, Zstandard, compress, decompress, get_compression_extension, is_compression) - -from .common.datasets import * -from .common.utils import * +from tests.common.datasets import * +from tests.common.utils import * class TestBrotli: diff --git a/tests/test_distributed.py b/tests/test_distributed.py index 6c5b4da89..1d53a6701 100644 --- a/tests/test_distributed.py +++ b/tests/test_distributed.py @@ -12,10 +12,9 @@ import streaming.base.distributed as ms_dist from streaming.base import Dataset - -from .common.datasets import * -from .common.distributed import DistributedTest -from .common.utils import * +from tests.common.datasets import * +from tests.common.distributed import DistributedTest +from tests.common.utils import * logger = logging.getLogger(__name__) diff --git a/tests/test_streaming.py b/tests/test_streaming.py index ecefaccd5..5a5a7ba28 100644 --- a/tests/test_streaming.py +++ b/tests/test_streaming.py @@ -12,9 +12,8 @@ from torch.utils.data import DataLoader from streaming.base import Dataset - -from .common.datasets import * -from .common.utils import * +from tests.common.datasets import * +from tests.common.utils import * logger = logging.getLogger(__name__) diff --git a/tests/test_writer.py b/tests/test_writer.py index 483bd5499..fbcdb6013 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -10,9 +10,8 @@ import pytest from streaming import CSVWriter, Dataset, JSONWriter, MDSWriter, TSVWriter, XSVWriter - -from .common.datasets import * -from .common.utils import * +from tests.common.datasets import * +from tests.common.utils import * logger = logging.getLogger(__name__)