diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 000000000..ac6d485a3
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,35 @@
+## Description of changes:
+
+<!--
+Please briefly describe your change, including what problem the change fixes, and any context
+necessary for understanding the change
+-->
+
+## Issue #, if available:
+
+<!--
+Please include any issues related to this pull request, including 'Fixes' if the issue is resolved
+by this pull request.
+Example:
+- Fixes #42
+- Related to #1234
+-->
+
+## Merge Checklist:
+_Put an `x` without space in the boxes that apply. If you are unsure about any checklist, please don't hesitate to ask. We are here to help! This is simply a reminder of what we are going to look for before merging your pull request._
+
+### General
+- [ ] I have read the [contributor guidelines](https://github.com/mosaicml/streaming/blob/main/CONTRIBUTING.md)
+- [ ] This is a documentation change or typo fix. If so, skip the rest of this checklist.
+- [ ] I certify that the changes I am introducing will be backward compatible, and I have discussed concerns about this, if any, with the MosaicML team.
+- [ ] I have updated any necessary documentation, including [README](https://github.com/mosaicml/streaming/blob/main/README.md) and [API docs](https://github.com/mosaicml/streaming/tree/main/docs) (if appropriate).
+
+### Tests
+- [ ] I ran `pre-commit` on my change. (check out the `pre-commit` section of [prerequisites](https://github.com/mosaicml/streaming/blob/main/CONTRIBUTING.md#prerequisites))
+- [ ] I have added tests that prove my fix is effective or that my feature works (if appropriate).
+- [ ] I ran the tests locally to make sure it pass. (check out [testing](https://github.com/mosaicml/streaming/blob/main/CONTRIBUTING.md#running-tests))
+- [ ] I have added unit and/or integration tests as appropriate to ensure backward compatibility of the changes.
+
+<!--
+Thanks so much for contributing to Streaming! We really appreciate it :)
+-->
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 7878de644..b4368fcc7 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -5,8 +5,8 @@
 
 version: 2
 updates:
-  - package-ecosystem: "pip" # See documentation for possible values
-    directory: "/" # Location of package manifests
+  - package-ecosystem: "pip"  # See documentation for possible values
+    directory: "/"  # Location of package manifests
     schedule:
       interval: "weekly"
     # Allow up to 5 open pull requests for pip dependencies
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index a9b9d175a..f687e4e37 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -13,9 +13,9 @@ name: "CodeQL"
 
 on:
   push:
-    branches: [ main ]
+    branches: [main]
   schedule:
-    - cron: '0 9 * * 1'  # Every Monday at 09:00 (9:00 AM)
+    - cron: "0 9 * * 1"  # Every Monday at 09:00 (9:00 AM)
 
 jobs:
   analyze:
@@ -29,39 +29,39 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        language: [ 'python' ]
+        language: ["python"]
         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
         # Learn more about CodeQL language support at https://git.io/codeql-language-support
 
     steps:
-    - name: Checkout repository
-      uses: actions/checkout@v3
+      - name: Checkout repository
+        uses: actions/checkout@v3
 
-    # Initializes the CodeQL tools for scanning.
-    - name: Initialize CodeQL
-      uses: github/codeql-action/init@v2
-      with:
-        languages: ${{ matrix.language }}
-        # If you wish to specify custom queries, you can do so here or in a config file.
-        # By default, queries listed here will override any specified in a config file.
-        # Prefix the list here with "+" to use these queries and those in the config file.
-        # queries: ./path/to/local/query, your-org/your-repo/queries@main
+      # Initializes the CodeQL tools for scanning.
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@v2
+        with:
+          languages: ${{ matrix.language }}
+          # If you wish to specify custom queries, you can do so here or in a config file.
+          # By default, queries listed here will override any specified in a config file.
+          # Prefix the list here with "+" to use these queries and those in the config file.
+          # queries: ./path/to/local/query, your-org/your-repo/queries@main
 
-    # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
-    # If this step fails, then you should remove it and run the build manually (see below)
-    - name: Autobuild
-      uses: github/codeql-action/autobuild@v2
+      # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
+      # If this step fails, then you should remove it and run the build manually (see below)
+      - name: Autobuild
+        uses: github/codeql-action/autobuild@v2
 
-    # ℹ️ Command-line programs to run using the OS shell.
-    # 📚 https://git.io/JvXDl
+      # ℹ️ Command-line programs to run using the OS shell.
+      # 📚 https://git.io/JvXDl
 
-    # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
-    #    and modify them (or add more) to build your code if your project
-    #    uses a compiled language
+      # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
+      #    and modify them (or add more) to build your code if your project
+      #    uses a compiled language
 
-    #- run: |
-    #   make bootstrap
-    #   make release
+      # - run: |
+      #   make bootstrap
+      #   make release
 
-    - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v2
+      - name: Perform CodeQL Analysis
+        uses: github/codeql-action/analyze@v2
diff --git a/.github/workflows/pypi-release.yaml b/.github/workflows/pypi-release.yaml
index 324154249..1099b2e49 100644
--- a/.github/workflows/pypi-release.yaml
+++ b/.github/workflows/pypi-release.yaml
@@ -51,7 +51,7 @@ jobs:
         id: create_release
         uses: actions/create-release@v1
         env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}   # This token is provided by Actions, you do not need to create your own token
         with:
           tag_name: ${{ github.ref }}
           release_name: ${{ github.ref }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index fea490f8b..63914fc78 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@ default_language_version:
   python: python3
 # Skip the pre-commit check for below directories to have
 # a consistency with the official tfrecord preprocessing scripts
-exclude: '^(streaming/text/convert/enwiki/)'
+exclude: "^(streaming/text/convert/enwiki/)"
 repos:
   - repo: https://github.com/google/yapf
     rev: v0.32.0
@@ -11,7 +11,7 @@ repos:
         name: yapf
         description: "A formatter for Python files."
         entry: yapf
-        args: [-i, -vv, -p] #inplace
+        args: [-i, -vv, -p]   # inplace
         language: python
         types: [python]
         additional_dependencies:
@@ -83,7 +83,16 @@ repos:
         entry: pydocstyle
         language: python
         types: [python]
-        exclude: '(tests|.ci|.github)'
+        exclude: "(tests|.ci|.github)"
         additional_dependencies:
           - "toml"
     rev: 6.1.1
+  - repo: https://github.com/adrienverge/yamllint.git
+    rev: v1.28.0
+    hooks:
+      - id: yamllint
+        name: yamllint
+        description: This hook runs yamllint.
+        entry: yamllint
+        language: python
+        types: [file, yaml]
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 1e96650a8..77e68cdf1 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -7,12 +7,12 @@ version: 2
 
 # Build documentation in the docs/ directory with Sphinx
 sphinx:
-   builder: html
-   configuration: docs/source/conf.py
+  builder: html
+  configuration: docs/source/conf.py
 
 # Optionally set the version of Python and requirements required to build your docs
 python:
-   version: "3.8"
-   install:
-      - method: pip
-        path: .[all]
+  version: "3.8"
+  install:
+    - method: pip
+      path: .[all]
diff --git a/.yamllint.yaml b/.yamllint.yaml
new file mode 100644
index 000000000..5821b8330
--- /dev/null
+++ b/.yamllint.yaml
@@ -0,0 +1,40 @@
+yaml-files:
+  - "*.yaml"
+  - "*.yml"
+  - .yamllint
+
+rules:
+  braces:
+    forbid: non-empty
+  brackets:
+    forbid: false
+  colons: enable
+  commas: enable
+  comments:
+    level: warning
+  comments-indentation: enable
+  document-end:
+    present: false
+  document-start:
+    present: false
+  empty-lines: enable
+  empty-values: disable
+  hyphens: enable
+  indentation:
+    spaces: 2
+    indent-sequences: true
+    check-multi-line-strings: false
+  key-duplicates: enable
+  key-ordering: disable
+  line-length:
+    max: 200
+    allow-non-breakable-words: true
+    allow-non-breakable-inline-mappings: true
+  new-line-at-end-of-file: enable
+  new-lines: enable
+  octal-values: enable
+  quoted-strings:
+    quote-type: double
+    required: false
+  trailing-spaces: enable
+  truthy: disable
diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 000000000..01da718da
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1,3 @@
+# Require admin approval to modify all files in the root of the repository
+# This includes setup.py, the README, and the CODEOWNERS file itself!
+/* @knighton @karan6181 @bandish-shah
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 000000000..7d30b8b78
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,5 @@
+# Community Guidelines
+
+This repository is governed by MosaicML's community guidelines and code of conduct.
+For more details, including information on how to report issues affecting the community, please read the
+[MosaicML Community Guidelines](https://docs.google.com/document/d/1h8S9x9bCTsA_H8ourZJy3SQVWy-6z7i28TP5rcZt8RI/edit) and the [MosaicML Code of Conduct](https://docs.google.com/document/d/1aCaMLO65qfMaqP3uDYiUsTauMvBrSKd7qgeYqz458Ew/edit).
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 000000000..bc8488f8f
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,2 @@
+global-exclude **/__pycache__/**
+global-exclude *.pyc
diff --git a/STYLE_GUIDE.md b/STYLE_GUIDE.md
new file mode 100644
index 000000000..55d0a6edd
--- /dev/null
+++ b/STYLE_GUIDE.md
@@ -0,0 +1,394 @@
+# 1. Style and Conventions
+
+## 1.1 Style Guide
+
+Streaming generally follows Google's
+[Python Style Guide](https://google.github.io/styleguide/pyguide.html) for how to format and structure code.
+
+## 1.2. Pre-Commit Hooks
+
+Streaming uses [Pre Commit](https://pre-commit.com/) to enforce style checks. To configure, run
+```
+pip install '.[dev]'  # if not already installed
+pre-commit install
+```
+
+The pre-commit hooks will now be run before each commit. You can also run the hooks manually via:
+
+```
+pre-commit run  # run all hooks on changed files
+pre-commit run --all-files  # or, run all hooks on all files
+```
+
+## 1.3. Code Formatting
+
+Streaming uses the [yapf](https://github.com/google/yapf) formatter for general formatting
+[isort](https://github.com/PyCQA/isort) to sort imports. These checks run through pre-commit
+(see section 2.2). These checks can also be run manually via:
+
+```
+pre-commit run yapf --all-files  # for yahp
+pre-commit run isort --all-files  # for isort
+```
+
+The configuration is stored in [pyproject.toml](pyproject.toml).
+
+## 1.4. Code Structure
+
+As a general rule of thumb,
+
+-   Don't: Default to using inheritance for code reuse
+
+    Do: prefer [composition over inheritance](https://en.wikipedia.org/wiki/Composition_over_inheritance)
+-   Don't: strive to implement all logic using classes
+
+    Do: strive to implement logic as pure functions when possible, and classes when there is good reason
+-   Don't: Have a function accept falsy values that would then result in a no-op.
+
+    Example of the anti-pattern:
+
+    ```python
+    from typing import Optional
+
+    def custom_configuration(config: Optional[dict]):
+        if config is None:
+            # Don't do this check in the callee, which results in a no-op
+            return
+        ...
+    ```
+
+    Do: Require the caller, instead of the callee, check for and handle falsy values. It's ok to accept falsy values
+    for individual arguments of a caller function, so long as the entire function would not be a no-op.
+
+    Example:
+    ```python
+    from typing import Optional
+
+    def custom_configuration(config: dict):
+        ...
+
+    def trainer(config: Optional[dict]):
+        if config is not None:
+            # Do this check in the caller function
+            custom_configuration(config)
+        ...
+    ```
+
+# 2. Type Annotations and Typechecking
+
+Streaming aims to annotate all functions with type annotations (introduced in
+[PEP 526](https://www.python.org/dev/peps/pep-0526/)). Type annotations help statically catch `TypeError` and
+`AttributeError` bugs, in addition to other benefits, as outlined in the PEP.
+
+For documentation on typing annotations, see:
+* [PEP 483](https://peps.python.org/pep-0483/) for a simplified introduction
+* [PEP 484](https://peps.python.org/pep-0484/) for the full specification
+* [Python docs for `typing`](https://docs.python.org/3/library/typing.html) for the API reference
+
+Streaming uses [pyright](https://github.com/microsoft/pyright)
+to validate type annotations. PyRight is automatically run as part of the pre-commit hooks, but you can also
+run PyRight specifically via:
+
+```
+pre-commit run pyright --all-files
+```
+
+The pyright configuration is stored in [pyproject.toml](pyproject.toml).
+
+
+## 2.1 Debugging
+
+Here are some suggestions to deal with pyright errors:
+
+1. Suppose a variable could be one of multiple types, like the following:
+
+    ```python
+    from typing import Union
+
+    def foo(x: Union[int, None]):
+        return x + 5  # type error -- None + 5 is not allowed!
+    ```
+
+    PyRight will complain since `None + 5` is not a valid operation.
+    Instead, add a check to ensure that `x is not None`:
+
+    ```python
+    from typing import Union
+
+    def foo(x: Union[int, None]):
+        if x is None:
+            raise TypeError("x must be an integer, not None!")
+        return x + 5  # valid
+    ```
+
+    Assert statements also work. However, assert statements should not be used for data validation
+    (see the assert statement section below).
+    ```python
+    from typing import Union
+
+    def foo(x: Union[int, None]):
+        assert x is not None, "x should never be None"
+        return x + 5  # valid
+    ```
+
+1. For variables where it is impossible for pyright to infer the correct type, use
+[cast](https://docs.python.org/3/library/typing.html#typing.cast).
+1. As a last resort, add a `# type: ignore` comment to the line where pyright emits an error.
+Immediately following this statement, paste in the error emitted by pyright,
+so other contributors will know why this error was silenced.
+
+
+# 3. Public APIs
+A public API, generally speaking, can be invoked by a user without a leading underscore in any portion of the path.
+The following are examples of public APIs:
+
+* Standalone functions in public modules (e.g. `streaming.base.distributed.get_world_size`)
+* Classes in public modules (e.g. `streaming.base.format.MDSWriter`)
+* Public methods in public classes (e.g. `streaming.base.format.MDSWriter.write`)
+* Public modules (e.g. `streaming.base.dataset`)
+
+The following rules apply to public APIs:
+1. All public APIs must have a docstring (see the Documentation section below)
+1. All parameters must have type annotations.
+1. To minimize user imports, parameters should should use native PyTorch or Python types whenever possible.
+
+    It is acceptable to use a union of types, so long as one of the options is a primitive.
+
+1. Parameters that could take a sequence of elements should also allow `None` or a singleton.
+    This simplifies the user API by not having to construct a list (or tuple) to hold a single element
+    (or no element). For example, use `Optional[Union[torch.Tensor, Sequence[torch.Tensor]]`.
+
+
+# 4. Use of `assert`
+
+`assert` should be used only in test cases and for verifying invariants (likely required for type checking),
+not for data validation. As asserts can be disabled in python by using the `-O` flag
+(e.g. `python -O path/to/script.py`), they are not guaranteed to run. For data validation, instead use a style like
+the following:
+
+<!--pytest.mark.xfail-->
+<!--
+```python
+parameter = None
+```
+-->
+<!--pytest-codeblocks:cont-->
+```python
+if parameter is None:
+    raise ValueError("parameter must be specified and cannot be None")
+```
+
+
+# 5. Imports and `__init__.py`
+
+All imports in Streaming should be absolute -- that is, they do not begin with a period.
+
+## 5.1 External Dependencies
+1.  All external dependencies must be specified in both [setup.py](setup.py) for pip.
+
+1.  If a dependency is not core to Streaming (e.g. it is for a model, dataset, or some callbacks):
+    1.  It must be specified in a entry of the `extra_deps` dictionary of [setup.py](setup.py).
+        This dictionary groups dependencies that can be conditionally installed. An entry named `foo`
+        can be installed with `pip install 'mosaicml-streaming[foo]'`. For example, running `pip install 'mosaicml-streaming[docs]'`
+        will install everything in `install_requires`, along with `docs`.
+    1.  It must also be specified in the `run_constrained` and the `test.requires` section.
+    1.  If the dependency is core to Streaming, add the dependency to the `install_requires` section of
+        [setup.py](./setup.py).
+
+## 5.2 Use of `__all__`
+
+All public modules must define `__all__` to be the list of members that should be re-exported.
+The variable is necessary to 1) limit what `from XXX import *` imports, and 2) ensure that the documentation only
+includes exported members, not unrelated re-imports.
+
+For example, from [streaming/base/dataset.py](streaming/base/dataset.py)
+
+```python
+"""The :class:`Dataset` class, used for building streaming iterable datasets."""
+from torch.utils.data import IterableDataset
+
+from streaming.base.format import reader_from_json
+from streaming.base.index import Index, Partition
+
+__all__ = ["Dataset"]  # export only the Dataset, not other imports like `Index`, `Partition`, or `reader_from_json`
+
+
+class Dataset(IterableDataset):
+    ...
+```
+
+
+## 5.3 `__init__.py`
+
+All public classes and functions should be added to the module's `__init__.py`.
+
+<!--pytest.mark.skip-->
+```python
+from streaming.path.to.module.file import MyClass as MyClass
+from streaming.path.to.module.file import my_func as my_func
+```
+
+If a file only contains public functions, then the following is also acceptable:
+
+<!--pytest.mark.skip-->
+```python
+from streaming.path.to.module import my_file as my_file
+```
+
+
+# 6. Documentation
+
+## 6.1 Docstrings
+
+Streaming uses [Google Style Docstrings](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html).
+All public APIs require documentation.
+
+### 6.1.1 What to include in Docstrings?
+
+Docstrings, at a minimum, should include a summary of what the function or class does, along with the arguments it takes. See [below](#612-formatting-docstrings) for how to format docstrings. The [Google Style Guide](https://google.github.io/styleguide/pyguide.html) also includes some guidelines on how to write docstrings.
+
+### 6.1.2 Formatting Docstrings
+
+The following guidelines apply to documentation.
+1.  Each function that needs a docstring must have its input arguments, return statement (if not None), and any custom
+    exceptions annotated.
+1.  The arguments for the `__init__` signature of classes should be documented under the class-level docstring. There
+    should not be any `__init__`-level docstring.
+1.  Each argument annotation should include the type. If the argument has a default value, the type annotation should
+    specify "optional", and the docstring should say the default value. Some examples:
+
+    ```python
+    from typing import Optional, Tuple, Union
+
+    def foo(bar: int):
+        """Foo.
+
+        Args:
+            bar (int): Required bar.
+        """
+        ...
+
+    def foo2(bar: int = 42):
+        """Foo2.
+
+        Args:
+            bar (int, optional): The first Argument. Default: ``42``.
+        """
+        ...
+
+    def foo3(bar: Optional[int] = None):
+        """Foo3.
+
+        Args:
+            bar (int, optional): The first Argument. Default: ``None``.
+        """
+        ...
+
+    def foo4(bar: Union[int, str] = 42):
+        """Foo4.
+
+        Args:
+            bar (int | str, optional): The first Argument. Default: ``42``.
+        """
+        ...
+
+    def foo5(bar: int) -> int:
+        """Foo5.
+
+        Args:
+            bar (int): Required bar.
+
+        Returns:
+            int: Description of return statement.
+        """
+        ...
+
+    def foo6(bar: int) -> Tuple[int, str]:
+        """Foo6.
+
+        Args:
+            bar (int): Required bar.
+
+        Returns:
+            a (int): Returned value.
+            b (str): Returned value.
+        """
+        ...
+    ```
+
+### 6.1.3 Building and Viewing Docs Locally
+
+Assuming you already have a development install of Streaming (see these [instructions](CONTRIBUTING.md#prerequisites)), here’s how to build and previous the docs locally.
+
+**️️ ⚠ Warning:** CI treats all sphinx warnings as errors, so they must be addressed before a PR can be merged. Building docs locally can help debug any warnings showing up on Jenkins!
+
+In a terminal, run:
+
+<!--pytest.mark.skip-->
+```bash
+source path/to/streaming_venv/bin/activate  # activate your streaming virtual env
+cd streaming/docs  # cd to the docs folder inside your streaming clone
+make clean  # Cleans the artifacts and remove source/api_reference folder
+make html   # build the docs
+make host   # Run the docs locally
+```
+
+Then, navigate to [http://localhost:8000](http://localhost:8000) in your browser.
+
+## 6.2 Doctests
+
+Most docstrings should also include a `.. doctest` or `.. testcode` example to clearly illustrate how one would interact with the class or function. As part of the CI/CD process, all `.. doctest` blocks are executed to ensure the example in the documentation actually works.
+
+### 6.2.1 Writing Doctests
+
+See the [Sphinx Doctest Extension](https://www.sphinx-doc.org/en/master/usage/extensions/doctest.html) for all of the available directives. Do not use `.. code-block::` for Python examples, as they are untested.
+
+Any test fixtures for doctests should go in [docs/source/doctest_fixtures.py](docs/source/doctest_fixtures.py) or in a `.. testsetup::` block.
+
+For example:
+```python
+import torch
+from typing import Optional
+
+def my_function(x: Optional[torch.Tensor]) -> torch.Tensor:
+    """blah function
+
+    Args:
+        input (torch.Tensor): Your guess.
+
+    Returns:
+        torch.Tensor: How good your input is.
+
+    Raises:
+        ValueError: If your input is negative.
+
+    Example:
+        .. testsetup::
+
+            # optional setup section, not shown in docs
+            import torch
+            x = torch.randn(42)
+
+
+        .. testcode::
+
+            # shown in docs; runs after testsetup
+            my_function(x)
+    """
+    ...
+```
+
+All doctests load the [docs/source/doctest_fixtures.py](docs/source/doctest_fixtures.py) file *before* tests run. If there are any variables that would be helpful have defined for all tests, feel free to add them into this file. However, if a variable is more specific to an individual doctest, then it would be best to include it in a `.. testsetup::` block, as not to pollute the global fixture namespace. (Unlike pytest fixtures, all doctest fixtures are given to every doctest; they cannot be specifically requested)
+
+### 6.2.2 Running Doctests Locally
+
+Assuming you already have a development install of Streaming (see these [instructions](CONTRIBUTING.md#prerequisites)), here’s how to run the doctests.
+
+<!--pytest.mark.skip-->
+```bash
+source path/to/streaming_venv/bin/activate  # activate your streaming virtual env
+cd streaming/docs  # cd to the docs folder inside your streaming clone
+make clean  # Cleans the artifacts and remove source/api_reference folder
+make html  # the html build must be completed first to ensure all doctests are identified
+make doctest 2>/dev/null # For more verbosity, do not direct stderr to /dev/null
+```
diff --git a/streaming/base/dataset.py b/streaming/base/dataset.py
index 0c79a436f..74d3a5444 100644
--- a/streaming/base/dataset.py
+++ b/streaming/base/dataset.py
@@ -22,6 +22,8 @@
 from streaming.base.hashing import get_hash
 from streaming.base.index import Index, Partition, get_index_basename
 
+__all__ = ['Dataset']
+
 
 class DownloadStatus(IntEnum):
     """Download status."""
diff --git a/streaming/base/format/base/reader.py b/streaming/base/format/base/reader.py
index c851ff8ce..a067b9815 100644
--- a/streaming/base/format/base/reader.py
+++ b/streaming/base/format/base/reader.py
@@ -7,6 +7,8 @@
 from dataclasses import dataclass
 from typing import Any, Dict, Iterator, List, Optional
 
+__all__ = ['FileInfo', 'Reader', 'JointReader', 'SplitReader']
+
 
 @dataclass
 class FileInfo(object):
diff --git a/streaming/base/format/base/writer.py b/streaming/base/format/base/writer.py
index ed7bf7d20..4394b5a3b 100644
--- a/streaming/base/format/base/writer.py
+++ b/streaming/base/format/base/writer.py
@@ -15,6 +15,8 @@
 from streaming.base.hashing import get_hash, is_hash
 from streaming.base.index import get_index_basename
 
+__all__ = ['JointWriter', 'SplitWriter']
+
 
 class Writer(ABC):
     """Writes a streaming dataset.
diff --git a/streaming/base/format/json/reader.py b/streaming/base/format/json/reader.py
index 4ac4409c9..07a34636e 100644
--- a/streaming/base/format/json/reader.py
+++ b/streaming/base/format/json/reader.py
@@ -13,6 +13,8 @@
 
 from streaming.base.format.base.reader import FileInfo, SplitReader
 
+__all__ = ['JSONReader']
+
 
 class JSONReader(SplitReader):
     """Provides random access to the samples of a JSON shard.
diff --git a/streaming/base/format/json/writer.py b/streaming/base/format/json/writer.py
index 217a123ae..1cc27b3cc 100644
--- a/streaming/base/format/json/writer.py
+++ b/streaming/base/format/json/writer.py
@@ -11,6 +11,8 @@
 from streaming.base.format.base.writer import SplitWriter
 from streaming.base.format.json.encodings import is_json_encoded, is_json_encoding
 
+__all__ = ['JSONWriter']
+
 
 class JSONWriter(SplitWriter):
     r"""Writes a streaming JSON dataset.
diff --git a/streaming/base/format/mds/reader.py b/streaming/base/format/mds/reader.py
index 698bc2979..e289434d9 100644
--- a/streaming/base/format/mds/reader.py
+++ b/streaming/base/format/mds/reader.py
@@ -13,6 +13,8 @@
 from streaming.base.format.base.reader import FileInfo, JointReader
 from streaming.base.format.mds.encodings import mds_decode
 
+__all__ = ['MDSReader']
+
 
 class MDSReader(JointReader):
     """Provides random access to the samples of an MDS shard.
diff --git a/streaming/base/format/mds/writer.py b/streaming/base/format/mds/writer.py
index ec97d776d..ae0e5bc44 100644
--- a/streaming/base/format/mds/writer.py
+++ b/streaming/base/format/mds/writer.py
@@ -11,6 +11,8 @@
 from streaming.base.format.base.writer import JointWriter
 from streaming.base.format.mds.encodings import get_mds_encoded_size, is_mds_encoding, mds_encode
 
+__all__ = ['MDSWriter']
+
 
 class MDSWriter(JointWriter):
     """Writes a streaming MDS dataset.
diff --git a/streaming/base/format/xsv/reader.py b/streaming/base/format/xsv/reader.py
index d53a797ab..fe0421661 100644
--- a/streaming/base/format/xsv/reader.py
+++ b/streaming/base/format/xsv/reader.py
@@ -13,6 +13,8 @@
 from streaming.base.format.base.reader import FileInfo, SplitReader
 from streaming.base.format.xsv.encodings import xsv_decode
 
+__all__ = ['XSVReader', 'CSVReader', 'TSVReader']
+
 
 class XSVReader(SplitReader):
     """Provides random access to the samples of an XSV shard.
diff --git a/streaming/base/format/xsv/writer.py b/streaming/base/format/xsv/writer.py
index 05e8c95b8..c76687479 100644
--- a/streaming/base/format/xsv/writer.py
+++ b/streaming/base/format/xsv/writer.py
@@ -11,6 +11,8 @@
 from streaming.base.format.base.writer import SplitWriter
 from streaming.base.format.xsv.encodings import is_xsv_encoding, xsv_encode
 
+__all__ = ['XSVWriter', 'CSVWriter', 'TSVWriter']
+
 
 class XSVWriter(SplitWriter):
     r"""Writes a streaming XSV dataset.
diff --git a/streaming/base/index.py b/streaming/base/index.py
index 098b03ad4..684ea648c 100644
--- a/streaming/base/index.py
+++ b/streaming/base/index.py
@@ -11,6 +11,8 @@
 
 from streaming.base import distributed as dist
 
+__all__ = ['get_index_basename', 'Partition', 'Index']
+
 
 def get_index_basename() -> str:
     """Get the canonical index file basename.
diff --git a/streaming/base/local.py b/streaming/base/local.py
index ad232099e..926f67c79 100644
--- a/streaming/base/local.py
+++ b/streaming/base/local.py
@@ -12,6 +12,8 @@
 from streaming.base.format import reader_from_json
 from streaming.base.index import Index
 
+__all__ = ['LocalDataset']
+
 
 class LocalDataset(Dataset):
     """The dataset resides locally in a machine.
diff --git a/streaming/base/util.py b/streaming/base/util.py
index a523bd4a9..1675dec13 100644
--- a/streaming/base/util.py
+++ b/streaming/base/util.py
@@ -5,6 +5,8 @@
 
 from typing import List
 
+__all__ = ['get_list_arg']
+
 
 def get_list_arg(text: str) -> List[str]:
     """Pass a list as a command-line flag.
diff --git a/streaming/text/c4.py b/streaming/text/c4.py
index 45dbdd6c4..343805867 100644
--- a/streaming/text/c4.py
+++ b/streaming/text/c4.py
@@ -13,6 +13,8 @@
 
 from streaming.base import Dataset
 
+__all__ = ['C4']
+
 
 class C4(Dataset):
     """Implementation of the C4 (Colossal Cleaned Common Crawl) dataset using streaming Dataset.
diff --git a/streaming/text/enwiki.py b/streaming/text/enwiki.py
index 3f1a8d46a..17a9c4ec5 100644
--- a/streaming/text/enwiki.py
+++ b/streaming/text/enwiki.py
@@ -9,6 +9,8 @@
 
 from streaming.base import Dataset
 
+__all__ = ['EnWiki']
+
 
 class EnWiki(Dataset):
     """Implementation of the English Wikipedia 2020-01-01 streaming dataset.
diff --git a/streaming/vision/ade20k.py b/streaming/vision/ade20k.py
index 3909d5799..1c274979d 100644
--- a/streaming/vision/ade20k.py
+++ b/streaming/vision/ade20k.py
@@ -11,6 +11,8 @@
 
 from streaming.base import Dataset
 
+__all__ = ['ADE20K']
+
 
 class ADE20K(Dataset):
     """Implementation of the ADE20K dataset using streaming Dataset.
diff --git a/streaming/vision/cifar10.py b/streaming/vision/cifar10.py
index 1dda6b1d6..3a5383fcb 100644
--- a/streaming/vision/cifar10.py
+++ b/streaming/vision/cifar10.py
@@ -9,6 +9,8 @@
 
 from streaming.vision.base import ImageClassDataset
 
+__all__ = ['CIFAR10']
+
 
 class CIFAR10(ImageClassDataset):
     """Implementation of the CIFAR-10 dataset using streaming Dataset.
diff --git a/streaming/vision/coco.py b/streaming/vision/coco.py
index a29d8682e..9e5ce4486 100644
--- a/streaming/vision/coco.py
+++ b/streaming/vision/coco.py
@@ -11,6 +11,8 @@
 
 from streaming.base import Dataset
 
+__all__ = ['COCO']
+
 
 class COCO(Dataset):
     """Implementation of the COCO dataset using streaming Dataset.
diff --git a/streaming/vision/imagenet.py b/streaming/vision/imagenet.py
index acf23837c..0d2f55fb2 100644
--- a/streaming/vision/imagenet.py
+++ b/streaming/vision/imagenet.py
@@ -9,6 +9,8 @@
 
 from streaming.vision.base import ImageClassDataset
 
+__all__ = ['ImageNet']
+
 
 class ImageNet(ImageClassDataset):
     """Implementation of the ImageNet dataset using streaming Dataset.
diff --git a/tests/test_compression.py b/tests/test_compression.py
index bc596b942..6e6ad32a0 100644
--- a/tests/test_compression.py
+++ b/tests/test_compression.py
@@ -11,9 +11,8 @@
 from streaming.base.compression.compression import (Brotli, Bzip2, Gzip, Snappy, Zstandard,
                                                     compress, decompress,
                                                     get_compression_extension, is_compression)
-
-from .common.datasets import *
-from .common.utils import *
+from tests.common.datasets import *
+from tests.common.utils import *
 
 
 class TestBrotli:
diff --git a/tests/test_distributed.py b/tests/test_distributed.py
index 6c5b4da89..1d53a6701 100644
--- a/tests/test_distributed.py
+++ b/tests/test_distributed.py
@@ -12,10 +12,9 @@
 
 import streaming.base.distributed as ms_dist
 from streaming.base import Dataset
-
-from .common.datasets import *
-from .common.distributed import DistributedTest
-from .common.utils import *
+from tests.common.datasets import *
+from tests.common.distributed import DistributedTest
+from tests.common.utils import *
 
 logger = logging.getLogger(__name__)
 
diff --git a/tests/test_streaming.py b/tests/test_streaming.py
index ecefaccd5..5a5a7ba28 100644
--- a/tests/test_streaming.py
+++ b/tests/test_streaming.py
@@ -12,9 +12,8 @@
 from torch.utils.data import DataLoader
 
 from streaming.base import Dataset
-
-from .common.datasets import *
-from .common.utils import *
+from tests.common.datasets import *
+from tests.common.utils import *
 
 logger = logging.getLogger(__name__)
 
diff --git a/tests/test_writer.py b/tests/test_writer.py
index 483bd5499..fbcdb6013 100644
--- a/tests/test_writer.py
+++ b/tests/test_writer.py
@@ -10,9 +10,8 @@
 import pytest
 
 from streaming import CSVWriter, Dataset, JSONWriter, MDSWriter, TSVWriter, XSVWriter
-
-from .common.datasets import *
-from .common.utils import *
+from tests.common.datasets import *
+from tests.common.utils import *
 
 logger = logging.getLogger(__name__)