diff --git a/docs/source/images/integrations/hf_data_card_preview.jpg b/docs/source/images/integrations/hf_data_card_preview.jpg
new file mode 100644
index 0000000000..12d717028f
Binary files /dev/null and b/docs/source/images/integrations/hf_data_card_preview.jpg differ
diff --git a/docs/source/images/integrations/hf_push_advanced_example.jpg b/docs/source/images/integrations/hf_push_advanced_example.jpg
new file mode 100644
index 0000000000..a0a38d43c9
Binary files /dev/null and b/docs/source/images/integrations/hf_push_advanced_example.jpg differ
diff --git a/docs/source/integrations/huggingface.rst b/docs/source/integrations/huggingface.rst
index f8d89a571e..8b65bd3b3b 100644
--- a/docs/source/integrations/huggingface.rst
+++ b/docs/source/integrations/huggingface.rst
@@ -10,10 +10,19 @@ FiftyOne integrates natively with Hugging Face's
 you can load, fine-tune, and run inference with your favorite Transformers
 models on your FiftyOne datasets with just a few lines of code!
 
-.. _huggingface-setup:
+FiftyOne also integrates with the `Hugging Face Hub <https://huggingface.co/docs/hub/index>`_, 
+so you can push datasets to and load datasets from the Hub with ease.
+
+.. _huggingface-transformers:
+
+Transformers Library
+____________________
+
+
+.. _huggingface-transformers-setup:
 
 Setup
-_____
+-----
 
 To get started with
 `Transformers <https://huggingface.co/docs/transformers>`_, just install the
@@ -21,12 +30,13 @@ To get started with
 
 .. code-block:: shell
 
-    pip install transformers
+    pip install -U transformers
+
 
-.. _huggingface-inference:
+.. _huggingface-transformers-inference:
 
 Inference
-_________
+---------
 
 All
 `Transformers models <https://huggingface.co/docs/transformers/index#supported-models-and-frameworks>`_
@@ -47,10 +57,10 @@ on the following sample dataset:
     dataset = foz.load_zoo_dataset("quickstart", max_samples=25)
     dataset.select_fields().keep_fields()
 
-.. _huggingface-image-classification:
+.. _huggingface-transformers-image-classification:
 
 Image classification
---------------------
+^^^^^^^^^^^^^^^^^^^^
 
 You can pass `transformers` classification models directly to FiftyOne
 dataset's
@@ -164,10 +174,10 @@ model's name or path as a keyword argument:
 
     session = fo.launch_app(dataset)
 
-.. _huggingface-object-detection:
+.. _huggingface-transformers-object-detection:
 
 Object detection
-----------------
+^^^^^^^^^^^^^^^^
 
 You can pass `transformers` detection models directly to your FiftyOne
 dataset's
@@ -277,10 +287,10 @@ name or path as a keyword argument:
 
     session = fo.launch_app(dataset)
 
-.. _huggingface-semantic-segmentation:
+.. _huggingface-transformers-semantic-segmentation:
 
 Semantic segmentation
----------------------
+^^^^^^^^^^^^^^^^^^^^^^
 
 You can pass a `transformers` semantic segmentation model directly to your
 FiftyOne dataset's
@@ -373,10 +383,10 @@ model's name or path as a keyword argument:
     session = fo.launch_app(dataset)
 
 
-.. _huggingface-monocular-depth-estimation:
+.. _huggingface-transformers-monocular-depth-estimation:
 
 Monocular depth estimation
---------------------------
+^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 You can pass a `transformers` monocular depth estimation model directly to your
 FiftyOne dataset's :meth:`apply_model() <fiftyone.core.collections.SampleCollection.apply_model>`
@@ -423,10 +433,10 @@ model's name or path as a keyword argument:
     session = fo.launch_app(dataset)
 
 
-.. _huggingface-zero-shot-classification:
+.. _huggingface-transformers-zero-shot-classification:
 
 Zero-shot classification
-------------------------
+^^^^^^^^^^^^^^^^^^^^^^^^
 
 Zero-shot image classification models from `transformers` can be loaded 
 directly from the :ref:`FiftyOne Model Zoo <model-zoo>`!
@@ -511,10 +521,10 @@ FiftyOne format:
     Some zero-shot models are compatible with multiple tasks, so it is
     recommended that you specify the task type when converting the model.
 
-.. _huggingface-zero-shot-detection:
+.. _huggingface-transformers-zero-shot-detection:
 
 Zero-shot object detection
---------------------------
+^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Zero-shot object detection models from `transformers` can be loaded directly
 from the :ref:`FiftyOne Model Zoo <model-zoo>`!
@@ -560,10 +570,10 @@ FiftyOne format:
     Some zero-shot models are compatible with multiple tasks, so it is
     recommended that you specify the task type when converting the model.
 
-.. _huggingface-batch-inference:
+.. _huggingface-transformers-batch-inference:
 
 Batch inference
----------------
+^^^^^^^^^^^^^^^
 
 When using
 :meth:`apply_model() <fiftyone.core.collections.SampleCollection.apply_model>`,
@@ -615,10 +625,10 @@ pattern below:
     See :ref:`this section <batch-updates>` for more information about
     performing batch updates to your FiftyOne datasets.
 
-.. _huggingface-embeddings:
+.. _huggingface-transformers-embeddings:
 
 Embeddings
-__________
+----------
 
 Any `transformers` model that supports image classification or object detection
 tasks — zero-shot or otherwise — can be used to compute embeddings for your 
@@ -633,10 +643,10 @@ samples.
     detection, or base model, FiftyOne will extract embeddings from the
     `last_hidden_state` of the model's base encoder.
 
-.. _huggingface-image-embeddings:
+.. _huggingface-transformers-image-embeddings:
 
 Image embeddings
-----------------
+^^^^^^^^^^^^^^^^
 
 To compute embeddings for images, you can pass the `transformers` model
 directly to your FiftyOne dataset's
@@ -713,10 +723,10 @@ see if the model can be used to generate embeddings:
     image = Image.open(dataset.first().filepath)
     embedding = model.embed(np.array(image))
 
-.. _huggingface-text-embeddings:
+.. _huggingface-transformers-text-embeddings:
 
 Text embeddings
----------------
+^^^^^^^^^^^^^^^
 
 Zero-shot image classification and object detection models from `transformers`
 can also be used to compute embeddings for text:
@@ -760,10 +770,10 @@ property:
     )
     print(model.can_embed_prompts)  # False
 
-.. _huggingface-batch-embeddings:
+.. _huggingface-transformers-batch-embeddings:
 
 Batch embeddings
-----------------
+^^^^^^^^^^^^^^^^
 
 You can request batch inference by passing the optional `batch_size` parameter
 to
@@ -774,10 +784,10 @@ to
 
     dataset.compute_embeddings(model, embeddings_field="embeddings", batch_size=16)
 
-.. _huggingface-patch-embeddings:
+.. _huggingface-transformers-patch-embeddings:
 
 Patch embeddings
-----------------
+^^^^^^^^^^^^^^^^
 
 You can compute embeddings for image patches by passing `transformers` models
 directly to your FiftyOne dataset's
@@ -804,10 +814,10 @@ method:
         embeddings_field="embeddings",
     )
 
-.. _huggingface-brain-methods:
+.. _huggingface-transformers-brain-methods:
 
 Brain methods
-_____________
+-------------
 
 Because `transformers` models can be used to compute embeddings, they can be
 passed to :ref:`Brain methods <fiftyone-brain>` like
@@ -891,3 +901,1038 @@ model:
     view = dataset.sort_by_similarity("A photo of a dog", k=25)
 
     session = fo.launch_app(view)
+
+.. _huggingface-hub:
+
+Hugging Face Hub
+________________
+
+FiftyOne integrates with the
+`Hugging Face Hub <https://huggingface.co/docs/hub/index>`_ to allow you to
+push datasets to and load datasets from the Hub with ease. This integration
+simplifies the process of sharing datasets with the machine learning  and
+computer vision community, and allows you to easily access and work with many
+of the most popular vision and multimodal datasets available!
+
+.. _huggingface-hub-setup:
+
+Setup
+-----
+
+To push datasets to and load datasets from the
+`Hugging Face Hub <https://huggingface.co/docs/hub/index>`_, you will need the
+`Hugging Face Hub Python client <https://github.com/huggingface/huggingface_hub>`_,
+which you can install via PyPI:
+
+.. code-block:: shell
+
+    pip install "huggingface_hub>=0.20.0"
+
+To push a dataset to the Hub, and in some cases, to access a dataset on
+the hub, you will need to have a
+`Hugging Face Hub account <https://huggingface.co/join>`_.
+
+Hugging Face handles authentication via tokens, which you can obtain by
+logging into your account and navigating to the 
+`Access Tokens <https://huggingface.co/settings/tokens>`_ section of your
+profile. At the bottom of this page, you can create a new token with write or
+read access to the Hub. Once you have your token, you can set it as an
+environment variable:
+
+.. code-block:: shell
+
+    export HF_TOKEN="<your-token-here>"
+
+.. _huggingface-hub-push-dataset:
+
+Pushing datasets to the Hub
+---------------------------
+
+If you are working with a dataset in FiftyOne and you want to quickly share it 
+with others, you can do so via the
+:func:`push_to_hub() <fiftyone.utils.huggingface.push_to_hub>`
+function, which takes two positional arguments: 
+
+- the FiftyOne sample collection (a |Dataset| or |DatasetView|)
+- the `repo_name`, which will be combined with your Hugging Face username or
+  organization name to construct the `repo_id` where the sample collection
+  will be uploaded.
+
+As you will see, this simple function allows you to push datasets and filtered
+views containing images, videos, point clouds, and other multimodal data to the
+Hugging Face Hub, providing you with incredible flexibility in the process.
+
+.. _huggingface-hub-push-dataset-basic:
+
+Basic usage
+^^^^^^^^^^^
+
+The basic recipe for pushing a FiftyOne dataset to the Hub is just two lines of
+code. As a starting point, let's use the example 
+:ref:`Quickstart dataset <dataset-zoo-quickstart>` dataset from the 
+:ref:`FiftyOne Dataset Zoo <dataset-zoo>`:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone as fo
+    import fiftyone.zoo as foz
+
+    dataset = foz.load_zoo_dataset("quickstart")
+
+To push the dataset to the Hugging Face Hub, all you need to do is call
+:func:`push_to_hub() <fiftyone.utils.huggingface.push_to_hub>` with the dataset
+and the desired `repo_name`:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone.utils.huggingface as fouh
+
+    fouh.push_to_hub(dataset, "my-quickstart-dataset")
+
+When you run this code, a few things happen:
+
+- The dataset and its media files are exported to a temporary directory and
+  uploaded to the specified Hugging Face repo.
+- A `fiftyone.yml` config file for the dataset is generated and uploaded to
+  the repo, which contains all of the necessary information so that the dataset
+  can be loaded with
+  :func:`load_from_hub() <fiftyone.utils.huggingface.load_from_hub>`.
+- A Hugging Face
+  `Dataset Card <https://huggingface.co/docs/hub/en/datasets-cards>`_
+  for the dataset is auto-generated, providing tags, metadata, license info,
+  and a code snippet illustrating how to load the dataset from the hub.
+
+Your dataset will be available on the Hub at the following URL:
+
+.. code-block:: text
+
+    https://huggingface.co/datasets/<your-username-or-org-name>/my-quickstart-dataset
+
+Pushing a |DatasetView| to the Hub works in exactly the same way. For example,
+if you want to push a filtered view of the `quickstart` dataset containing only
+predictions with high confidence, you can do so by creating the view as usual,
+and then passing that in to
+:func:`push_to_hub() <fiftyone.utils.huggingface.push_to_hub>`:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone.utils.huggingface as fouh
+
+    # create view with high confidence predictions
+    view = dataset.filter_labels("predictions", F("confidence") > 0.95)
+
+    # push view to the Hub as a new dataset
+    fouh.push_to_hub(view, "my-quickstart-high-conf")
+
+When you do so, note that the view is exported as a new dataset, and other 
+details from the original dataset are not included.
+
+FiftyOne is a *visual* toolkit, so when you push a dataset to the Hub, you can
+optionally include a preview (image, gif, or video) of the dataset, that will be
+displayed on the dataset page. To do this, you can pass the `preview_path`
+argument to :func:`push_to_hub() <fiftyone.utils.huggingface.push_to_hub>`, with
+either a relative or absolute path to the preview file on your local machine:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone as fo
+    import fiftyone.zoo as foz
+
+    import fiftyone.utils.huggingface as fouh
+
+    dataset = foz.load_zoo_dataset("quickstart")
+
+    session = fo.launch_app(dataset)
+    # Screenshot and save the preview image to a file
+
+    fouh.push_to_hub(
+        dataset,
+        "my-quickstart-with-preview",
+        preview_path="/path/to/preview.jpg"
+    )
+
+The preview file will be uploaded to the Hub along with the dataset, and will be
+displayed on the dataset card!
+
+.. image:: /images/integrations/hf_data_card_preview.jpg
+   :alt: Pushing a dataset to the Hugging Face Hub with a preview image
+   :align: center
+   
+
+.. _huggingface-hub-push-dataset-advanced:
+
+Advanced usage
+^^^^^^^^^^^^^^
+
+The :func:`push_to_hub() <fiftyone.utils.huggingface.push_to_hub>` function
+provides a number of optional arguments that allow you to customize how your
+dataset is pushed to the Hub, including whether the dataset is public or private,
+what license it is released under, and more.
+
+FiftyOne's :func:`push_to_hub() <fiftyone.utils.huggingface.push_to_hub>`
+function supports the Hugging Face Hub API arguments `private` and `exist_ok`.
+
+- **private** *(bool)*: Whether the dataset should be private. If `True`, the
+  dataset will be private and only accessible to you. If `False`, the dataset
+  will be public and accessible to anyone with the link. Defaults to `False`.
+- **exist_ok** *(bool)*: Whether to overwrite an existing dataset with the same
+    `repo_name`. If `True`, the existing dataset will be overwritten. If `False`,
+    an error will be raised if a dataset with the same `repo_name` already
+    exists. Defaults to `False`.
+
+For example, to push a dataset to the Hub as private, you can do the following:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone.utils.huggingface as fouh
+
+    fouh.push_to_hub(dataset, "my-private-dataset", private=True)
+
+You can also specify the `tags`, `license`, and `description` of the dataset,
+all of which will propagate to the `fiftyone.yml` config file and the Hugging
+Face Dataset Card. For example, to push a video action recognition dataset with
+an MIT license and a description, you can do the following:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone as fo
+    import fiftyone.zoo as foz
+    import fiftyone.utils.huggingface as fouh
+
+    dataset = foz.load_zoo_dataset("quickstart-video")
+
+    fouh.push_to_hub(
+        dataset,
+        "my-action-recognition-dataset",
+        tags=["video", "action-recognition"],
+        license="mit",
+        description="A dataset of videos for action recognition tasks",
+    )
+
+The pushed dataset will be available on the Hub and the dataset page will look
+like this:
+
+.. image:: /images/integrations/hf_push_advanced_example.jpg
+   :alt: Pushing a dataset to the Hugging Face Hub with advanced options
+   :align: center
+
+.. note::
+
+    The `tags` argument can be a string or a list of strings. The tag `fiftyone`
+    is automatically added to all datasets pushed with FiftyOne, communicating
+    that the dataset was created with FiftyOne and can be loaded with the
+    :func:`load_from_hub() <fiftyone.utils.huggingface.load_from_hub>` function.
+
+The license is specified as a string. For a list of supported licenses, see the
+`Hugging Face Hub documentation <https://huggingface.co/docs/hub/en/repositories-licenses>`_.
+
+The `description` argument can be used for whatever you like. When the dataset
+is loaded from the Hub, this description will be accessible via the dataset's
+:meth:`description <fiftyone.core.dataset.Dataset.description>` property.
+
+Additionally, you can specify the "format" of the uploaded dataset. By default,
+the format is the standard :ref:`FiftyOneDataset <FiftyOneDataset-import>` format,
+but you can also specify the data is uploaded in any of these
+:ref:`common formats <supported-import-formats>`. For example, to push the
+quickstart dataset in :ref:`COCO <COCODetectionDataset-import>` format, with a
+Creative Commons Attribution 4.0 license, you can do the following:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone as fo
+    import fiftyone.zoo as foz
+    import fiftyone.utils.huggingface as fouh
+    import fiftyone.types as fot
+
+    dataset = foz.load_zoo_dataset("quickstart")
+    dataset_type = fot.dataset_types.COCODetectionDataset
+
+    fouh.push_to_hub(
+        dataset,
+        "quickstart-coco",
+        dataset_type=dataset_type,
+        license="cc-by-4.0",
+        label_fields="*",  # convert all label fields, not just ground truth
+    )
+
+.. note::
+
+    The `label_fields` argument is used to specify which label fields to convert
+    to the specified dataset type. By default when using some dataset formats,
+    only the `ground_truth` label field is converted. If you want to convert all
+    label fields, you can set `label_fields="*"`. If you want to convert specific
+    label fields, you can pass a list of field names.
+
+
+Additionally, you can specify the minimum version of FiftyOne required to load
+the dataset by passing the `min_fiftyone_version` argument. This is useful when
+the dataset utilizes features that are only available in versions above a certain
+release. For example, to specify that the dataset requires FiftyOne version `0.23.0`:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone as fo
+    import fiftyone.zoo as foz
+    import fiftyone.utils.huggingface as fouh
+
+    dataset = foz.load_zoo_dataset("quickstart")
+
+    fouh.push_to_hub(
+        dataset,
+        "quickstart-min-version",
+        min_fiftyone_version="0.23.0",
+    )
+
+
+
+.. _huggingface-hub-load-dataset:
+
+Loading datasets from the Hub
+-----------------------------
+
+To load a dataset from the Hugging Face Hub, you can use the
+:func:`load_from_hub() <fiftyone.utils.huggingface.load_from_hub>` function.
+This function supports loading datasets in any of the 
+:ref:`common formats <supported-import-formats>` supported by FiftyOne, as well
+as image-based datasets stored via `Parquet <https://parquet.apache.org/>`_ files,
+as is common with datasets from the
+`datasets <https://huggingface.co/docs/datasets/en/index>`_ library which have
+been uploaded to the Hugging Face Hub. Below, we will walk through all of the
+ways you can load datasets from the Hub.
+
+In its simplest usage, the
+:func:`load_from_hub() <fiftyone.utils.huggingface.load_from_hub>` function
+only requires the `repo_id` of the dataset you want to load. For example, to
+load the :ref:`private dataset <huggingface-hub-push-dataset-advanced>` that we
+pushed to the Hub earlier, you can do the following:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone.utils.huggingface as fouh
+
+    dataset = fouh.load_from_hub("<username-or-org>/my-private-dataset")
+
+.. note::
+
+    As long as you have an environment variable `HF_TOKEN` set with your Hugging
+    Face token (with read access), you can load private or gated datasets that you have
+    access to from the Hub.
+
+.. _huggingface-hub-load-dataset-from-repo-config:
+
+Loading datasets from repo configs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When you push a dataset to the Hub using 
+:func:`push_to_hub() <fiftyone.utils.huggingface.push_to_hub>`, a `fiftyone.yml`
+config file is generated and uploaded to the repo. This file contains all of the
+information necessary to load the dataset from the Hugging Face Hub. More
+generally, any repo on the Hugging Face Hub that contains a `fiftyone.yml` or
+`fiftyone.yaml` file (assuming the file is correctly formatted) can be loaded
+using the :func:`load_from_hub() <fiftyone.utils.huggingface.load_from_hub>`
+function by passing the `repo_id` of the dataset, without needing to specify any
+additional arguments.
+
+For example, to load the `quickstart` dataset that we pushed to the Hub earlier,
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone.utils.huggingface as fouh
+
+    dataset = fouh.load_from_hub("my-quickstart-dataset")
+
+.. _huggingface-hub-load-dataset-from-local-config:
+
+Loading datasets from local configs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If the repo was uploaded to the Hugging Face Hub via FiftyOne's
+:func:`push_to_hub() <fiftyone.utils.huggingface.push_to_hub>` function, then
+the `fiftyone.yml` config file will be generated and uploaded to the repo.
+However, some common datasets like
+`mnist <https://huggingface.co/datasets/mnist>`_ were uploaded to the Hub
+using the `datasets` library and do not contain a `fiftyone.yml` or
+`fiftyone.yaml` file. If you know how the dataset is structured, you can load
+the dataset by passing the path to a local yaml config file that describes the
+dataset via the `config_file` keyword argument.
+
+For example, to load the `mnist` dataset from the Hub, you might have a local
+yaml config file like this:
+
+.. code-block:: yaml
+
+    format: ParquetFilesDataset
+    classification_fields: label
+
+To load the dataset from the Hub, you can pass the `repo_id` of the dataset and
+the path to the local yaml config file:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone.utils.huggingface as fouh
+
+    dataset = fouh.load_from_hub(
+        "mnist",
+        config_file="/path/to/mnist.yml",
+    )
+
+For a comprehensive list of the supported fields in the yaml config file, see
+:ref:`Supported config fields <huggingface-hub-load-dataset-config-kwargs>`.
+
+.. _huggingface-hub-load-dataset-from-kwargs:
+
+Loading datasets with config kwargs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In addition to loading datasets from repo configs and local configs, you can
+also load datasets from the Hub by passing the necessary config arguments
+directly to :func:`load_from_hub() <fiftyone.utils.huggingface.load_from_hub>`.
+This is useful when you want to load a dataset from the Hub that does not have
+a `fiftyone.yml` or `fiftyone.yaml` file, and the structure of the dataset is
+simple enough that you can specify the necessary arguments directly.
+
+For example, to load the `mnist` dataset from the Hub, you can pass the `format`
+and `classification_fields` arguments directly:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone.utils.huggingface as fouh
+
+    dataset = fouh.load_from_hub(
+        "mnist",
+        format="ParquetFilesDataset",
+        classification_fields="label",
+    )
+
+This will tell FiftyOne that the data is stored in Parquet files, and that the
+`label` field should be treated as a classification field, to be converted into
+a |Classification| label field in the dataset.
+
+.. _huggingface-hub-load-dataset-config-kwargs:
+
+Supported config fields
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Whether you are loading a dataset from a repo config, a local config file, or
+passing the config arguments directly, you can specify a number of fields.
+
+Broadly speaking, these fields fall into three categories: format specification,
+media field specification, and label field specification. 
+
+Let's look at these categories in more detail:
+
+**Format specification**:
+
+- **format** *(str)*: The format of the dataset. This can be any of the
+  :ref:`common formats <supported-import-formats>` supported by FiftyOne — just
+  pass the name of the format as a string. For example, to load a dataset in the
+  :ref:`COCO <COCODetectionDataset-import>` format, you can pass 
+  `format="COCODetectionDataset"`. To specify that the dataset is stored in
+  Parquet files, you can pass `format="ParquetFilesDataset"` (or simply 
+  `format="parquet"` for short). This is the only required field.
+- **name** *(str)*: The name of the FiftyOne |Dataset| to be created. If the
+  `repo_id` is cumbersome, this can be used to specify a simpler default name.
+  For example, for this `sheep dataset <https://huggingface.co/datasets/keremberke/aerial-sheep-object-detection>`_
+  rather than using the `repo_id` `keremberke/aerial-sheep-object-detection`, you
+  can specify `name="sheep-detection"`.
+- **subsets** *(str or list)*: The subset or subsets of the Hugging Face
+  dataset that are *compatible* with this config, and are *available* to be
+  loaded. In Hugging Face, the "dataset" in a repo can contain multiple
+  "subsets", which may or may not have the same schema. Take the
+  `Street View House Numbers <https://huggingface.co/datasets/svhn>`_ dataset for
+  example. This dataset has two subsets: `"cropped_digits"` and `"full_numbers"`.
+  The `cropped_digits` subset contains classification labels, while the
+  `full_numbers` subset contains detection labels. A single config would not be
+  able to specify the schema for both subsets, so you can specify the subset you
+  want to load (or if you are the dataset author, which subset you want to *allow*
+  people to load in this way) with the `subsets` field. For example, to load the
+  `cropped_digits` subset of the SVHN dataset, you can pass
+  `subsets="cropped_digits"`. Note that this is not a required field, and by
+  default all subsets are loaded. Also note that subsets are distinct from splits
+  in the dataset, which are handled by the `splits` field (see below).
+- **splits** *(str or list)*: The split or splits of the Hugging Face dataset that
+  are *compatible* with this config, and are *available* to be loaded. As is
+  standard for machine learning, many datasets are split into training, validation,
+  and test sets. The specific names of these splits may vary from dataset to
+  dataset, but :func:`load_from_hub() <fiftyone.utils.huggingface.load_from_hub>`
+  identifies the names of all splits and by default, will assume that all of
+  these splits are to be loaded. If you only want to load a specific split or
+  splits, you can specify them with the `splits` field. For example, to load the
+  training split of the `CIFAR10 <https://huggingface.co/datasets/cifar10>`_
+  dataset, you can pass `splits="train"`. If you want to load multiple splits,
+  you can pass them as a list, e.g., `splits=["train", "test"]`. Note that this
+  is not a required field, and by default all splits are loaded.
+    
+**Media field specification**:
+
+While not all Parquet datasets contain media fields, all FiftyOne |Sample| objects
+must be connected to at least one media file. The following fields can be used
+to configure the media fields in the Hugging Face dataset that should be converted
+to FiftyOne media fields:
+
+- **filepath** *(str)*: In FiftyOne, `filepath` is
+  `a default field <https://docs.voxel51.com/user_guide/using_datasets.html#fields>`_
+  that is used to store the path to the primary media file for each sample in
+  the dataset. For Hugging Face parquet datasets, primary media fields for image
+  datasets are typically stored in the `image` columns, so this is where
+  FiftyOne's :func:`load_from_hub() <fiftyone.utils.huggingface.load_from_hub>`
+  looks by default. If the primary media field is stored in a different column,
+  you can specify the column name with the key `filepath`. For example, the 
+  `COYO-700M dataset <https://huggingface.co/datasets/kakaobrain/coyo-700m>`_
+  has the primary media field referenced in the `url` column. Specifying
+  `filepath="url"` will tell FiftyOne to look in the `url` column for the 
+  primary media file path. Images will be downloaded from the corresponding URLs
+  and saved to disk.
+- **thumbnail_path** *(str)*: The field containing the path to a thumbnail image
+  for each sample in the dataset, if such a field exists. If a `thumbnail_path`
+  is specified, this media file will be shown in the sample grid in the FiftyOne
+  App. This can be useful for quickly visualizing the dataset when the primary
+  media field contains large (e.g., high-resolution) images. For more information
+  on thumbnail images, see :ref:`this section <app-multiple-media-fields>`.
+- **additional_media_fields** *(dict)*: If each sample has multiple associated media
+  files that you may want to visualize in the FiftyOne App, you can specify 
+  these non-default media fields in the `additional_media_fields` dictionary,
+  where the keys are the column names in the Hugging Face dataset and the values
+  are the names of the fields in the FiftyOne |Dataset| that will store the
+  paths. Note that this is *not* the same as :ref:`grouped datasets <groups>`.
+
+**Label field specification**:
+
+FiftyOne's Hugging Face Hub integration currently supports converting labels of
+type |Classification|, |Detections|, and |Segmentation| from Hugging Face
+Parquet datasets to FiftyOne label fields. The following fields can be used to
+specify the label fields in the Hugging Face dataset that should be converted to
+FiftyOne label fields:
+
+- **classification_fields** *(str or list)*: The column or columns in the Hugging
+  Face dataset that should be converted to FiftyOne |Classification| label fields.
+  contain classification labels. For example, if the dataset contains a `label`
+  field that contains classification labels, you can specify
+  `classification_fields="label"`. If the dataset contains multiple
+  classification fields, you can specify them as a list, e.g.,
+  `classification_fields=["label1", "label2"]`. This is not a required field,
+  and if the dataset does not contain classification labels, you can omit it.
+- **detection_fields** *(str or list)*: The column or columns in the Hugging Face
+  dataset that should be converted to FiftyOne |Detections| label fields. If the
+  dataset contains detection labels, you can specify the column name or names
+  here. For example, if the dataset contains a `detections` field that contains
+  detection labels, you can specify `detection_fields="detections"`. If the
+  dataset contains multiple detection fields, you can specify them as a list,
+  e.g., `detection_fields=["detections1", "detections2"]`. This is not a required
+  field, and if the dataset does not contain detection labels, you can omit it.
+- **mask_fields** *(str or list)*: The column or columns in the Hugging Face dataset
+  that should be converted to FiftyOne |Segmentation| label fields. The column
+  in the Hugging Face dataset must contain an image or the URL for an image that
+  can be used as a segmentation mask. If necessary, the images will be downloaded
+  and saved to disk. If the dataset contains mask labels, you can specify the
+  column name or names here. For example, if the dataset contains a `masks` field
+  that contains mask labels, you can specify `mask_fields="masks"`. This is not
+  a required field, and if the dataset does not contain mask labels, you can
+  omit it.
+
+.. _huggingface-hub-load-dataset-download:
+
+Configuring the download process
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When loading datasets from the Hugging Face Hub, FiftyOne will download the
+*all* of the data specified by the `repo_id` and the config. If no splits or
+subsets are listed in the config, this means that all samples across all splits
+and subsets will be downloaded. This can be a time-consuming process, especially
+for large datasets, and sometimes you may only want to download a fixed number
+of samples to get started exploring the dataset.
+
+FiftyOne's :func:`load_from_hub() <fiftyone.utils.huggingface.load_from_hub>`
+function supports a variety of arguments that allow you to control the download
+process, from the maximum number of samples to be downloaded to the batch size
+to use when making requests to the Datasets Server. Here are the supported
+arguments:
+
+- **max_samples** *(int)*: The number of samples to download from the dataset.
+  If not specified, all samples will be downloaded.
+- **batch_size** *(int)*: The batch size to use when making requests to the
+  Datasets Server. Defaults to 100, which is the max batch size allowed by the
+  Datasets Server.
+- **num_workers** *(int)*: The number of worker to use when downloading
+  media files. If not specified, the number of workers will be resolved by
+  looking at your :ref:`FiftyOne Config <configuring-fiftyone>`.
+- **splits** *(str or list)*: The split or splits of the Hugging Face dataset
+  that you want to download. This overrides the `splits` field in the config.
+- **subsets** *(str or list)*: The subset or subsets of the Hugging Face dataset
+  that you want to download. This overrides the `subsets` field in the config.
+- **overwrite** *(bool)*: Whether to overwrite existing an existing dataset
+  with the same name. If `True`, the existing dataset will be overwritten. If
+  `False`, an error will be raised if a dataset with the same name already
+  exists. Defaults to `False`.
+- **persistent** *(bool)*: Whether to persist the dataset to the underlying
+  database after it is loaded. If `True`, the dataset will be available for
+  loading in future FiftyOne sessions by passing the dataset's name into
+  FiftyOne's
+  :func:`load_dataset() <fiftyone.core.dataset.Dataset.load_dataset>` function.
+  Defaults to `False`.
+- **revision** *(str)*: The revision (specified by a commit hash to the Hugging
+  Face repo) of the dataset to load. If not specified, the latest revision will
+  be loaded.
+
+.. _huggingface-hub-load-dataset-basic-examples:
+
+Basic examples
+^^^^^^^^^^^^^^
+
+Okay, so :func:`load_from_hub() <fiftyone.utils.huggingface.load_from_hub>` is
+*very* powerful, and can be used in a *ton* of ways. All of this flexibility
+can be a bit overwhelming, so let's walk through a few examples to show you how
+easy it is in practice to load datasets from the Hugging Face Hub.
+
+.. note::
+
+    To make these downloads as fast as possible, we recommend setting the
+    `max_samples` argument to a reasonable number, like 1000, to get a feel for
+    the dataset. If you like what you see, you can always download more samples!
+
+**Classification Datasets**
+
+Let's start by loading the
+`MNIST <https://huggingface.co/datasets/mnist>`_ dataset into FiftyOne. All you
+need to do is pass the `repo_id` of the dataset — in this case `"mnist"` — to
+:func:`load_from_hub() <fiftyone.utils.huggingface.load_from_hub>`, specify the
+format as `"parquet"`, and specify the `classification_fields` as `"label"`:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone.utils.huggingface as fouh
+
+    dataset = fouh.load_from_hub(
+        "mnist",
+        format="parquet",
+        classification_fields="label",
+        max_samples=1000,
+    )
+
+    session = fo.launch_app(dataset)
+
+The same exact syntax works for the `CIFAR-10 <https://huggingface.co/datasets/cifar10>`_
+and `FashionMNIST <https://huggingface.co/datasets/fashion_mnist>`_ datasets,
+which are also available on the Hub. In fact, you can load any of the following
+classification datasets from the Hub using the same syntax, just by changing the
+`repo_id`:
+
+- `CIFAR-10 <https://huggingface.co/datasets/cifar10>`_ (use `"cifar10"`)
+- `ImageNet <https://huggingface.co/datasets/imagenet-1k>`_ (use `"imagenet-1k"`)
+- `FashionMNIST <https://huggingface.co/datasets/fashion_mnist>`_ (use `"fashion_mnist"`)
+- `Tiny ImageNet <https://huggingface.co/datasets/zh-plus/tiny-imagenet>`_ (use `"zh-plus/tiny-imagenet"`)
+- `Food-101 <https://huggingface.co/datasets/food101>`_ (use `"food101"`)
+- `Dog Food <https://huggingface.co/datasets/sasha/dogfood>`_ (use `"sasha/dog-food"`)
+- `ImageNet-Sketch <https://huggingface.co/datasets/imagenet_sketch>`_ (use `"imagenet_sketch"`)
+- `Oxford Flowers <https://huggingface.co/datasets/nelorth/oxford-flowers>`_ (use `"nelorth/oxford-flowers"`)
+- `Cats vs. Dogs <https://huggingface.co/datasets/cats_vs_dogs>`_ (use `"cats_vs_dogs"`)
+- `ObjectNet-1.0 <https://huggingface.co/datasets/timm/objectnet>`_ (use `"timm/objectnet"`)
+
+A very similar syntax can be used to load classification datasets that contain
+*multiple* classification fields, such as
+`CIFAR-100 <https://huggingface.co/datasets/cifar100>`_ and the
+`WikiArt <https://huggingface.co/datasets/huggan/wikiart>`_ dataset. For example,
+to load the CIFAR-100 dataset, you can specify the `classification_fields` as
+`["coarse_label", "fine_label"]`:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone.utils.huggingface as fouh
+
+    dataset = fouh.load_from_hub(
+        "cifar100",
+        format="parquet",
+        classification_fields=["coarse_label", "fine_label"],
+        max_samples=1000,
+    )
+
+    session = fo.launch_app(dataset)
+
+To load the `WikiArt <https://huggingface.co/datasets/huggan/wikiart>`_ dataset,
+you can specify the `classification_fields` as `["artist", "genre", "style"]`:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone.utils.huggingface as fouh
+
+    dataset = fouh.load_from_hub(
+        "huggan/wikiart",
+        format="parquet",
+        classification_fields=["artist", "genre", "style"],
+        max_samples=1000,
+    )
+
+    session = fo.launch_app(dataset)
+
+As touched upon earlier, you can also load a classification *subset* of a
+dataset. For example, to load the `cropped_digits` subset of the
+`Street View House Numbers <https://huggingface.co/datasets/svhn>`_ dataset:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone.utils.huggingface as fouh
+
+    dataset = fouh.load_from_hub(
+        "svhn",
+        format="parquet",
+        classification_fields="label",
+        subsets="cropped_digits",
+        max_samples=1000,
+    )
+
+    session = fo.launch_app(dataset)
+
+**Detection Datasets**
+
+Loading detection datasets from the Hub is just as easy. For example, to load
+the `MS COCO <https://huggingface.co/datasets/detection-datasets/coco>`_
+dataset, you can specify the `detection_fields` as `"objects"`, which is the
+standard column name for detection features in Hugging Face datasets:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone.utils.huggingface as fouh
+
+    dataset = fouh.load_from_hub(
+        "detection-datasets/coco",
+        format="parquet",
+        detection_fields="objects",
+        max_samples=1000,
+    )
+
+    session = fo.launch_app(dataset)
+
+The same syntax works for many other popular detection datasets on the Hub,
+including:
+
+- `CPPE - 5 <https://huggingface.co/datasets/cppe-5>`_ (use `"cppe-5"`)
+- `WIDER FACE <https://huggingface.co/datasets/wider_face>`_ (use `"wider_face"`)
+- `License Plate Object Detection <https://huggingface.co/datasets/keremberke/license-plate-object-detection>`_
+  (use `"keremberke/license-plate-object-detection"`)
+- `Aerial Sheep Object Detection <https://huggingface.co/datasets/keremberke/aerial-sheep-object-detection>`_
+  (use `"keremberke/aerial-sheep-object-detection"`)
+
+Some detection datasets have their detections stored under a column with a
+different name. For example, the `full_numbers` subset of the
+`Street View House Numbers <https://huggingface.co/datasets/svhn>`_ dataset
+stores its detections under the column `digits`. To load this subset, you can
+specify the `detection_fields` as `"digits"`:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone.utils.huggingface as fouh
+
+    dataset = fouh.load_from_hub(
+        "svhn",
+        format="parquet",
+        detection_fields="digits",
+        subsets="full_numbers",
+        max_samples=1000,
+    )
+
+    session = fo.launch_app(dataset)
+
+.. note::
+
+    Not *all* detection datasets on the Hub are stored in a format that is
+    currently supported by FiftyOne. For instance, the
+    `Fashionpedia <https://huggingface.co/datasets/detection-datasets/fashionpedia>`_
+    dataset has detections stored in Pascal VOC format, which is not the `standard
+    Hugging Face format <https://huggingface.co/docs/transformers/en/tasks/object_detection>`_.
+
+**Segmentation Datasets**
+
+Loading segmentation datasets from the Hub is also a breeze. For example, to
+load the "instance_segmentation" subset from
+`SceneParse150 <https://huggingface.co/datasets/scene_parse150>`_, all you
+need to do is specify the `mask_fields` as `"annotation"`:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone.utils.huggingface as fouh
+
+    dataset = fouh.load_from_hub(
+        "scene_parse150",
+        format="parquet",
+        subsets="instance_segmentation",
+        mask_fields="annotation",
+        max_samples=1000,
+    )
+
+    session = fo.launch_app(dataset)
+
+Many other segmentation datasets on the Hub can be loaded in the same way, such
+as `ADE 20K Tiny <https://huggingface.co/datasets/nateraw/ade20k-tiny>`_:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone.utils.huggingface as fouh
+
+    dataset = fouh.load_from_hub(
+        "nateraw/ade20k-tiny",
+        format="parquet",
+        mask_fields="label",
+    )
+
+    # only 20 samples in the dataset
+
+    session = fo.launch_app(dataset)
+
+In other cases, because there are now *multiple* image columns — one for the
+sample image and one for the mask — the naming convention for the dataset might
+be different, and you may need to explicitly specify the `filepath`. For
+example, to load the
+`Sidewalk Semantic <https://huggingface.co/datasets/segments/sidewalk-semantic>`_
+dataset:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone.utils.huggingface as fouh
+
+    # Note: you need access to the dataset to load it!
+
+    dataset = fouh.load_from_hub(
+        "segments/sidewalk-semantic",
+        format="parquet",
+        filepath="pixel_values",
+        mask_fields="label",
+        max_samples=1000,
+    )
+
+    session = fo.launch_app(dataset)
+
+.. note::
+
+    Once you have the dataset loaded into FiftyOne, you may want to set the
+    dataset's `mask targets <storing-mask-targets>`_ to specify the names of
+    the classes represented in the segmentation masks.
+
+**Unlabelled Image Datasets**
+
+Some datasets on the Hub contain images and metadata in the form of features,
+but do not explicitly contain classification, detection, or segmentation labels.
+This is common for text-to-image tasks, as well as captioning and visual question
+answering tasks. These datasets can also be converted and loaded into FiftyOne! 
+Once the dataset is loaded into FiftyOne, you can process the data and generate
+labels for whatever tasks you are interested in.
+
+Let's look at a few examples:
+
+For `DiffusionDB <https://huggingface.co/datasets/poloclub/diffusiondb>`_, you
+can load the dataset as follows:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone.utils.huggingface as fouh
+
+    dataset = fouh.load_from_hub(
+        "poloclub/diffusiondb",
+        format="parquet",
+        max_samples=1000,
+    )
+
+    session = fo.launch_app(dataset)
+
+Here are some other popular datasets on the Hub that can be loaded following the
+same syntax:
+
+- `Nouns <https://huggingface.co/datasets/m1guelpf/nouns>`_: (use `"m1guelpf/nouns"`)
+- `New Yorker Caption Contest <https://huggingface.co/datasets/jmhessel/newyorker_caption_contest>`_:
+  (use `"jmhessel/newyorker_caption_contest"`)
+- `Captcha Dataset <https://huggingface.co/datasets/project-sloth/captcha-images>`_:
+  (use `"project-sloth/captcha-images"`)
+- `MathVista <https://huggingface.co/datasets/AI4Math/MathVista>`_: (use `"AI4Math/MathVista"`)
+- `TextVQA <https://huggingface.co/datasets/TextVQA>`_: (use `"textvqa"`)
+- `VQA-RAD <https://huggingface.co/datasets/flaviagiammarino/vqa-rad>`_: (use `"flaviagiammarino/vqa-rad"`)
+- `ScienceQA <https://huggingface.co/datasets/derek-thomas/ScienceQA>`_: (use `"derek-thomas/ScienceQA"`)
+- `PathVQA <https://huggingface.co/datasets/flaviagiammarino/path-vqa>`_: (use `"flaviagiammarino/path-vqa"`)
+
+Many other popular datasets on the Hub can be loaded in the same way, with slight
+modifications to `filepath` or other arguments as needed. Here are a few examples:
+
+For `COYO-700M <https://huggingface.co/datasets/kakaobrain/coyo-700m>`_, we just
+need to specify the `filepath` as `"url"`:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone.utils.huggingface as fouh
+
+    dataset = fouh.load_from_hub(
+        "kakaobrain/coyo-700m",
+        format="parquet",
+        filepath="url",
+        max_samples=1000,
+    )
+
+    session = fo.launch_app(dataset)
+
+For `RedCaps <https://huggingface.co/datasets/red_caps>`_, we instead use
+`"image_url"` as the `filepath`:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone.utils.huggingface as fouh
+
+    dataset = fouh.load_from_hub(
+        "red_caps",
+        format="parquet",
+        filepath="image_url",
+        max_samples=1000,
+    )
+
+    session = fo.launch_app(dataset)
+
+For `MMMU <https://huggingface.co/datasets/MMMU/MMMU>`_ 
+(A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for
+Expert AGI), we use `"image_1"` as the `filepath`:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone.utils.huggingface as fouh
+
+    dataset = fouh.load_from_hub(
+        "MMMU/MMMU",
+        format="parquet",
+        filepath="image_1",
+        max_samples=1000,
+    )
+
+    session = fo.launch_app(dataset)
+
+.. _huggingface-hub-load-dataset-advanced-examples:
+
+Advanced examples
+^^^^^^^^^^^^^^^^^
+
+The :func:`load_from_hub() <fiftyone.utils.huggingface.load_from_hub>` function
+also allows us to load datasets in much more complex formats, as well as with
+more advanced configurations. Let's walk through a few examples to show you how
+to leverage the full power of FiftyOne's Hugging Face Hub integration.
+
+**Loading Datasets from Revisions**
+
+When you load a dataset from the Hugging Face Hub, you are loading the latest
+revision of the dataset. However, you can also load a specific revision of the
+dataset by specifying the `revision` argument. For example, to load the last
+revision of DiffusionDB before NSFW scores were added, you can specify this via:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone.utils.huggingface as fouh
+
+    dataset = fouh.load_from_hub(
+        "poloclub/diffusiondb",
+        format="parquet",
+        subset="2m_random_1k", ## just one of the subsets
+        max_samples=1000,
+        revision="5fa48ba66a44822d82d024d195fbe918e6c42ca6",
+    )
+
+    session = fo.launch_app(dataset)
+
+**Loading Datasets with Multiple Media Fields**
+
+Some datasets on the Hub contain multiple media fields for each sample. Take
+`MagicBrush <https://huggingface.co/datasets/magicbrush>`_ for example, which
+contains a `"source_img"` and a `"target_img"` for each sample, in addition
+to a segmentation mask denoting the area of the source image to be modified. To
+load this dataset, you can specify the `filepath` as `"source_img"` and the
+target image via `additional_media_fields`. Because this is getting a bit more
+complex, we'll create a local yaml config file to specify the dataset format:
+
+.. code-block:: yaml
+
+    format: ParquetFilesDataset
+    name: magicbrush
+    filepath: source_img
+    additional_media_fields:
+        target_img: target_img
+    mask_fields: mask_img
+
+Now, you can load the dataset using the local yaml config file:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone.utils.huggingface as fouh
+
+    dataset = fouh.load_from_hub(
+        "osunlp/MagicBrush",
+        config_file="/path/to/magicbrush.yml",
+        max_samples=1000,
+    )
+
+    session = fo.launch_app(dataset)
+
+**Customizing the Download Process**
+
+When loading datasets from the Hub, you can customize the download process by
+specifying the `batch_size`, `num_workers`, and `overwrite` arguments. For
+example, to download the `full_numbers` subset of the `Street View House Numbers
+<https://huggingface.co/datasets/svhn>`_ dataset with a batch size of 50 and 4
+workers, you can do the following:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone.utils.huggingface as fouh
+
+    dataset = fouh.load_from_hub(
+        "svhn",
+        format="parquet",
+        detection_fields="digits",
+        subsets="full_numbers",
+        max_samples=1000,
+        batch_size=50,
+        num_workers=4,
+    )
+
+    session = fo.launch_app(dataset)
+
+**Loading Private or Gated Datasets**
+
+Like public datasets, you can also load private or gated datasets from the Hub,
+as long as you have the necessary permissions. If your Hugging Face token is
+set as an environment variable `HF_TOKEN`, this is as simple as specifying the
+`repo_id` of the dataset. If you don't have your token set, or you need to use
+a specific token for a specific dataset, you can specify the `token` argument.
+You can do so following this recipe:
+
+.. code-block:: python
+    :linenos:
+
+    import fiftyone.utils.huggingface as fouh
+
+    dataset = fouh.load_from_hub(
+        "my-private-dataset-repo-id",
+        token="<my-secret-token>",
+        ...
+    )
+
+    session = fo.launch_app(dataset)
diff --git a/fiftyone/utils/huggingface.py b/fiftyone/utils/huggingface.py
new file mode 100644
index 0000000000..2456ae1631
--- /dev/null
+++ b/fiftyone/utils/huggingface.py
@@ -0,0 +1,1203 @@
+"""
+Utilities for working with `Hugging Face <https://huggingface.co>`_.
+
+| Copyright 2017-2024, Voxel51, Inc.
+| `voxel51.com <https://voxel51.com/>`_
+|
+"""
+
+from concurrent.futures import ThreadPoolExecutor
+from contextlib import contextmanager
+import logging
+import os
+from packaging.requirements import Requirement
+import requests
+
+import yaml
+
+import eta.core.utils as etau
+
+import fiftyone as fo
+import fiftyone.constants as foc
+from fiftyone.core.config import Config
+import fiftyone.core.dataset as fod
+import fiftyone.core.labels as fol
+import fiftyone.core.metadata as fom
+from fiftyone.core.sample import Sample
+import fiftyone.core.utils as fou
+import fiftyone.types as fot
+
+hfh = fou.lazy_import(
+    "huggingface_hub",
+    callback=lambda: fou.ensure_package("huggingface_hub>=0.20.0"),
+)
+
+hfu = fou.lazy_import(
+    "huggingface_hub.utils",
+    callback=lambda: fou.ensure_package("huggingface_hub>=0.20.0"),
+)
+
+
+DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
+DEFAULT_MEDIA_TYPE = "image"
+DATASET_METADATA_FILENAMES = ("fiftyone.yml", "fiftyone.yaml")
+DATASETS_MAX_BATCH_SIZE = 100
+DEFAULT_IMAGE_FILEPATH_FEATURE = "image"
+FIFTYONE_BUILTIN_FIELDS = ("id", "filepath", "tags", "metadata")
+SUPPORTED_DTYPES = (
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "float16",
+    "float32",
+    "float64",
+    "bool",
+    "string",
+)
+
+logger = logging.getLogger(__name__)
+
+
+def push_to_hub(
+    dataset,
+    repo_name,
+    description=None,
+    license=None,
+    tags=None,
+    private=False,
+    exist_ok=False,
+    dataset_type=None,
+    min_fiftyone_version=None,
+    label_field=None,
+    frame_labels_field=None,
+    token=None,
+    preview_path=None,
+    **data_card_kwargs,
+):
+    """Push a FiftyOne dataset to the Hugging Face Hub.
+
+    Args:
+        dataset: a FiftyOne dataset
+        repo_name: the name of the dataset repo to create. The repo ID will be
+            ``{your_username}/{repo_name}``
+        description (None): a description of the dataset
+        license (None): the license of the dataset
+        tags (None): a list of tags for the dataset
+        private (True): whether the repo should be private
+        exist_ok (False): if True, do not raise an error if repo already exists.
+        dataset_type (None): the type of the dataset to create
+        min_fiftyone_version (None): the minimum version of FiftyOne required
+            to load the dataset. For example ``"0.23.0"``.
+        label_field (None): controls the label field(s) to export. Only
+            applicable to labeled datasets. Can be any of the following:
+
+            - the name of a label field to export
+            - a glob pattern of label field(s) to export
+            - a list or tuple of label field(s) to export
+            - a dictionary mapping label field names to keys to use when
+              constructing the label dictionaries to pass to the exporter
+        frame_labels_field (None): controls the frame label field(s) to export.
+            The "frames." prefix is optional. Only applicable to labeled video
+            datasets. Can be any of the following:
+
+            - the name of a frame label field to export
+            - a glob pattern of frame label field(s) to export
+            - a list or tuple of frame label field(s) to export
+            - a dictionary mapping frame label field names to keys to use when
+              constructing the frame label dictionaries to pass to the exporter
+        token (None): a Hugging Face API token to use. May also be provided via
+            the ``HF_TOKEN`` environment variable
+        preview_path (None): a path to a preview image or video to display on
+            the readme of the dataset repo.
+        data_card_kwargs: additional keyword arguments to pass to the
+            `DatasetCard` constructor
+    """
+    if dataset_type is None:
+        dataset_type = fot.FiftyOneDataset
+
+    if tags is not None:
+        if isinstance(tags, str):
+            tags = [t.strip() for t in tags.split(",")]
+        tags.extend(_get_dataset_tags(dataset))
+        tags = sorted(tags)
+    else:
+        tags = _get_dataset_tags(dataset)
+
+    # do this now in case HF login fails before we do anything expensive
+    hf_username = hfh.whoami(token=token)["name"]
+    repo_id = hf_username + "/" + repo_name
+
+    with etau.TempDir() as tmp_dir:
+        config_filepath = os.path.join(tmp_dir, "fiftyone.yml")
+
+        dataset.export(
+            export_dir=tmp_dir,
+            dataset_type=dataset_type,
+            label_field=label_field,
+            frame_labels_field=frame_labels_field,
+            export_media=True,
+        )
+
+        _populate_config_file(
+            config_filepath,
+            dataset,
+            dataset_type=dataset_type,
+            description=description,
+            license=license,
+            tags=tags,
+            min_fiftyone_version=min_fiftyone_version,
+        )
+
+        ## Create the dataset repo
+        hfh.create_repo(
+            repo_id,
+            token=token,
+            repo_type="dataset",
+            private=private,
+            exist_ok=exist_ok,
+        )
+
+        ## Upload the dataset to the repo
+        api = hfh.HfApi(token=token)
+        with _no_progress_bars():
+            api.upload_folder(
+                folder_path=tmp_dir,
+                repo_id=repo_id,
+                repo_type="dataset",
+            )
+
+        # Upload preview image or video if provided
+        if preview_path is not None:
+            abs_preview_path = os.path.abspath(preview_path)
+            if not os.path.exists(abs_preview_path):
+                logger.warning(
+                    f"Preview path {abs_preview_path} does not exist"
+                )
+
+            ext = os.path.splitext(abs_preview_path)[1]
+            path_in_repo = "dataset_preview" + ext
+
+            try:
+                api.upload_file(
+                    path_or_fileobj=abs_preview_path,
+                    path_in_repo=path_in_repo,
+                    repo_id=repo_id,
+                    repo_type="dataset",
+                    commit_message="Add preview",
+                )
+            except:
+                logger.warning(
+                    f"Failed to upload preview media file {abs_preview_path}"
+                )
+
+                # If fails, set preview to None
+                preview_path = None
+
+        path_in_repo = path_in_repo if preview_path is not None else None
+
+    ## Create the dataset card
+    card = _create_dataset_card(
+        repo_id,
+        dataset,
+        description=description,
+        license=license,
+        tags=tags,
+        preview_path=path_in_repo,
+        **data_card_kwargs,
+    )
+    card.push_to_hub(repo_id)
+
+
+def load_from_hub(
+    repo_id,
+    revision=None,
+    split=None,
+    splits=None,
+    subset=None,
+    subsets=None,
+    max_samples=None,
+    batch_size=None,
+    num_workers=None,
+    overwrite=False,
+    persistent=False,
+    name=None,
+    token=None,
+    config_file=None,
+    **kwargs,
+):
+    """Loads a dataset from the Hugging Face Hub into FiftyOne.
+
+    Args:
+        repo_id: the Hugging Face Hub identifier of the dataset
+        revision (None): the revision of the dataset to load
+        split (None): the split of the dataset to load
+        splits (None): the splits of the dataset to load
+        subset (None): the subset of the dataset to load
+        subsets (None): the subsets of the dataset to load
+        max_samples (None): the maximum number of samples to load
+        batch_size (None): the batch size to use when loading samples
+        num_workers (None): a suggested number of threads to use when
+            downloading media
+        overwrite (True): whether to overwrite an existing dataset with the
+            same name
+        persistent (False): whether the dataset should be persistent
+        name (None): an optional name to give the dataset
+        token (None): a Hugging Face API token to use. May also be provided via
+            the ``HF_TOKEN`` environment variable
+        config_file (None): the path to a config file on disk specifying how to
+            load the dataset if the repo has no ``fiftyone.yml`` file
+        **kwargs: keyword arguments specifying config parameters to load the
+            dataset if the repo has no ``fiftyone.yml`` file
+
+    Returns:
+        a :class:`fiftyone.core.dataset.Dataset`
+    """
+    kwargs["splits"] = splits
+    kwargs["split"] = split
+    kwargs["subsets"] = subsets
+    kwargs["subset"] = subset
+    kwargs["max_samples"] = max_samples
+    kwargs["batch_size"] = batch_size
+    kwargs["num_workers"] = num_workers
+    kwargs["overwrite"] = overwrite
+    kwargs["persistent"] = persistent
+    kwargs["name"] = name
+    kwargs["token"] = token
+    kwargs["config_file"] = config_file
+
+    config = _get_dataset_metadata(repo_id, revision=revision, **kwargs)
+    if config is None:
+        raise ValueError(f"Could not find fiftyone metadata for {repo_id}")
+
+    return _load_dataset_from_config(config, **kwargs)
+
+
+class HFHubDatasetConfig(Config):
+    """Config for a Hugging Face Hub dataset.
+
+    Args:
+        name: the name of the dataset
+        repo_type: the type of the repository
+        repo_id: the identifier of the repository
+        revision: the revision of the dataset
+        filename: the name of the file
+        format: the format of the dataset
+        tags: the tags of the dataset
+        license: the license of the dataset
+        description: the description of the dataset
+        fiftyone: the fiftyone version requirement of the dataset
+    """
+
+    def __init__(self, **kwargs):
+        ## Internals
+        self._repo_type = kwargs.get("repo_type", None)
+        self._repo_id = kwargs.get("repo_id", None)
+        self._revision = kwargs.get("revision", None)
+        self._filename = kwargs.get("filename", None)
+        self._format = kwargs.get("format", None)
+
+        ## Dataset metadata
+        self.tags = kwargs.get("tags", [])
+        if isinstance(self.tags, str):
+            self.tags = [t.strip() for t in self.tags.split(",")]
+        elif isinstance(self.tags, list):
+            self.tags = [t.strip() for t in self.tags]
+        self.license = kwargs.get("license", None)
+        self.description = kwargs.get("description", None)
+        self._get_fiftyone_version(kwargs)
+
+    def _get_fiftyone_version(self, kwargs):
+        if kwargs.get("fiftyone", None) is None:
+            self.version = None
+        else:
+            version = kwargs["fiftyone"].get("version", None)
+            if version is None:
+                self.version = None
+            else:
+                self.version = f"fiftyone{version}"
+
+
+DATASET_CONTENT_TEMPLATE = """
+
+{preview}
+
+This is a [FiftyOne](https://github.com/voxel51/fiftyone) dataset with {num_samples} samples.
+
+## Installation
+
+If you haven't already, install FiftyOne:
+
+```bash
+pip install -U fiftyone
+```
+
+## Usage
+
+```python
+import fiftyone as fo
+import fiftyone.utils.huggingface as fouh
+
+# Load the dataset
+# Note: other available arguments include 'split', 'max_samples', etc
+dataset = fouh.load_from_hub("{repo_id}")
+
+# Launch the App
+session = fo.launch_app(dataset)
+```
+"""
+
+
+def _populate_config_file(
+    config_filepath,
+    dataset,
+    dataset_type=None,
+    description=None,
+    license=None,
+    tags=None,
+    min_fiftyone_version=None,
+):
+    config_dict = {
+        "name": dataset.name,
+        "format": dataset_type.__name__,
+        "tags": tags,
+    }
+
+    if min_fiftyone_version is not None:
+        version_val = f">={min_fiftyone_version}"
+        config_dict["fiftyone"] = {"version": version_val}
+
+    if description is not None:
+        config_dict["description"] = description
+
+    if license is not None:
+        config_dict["license"] = license
+
+    with open(config_filepath, "w") as f:
+        yaml.dump(config_dict, f)
+
+
+def _get_dataset_tasks(dataset):
+    def _has_label(ftype):
+        return bool(dataset.get_field_schema(embedded_doc_type=ftype).keys())
+
+    tasks = []
+    if _has_label(fol.Classification) or _has_label(fol.Classifications):
+        tasks.append("image-classification")
+    if _has_label(fol.Detections):
+        tasks.append("object-detection")
+    if _has_label(fol.Segmentation):
+        tasks.append("semantic-segmentation")
+    return tasks
+
+
+def _get_dataset_tags(dataset):
+    tags = ["fiftyone"]
+    tags.append(dataset.media_type)
+    tags.extend(_get_dataset_tasks(dataset))
+    tags.extend(dataset.tags)
+    return sorted(list(set(tags)))
+
+
+def _generate_dataset_summary(repo_id, dataset, preview_path):
+    format_kwargs = {
+        "repo_id": repo_id,
+        "num_samples": len(dataset),
+        "preview": "",
+    }
+    if preview_path is not None:
+        format_kwargs["preview"] = f"\n![image/png]({preview_path})\n"
+    return DATASET_CONTENT_TEMPLATE.format(**format_kwargs)
+
+
+def _create_dataset_card(
+    repo_id,
+    dataset,
+    tags=None,
+    license=None,
+    preview_path=None,
+    **dataset_card_kwargs,
+):
+    card_inputs = {
+        "language": "en",
+        "annotations_creators": [],
+        "task_categories": _get_dataset_tasks(dataset),
+        "task_ids": [],
+        "pretty_name": dataset.name,
+        "license": license,
+        "tags": tags,
+    }
+
+    for key, value in dataset_card_kwargs.items():
+        card_inputs[key] = value
+
+    dataset_summary = _generate_dataset_summary(repo_id, dataset, preview_path)
+    if dataset_summary is not None:
+        card_inputs["dataset_summary"] = dataset_summary
+
+    card_data = hfh.DatasetCardData(**card_inputs)
+    return hfh.DatasetCard.from_template(card_data)
+
+
+def _parse_split_kwargs(**kwargs):
+    splits = kwargs.get("splits", None)
+    split = kwargs.get("split", None)
+    if splits is None and split is not None:
+        splits = split
+
+    if isinstance(splits, str):
+        if "," in splits:
+            splits = splits.split(",")
+        else:
+            splits = [splits]
+    return splits
+
+
+def _parse_subset_kwargs(**kwargs):
+    subsets = kwargs.get("subsets", None)
+    subset = kwargs.get("subset", None)
+    if subsets is None and subset is not None:
+        subsets = subset
+
+    if isinstance(subsets, str):
+        subsets = [subsets]
+    return subsets
+
+
+@contextmanager
+def _no_progress_bars():
+    pbs_disabled = hfu.are_progress_bars_disabled()
+    hfu.disable_progress_bars()
+    try:
+        yield
+    finally:
+        # Restore the original state
+        if not pbs_disabled:
+            hfu.enable_progress_bars()
+
+
+class HFHubParquetFilesDatasetConfig(HFHubDatasetConfig):
+    """Config for a Hugging Face Hub dataset that is stored as parquet files.
+
+    Args:
+        name: the name of the dataset
+        repo_type: the type of the repository
+        repo_id: the identifier of the repository
+        revision: the revision of the dataset
+        filename: the name of the file
+        format: the format of the dataset
+        tags: the tags of the dataset
+        license: the license of the dataset
+        description: the description of the dataset
+        fiftyone: the fiftyone version requirement of the dataset
+        label_fields: the label fields of the dataset
+        media_type: the media type of the dataset
+        default_media_fields: the default media fields of the dataset
+        additional_media_fields: the additional media fields of the dataset
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        self.media_type = kwargs.get("media_type", DEFAULT_MEDIA_TYPE)
+
+        self._build_name(kwargs)
+        self._build_media_fields_dict(kwargs)
+        self._build_label_fields_dict(kwargs)
+        self._build_allowed_splits(kwargs)
+        self._build_allowed_subsets(kwargs)
+
+    def _build_name(self, kwargs):
+        self.name = kwargs.get("name", None)
+        if self.name is None:
+            self.name = kwargs.get("repo_id", None)
+
+    def _build_allowed_splits(self, kwargs):
+        # Author specifies what splits are compatible with this config
+        self._allowed_splits = _parse_split_kwargs(**kwargs)
+
+    def _build_allowed_subsets(self, kwargs):
+        # Author specifies what subsets are compatible with this config
+        self._allowed_subsets = _parse_subset_kwargs(**kwargs)
+
+    def _build_media_fields_dict(self, kwargs):
+        media_fields_dict = kwargs.get("default_media_fields", {})
+        if media_fields_dict.get("filepath", None) is None:
+            media_fields_dict["filepath"] = kwargs.get(
+                "filepath", DEFAULT_IMAGE_FILEPATH_FEATURE
+            )
+        if (
+            media_fields_dict.get("thumbnail_path", None) is None
+            and kwargs.get("thumbnail_path", None) is not None
+        ):
+            media_fields_dict["thumbnail_path"] = kwargs["thumbnail_path"]
+
+        additional_media_fields = kwargs.get("additional_media_fields", {})
+        media_fields_dict.update(additional_media_fields)
+        self.media_fields = media_fields_dict
+
+    def _build_label_fields_dict(self, kwargs):
+        self.label_fields = kwargs.get("label_fields", {})
+        label_types = ("classification", "detection", "mask")
+        for label_type in label_types:
+            label_fields = kwargs.get(f"{label_type}_fields", None)
+            if label_fields is not None:
+                if isinstance(label_fields, str):
+                    self.label_fields[label_type] = label_fields.split(",")
+                elif isinstance(label_fields, list):
+                    self.label_fields[label_type] = label_fields
+
+
+def _parse_format_string(format_str):
+    if "parquet" in format_str.lower():
+        return "ParquetFilesDataset"
+    else:
+        return format_str
+
+
+def _build_config(config_dict):
+    format = config_dict.get("format", None)
+    if format is None:
+        raise ValueError("Dataset config must have a format key")
+
+    format = _parse_format_string(format)
+    if format == "ParquetFilesDataset":
+        return HFHubParquetFilesDatasetConfig(**config_dict)
+    else:
+        return HFHubDatasetConfig(**config_dict)
+
+
+def _get_headers(**kwargs):
+    token = kwargs.get("token", None) or os.getenv("HF_TOKEN")
+    if token is not None:
+        return {"Authorization": f"Bearer {token}"}
+    return None
+
+
+def _get_dataset_metadata(repo_id, revision=None, token=None, **kwargs):
+    common_kwargs = dict(repo_type="dataset", revision=revision)
+    config_file = kwargs.get("config_file", None)
+
+    if config_file is not None:
+        config_file = os.path.abspath(config_file)
+        filename = os.path.basename(config_file)
+        all_kwargs = dict(repo_id=repo_id, filename=filename, **common_kwargs)
+    else:
+        api = hfh.HfApi(token=token)
+        for filename in DATASET_METADATA_FILENAMES:
+            if api.file_exists(repo_id, filename, **common_kwargs):
+                all_kwargs = dict(
+                    repo_id=repo_id, filename=filename, **common_kwargs
+                )
+                logger.info(
+                    f"Downloading config file {filename} from {repo_id}"
+                )
+                config_file = hfh.hf_hub_download(**all_kwargs)
+                break
+
+    if config_file is None and "format" not in kwargs:
+        return None
+
+    if config_file is None:
+        config_dict = kwargs
+        config_dict.update(**common_kwargs)
+        config_dict["repo_id"] = repo_id
+    else:
+        with open(config_file, "r") as f:
+            config_dict = yaml.safe_load(f)
+        config_dict.update(**all_kwargs)
+
+    return _build_config(config_dict)
+
+
+def _ensure_dataset_compatibility(config):
+    req_str = config.version
+    if req_str is None:
+        return
+
+    try:
+        req = Requirement(req_str)
+    except:
+        logger.warning(
+            f"Unable to parse dataset {config.name}'s fiftyone version requirement {req_str}"
+        )
+        return
+
+    if not req.specifier.contains(foc.VERSION):
+        logger.warning(
+            f"Dataset {config.name} requires {req_str} but you are running fiftyone=={foc.VERSION}"
+        )
+
+
+def _get_download_dir(repo_id, split=None, subset=None, **kwargs):
+    path_walk = [fo.config.default_dataset_dir, "huggingface", "hub", repo_id]
+
+    ## Note: for now don't support multiple revisions storage
+    if subset is not None:
+        path_walk.append(subset)
+    if split is not None:
+        path_walk.append(split)
+
+    download_dir = os.path.join(*path_walk)
+    etau.ensure_dir(download_dir)
+
+    return download_dir
+
+
+def _get_split_subset_pairs(config, **kwargs):
+    repo_id = config._repo_id
+    revision = config._revision
+    api_url = (
+        f"{DATASETS_SERVER_URL}/splits?dataset={repo_id.replace('/', '%2F')}"
+    )
+    if revision is not None:
+        api_url += f"&revision={revision}"
+    headers = _get_headers(**kwargs)
+    response = requests.get(api_url, headers=headers).json()["splits"]
+    return [(ss["split"], ss["config"]) for ss in response]
+
+
+def _load_dataset_from_config(config, **kwargs):
+    _ensure_dataset_compatibility(config)
+    if isinstance(config, HFHubParquetFilesDatasetConfig):
+        return _load_parquet_files_dataset_from_config(config, **kwargs)
+    else:
+        return _load_fiftyone_dataset_from_config(config, **kwargs)
+
+
+def _get_allowed_splits(config, **kwargs):
+    user_splits = _parse_split_kwargs(**kwargs)
+    author_splits = config._allowed_splits
+
+    if not user_splits and not author_splits:
+        return None
+    else:
+        return user_splits if user_splits else author_splits
+
+
+def _get_allowed_subsets(config, **kwargs):
+    user_subsets = _parse_subset_kwargs(**kwargs)
+    author_subsets = config._allowed_subsets
+    if not user_subsets and not author_subsets:
+        return None
+    else:
+        return user_subsets if user_subsets else author_subsets
+
+
+def _is_valid_split_subset_pair(
+    split, subset, allowed_splits, allowed_subsets
+):
+    if allowed_splits is not None and split not in allowed_splits:
+        return False
+    if allowed_subsets is not None and subset not in allowed_subsets:
+        return False
+    return True
+
+
+def _get_label_field_names_and_types(config):
+    label_field_names, label_types = [], []
+    label_fields = config.label_fields
+    if label_fields is None:
+        return label_field_names, label_types
+
+    for label_type, fields in label_fields.items():
+        if isinstance(fields, str):
+            label_field_names.append(fields)
+            label_types.append(label_type)
+        elif isinstance(fields, list):
+            label_field_names.extend(fields)
+            label_types.extend([label_type] * len(fields))
+
+    return label_field_names, label_types
+
+
+def _get_parquet_dataset_features(
+    repo_id, split, subset, revision=None, **kwargs
+):
+    api_url = f"{DATASETS_SERVER_URL}/info?dataset={repo_id.replace('/', '%2F')}&config={subset}&split={split}]"
+    if revision is not None:
+        api_url += f"&revision={revision}"
+
+    headers = _get_headers(**kwargs)
+    response = requests.get(api_url, headers=headers)
+    features = response.json()["dataset_info"]["features"]
+    return features
+
+
+def _get_num_rows(repo_id, split, subset, revision=None, **kwargs):
+    api_url = f"{DATASETS_SERVER_URL}/info?dataset={repo_id.replace('/', '%2F')}&config={subset}&split={split}]"
+    if revision is not None:
+        api_url += f"&revision={revision}"
+
+    headers = _get_headers(**kwargs)
+    response = requests.get(api_url, headers=headers)
+    splits = response.json()["dataset_info"]["splits"]
+    return splits[split]["num_examples"]
+
+
+def _build_rows_request_url(
+    repo_id, split=None, subset="default", revision=None, offset=0, length=100
+):
+    url = f"{DATASETS_SERVER_URL}/rows?dataset={repo_id.replace('/', '%2F')}"
+    if split is not None:
+        url += f"&split={split}"
+    if subset is not None:
+        url += f"&config={subset}"
+    if revision is not None:
+        url += f"&revision={revision}"
+    url += f"&offset={offset}&length={length}"
+    return url
+
+
+def _get_rows(
+    repo_id,
+    split,
+    subset,
+    start_index=0,
+    end_index=100,
+    revision=None,
+    **kwargs,
+):
+    length = end_index - start_index
+    url = _build_rows_request_url(
+        repo_id, split, subset, revision, offset=start_index, length=length
+    )
+    headers = _get_headers(**kwargs)
+    response = requests.get(url, headers=headers)
+    return response.json()["rows"]
+
+
+def _download_image(url_and_filepath):
+    url, filepath = url_and_filepath
+    try:
+        if not os.path.exists(filepath):
+            with requests.get(url, stream=True) as r:
+                r.raise_for_status()
+                with open(filepath, "wb") as f:
+                    for chunk in r.iter_content(chunk_size=8192):
+                        f.write(chunk)
+    except Exception as e:
+        logger.warning(f"Failed to download image from {url}: {e}")
+
+
+def _download_images(urls_and_filepaths, num_workers):
+    if num_workers <= 1:
+        for url_and_filepath in urls_and_filepaths:
+            _download_image(url_and_filepath)
+    else:
+        with ThreadPoolExecutor(max_workers=num_workers) as executor:
+            executor.map(_download_image, urls_and_filepaths)
+
+
+def _build_media_field_converter(
+    media_field_key, media_field_name, feature, download_dir
+):
+    def convert_media_field(sample_dict, row):
+        row_content = row["row"]
+        row_index = row["row_idx"]
+
+        filename = f"{media_field_name}_{row_index}.png"
+        filepath = os.path.join(download_dir, filename)
+
+        if feature["_type"] == "Image":
+            url = row_content[media_field_name]["src"]
+        else:
+            url = row_content[media_field_name]
+
+        sample_dict[media_field_key] = filepath
+
+        return (url, filepath)
+
+    return convert_media_field
+
+
+def _get_image_shape(image_path):
+    metadata = fom.ImageMetadata.build_for(image_path)
+    return (metadata.width, metadata.height)
+
+
+def _get_detection_label_field_name(feature):
+    for key, value in feature["feature"].items():
+        if value["_type"] == "ClassLabel":
+            return key
+    return None
+
+
+def _get_bounding_box_field_name(feature):
+    for key, value in feature["feature"].items():
+        if value["_type"] == "Sequence" and value["length"] == 4:
+            return key
+    return None
+
+
+def _convert_bounding_box(hf_bbox, img_size):
+    x, y, w, h = hf_bbox
+    if all([0 <= c <= 1 for c in [x, y, w, h]]):
+        return hf_bbox
+    else:
+        return [
+            x / img_size[0],
+            y / img_size[1],
+            w / img_size[0],
+            h / img_size[1],
+        ]
+
+
+def _build_label_field_converter(
+    field_name, field_type, feature, config, download_dir
+):
+    def convert_classification_field(sample_dict, row):
+        row_content = row["row"]
+        label_index = row_content[field_name]
+        if label_index == -1:
+            return
+        label = feature["names"][label_index]
+        if isinstance(label, tuple):
+            label = label[0]
+        sample_dict[field_name] = fol.Classification(label=str(label))
+
+    def convert_detection_field(sample_dict, row):
+        img_w, img_h = _get_image_shape(sample_dict["filepath"])
+
+        feature_content = row["row"][field_name]
+        det_keys = list(feature["feature"].keys())
+        bbox_key = _get_bounding_box_field_name(feature)
+        det_label_key = _get_detection_label_field_name(feature)
+
+        num_dets = len(feature_content[det_label_key])
+
+        detections = []
+        for i in range(num_dets):
+            label = feature_content[det_label_key][i]
+            bounding_box = feature_content[bbox_key][i]
+
+            bounding_box = _convert_bounding_box(bounding_box, (img_w, img_h))
+            det_dict = {
+                "label": feature["feature"][det_label_key]["names"][label],
+                "bounding_box": bounding_box,
+            }
+            for key in det_keys:
+                if (
+                    key not in [bbox_key, det_label_key]
+                    and key not in FIFTYONE_BUILTIN_FIELDS
+                ):
+                    det_dict[key] = feature_content[key][i]
+
+            detections.append(fol.Detection(**det_dict))
+
+        sample_dict[field_name] = fol.Detections(detections=detections)
+
+    def convert_mask_field(sample_dict, row):
+        row_content = row["row"]
+        row_index = row["row_idx"]
+        filename = f"{field_name}_{row_index}.png"
+        filepath = os.path.join(download_dir, filename)
+
+        if feature["_type"] == "Image":
+            url = row_content[field_name]["src"]
+        else:
+            url = row_content[field_name]
+
+        sample_dict[field_name] = fol.Segmentation(mask_path=filepath)
+
+        return (url, filepath)
+
+    def convert_label_field(sample_dict, row):
+        pass
+
+    if field_type == "classification":
+        return convert_classification_field
+    elif "detection" in field_type:
+        return convert_detection_field
+    elif "mask" in field_type:
+        return convert_mask_field
+
+    return convert_label_field
+
+
+def _build_dtype_field_converter(field_name, feature, config):
+    def dont_convert(sample_dict, row):
+        pass
+
+    def convert_dtype_field(sample_dict, row):
+        row_content = row["row"]
+        fo_field_name = field_name
+        if field_name in FIFTYONE_BUILTIN_FIELDS:
+            fo_field_name = f"hf_{field_name}"
+        sample_dict[fo_field_name] = row_content[field_name]
+
+    if (
+        feature["_type"] == "Value"
+        and feature["dtype"] not in SUPPORTED_DTYPES
+    ):
+        return dont_convert
+    elif (
+        feature["_type"] == "Sequence"
+        and feature["feature"]["dtype"] not in SUPPORTED_DTYPES
+    ):
+        logger.warning(
+            f"Field {field_name} has dtype {feature['dtype']} which is not supported by fiftyone"
+        )
+        return dont_convert
+    else:
+        return convert_dtype_field
+
+
+def _build_parquet_to_fiftyone_conversion(config, split, subset, **kwargs):
+    feature_converters = {}
+
+    features = _get_parquet_dataset_features(
+        config._repo_id, split, subset, revision=config._revision, **kwargs
+    )
+
+    media_field_names = list(set(config.media_fields.values()))
+    media_field_keys = list(config.media_fields.keys())
+    lf_names, lf_types = _get_label_field_names_and_types(config)
+
+    download_dir = _get_download_dir(
+        config._repo_id, split=split, subset=subset, **kwargs
+    )
+
+    ## Media field handling
+    for media_field_key in media_field_keys:
+        media_field_name = config.media_fields[media_field_key]
+        feature = features[media_field_name]
+        feature_converters[media_field_name] = _build_media_field_converter(
+            media_field_key, media_field_name, feature, download_dir
+        )
+
+    ## Label field handling
+    for lfn, lft in zip(lf_names, lf_types):
+        feature = features[lfn]
+        feature_converters[lfn] = _build_label_field_converter(
+            lfn, lft.replace("_fields", ""), feature, config, download_dir
+        )
+
+    for feature_name, feature in features.items():
+        if feature_name in media_field_names or feature_name in lf_names:
+            continue
+        feature_converters[feature_name] = _build_dtype_field_converter(
+            feature_name, feature, config
+        )
+
+    return feature_converters
+
+
+def _add_parquet_subset_to_dataset(dataset, config, split, subset, **kwargs):
+    feature_converters = _build_parquet_to_fiftyone_conversion(
+        config, split, subset
+    )
+
+    num_rows = _get_num_rows(
+        config._repo_id, split, subset, revision=config._revision, **kwargs
+    )
+    max_samples = kwargs.get("max_samples", None)
+    if max_samples is not None:
+        num_rows = min(num_rows, max_samples)
+
+    num_workers = fou.recommend_thread_pool_workers(
+        kwargs.get("num_workers", None)
+    )
+
+    batch_size = kwargs.get("batch_size", None)
+    if batch_size is None:
+        batch_size = DATASETS_MAX_BATCH_SIZE
+
+    if batch_size > DATASETS_MAX_BATCH_SIZE:
+        logger.info(
+            f"Batch size {batch_size} is larger than the maximum batch size {DATASETS_MAX_BATCH_SIZE}. Using {DATASETS_MAX_BATCH_SIZE} instead"
+        )
+        batch_size = DATASETS_MAX_BATCH_SIZE
+
+    logger.info(
+        f"Downloading {num_rows} images from {config.name} ({split}, {subset})..."
+    )
+
+    tags = [split]
+    if subset != "default" and subset != config._repo_id:
+        tags.append(subset)
+
+    with fou.ProgressBar(total=num_rows) as pb:
+        for start_idx in range(0, num_rows, batch_size):
+            urls_and_filepaths = []
+
+            end_idx = min(start_idx + batch_size, num_rows)
+
+            rows = _get_rows(
+                config._repo_id,
+                split,
+                subset,
+                start_index=start_idx,
+                end_index=end_idx,
+                revision=config._revision,
+            )
+
+            samples = []
+            for row in rows:
+                sample_dict = {}
+                for convert in feature_converters.values():
+                    res = convert(sample_dict, row)
+                    if res is not None:
+                        urls_and_filepaths.append(res)
+
+                sample_dict["row_idx"] = row["row_idx"]
+                sample_dict["tags"] = tags
+                sample = Sample(**sample_dict)
+                samples.append(sample)
+
+            dataset.add_samples(samples, progress=False)
+
+            _download_images(urls_and_filepaths, num_workers)
+
+            pb.update(count=len(samples))
+
+
+def _configure_dataset_media_fields(dataset, config):
+    media_fields = config.media_fields
+    media_field_keys = list(media_fields.keys())
+    if len(media_field_keys) > 1:
+        dataset.app_config_media_fields = media_field_keys
+    if "thumbnail_path" in media_field_keys:
+        dataset.app_config.grid_media_field = "thumbnail_path"
+    dataset.save()
+
+
+def _add_dataset_metadata(dataset, config):
+    dataset.tags = config.tags
+    description = config.description
+    if description is not None:
+        dataset.description = description
+
+    dataset.info["source"] = "Hugging Face Hub"
+    dataset.info["repo_id"] = config._repo_id
+    if config.license is not None:
+        dataset.info["license"] = config.license
+    if config._revision is not None:
+        dataset.info["revision"] = config._revision
+    dataset.save()
+
+
+def _resolve_dataset_name(config, **kwargs):
+    name = kwargs.get("name", None)
+    if name is None:
+        if hasattr(config, "name"):
+            name = config.name
+        else:
+            name = config._repo_id
+    return name
+
+
+def _get_files_to_download(dataset):
+    filepaths = dataset.values("filepath")
+    filepaths = [fp for fp in filepaths if not os.path.exists(fp)]
+    return filepaths
+
+
+def _load_fiftyone_dataset_from_config(config, **kwargs):
+    logger.info("Loading dataset")
+
+    overwrite = kwargs.get("overwrite", False)
+    persistent = kwargs.get("persistent", False)
+    max_samples = kwargs.get("max_samples", None)
+    splits = _parse_split_kwargs(**kwargs)
+
+    download_dir = _get_download_dir(config._repo_id, **kwargs)
+
+    init_download_kwargs = {
+        "repo_id": config._repo_id,
+        "repo_type": "dataset",
+        "local_dir": download_dir,
+    }
+
+    dataset_type_name = config._format.strip()
+
+    if dataset_type_name == "FiftyOneDataset" and max_samples is not None:
+        # If the dataset is a FiftyOneDataset, download only the necessary files
+        with _no_progress_bars():
+            hfh.snapshot_download(
+                **init_download_kwargs,
+                ignore_patterns="data/*",
+            )
+    else:
+        with _no_progress_bars():
+            hfh.snapshot_download(
+                **init_download_kwargs,
+            )
+
+    dataset_type = getattr(
+        __import__("fiftyone.types", fromlist=[dataset_type_name]),
+        dataset_type_name,
+    )
+
+    dataset_kwargs = {
+        "persistent": persistent,
+        "overwrite": overwrite,
+        "max_samples": max_samples,
+        "splits": splits,
+        "dataset_type": dataset_type,
+    }
+
+    name = _resolve_dataset_name(config, **kwargs)
+    if name is not None:
+        dataset_kwargs["name"] = name
+
+    dataset = fod.Dataset.from_dir(download_dir, **dataset_kwargs)
+
+    if dataset_type_name != "FiftyOneDataset":
+        return dataset
+
+    filepaths = _get_files_to_download(dataset)
+    if filepaths:
+        logger.info(f"Downloading {len(filepaths)} media files...")
+        filenames = [os.path.basename(fp) for fp in filepaths]
+        allowed_globs = ["data/" + fn for fn in filenames]
+        with _no_progress_bars():
+            hfh.snapshot_download(
+                **init_download_kwargs, allow_patterns=allowed_globs
+            )
+    return dataset
+
+
+def _load_parquet_files_dataset_from_config(config, **kwargs):
+    logger.info("Loading parquet files dataset")
+
+    allowed_splits = _get_allowed_splits(config, **kwargs)
+    allowed_subsets = _get_allowed_subsets(config, **kwargs)
+
+    for key in ["splits", "split", "subsets", "subset"]:
+        if key in kwargs:
+            kwargs.pop(key)
+
+    overwrite = kwargs.get("overwrite", False)
+    persistent = kwargs.get("persistent", False)
+
+    split_subset_pairs = _get_split_subset_pairs(config, **kwargs)
+
+    name_kwarg = kwargs.get("name", None)
+    if name_kwarg is not None:
+        name = name_kwarg
+    else:
+        name = config.name
+        max_samples = kwargs.get("max_samples", None)
+        if max_samples is not None:
+            name += f"-{max_samples}"
+
+    dataset = fod.Dataset(
+        name=name,
+        persistent=persistent,
+        overwrite=overwrite,
+    )
+
+    for split, subset in split_subset_pairs:
+        if not _is_valid_split_subset_pair(
+            split, subset, allowed_splits, allowed_subsets
+        ):
+            continue
+
+        _add_parquet_subset_to_dataset(
+            dataset, config, split, subset, **kwargs
+        )
+
+    _configure_dataset_media_fields(dataset, config)
+    _add_dataset_metadata(dataset, config)
+    return dataset