diff --git a/docs/source/images/integrations/hf_data_card_preview.jpg b/docs/source/images/integrations/hf_data_card_preview.jpg new file mode 100644 index 0000000000..12d717028f Binary files /dev/null and b/docs/source/images/integrations/hf_data_card_preview.jpg differ diff --git a/docs/source/images/integrations/hf_push_advanced_example.jpg b/docs/source/images/integrations/hf_push_advanced_example.jpg new file mode 100644 index 0000000000..a0a38d43c9 Binary files /dev/null and b/docs/source/images/integrations/hf_push_advanced_example.jpg differ diff --git a/docs/source/integrations/huggingface.rst b/docs/source/integrations/huggingface.rst index f8d89a571e..8b65bd3b3b 100644 --- a/docs/source/integrations/huggingface.rst +++ b/docs/source/integrations/huggingface.rst @@ -10,10 +10,19 @@ FiftyOne integrates natively with Hugging Face's you can load, fine-tune, and run inference with your favorite Transformers models on your FiftyOne datasets with just a few lines of code! -.. _huggingface-setup: +FiftyOne also integrates with the `Hugging Face Hub `_, +so you can push datasets to and load datasets from the Hub with ease. + +.. _huggingface-transformers: + +Transformers Library +____________________ + + +.. _huggingface-transformers-setup: Setup -_____ +----- To get started with `Transformers `_, just install the @@ -21,12 +30,13 @@ To get started with .. code-block:: shell - pip install transformers + pip install -U transformers + -.. _huggingface-inference: +.. _huggingface-transformers-inference: Inference -_________ +--------- All `Transformers models `_ @@ -47,10 +57,10 @@ on the following sample dataset: dataset = foz.load_zoo_dataset("quickstart", max_samples=25) dataset.select_fields().keep_fields() -.. _huggingface-image-classification: +.. _huggingface-transformers-image-classification: Image classification --------------------- +^^^^^^^^^^^^^^^^^^^^ You can pass `transformers` classification models directly to FiftyOne dataset's @@ -164,10 +174,10 @@ model's name or path as a keyword argument: session = fo.launch_app(dataset) -.. _huggingface-object-detection: +.. _huggingface-transformers-object-detection: Object detection ----------------- +^^^^^^^^^^^^^^^^ You can pass `transformers` detection models directly to your FiftyOne dataset's @@ -277,10 +287,10 @@ name or path as a keyword argument: session = fo.launch_app(dataset) -.. _huggingface-semantic-segmentation: +.. _huggingface-transformers-semantic-segmentation: Semantic segmentation ---------------------- +^^^^^^^^^^^^^^^^^^^^^^ You can pass a `transformers` semantic segmentation model directly to your FiftyOne dataset's @@ -373,10 +383,10 @@ model's name or path as a keyword argument: session = fo.launch_app(dataset) -.. _huggingface-monocular-depth-estimation: +.. _huggingface-transformers-monocular-depth-estimation: Monocular depth estimation --------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^ You can pass a `transformers` monocular depth estimation model directly to your FiftyOne dataset's :meth:`apply_model() ` @@ -423,10 +433,10 @@ model's name or path as a keyword argument: session = fo.launch_app(dataset) -.. _huggingface-zero-shot-classification: +.. _huggingface-transformers-zero-shot-classification: Zero-shot classification ------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^ Zero-shot image classification models from `transformers` can be loaded directly from the :ref:`FiftyOne Model Zoo `! @@ -511,10 +521,10 @@ FiftyOne format: Some zero-shot models are compatible with multiple tasks, so it is recommended that you specify the task type when converting the model. -.. _huggingface-zero-shot-detection: +.. _huggingface-transformers-zero-shot-detection: Zero-shot object detection --------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^ Zero-shot object detection models from `transformers` can be loaded directly from the :ref:`FiftyOne Model Zoo `! @@ -560,10 +570,10 @@ FiftyOne format: Some zero-shot models are compatible with multiple tasks, so it is recommended that you specify the task type when converting the model. -.. _huggingface-batch-inference: +.. _huggingface-transformers-batch-inference: Batch inference ---------------- +^^^^^^^^^^^^^^^ When using :meth:`apply_model() `, @@ -615,10 +625,10 @@ pattern below: See :ref:`this section ` for more information about performing batch updates to your FiftyOne datasets. -.. _huggingface-embeddings: +.. _huggingface-transformers-embeddings: Embeddings -__________ +---------- Any `transformers` model that supports image classification or object detection tasks — zero-shot or otherwise — can be used to compute embeddings for your @@ -633,10 +643,10 @@ samples. detection, or base model, FiftyOne will extract embeddings from the `last_hidden_state` of the model's base encoder. -.. _huggingface-image-embeddings: +.. _huggingface-transformers-image-embeddings: Image embeddings ----------------- +^^^^^^^^^^^^^^^^ To compute embeddings for images, you can pass the `transformers` model directly to your FiftyOne dataset's @@ -713,10 +723,10 @@ see if the model can be used to generate embeddings: image = Image.open(dataset.first().filepath) embedding = model.embed(np.array(image)) -.. _huggingface-text-embeddings: +.. _huggingface-transformers-text-embeddings: Text embeddings ---------------- +^^^^^^^^^^^^^^^ Zero-shot image classification and object detection models from `transformers` can also be used to compute embeddings for text: @@ -760,10 +770,10 @@ property: ) print(model.can_embed_prompts) # False -.. _huggingface-batch-embeddings: +.. _huggingface-transformers-batch-embeddings: Batch embeddings ----------------- +^^^^^^^^^^^^^^^^ You can request batch inference by passing the optional `batch_size` parameter to @@ -774,10 +784,10 @@ to dataset.compute_embeddings(model, embeddings_field="embeddings", batch_size=16) -.. _huggingface-patch-embeddings: +.. _huggingface-transformers-patch-embeddings: Patch embeddings ----------------- +^^^^^^^^^^^^^^^^ You can compute embeddings for image patches by passing `transformers` models directly to your FiftyOne dataset's @@ -804,10 +814,10 @@ method: embeddings_field="embeddings", ) -.. _huggingface-brain-methods: +.. _huggingface-transformers-brain-methods: Brain methods -_____________ +------------- Because `transformers` models can be used to compute embeddings, they can be passed to :ref:`Brain methods ` like @@ -891,3 +901,1038 @@ model: view = dataset.sort_by_similarity("A photo of a dog", k=25) session = fo.launch_app(view) + +.. _huggingface-hub: + +Hugging Face Hub +________________ + +FiftyOne integrates with the +`Hugging Face Hub `_ to allow you to +push datasets to and load datasets from the Hub with ease. This integration +simplifies the process of sharing datasets with the machine learning and +computer vision community, and allows you to easily access and work with many +of the most popular vision and multimodal datasets available! + +.. _huggingface-hub-setup: + +Setup +----- + +To push datasets to and load datasets from the +`Hugging Face Hub `_, you will need the +`Hugging Face Hub Python client `_, +which you can install via PyPI: + +.. code-block:: shell + + pip install "huggingface_hub>=0.20.0" + +To push a dataset to the Hub, and in some cases, to access a dataset on +the hub, you will need to have a +`Hugging Face Hub account `_. + +Hugging Face handles authentication via tokens, which you can obtain by +logging into your account and navigating to the +`Access Tokens `_ section of your +profile. At the bottom of this page, you can create a new token with write or +read access to the Hub. Once you have your token, you can set it as an +environment variable: + +.. code-block:: shell + + export HF_TOKEN="" + +.. _huggingface-hub-push-dataset: + +Pushing datasets to the Hub +--------------------------- + +If you are working with a dataset in FiftyOne and you want to quickly share it +with others, you can do so via the +:func:`push_to_hub() ` +function, which takes two positional arguments: + +- the FiftyOne sample collection (a |Dataset| or |DatasetView|) +- the `repo_name`, which will be combined with your Hugging Face username or + organization name to construct the `repo_id` where the sample collection + will be uploaded. + +As you will see, this simple function allows you to push datasets and filtered +views containing images, videos, point clouds, and other multimodal data to the +Hugging Face Hub, providing you with incredible flexibility in the process. + +.. _huggingface-hub-push-dataset-basic: + +Basic usage +^^^^^^^^^^^ + +The basic recipe for pushing a FiftyOne dataset to the Hub is just two lines of +code. As a starting point, let's use the example +:ref:`Quickstart dataset ` dataset from the +:ref:`FiftyOne Dataset Zoo `: + +.. code-block:: python + :linenos: + + import fiftyone as fo + import fiftyone.zoo as foz + + dataset = foz.load_zoo_dataset("quickstart") + +To push the dataset to the Hugging Face Hub, all you need to do is call +:func:`push_to_hub() ` with the dataset +and the desired `repo_name`: + +.. code-block:: python + :linenos: + + import fiftyone.utils.huggingface as fouh + + fouh.push_to_hub(dataset, "my-quickstart-dataset") + +When you run this code, a few things happen: + +- The dataset and its media files are exported to a temporary directory and + uploaded to the specified Hugging Face repo. +- A `fiftyone.yml` config file for the dataset is generated and uploaded to + the repo, which contains all of the necessary information so that the dataset + can be loaded with + :func:`load_from_hub() `. +- A Hugging Face + `Dataset Card `_ + for the dataset is auto-generated, providing tags, metadata, license info, + and a code snippet illustrating how to load the dataset from the hub. + +Your dataset will be available on the Hub at the following URL: + +.. code-block:: text + + https://huggingface.co/datasets//my-quickstart-dataset + +Pushing a |DatasetView| to the Hub works in exactly the same way. For example, +if you want to push a filtered view of the `quickstart` dataset containing only +predictions with high confidence, you can do so by creating the view as usual, +and then passing that in to +:func:`push_to_hub() `: + +.. code-block:: python + :linenos: + + import fiftyone.utils.huggingface as fouh + + # create view with high confidence predictions + view = dataset.filter_labels("predictions", F("confidence") > 0.95) + + # push view to the Hub as a new dataset + fouh.push_to_hub(view, "my-quickstart-high-conf") + +When you do so, note that the view is exported as a new dataset, and other +details from the original dataset are not included. + +FiftyOne is a *visual* toolkit, so when you push a dataset to the Hub, you can +optionally include a preview (image, gif, or video) of the dataset, that will be +displayed on the dataset page. To do this, you can pass the `preview_path` +argument to :func:`push_to_hub() `, with +either a relative or absolute path to the preview file on your local machine: + +.. code-block:: python + :linenos: + + import fiftyone as fo + import fiftyone.zoo as foz + + import fiftyone.utils.huggingface as fouh + + dataset = foz.load_zoo_dataset("quickstart") + + session = fo.launch_app(dataset) + # Screenshot and save the preview image to a file + + fouh.push_to_hub( + dataset, + "my-quickstart-with-preview", + preview_path="/path/to/preview.jpg" + ) + +The preview file will be uploaded to the Hub along with the dataset, and will be +displayed on the dataset card! + +.. image:: /images/integrations/hf_data_card_preview.jpg + :alt: Pushing a dataset to the Hugging Face Hub with a preview image + :align: center + + +.. _huggingface-hub-push-dataset-advanced: + +Advanced usage +^^^^^^^^^^^^^^ + +The :func:`push_to_hub() ` function +provides a number of optional arguments that allow you to customize how your +dataset is pushed to the Hub, including whether the dataset is public or private, +what license it is released under, and more. + +FiftyOne's :func:`push_to_hub() ` +function supports the Hugging Face Hub API arguments `private` and `exist_ok`. + +- **private** *(bool)*: Whether the dataset should be private. If `True`, the + dataset will be private and only accessible to you. If `False`, the dataset + will be public and accessible to anyone with the link. Defaults to `False`. +- **exist_ok** *(bool)*: Whether to overwrite an existing dataset with the same + `repo_name`. If `True`, the existing dataset will be overwritten. If `False`, + an error will be raised if a dataset with the same `repo_name` already + exists. Defaults to `False`. + +For example, to push a dataset to the Hub as private, you can do the following: + +.. code-block:: python + :linenos: + + import fiftyone.utils.huggingface as fouh + + fouh.push_to_hub(dataset, "my-private-dataset", private=True) + +You can also specify the `tags`, `license`, and `description` of the dataset, +all of which will propagate to the `fiftyone.yml` config file and the Hugging +Face Dataset Card. For example, to push a video action recognition dataset with +an MIT license and a description, you can do the following: + +.. code-block:: python + :linenos: + + import fiftyone as fo + import fiftyone.zoo as foz + import fiftyone.utils.huggingface as fouh + + dataset = foz.load_zoo_dataset("quickstart-video") + + fouh.push_to_hub( + dataset, + "my-action-recognition-dataset", + tags=["video", "action-recognition"], + license="mit", + description="A dataset of videos for action recognition tasks", + ) + +The pushed dataset will be available on the Hub and the dataset page will look +like this: + +.. image:: /images/integrations/hf_push_advanced_example.jpg + :alt: Pushing a dataset to the Hugging Face Hub with advanced options + :align: center + +.. note:: + + The `tags` argument can be a string or a list of strings. The tag `fiftyone` + is automatically added to all datasets pushed with FiftyOne, communicating + that the dataset was created with FiftyOne and can be loaded with the + :func:`load_from_hub() ` function. + +The license is specified as a string. For a list of supported licenses, see the +`Hugging Face Hub documentation `_. + +The `description` argument can be used for whatever you like. When the dataset +is loaded from the Hub, this description will be accessible via the dataset's +:meth:`description ` property. + +Additionally, you can specify the "format" of the uploaded dataset. By default, +the format is the standard :ref:`FiftyOneDataset ` format, +but you can also specify the data is uploaded in any of these +:ref:`common formats `. For example, to push the +quickstart dataset in :ref:`COCO ` format, with a +Creative Commons Attribution 4.0 license, you can do the following: + +.. code-block:: python + :linenos: + + import fiftyone as fo + import fiftyone.zoo as foz + import fiftyone.utils.huggingface as fouh + import fiftyone.types as fot + + dataset = foz.load_zoo_dataset("quickstart") + dataset_type = fot.dataset_types.COCODetectionDataset + + fouh.push_to_hub( + dataset, + "quickstart-coco", + dataset_type=dataset_type, + license="cc-by-4.0", + label_fields="*", # convert all label fields, not just ground truth + ) + +.. note:: + + The `label_fields` argument is used to specify which label fields to convert + to the specified dataset type. By default when using some dataset formats, + only the `ground_truth` label field is converted. If you want to convert all + label fields, you can set `label_fields="*"`. If you want to convert specific + label fields, you can pass a list of field names. + + +Additionally, you can specify the minimum version of FiftyOne required to load +the dataset by passing the `min_fiftyone_version` argument. This is useful when +the dataset utilizes features that are only available in versions above a certain +release. For example, to specify that the dataset requires FiftyOne version `0.23.0`: + +.. code-block:: python + :linenos: + + import fiftyone as fo + import fiftyone.zoo as foz + import fiftyone.utils.huggingface as fouh + + dataset = foz.load_zoo_dataset("quickstart") + + fouh.push_to_hub( + dataset, + "quickstart-min-version", + min_fiftyone_version="0.23.0", + ) + + + +.. _huggingface-hub-load-dataset: + +Loading datasets from the Hub +----------------------------- + +To load a dataset from the Hugging Face Hub, you can use the +:func:`load_from_hub() ` function. +This function supports loading datasets in any of the +:ref:`common formats ` supported by FiftyOne, as well +as image-based datasets stored via `Parquet `_ files, +as is common with datasets from the +`datasets `_ library which have +been uploaded to the Hugging Face Hub. Below, we will walk through all of the +ways you can load datasets from the Hub. + +In its simplest usage, the +:func:`load_from_hub() ` function +only requires the `repo_id` of the dataset you want to load. For example, to +load the :ref:`private dataset ` that we +pushed to the Hub earlier, you can do the following: + +.. code-block:: python + :linenos: + + import fiftyone.utils.huggingface as fouh + + dataset = fouh.load_from_hub("/my-private-dataset") + +.. note:: + + As long as you have an environment variable `HF_TOKEN` set with your Hugging + Face token (with read access), you can load private or gated datasets that you have + access to from the Hub. + +.. _huggingface-hub-load-dataset-from-repo-config: + +Loading datasets from repo configs +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When you push a dataset to the Hub using +:func:`push_to_hub() `, a `fiftyone.yml` +config file is generated and uploaded to the repo. This file contains all of the +information necessary to load the dataset from the Hugging Face Hub. More +generally, any repo on the Hugging Face Hub that contains a `fiftyone.yml` or +`fiftyone.yaml` file (assuming the file is correctly formatted) can be loaded +using the :func:`load_from_hub() ` +function by passing the `repo_id` of the dataset, without needing to specify any +additional arguments. + +For example, to load the `quickstart` dataset that we pushed to the Hub earlier, + +.. code-block:: python + :linenos: + + import fiftyone.utils.huggingface as fouh + + dataset = fouh.load_from_hub("my-quickstart-dataset") + +.. _huggingface-hub-load-dataset-from-local-config: + +Loading datasets from local configs +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If the repo was uploaded to the Hugging Face Hub via FiftyOne's +:func:`push_to_hub() ` function, then +the `fiftyone.yml` config file will be generated and uploaded to the repo. +However, some common datasets like +`mnist `_ were uploaded to the Hub +using the `datasets` library and do not contain a `fiftyone.yml` or +`fiftyone.yaml` file. If you know how the dataset is structured, you can load +the dataset by passing the path to a local yaml config file that describes the +dataset via the `config_file` keyword argument. + +For example, to load the `mnist` dataset from the Hub, you might have a local +yaml config file like this: + +.. code-block:: yaml + + format: ParquetFilesDataset + classification_fields: label + +To load the dataset from the Hub, you can pass the `repo_id` of the dataset and +the path to the local yaml config file: + +.. code-block:: python + :linenos: + + import fiftyone.utils.huggingface as fouh + + dataset = fouh.load_from_hub( + "mnist", + config_file="/path/to/mnist.yml", + ) + +For a comprehensive list of the supported fields in the yaml config file, see +:ref:`Supported config fields `. + +.. _huggingface-hub-load-dataset-from-kwargs: + +Loading datasets with config kwargs +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In addition to loading datasets from repo configs and local configs, you can +also load datasets from the Hub by passing the necessary config arguments +directly to :func:`load_from_hub() `. +This is useful when you want to load a dataset from the Hub that does not have +a `fiftyone.yml` or `fiftyone.yaml` file, and the structure of the dataset is +simple enough that you can specify the necessary arguments directly. + +For example, to load the `mnist` dataset from the Hub, you can pass the `format` +and `classification_fields` arguments directly: + +.. code-block:: python + :linenos: + + import fiftyone.utils.huggingface as fouh + + dataset = fouh.load_from_hub( + "mnist", + format="ParquetFilesDataset", + classification_fields="label", + ) + +This will tell FiftyOne that the data is stored in Parquet files, and that the +`label` field should be treated as a classification field, to be converted into +a |Classification| label field in the dataset. + +.. _huggingface-hub-load-dataset-config-kwargs: + +Supported config fields +^^^^^^^^^^^^^^^^^^^^^^^ + +Whether you are loading a dataset from a repo config, a local config file, or +passing the config arguments directly, you can specify a number of fields. + +Broadly speaking, these fields fall into three categories: format specification, +media field specification, and label field specification. + +Let's look at these categories in more detail: + +**Format specification**: + +- **format** *(str)*: The format of the dataset. This can be any of the + :ref:`common formats ` supported by FiftyOne — just + pass the name of the format as a string. For example, to load a dataset in the + :ref:`COCO ` format, you can pass + `format="COCODetectionDataset"`. To specify that the dataset is stored in + Parquet files, you can pass `format="ParquetFilesDataset"` (or simply + `format="parquet"` for short). This is the only required field. +- **name** *(str)*: The name of the FiftyOne |Dataset| to be created. If the + `repo_id` is cumbersome, this can be used to specify a simpler default name. + For example, for this `sheep dataset `_ + rather than using the `repo_id` `keremberke/aerial-sheep-object-detection`, you + can specify `name="sheep-detection"`. +- **subsets** *(str or list)*: The subset or subsets of the Hugging Face + dataset that are *compatible* with this config, and are *available* to be + loaded. In Hugging Face, the "dataset" in a repo can contain multiple + "subsets", which may or may not have the same schema. Take the + `Street View House Numbers `_ dataset for + example. This dataset has two subsets: `"cropped_digits"` and `"full_numbers"`. + The `cropped_digits` subset contains classification labels, while the + `full_numbers` subset contains detection labels. A single config would not be + able to specify the schema for both subsets, so you can specify the subset you + want to load (or if you are the dataset author, which subset you want to *allow* + people to load in this way) with the `subsets` field. For example, to load the + `cropped_digits` subset of the SVHN dataset, you can pass + `subsets="cropped_digits"`. Note that this is not a required field, and by + default all subsets are loaded. Also note that subsets are distinct from splits + in the dataset, which are handled by the `splits` field (see below). +- **splits** *(str or list)*: The split or splits of the Hugging Face dataset that + are *compatible* with this config, and are *available* to be loaded. As is + standard for machine learning, many datasets are split into training, validation, + and test sets. The specific names of these splits may vary from dataset to + dataset, but :func:`load_from_hub() ` + identifies the names of all splits and by default, will assume that all of + these splits are to be loaded. If you only want to load a specific split or + splits, you can specify them with the `splits` field. For example, to load the + training split of the `CIFAR10 `_ + dataset, you can pass `splits="train"`. If you want to load multiple splits, + you can pass them as a list, e.g., `splits=["train", "test"]`. Note that this + is not a required field, and by default all splits are loaded. + +**Media field specification**: + +While not all Parquet datasets contain media fields, all FiftyOne |Sample| objects +must be connected to at least one media file. The following fields can be used +to configure the media fields in the Hugging Face dataset that should be converted +to FiftyOne media fields: + +- **filepath** *(str)*: In FiftyOne, `filepath` is + `a default field `_ + that is used to store the path to the primary media file for each sample in + the dataset. For Hugging Face parquet datasets, primary media fields for image + datasets are typically stored in the `image` columns, so this is where + FiftyOne's :func:`load_from_hub() ` + looks by default. If the primary media field is stored in a different column, + you can specify the column name with the key `filepath`. For example, the + `COYO-700M dataset `_ + has the primary media field referenced in the `url` column. Specifying + `filepath="url"` will tell FiftyOne to look in the `url` column for the + primary media file path. Images will be downloaded from the corresponding URLs + and saved to disk. +- **thumbnail_path** *(str)*: The field containing the path to a thumbnail image + for each sample in the dataset, if such a field exists. If a `thumbnail_path` + is specified, this media file will be shown in the sample grid in the FiftyOne + App. This can be useful for quickly visualizing the dataset when the primary + media field contains large (e.g., high-resolution) images. For more information + on thumbnail images, see :ref:`this section `. +- **additional_media_fields** *(dict)*: If each sample has multiple associated media + files that you may want to visualize in the FiftyOne App, you can specify + these non-default media fields in the `additional_media_fields` dictionary, + where the keys are the column names in the Hugging Face dataset and the values + are the names of the fields in the FiftyOne |Dataset| that will store the + paths. Note that this is *not* the same as :ref:`grouped datasets `. + +**Label field specification**: + +FiftyOne's Hugging Face Hub integration currently supports converting labels of +type |Classification|, |Detections|, and |Segmentation| from Hugging Face +Parquet datasets to FiftyOne label fields. The following fields can be used to +specify the label fields in the Hugging Face dataset that should be converted to +FiftyOne label fields: + +- **classification_fields** *(str or list)*: The column or columns in the Hugging + Face dataset that should be converted to FiftyOne |Classification| label fields. + contain classification labels. For example, if the dataset contains a `label` + field that contains classification labels, you can specify + `classification_fields="label"`. If the dataset contains multiple + classification fields, you can specify them as a list, e.g., + `classification_fields=["label1", "label2"]`. This is not a required field, + and if the dataset does not contain classification labels, you can omit it. +- **detection_fields** *(str or list)*: The column or columns in the Hugging Face + dataset that should be converted to FiftyOne |Detections| label fields. If the + dataset contains detection labels, you can specify the column name or names + here. For example, if the dataset contains a `detections` field that contains + detection labels, you can specify `detection_fields="detections"`. If the + dataset contains multiple detection fields, you can specify them as a list, + e.g., `detection_fields=["detections1", "detections2"]`. This is not a required + field, and if the dataset does not contain detection labels, you can omit it. +- **mask_fields** *(str or list)*: The column or columns in the Hugging Face dataset + that should be converted to FiftyOne |Segmentation| label fields. The column + in the Hugging Face dataset must contain an image or the URL for an image that + can be used as a segmentation mask. If necessary, the images will be downloaded + and saved to disk. If the dataset contains mask labels, you can specify the + column name or names here. For example, if the dataset contains a `masks` field + that contains mask labels, you can specify `mask_fields="masks"`. This is not + a required field, and if the dataset does not contain mask labels, you can + omit it. + +.. _huggingface-hub-load-dataset-download: + +Configuring the download process +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When loading datasets from the Hugging Face Hub, FiftyOne will download the +*all* of the data specified by the `repo_id` and the config. If no splits or +subsets are listed in the config, this means that all samples across all splits +and subsets will be downloaded. This can be a time-consuming process, especially +for large datasets, and sometimes you may only want to download a fixed number +of samples to get started exploring the dataset. + +FiftyOne's :func:`load_from_hub() ` +function supports a variety of arguments that allow you to control the download +process, from the maximum number of samples to be downloaded to the batch size +to use when making requests to the Datasets Server. Here are the supported +arguments: + +- **max_samples** *(int)*: The number of samples to download from the dataset. + If not specified, all samples will be downloaded. +- **batch_size** *(int)*: The batch size to use when making requests to the + Datasets Server. Defaults to 100, which is the max batch size allowed by the + Datasets Server. +- **num_workers** *(int)*: The number of worker to use when downloading + media files. If not specified, the number of workers will be resolved by + looking at your :ref:`FiftyOne Config `. +- **splits** *(str or list)*: The split or splits of the Hugging Face dataset + that you want to download. This overrides the `splits` field in the config. +- **subsets** *(str or list)*: The subset or subsets of the Hugging Face dataset + that you want to download. This overrides the `subsets` field in the config. +- **overwrite** *(bool)*: Whether to overwrite existing an existing dataset + with the same name. If `True`, the existing dataset will be overwritten. If + `False`, an error will be raised if a dataset with the same name already + exists. Defaults to `False`. +- **persistent** *(bool)*: Whether to persist the dataset to the underlying + database after it is loaded. If `True`, the dataset will be available for + loading in future FiftyOne sessions by passing the dataset's name into + FiftyOne's + :func:`load_dataset() ` function. + Defaults to `False`. +- **revision** *(str)*: The revision (specified by a commit hash to the Hugging + Face repo) of the dataset to load. If not specified, the latest revision will + be loaded. + +.. _huggingface-hub-load-dataset-basic-examples: + +Basic examples +^^^^^^^^^^^^^^ + +Okay, so :func:`load_from_hub() ` is +*very* powerful, and can be used in a *ton* of ways. All of this flexibility +can be a bit overwhelming, so let's walk through a few examples to show you how +easy it is in practice to load datasets from the Hugging Face Hub. + +.. note:: + + To make these downloads as fast as possible, we recommend setting the + `max_samples` argument to a reasonable number, like 1000, to get a feel for + the dataset. If you like what you see, you can always download more samples! + +**Classification Datasets** + +Let's start by loading the +`MNIST `_ dataset into FiftyOne. All you +need to do is pass the `repo_id` of the dataset — in this case `"mnist"` — to +:func:`load_from_hub() `, specify the +format as `"parquet"`, and specify the `classification_fields` as `"label"`: + +.. code-block:: python + :linenos: + + import fiftyone.utils.huggingface as fouh + + dataset = fouh.load_from_hub( + "mnist", + format="parquet", + classification_fields="label", + max_samples=1000, + ) + + session = fo.launch_app(dataset) + +The same exact syntax works for the `CIFAR-10 `_ +and `FashionMNIST `_ datasets, +which are also available on the Hub. In fact, you can load any of the following +classification datasets from the Hub using the same syntax, just by changing the +`repo_id`: + +- `CIFAR-10 `_ (use `"cifar10"`) +- `ImageNet `_ (use `"imagenet-1k"`) +- `FashionMNIST `_ (use `"fashion_mnist"`) +- `Tiny ImageNet `_ (use `"zh-plus/tiny-imagenet"`) +- `Food-101 `_ (use `"food101"`) +- `Dog Food `_ (use `"sasha/dog-food"`) +- `ImageNet-Sketch `_ (use `"imagenet_sketch"`) +- `Oxford Flowers `_ (use `"nelorth/oxford-flowers"`) +- `Cats vs. Dogs `_ (use `"cats_vs_dogs"`) +- `ObjectNet-1.0 `_ (use `"timm/objectnet"`) + +A very similar syntax can be used to load classification datasets that contain +*multiple* classification fields, such as +`CIFAR-100 `_ and the +`WikiArt `_ dataset. For example, +to load the CIFAR-100 dataset, you can specify the `classification_fields` as +`["coarse_label", "fine_label"]`: + +.. code-block:: python + :linenos: + + import fiftyone.utils.huggingface as fouh + + dataset = fouh.load_from_hub( + "cifar100", + format="parquet", + classification_fields=["coarse_label", "fine_label"], + max_samples=1000, + ) + + session = fo.launch_app(dataset) + +To load the `WikiArt `_ dataset, +you can specify the `classification_fields` as `["artist", "genre", "style"]`: + +.. code-block:: python + :linenos: + + import fiftyone.utils.huggingface as fouh + + dataset = fouh.load_from_hub( + "huggan/wikiart", + format="parquet", + classification_fields=["artist", "genre", "style"], + max_samples=1000, + ) + + session = fo.launch_app(dataset) + +As touched upon earlier, you can also load a classification *subset* of a +dataset. For example, to load the `cropped_digits` subset of the +`Street View House Numbers `_ dataset: + +.. code-block:: python + :linenos: + + import fiftyone.utils.huggingface as fouh + + dataset = fouh.load_from_hub( + "svhn", + format="parquet", + classification_fields="label", + subsets="cropped_digits", + max_samples=1000, + ) + + session = fo.launch_app(dataset) + +**Detection Datasets** + +Loading detection datasets from the Hub is just as easy. For example, to load +the `MS COCO `_ +dataset, you can specify the `detection_fields` as `"objects"`, which is the +standard column name for detection features in Hugging Face datasets: + +.. code-block:: python + :linenos: + + import fiftyone.utils.huggingface as fouh + + dataset = fouh.load_from_hub( + "detection-datasets/coco", + format="parquet", + detection_fields="objects", + max_samples=1000, + ) + + session = fo.launch_app(dataset) + +The same syntax works for many other popular detection datasets on the Hub, +including: + +- `CPPE - 5 `_ (use `"cppe-5"`) +- `WIDER FACE `_ (use `"wider_face"`) +- `License Plate Object Detection `_ + (use `"keremberke/license-plate-object-detection"`) +- `Aerial Sheep Object Detection `_ + (use `"keremberke/aerial-sheep-object-detection"`) + +Some detection datasets have their detections stored under a column with a +different name. For example, the `full_numbers` subset of the +`Street View House Numbers `_ dataset +stores its detections under the column `digits`. To load this subset, you can +specify the `detection_fields` as `"digits"`: + +.. code-block:: python + :linenos: + + import fiftyone.utils.huggingface as fouh + + dataset = fouh.load_from_hub( + "svhn", + format="parquet", + detection_fields="digits", + subsets="full_numbers", + max_samples=1000, + ) + + session = fo.launch_app(dataset) + +.. note:: + + Not *all* detection datasets on the Hub are stored in a format that is + currently supported by FiftyOne. For instance, the + `Fashionpedia `_ + dataset has detections stored in Pascal VOC format, which is not the `standard + Hugging Face format `_. + +**Segmentation Datasets** + +Loading segmentation datasets from the Hub is also a breeze. For example, to +load the "instance_segmentation" subset from +`SceneParse150 `_, all you +need to do is specify the `mask_fields` as `"annotation"`: + +.. code-block:: python + :linenos: + + import fiftyone.utils.huggingface as fouh + + dataset = fouh.load_from_hub( + "scene_parse150", + format="parquet", + subsets="instance_segmentation", + mask_fields="annotation", + max_samples=1000, + ) + + session = fo.launch_app(dataset) + +Many other segmentation datasets on the Hub can be loaded in the same way, such +as `ADE 20K Tiny `_: + +.. code-block:: python + :linenos: + + import fiftyone.utils.huggingface as fouh + + dataset = fouh.load_from_hub( + "nateraw/ade20k-tiny", + format="parquet", + mask_fields="label", + ) + + # only 20 samples in the dataset + + session = fo.launch_app(dataset) + +In other cases, because there are now *multiple* image columns — one for the +sample image and one for the mask — the naming convention for the dataset might +be different, and you may need to explicitly specify the `filepath`. For +example, to load the +`Sidewalk Semantic `_ +dataset: + +.. code-block:: python + :linenos: + + import fiftyone.utils.huggingface as fouh + + # Note: you need access to the dataset to load it! + + dataset = fouh.load_from_hub( + "segments/sidewalk-semantic", + format="parquet", + filepath="pixel_values", + mask_fields="label", + max_samples=1000, + ) + + session = fo.launch_app(dataset) + +.. note:: + + Once you have the dataset loaded into FiftyOne, you may want to set the + dataset's `mask targets `_ to specify the names of + the classes represented in the segmentation masks. + +**Unlabelled Image Datasets** + +Some datasets on the Hub contain images and metadata in the form of features, +but do not explicitly contain classification, detection, or segmentation labels. +This is common for text-to-image tasks, as well as captioning and visual question +answering tasks. These datasets can also be converted and loaded into FiftyOne! +Once the dataset is loaded into FiftyOne, you can process the data and generate +labels for whatever tasks you are interested in. + +Let's look at a few examples: + +For `DiffusionDB `_, you +can load the dataset as follows: + +.. code-block:: python + :linenos: + + import fiftyone.utils.huggingface as fouh + + dataset = fouh.load_from_hub( + "poloclub/diffusiondb", + format="parquet", + max_samples=1000, + ) + + session = fo.launch_app(dataset) + +Here are some other popular datasets on the Hub that can be loaded following the +same syntax: + +- `Nouns `_: (use `"m1guelpf/nouns"`) +- `New Yorker Caption Contest `_: + (use `"jmhessel/newyorker_caption_contest"`) +- `Captcha Dataset `_: + (use `"project-sloth/captcha-images"`) +- `MathVista `_: (use `"AI4Math/MathVista"`) +- `TextVQA `_: (use `"textvqa"`) +- `VQA-RAD `_: (use `"flaviagiammarino/vqa-rad"`) +- `ScienceQA `_: (use `"derek-thomas/ScienceQA"`) +- `PathVQA `_: (use `"flaviagiammarino/path-vqa"`) + +Many other popular datasets on the Hub can be loaded in the same way, with slight +modifications to `filepath` or other arguments as needed. Here are a few examples: + +For `COYO-700M `_, we just +need to specify the `filepath` as `"url"`: + +.. code-block:: python + :linenos: + + import fiftyone.utils.huggingface as fouh + + dataset = fouh.load_from_hub( + "kakaobrain/coyo-700m", + format="parquet", + filepath="url", + max_samples=1000, + ) + + session = fo.launch_app(dataset) + +For `RedCaps `_, we instead use +`"image_url"` as the `filepath`: + +.. code-block:: python + :linenos: + + import fiftyone.utils.huggingface as fouh + + dataset = fouh.load_from_hub( + "red_caps", + format="parquet", + filepath="image_url", + max_samples=1000, + ) + + session = fo.launch_app(dataset) + +For `MMMU `_ +(A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for +Expert AGI), we use `"image_1"` as the `filepath`: + +.. code-block:: python + :linenos: + + import fiftyone.utils.huggingface as fouh + + dataset = fouh.load_from_hub( + "MMMU/MMMU", + format="parquet", + filepath="image_1", + max_samples=1000, + ) + + session = fo.launch_app(dataset) + +.. _huggingface-hub-load-dataset-advanced-examples: + +Advanced examples +^^^^^^^^^^^^^^^^^ + +The :func:`load_from_hub() ` function +also allows us to load datasets in much more complex formats, as well as with +more advanced configurations. Let's walk through a few examples to show you how +to leverage the full power of FiftyOne's Hugging Face Hub integration. + +**Loading Datasets from Revisions** + +When you load a dataset from the Hugging Face Hub, you are loading the latest +revision of the dataset. However, you can also load a specific revision of the +dataset by specifying the `revision` argument. For example, to load the last +revision of DiffusionDB before NSFW scores were added, you can specify this via: + +.. code-block:: python + :linenos: + + import fiftyone.utils.huggingface as fouh + + dataset = fouh.load_from_hub( + "poloclub/diffusiondb", + format="parquet", + subset="2m_random_1k", ## just one of the subsets + max_samples=1000, + revision="5fa48ba66a44822d82d024d195fbe918e6c42ca6", + ) + + session = fo.launch_app(dataset) + +**Loading Datasets with Multiple Media Fields** + +Some datasets on the Hub contain multiple media fields for each sample. Take +`MagicBrush `_ for example, which +contains a `"source_img"` and a `"target_img"` for each sample, in addition +to a segmentation mask denoting the area of the source image to be modified. To +load this dataset, you can specify the `filepath` as `"source_img"` and the +target image via `additional_media_fields`. Because this is getting a bit more +complex, we'll create a local yaml config file to specify the dataset format: + +.. code-block:: yaml + + format: ParquetFilesDataset + name: magicbrush + filepath: source_img + additional_media_fields: + target_img: target_img + mask_fields: mask_img + +Now, you can load the dataset using the local yaml config file: + +.. code-block:: python + :linenos: + + import fiftyone.utils.huggingface as fouh + + dataset = fouh.load_from_hub( + "osunlp/MagicBrush", + config_file="/path/to/magicbrush.yml", + max_samples=1000, + ) + + session = fo.launch_app(dataset) + +**Customizing the Download Process** + +When loading datasets from the Hub, you can customize the download process by +specifying the `batch_size`, `num_workers`, and `overwrite` arguments. For +example, to download the `full_numbers` subset of the `Street View House Numbers +`_ dataset with a batch size of 50 and 4 +workers, you can do the following: + +.. code-block:: python + :linenos: + + import fiftyone.utils.huggingface as fouh + + dataset = fouh.load_from_hub( + "svhn", + format="parquet", + detection_fields="digits", + subsets="full_numbers", + max_samples=1000, + batch_size=50, + num_workers=4, + ) + + session = fo.launch_app(dataset) + +**Loading Private or Gated Datasets** + +Like public datasets, you can also load private or gated datasets from the Hub, +as long as you have the necessary permissions. If your Hugging Face token is +set as an environment variable `HF_TOKEN`, this is as simple as specifying the +`repo_id` of the dataset. If you don't have your token set, or you need to use +a specific token for a specific dataset, you can specify the `token` argument. +You can do so following this recipe: + +.. code-block:: python + :linenos: + + import fiftyone.utils.huggingface as fouh + + dataset = fouh.load_from_hub( + "my-private-dataset-repo-id", + token="", + ... + ) + + session = fo.launch_app(dataset) diff --git a/fiftyone/utils/huggingface.py b/fiftyone/utils/huggingface.py new file mode 100644 index 0000000000..2456ae1631 --- /dev/null +++ b/fiftyone/utils/huggingface.py @@ -0,0 +1,1203 @@ +""" +Utilities for working with `Hugging Face `_. + +| Copyright 2017-2024, Voxel51, Inc. +| `voxel51.com `_ +| +""" + +from concurrent.futures import ThreadPoolExecutor +from contextlib import contextmanager +import logging +import os +from packaging.requirements import Requirement +import requests + +import yaml + +import eta.core.utils as etau + +import fiftyone as fo +import fiftyone.constants as foc +from fiftyone.core.config import Config +import fiftyone.core.dataset as fod +import fiftyone.core.labels as fol +import fiftyone.core.metadata as fom +from fiftyone.core.sample import Sample +import fiftyone.core.utils as fou +import fiftyone.types as fot + +hfh = fou.lazy_import( + "huggingface_hub", + callback=lambda: fou.ensure_package("huggingface_hub>=0.20.0"), +) + +hfu = fou.lazy_import( + "huggingface_hub.utils", + callback=lambda: fou.ensure_package("huggingface_hub>=0.20.0"), +) + + +DATASETS_SERVER_URL = "https://datasets-server.huggingface.co" +DEFAULT_MEDIA_TYPE = "image" +DATASET_METADATA_FILENAMES = ("fiftyone.yml", "fiftyone.yaml") +DATASETS_MAX_BATCH_SIZE = 100 +DEFAULT_IMAGE_FILEPATH_FEATURE = "image" +FIFTYONE_BUILTIN_FIELDS = ("id", "filepath", "tags", "metadata") +SUPPORTED_DTYPES = ( + "int8", + "int16", + "int32", + "int64", + "float16", + "float32", + "float64", + "bool", + "string", +) + +logger = logging.getLogger(__name__) + + +def push_to_hub( + dataset, + repo_name, + description=None, + license=None, + tags=None, + private=False, + exist_ok=False, + dataset_type=None, + min_fiftyone_version=None, + label_field=None, + frame_labels_field=None, + token=None, + preview_path=None, + **data_card_kwargs, +): + """Push a FiftyOne dataset to the Hugging Face Hub. + + Args: + dataset: a FiftyOne dataset + repo_name: the name of the dataset repo to create. The repo ID will be + ``{your_username}/{repo_name}`` + description (None): a description of the dataset + license (None): the license of the dataset + tags (None): a list of tags for the dataset + private (True): whether the repo should be private + exist_ok (False): if True, do not raise an error if repo already exists. + dataset_type (None): the type of the dataset to create + min_fiftyone_version (None): the minimum version of FiftyOne required + to load the dataset. For example ``"0.23.0"``. + label_field (None): controls the label field(s) to export. Only + applicable to labeled datasets. Can be any of the following: + + - the name of a label field to export + - a glob pattern of label field(s) to export + - a list or tuple of label field(s) to export + - a dictionary mapping label field names to keys to use when + constructing the label dictionaries to pass to the exporter + frame_labels_field (None): controls the frame label field(s) to export. + The "frames." prefix is optional. Only applicable to labeled video + datasets. Can be any of the following: + + - the name of a frame label field to export + - a glob pattern of frame label field(s) to export + - a list or tuple of frame label field(s) to export + - a dictionary mapping frame label field names to keys to use when + constructing the frame label dictionaries to pass to the exporter + token (None): a Hugging Face API token to use. May also be provided via + the ``HF_TOKEN`` environment variable + preview_path (None): a path to a preview image or video to display on + the readme of the dataset repo. + data_card_kwargs: additional keyword arguments to pass to the + `DatasetCard` constructor + """ + if dataset_type is None: + dataset_type = fot.FiftyOneDataset + + if tags is not None: + if isinstance(tags, str): + tags = [t.strip() for t in tags.split(",")] + tags.extend(_get_dataset_tags(dataset)) + tags = sorted(tags) + else: + tags = _get_dataset_tags(dataset) + + # do this now in case HF login fails before we do anything expensive + hf_username = hfh.whoami(token=token)["name"] + repo_id = hf_username + "/" + repo_name + + with etau.TempDir() as tmp_dir: + config_filepath = os.path.join(tmp_dir, "fiftyone.yml") + + dataset.export( + export_dir=tmp_dir, + dataset_type=dataset_type, + label_field=label_field, + frame_labels_field=frame_labels_field, + export_media=True, + ) + + _populate_config_file( + config_filepath, + dataset, + dataset_type=dataset_type, + description=description, + license=license, + tags=tags, + min_fiftyone_version=min_fiftyone_version, + ) + + ## Create the dataset repo + hfh.create_repo( + repo_id, + token=token, + repo_type="dataset", + private=private, + exist_ok=exist_ok, + ) + + ## Upload the dataset to the repo + api = hfh.HfApi(token=token) + with _no_progress_bars(): + api.upload_folder( + folder_path=tmp_dir, + repo_id=repo_id, + repo_type="dataset", + ) + + # Upload preview image or video if provided + if preview_path is not None: + abs_preview_path = os.path.abspath(preview_path) + if not os.path.exists(abs_preview_path): + logger.warning( + f"Preview path {abs_preview_path} does not exist" + ) + + ext = os.path.splitext(abs_preview_path)[1] + path_in_repo = "dataset_preview" + ext + + try: + api.upload_file( + path_or_fileobj=abs_preview_path, + path_in_repo=path_in_repo, + repo_id=repo_id, + repo_type="dataset", + commit_message="Add preview", + ) + except: + logger.warning( + f"Failed to upload preview media file {abs_preview_path}" + ) + + # If fails, set preview to None + preview_path = None + + path_in_repo = path_in_repo if preview_path is not None else None + + ## Create the dataset card + card = _create_dataset_card( + repo_id, + dataset, + description=description, + license=license, + tags=tags, + preview_path=path_in_repo, + **data_card_kwargs, + ) + card.push_to_hub(repo_id) + + +def load_from_hub( + repo_id, + revision=None, + split=None, + splits=None, + subset=None, + subsets=None, + max_samples=None, + batch_size=None, + num_workers=None, + overwrite=False, + persistent=False, + name=None, + token=None, + config_file=None, + **kwargs, +): + """Loads a dataset from the Hugging Face Hub into FiftyOne. + + Args: + repo_id: the Hugging Face Hub identifier of the dataset + revision (None): the revision of the dataset to load + split (None): the split of the dataset to load + splits (None): the splits of the dataset to load + subset (None): the subset of the dataset to load + subsets (None): the subsets of the dataset to load + max_samples (None): the maximum number of samples to load + batch_size (None): the batch size to use when loading samples + num_workers (None): a suggested number of threads to use when + downloading media + overwrite (True): whether to overwrite an existing dataset with the + same name + persistent (False): whether the dataset should be persistent + name (None): an optional name to give the dataset + token (None): a Hugging Face API token to use. May also be provided via + the ``HF_TOKEN`` environment variable + config_file (None): the path to a config file on disk specifying how to + load the dataset if the repo has no ``fiftyone.yml`` file + **kwargs: keyword arguments specifying config parameters to load the + dataset if the repo has no ``fiftyone.yml`` file + + Returns: + a :class:`fiftyone.core.dataset.Dataset` + """ + kwargs["splits"] = splits + kwargs["split"] = split + kwargs["subsets"] = subsets + kwargs["subset"] = subset + kwargs["max_samples"] = max_samples + kwargs["batch_size"] = batch_size + kwargs["num_workers"] = num_workers + kwargs["overwrite"] = overwrite + kwargs["persistent"] = persistent + kwargs["name"] = name + kwargs["token"] = token + kwargs["config_file"] = config_file + + config = _get_dataset_metadata(repo_id, revision=revision, **kwargs) + if config is None: + raise ValueError(f"Could not find fiftyone metadata for {repo_id}") + + return _load_dataset_from_config(config, **kwargs) + + +class HFHubDatasetConfig(Config): + """Config for a Hugging Face Hub dataset. + + Args: + name: the name of the dataset + repo_type: the type of the repository + repo_id: the identifier of the repository + revision: the revision of the dataset + filename: the name of the file + format: the format of the dataset + tags: the tags of the dataset + license: the license of the dataset + description: the description of the dataset + fiftyone: the fiftyone version requirement of the dataset + """ + + def __init__(self, **kwargs): + ## Internals + self._repo_type = kwargs.get("repo_type", None) + self._repo_id = kwargs.get("repo_id", None) + self._revision = kwargs.get("revision", None) + self._filename = kwargs.get("filename", None) + self._format = kwargs.get("format", None) + + ## Dataset metadata + self.tags = kwargs.get("tags", []) + if isinstance(self.tags, str): + self.tags = [t.strip() for t in self.tags.split(",")] + elif isinstance(self.tags, list): + self.tags = [t.strip() for t in self.tags] + self.license = kwargs.get("license", None) + self.description = kwargs.get("description", None) + self._get_fiftyone_version(kwargs) + + def _get_fiftyone_version(self, kwargs): + if kwargs.get("fiftyone", None) is None: + self.version = None + else: + version = kwargs["fiftyone"].get("version", None) + if version is None: + self.version = None + else: + self.version = f"fiftyone{version}" + + +DATASET_CONTENT_TEMPLATE = """ + +{preview} + +This is a [FiftyOne](https://github.com/voxel51/fiftyone) dataset with {num_samples} samples. + +## Installation + +If you haven't already, install FiftyOne: + +```bash +pip install -U fiftyone +``` + +## Usage + +```python +import fiftyone as fo +import fiftyone.utils.huggingface as fouh + +# Load the dataset +# Note: other available arguments include 'split', 'max_samples', etc +dataset = fouh.load_from_hub("{repo_id}") + +# Launch the App +session = fo.launch_app(dataset) +``` +""" + + +def _populate_config_file( + config_filepath, + dataset, + dataset_type=None, + description=None, + license=None, + tags=None, + min_fiftyone_version=None, +): + config_dict = { + "name": dataset.name, + "format": dataset_type.__name__, + "tags": tags, + } + + if min_fiftyone_version is not None: + version_val = f">={min_fiftyone_version}" + config_dict["fiftyone"] = {"version": version_val} + + if description is not None: + config_dict["description"] = description + + if license is not None: + config_dict["license"] = license + + with open(config_filepath, "w") as f: + yaml.dump(config_dict, f) + + +def _get_dataset_tasks(dataset): + def _has_label(ftype): + return bool(dataset.get_field_schema(embedded_doc_type=ftype).keys()) + + tasks = [] + if _has_label(fol.Classification) or _has_label(fol.Classifications): + tasks.append("image-classification") + if _has_label(fol.Detections): + tasks.append("object-detection") + if _has_label(fol.Segmentation): + tasks.append("semantic-segmentation") + return tasks + + +def _get_dataset_tags(dataset): + tags = ["fiftyone"] + tags.append(dataset.media_type) + tags.extend(_get_dataset_tasks(dataset)) + tags.extend(dataset.tags) + return sorted(list(set(tags))) + + +def _generate_dataset_summary(repo_id, dataset, preview_path): + format_kwargs = { + "repo_id": repo_id, + "num_samples": len(dataset), + "preview": "", + } + if preview_path is not None: + format_kwargs["preview"] = f"\n![image/png]({preview_path})\n" + return DATASET_CONTENT_TEMPLATE.format(**format_kwargs) + + +def _create_dataset_card( + repo_id, + dataset, + tags=None, + license=None, + preview_path=None, + **dataset_card_kwargs, +): + card_inputs = { + "language": "en", + "annotations_creators": [], + "task_categories": _get_dataset_tasks(dataset), + "task_ids": [], + "pretty_name": dataset.name, + "license": license, + "tags": tags, + } + + for key, value in dataset_card_kwargs.items(): + card_inputs[key] = value + + dataset_summary = _generate_dataset_summary(repo_id, dataset, preview_path) + if dataset_summary is not None: + card_inputs["dataset_summary"] = dataset_summary + + card_data = hfh.DatasetCardData(**card_inputs) + return hfh.DatasetCard.from_template(card_data) + + +def _parse_split_kwargs(**kwargs): + splits = kwargs.get("splits", None) + split = kwargs.get("split", None) + if splits is None and split is not None: + splits = split + + if isinstance(splits, str): + if "," in splits: + splits = splits.split(",") + else: + splits = [splits] + return splits + + +def _parse_subset_kwargs(**kwargs): + subsets = kwargs.get("subsets", None) + subset = kwargs.get("subset", None) + if subsets is None and subset is not None: + subsets = subset + + if isinstance(subsets, str): + subsets = [subsets] + return subsets + + +@contextmanager +def _no_progress_bars(): + pbs_disabled = hfu.are_progress_bars_disabled() + hfu.disable_progress_bars() + try: + yield + finally: + # Restore the original state + if not pbs_disabled: + hfu.enable_progress_bars() + + +class HFHubParquetFilesDatasetConfig(HFHubDatasetConfig): + """Config for a Hugging Face Hub dataset that is stored as parquet files. + + Args: + name: the name of the dataset + repo_type: the type of the repository + repo_id: the identifier of the repository + revision: the revision of the dataset + filename: the name of the file + format: the format of the dataset + tags: the tags of the dataset + license: the license of the dataset + description: the description of the dataset + fiftyone: the fiftyone version requirement of the dataset + label_fields: the label fields of the dataset + media_type: the media type of the dataset + default_media_fields: the default media fields of the dataset + additional_media_fields: the additional media fields of the dataset + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + self.media_type = kwargs.get("media_type", DEFAULT_MEDIA_TYPE) + + self._build_name(kwargs) + self._build_media_fields_dict(kwargs) + self._build_label_fields_dict(kwargs) + self._build_allowed_splits(kwargs) + self._build_allowed_subsets(kwargs) + + def _build_name(self, kwargs): + self.name = kwargs.get("name", None) + if self.name is None: + self.name = kwargs.get("repo_id", None) + + def _build_allowed_splits(self, kwargs): + # Author specifies what splits are compatible with this config + self._allowed_splits = _parse_split_kwargs(**kwargs) + + def _build_allowed_subsets(self, kwargs): + # Author specifies what subsets are compatible with this config + self._allowed_subsets = _parse_subset_kwargs(**kwargs) + + def _build_media_fields_dict(self, kwargs): + media_fields_dict = kwargs.get("default_media_fields", {}) + if media_fields_dict.get("filepath", None) is None: + media_fields_dict["filepath"] = kwargs.get( + "filepath", DEFAULT_IMAGE_FILEPATH_FEATURE + ) + if ( + media_fields_dict.get("thumbnail_path", None) is None + and kwargs.get("thumbnail_path", None) is not None + ): + media_fields_dict["thumbnail_path"] = kwargs["thumbnail_path"] + + additional_media_fields = kwargs.get("additional_media_fields", {}) + media_fields_dict.update(additional_media_fields) + self.media_fields = media_fields_dict + + def _build_label_fields_dict(self, kwargs): + self.label_fields = kwargs.get("label_fields", {}) + label_types = ("classification", "detection", "mask") + for label_type in label_types: + label_fields = kwargs.get(f"{label_type}_fields", None) + if label_fields is not None: + if isinstance(label_fields, str): + self.label_fields[label_type] = label_fields.split(",") + elif isinstance(label_fields, list): + self.label_fields[label_type] = label_fields + + +def _parse_format_string(format_str): + if "parquet" in format_str.lower(): + return "ParquetFilesDataset" + else: + return format_str + + +def _build_config(config_dict): + format = config_dict.get("format", None) + if format is None: + raise ValueError("Dataset config must have a format key") + + format = _parse_format_string(format) + if format == "ParquetFilesDataset": + return HFHubParquetFilesDatasetConfig(**config_dict) + else: + return HFHubDatasetConfig(**config_dict) + + +def _get_headers(**kwargs): + token = kwargs.get("token", None) or os.getenv("HF_TOKEN") + if token is not None: + return {"Authorization": f"Bearer {token}"} + return None + + +def _get_dataset_metadata(repo_id, revision=None, token=None, **kwargs): + common_kwargs = dict(repo_type="dataset", revision=revision) + config_file = kwargs.get("config_file", None) + + if config_file is not None: + config_file = os.path.abspath(config_file) + filename = os.path.basename(config_file) + all_kwargs = dict(repo_id=repo_id, filename=filename, **common_kwargs) + else: + api = hfh.HfApi(token=token) + for filename in DATASET_METADATA_FILENAMES: + if api.file_exists(repo_id, filename, **common_kwargs): + all_kwargs = dict( + repo_id=repo_id, filename=filename, **common_kwargs + ) + logger.info( + f"Downloading config file {filename} from {repo_id}" + ) + config_file = hfh.hf_hub_download(**all_kwargs) + break + + if config_file is None and "format" not in kwargs: + return None + + if config_file is None: + config_dict = kwargs + config_dict.update(**common_kwargs) + config_dict["repo_id"] = repo_id + else: + with open(config_file, "r") as f: + config_dict = yaml.safe_load(f) + config_dict.update(**all_kwargs) + + return _build_config(config_dict) + + +def _ensure_dataset_compatibility(config): + req_str = config.version + if req_str is None: + return + + try: + req = Requirement(req_str) + except: + logger.warning( + f"Unable to parse dataset {config.name}'s fiftyone version requirement {req_str}" + ) + return + + if not req.specifier.contains(foc.VERSION): + logger.warning( + f"Dataset {config.name} requires {req_str} but you are running fiftyone=={foc.VERSION}" + ) + + +def _get_download_dir(repo_id, split=None, subset=None, **kwargs): + path_walk = [fo.config.default_dataset_dir, "huggingface", "hub", repo_id] + + ## Note: for now don't support multiple revisions storage + if subset is not None: + path_walk.append(subset) + if split is not None: + path_walk.append(split) + + download_dir = os.path.join(*path_walk) + etau.ensure_dir(download_dir) + + return download_dir + + +def _get_split_subset_pairs(config, **kwargs): + repo_id = config._repo_id + revision = config._revision + api_url = ( + f"{DATASETS_SERVER_URL}/splits?dataset={repo_id.replace('/', '%2F')}" + ) + if revision is not None: + api_url += f"&revision={revision}" + headers = _get_headers(**kwargs) + response = requests.get(api_url, headers=headers).json()["splits"] + return [(ss["split"], ss["config"]) for ss in response] + + +def _load_dataset_from_config(config, **kwargs): + _ensure_dataset_compatibility(config) + if isinstance(config, HFHubParquetFilesDatasetConfig): + return _load_parquet_files_dataset_from_config(config, **kwargs) + else: + return _load_fiftyone_dataset_from_config(config, **kwargs) + + +def _get_allowed_splits(config, **kwargs): + user_splits = _parse_split_kwargs(**kwargs) + author_splits = config._allowed_splits + + if not user_splits and not author_splits: + return None + else: + return user_splits if user_splits else author_splits + + +def _get_allowed_subsets(config, **kwargs): + user_subsets = _parse_subset_kwargs(**kwargs) + author_subsets = config._allowed_subsets + if not user_subsets and not author_subsets: + return None + else: + return user_subsets if user_subsets else author_subsets + + +def _is_valid_split_subset_pair( + split, subset, allowed_splits, allowed_subsets +): + if allowed_splits is not None and split not in allowed_splits: + return False + if allowed_subsets is not None and subset not in allowed_subsets: + return False + return True + + +def _get_label_field_names_and_types(config): + label_field_names, label_types = [], [] + label_fields = config.label_fields + if label_fields is None: + return label_field_names, label_types + + for label_type, fields in label_fields.items(): + if isinstance(fields, str): + label_field_names.append(fields) + label_types.append(label_type) + elif isinstance(fields, list): + label_field_names.extend(fields) + label_types.extend([label_type] * len(fields)) + + return label_field_names, label_types + + +def _get_parquet_dataset_features( + repo_id, split, subset, revision=None, **kwargs +): + api_url = f"{DATASETS_SERVER_URL}/info?dataset={repo_id.replace('/', '%2F')}&config={subset}&split={split}]" + if revision is not None: + api_url += f"&revision={revision}" + + headers = _get_headers(**kwargs) + response = requests.get(api_url, headers=headers) + features = response.json()["dataset_info"]["features"] + return features + + +def _get_num_rows(repo_id, split, subset, revision=None, **kwargs): + api_url = f"{DATASETS_SERVER_URL}/info?dataset={repo_id.replace('/', '%2F')}&config={subset}&split={split}]" + if revision is not None: + api_url += f"&revision={revision}" + + headers = _get_headers(**kwargs) + response = requests.get(api_url, headers=headers) + splits = response.json()["dataset_info"]["splits"] + return splits[split]["num_examples"] + + +def _build_rows_request_url( + repo_id, split=None, subset="default", revision=None, offset=0, length=100 +): + url = f"{DATASETS_SERVER_URL}/rows?dataset={repo_id.replace('/', '%2F')}" + if split is not None: + url += f"&split={split}" + if subset is not None: + url += f"&config={subset}" + if revision is not None: + url += f"&revision={revision}" + url += f"&offset={offset}&length={length}" + return url + + +def _get_rows( + repo_id, + split, + subset, + start_index=0, + end_index=100, + revision=None, + **kwargs, +): + length = end_index - start_index + url = _build_rows_request_url( + repo_id, split, subset, revision, offset=start_index, length=length + ) + headers = _get_headers(**kwargs) + response = requests.get(url, headers=headers) + return response.json()["rows"] + + +def _download_image(url_and_filepath): + url, filepath = url_and_filepath + try: + if not os.path.exists(filepath): + with requests.get(url, stream=True) as r: + r.raise_for_status() + with open(filepath, "wb") as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + except Exception as e: + logger.warning(f"Failed to download image from {url}: {e}") + + +def _download_images(urls_and_filepaths, num_workers): + if num_workers <= 1: + for url_and_filepath in urls_and_filepaths: + _download_image(url_and_filepath) + else: + with ThreadPoolExecutor(max_workers=num_workers) as executor: + executor.map(_download_image, urls_and_filepaths) + + +def _build_media_field_converter( + media_field_key, media_field_name, feature, download_dir +): + def convert_media_field(sample_dict, row): + row_content = row["row"] + row_index = row["row_idx"] + + filename = f"{media_field_name}_{row_index}.png" + filepath = os.path.join(download_dir, filename) + + if feature["_type"] == "Image": + url = row_content[media_field_name]["src"] + else: + url = row_content[media_field_name] + + sample_dict[media_field_key] = filepath + + return (url, filepath) + + return convert_media_field + + +def _get_image_shape(image_path): + metadata = fom.ImageMetadata.build_for(image_path) + return (metadata.width, metadata.height) + + +def _get_detection_label_field_name(feature): + for key, value in feature["feature"].items(): + if value["_type"] == "ClassLabel": + return key + return None + + +def _get_bounding_box_field_name(feature): + for key, value in feature["feature"].items(): + if value["_type"] == "Sequence" and value["length"] == 4: + return key + return None + + +def _convert_bounding_box(hf_bbox, img_size): + x, y, w, h = hf_bbox + if all([0 <= c <= 1 for c in [x, y, w, h]]): + return hf_bbox + else: + return [ + x / img_size[0], + y / img_size[1], + w / img_size[0], + h / img_size[1], + ] + + +def _build_label_field_converter( + field_name, field_type, feature, config, download_dir +): + def convert_classification_field(sample_dict, row): + row_content = row["row"] + label_index = row_content[field_name] + if label_index == -1: + return + label = feature["names"][label_index] + if isinstance(label, tuple): + label = label[0] + sample_dict[field_name] = fol.Classification(label=str(label)) + + def convert_detection_field(sample_dict, row): + img_w, img_h = _get_image_shape(sample_dict["filepath"]) + + feature_content = row["row"][field_name] + det_keys = list(feature["feature"].keys()) + bbox_key = _get_bounding_box_field_name(feature) + det_label_key = _get_detection_label_field_name(feature) + + num_dets = len(feature_content[det_label_key]) + + detections = [] + for i in range(num_dets): + label = feature_content[det_label_key][i] + bounding_box = feature_content[bbox_key][i] + + bounding_box = _convert_bounding_box(bounding_box, (img_w, img_h)) + det_dict = { + "label": feature["feature"][det_label_key]["names"][label], + "bounding_box": bounding_box, + } + for key in det_keys: + if ( + key not in [bbox_key, det_label_key] + and key not in FIFTYONE_BUILTIN_FIELDS + ): + det_dict[key] = feature_content[key][i] + + detections.append(fol.Detection(**det_dict)) + + sample_dict[field_name] = fol.Detections(detections=detections) + + def convert_mask_field(sample_dict, row): + row_content = row["row"] + row_index = row["row_idx"] + filename = f"{field_name}_{row_index}.png" + filepath = os.path.join(download_dir, filename) + + if feature["_type"] == "Image": + url = row_content[field_name]["src"] + else: + url = row_content[field_name] + + sample_dict[field_name] = fol.Segmentation(mask_path=filepath) + + return (url, filepath) + + def convert_label_field(sample_dict, row): + pass + + if field_type == "classification": + return convert_classification_field + elif "detection" in field_type: + return convert_detection_field + elif "mask" in field_type: + return convert_mask_field + + return convert_label_field + + +def _build_dtype_field_converter(field_name, feature, config): + def dont_convert(sample_dict, row): + pass + + def convert_dtype_field(sample_dict, row): + row_content = row["row"] + fo_field_name = field_name + if field_name in FIFTYONE_BUILTIN_FIELDS: + fo_field_name = f"hf_{field_name}" + sample_dict[fo_field_name] = row_content[field_name] + + if ( + feature["_type"] == "Value" + and feature["dtype"] not in SUPPORTED_DTYPES + ): + return dont_convert + elif ( + feature["_type"] == "Sequence" + and feature["feature"]["dtype"] not in SUPPORTED_DTYPES + ): + logger.warning( + f"Field {field_name} has dtype {feature['dtype']} which is not supported by fiftyone" + ) + return dont_convert + else: + return convert_dtype_field + + +def _build_parquet_to_fiftyone_conversion(config, split, subset, **kwargs): + feature_converters = {} + + features = _get_parquet_dataset_features( + config._repo_id, split, subset, revision=config._revision, **kwargs + ) + + media_field_names = list(set(config.media_fields.values())) + media_field_keys = list(config.media_fields.keys()) + lf_names, lf_types = _get_label_field_names_and_types(config) + + download_dir = _get_download_dir( + config._repo_id, split=split, subset=subset, **kwargs + ) + + ## Media field handling + for media_field_key in media_field_keys: + media_field_name = config.media_fields[media_field_key] + feature = features[media_field_name] + feature_converters[media_field_name] = _build_media_field_converter( + media_field_key, media_field_name, feature, download_dir + ) + + ## Label field handling + for lfn, lft in zip(lf_names, lf_types): + feature = features[lfn] + feature_converters[lfn] = _build_label_field_converter( + lfn, lft.replace("_fields", ""), feature, config, download_dir + ) + + for feature_name, feature in features.items(): + if feature_name in media_field_names or feature_name in lf_names: + continue + feature_converters[feature_name] = _build_dtype_field_converter( + feature_name, feature, config + ) + + return feature_converters + + +def _add_parquet_subset_to_dataset(dataset, config, split, subset, **kwargs): + feature_converters = _build_parquet_to_fiftyone_conversion( + config, split, subset + ) + + num_rows = _get_num_rows( + config._repo_id, split, subset, revision=config._revision, **kwargs + ) + max_samples = kwargs.get("max_samples", None) + if max_samples is not None: + num_rows = min(num_rows, max_samples) + + num_workers = fou.recommend_thread_pool_workers( + kwargs.get("num_workers", None) + ) + + batch_size = kwargs.get("batch_size", None) + if batch_size is None: + batch_size = DATASETS_MAX_BATCH_SIZE + + if batch_size > DATASETS_MAX_BATCH_SIZE: + logger.info( + f"Batch size {batch_size} is larger than the maximum batch size {DATASETS_MAX_BATCH_SIZE}. Using {DATASETS_MAX_BATCH_SIZE} instead" + ) + batch_size = DATASETS_MAX_BATCH_SIZE + + logger.info( + f"Downloading {num_rows} images from {config.name} ({split}, {subset})..." + ) + + tags = [split] + if subset != "default" and subset != config._repo_id: + tags.append(subset) + + with fou.ProgressBar(total=num_rows) as pb: + for start_idx in range(0, num_rows, batch_size): + urls_and_filepaths = [] + + end_idx = min(start_idx + batch_size, num_rows) + + rows = _get_rows( + config._repo_id, + split, + subset, + start_index=start_idx, + end_index=end_idx, + revision=config._revision, + ) + + samples = [] + for row in rows: + sample_dict = {} + for convert in feature_converters.values(): + res = convert(sample_dict, row) + if res is not None: + urls_and_filepaths.append(res) + + sample_dict["row_idx"] = row["row_idx"] + sample_dict["tags"] = tags + sample = Sample(**sample_dict) + samples.append(sample) + + dataset.add_samples(samples, progress=False) + + _download_images(urls_and_filepaths, num_workers) + + pb.update(count=len(samples)) + + +def _configure_dataset_media_fields(dataset, config): + media_fields = config.media_fields + media_field_keys = list(media_fields.keys()) + if len(media_field_keys) > 1: + dataset.app_config_media_fields = media_field_keys + if "thumbnail_path" in media_field_keys: + dataset.app_config.grid_media_field = "thumbnail_path" + dataset.save() + + +def _add_dataset_metadata(dataset, config): + dataset.tags = config.tags + description = config.description + if description is not None: + dataset.description = description + + dataset.info["source"] = "Hugging Face Hub" + dataset.info["repo_id"] = config._repo_id + if config.license is not None: + dataset.info["license"] = config.license + if config._revision is not None: + dataset.info["revision"] = config._revision + dataset.save() + + +def _resolve_dataset_name(config, **kwargs): + name = kwargs.get("name", None) + if name is None: + if hasattr(config, "name"): + name = config.name + else: + name = config._repo_id + return name + + +def _get_files_to_download(dataset): + filepaths = dataset.values("filepath") + filepaths = [fp for fp in filepaths if not os.path.exists(fp)] + return filepaths + + +def _load_fiftyone_dataset_from_config(config, **kwargs): + logger.info("Loading dataset") + + overwrite = kwargs.get("overwrite", False) + persistent = kwargs.get("persistent", False) + max_samples = kwargs.get("max_samples", None) + splits = _parse_split_kwargs(**kwargs) + + download_dir = _get_download_dir(config._repo_id, **kwargs) + + init_download_kwargs = { + "repo_id": config._repo_id, + "repo_type": "dataset", + "local_dir": download_dir, + } + + dataset_type_name = config._format.strip() + + if dataset_type_name == "FiftyOneDataset" and max_samples is not None: + # If the dataset is a FiftyOneDataset, download only the necessary files + with _no_progress_bars(): + hfh.snapshot_download( + **init_download_kwargs, + ignore_patterns="data/*", + ) + else: + with _no_progress_bars(): + hfh.snapshot_download( + **init_download_kwargs, + ) + + dataset_type = getattr( + __import__("fiftyone.types", fromlist=[dataset_type_name]), + dataset_type_name, + ) + + dataset_kwargs = { + "persistent": persistent, + "overwrite": overwrite, + "max_samples": max_samples, + "splits": splits, + "dataset_type": dataset_type, + } + + name = _resolve_dataset_name(config, **kwargs) + if name is not None: + dataset_kwargs["name"] = name + + dataset = fod.Dataset.from_dir(download_dir, **dataset_kwargs) + + if dataset_type_name != "FiftyOneDataset": + return dataset + + filepaths = _get_files_to_download(dataset) + if filepaths: + logger.info(f"Downloading {len(filepaths)} media files...") + filenames = [os.path.basename(fp) for fp in filepaths] + allowed_globs = ["data/" + fn for fn in filenames] + with _no_progress_bars(): + hfh.snapshot_download( + **init_download_kwargs, allow_patterns=allowed_globs + ) + return dataset + + +def _load_parquet_files_dataset_from_config(config, **kwargs): + logger.info("Loading parquet files dataset") + + allowed_splits = _get_allowed_splits(config, **kwargs) + allowed_subsets = _get_allowed_subsets(config, **kwargs) + + for key in ["splits", "split", "subsets", "subset"]: + if key in kwargs: + kwargs.pop(key) + + overwrite = kwargs.get("overwrite", False) + persistent = kwargs.get("persistent", False) + + split_subset_pairs = _get_split_subset_pairs(config, **kwargs) + + name_kwarg = kwargs.get("name", None) + if name_kwarg is not None: + name = name_kwarg + else: + name = config.name + max_samples = kwargs.get("max_samples", None) + if max_samples is not None: + name += f"-{max_samples}" + + dataset = fod.Dataset( + name=name, + persistent=persistent, + overwrite=overwrite, + ) + + for split, subset in split_subset_pairs: + if not _is_valid_split_subset_pair( + split, subset, allowed_splits, allowed_subsets + ): + continue + + _add_parquet_subset_to_dataset( + dataset, config, split, subset, **kwargs + ) + + _configure_dataset_media_fields(dataset, config) + _add_dataset_metadata(dataset, config) + return dataset