Merge branch 'activity-net' of github.com:voxel51/fiftyone into activ…

…ity-net
cena001plus · Feb 5, 2022 · a3c0a1c · a3c0a1c
2 parents f4680e9 + 17c6818
commit a3c0a1c
Show file tree

Hide file tree

Showing 5 changed files with 128 additions and 39 deletions.
diff --git a/docs/source/integrations/activitynet.rst b/docs/source/integrations/activitynet.rst
@@ -18,8 +18,8 @@ Loading the ActivityNet dataset
 _______________________________
 
 The FiftyOne Dataset Zoo provides support for loading both the
-:ref:`ActivityNet-100 <dataset-zoo-activitynet-100>` and
-:ref:`ActivityNet-200 <dataset-zoo-activitynet-200>` datasets.
+:ref:`ActivityNet 100 <dataset-zoo-activitynet-100>` and
+:ref:`ActivityNet 200 <dataset-zoo-activitynet-200>` datasets.
 
 Like all other zoo datasets, you can use
 :func:`load_zoo_dataset() <fiftyone.zoo.datasets.load_zoo_dataset>` to download
@@ -31,7 +31,7 @@ and load an ActivityNet split into FiftyOne:
     import fiftyone as fo
     import fiftyone.zoo as foz
 
-    # Download and load 10 samples from the validation split of ActivityNet-200
+    # Download and load 10 samples from the validation split of ActivityNet 200
     dataset = foz.load_zoo_dataset(
         "activitynet-200",
         split="validation",
@@ -42,9 +42,9 @@ and load an ActivityNet split into FiftyOne:
 
 .. note::
 
-    ActivityNet-200 is a superset of ActivityNet-100 so we have made sure
+    ActivityNet 200 is a superset of ActivityNet 100 so we have made sure
     to only store one copy of every video on disk. Videos in the
-    ActivityNet-100 zoo directory are used directly by ActivityNet-200.
+    ActivityNet 100 zoo directory are used directly by ActivityNet 200.
 
 .. _activitynet-partial-downloads:
 
@@ -103,7 +103,7 @@ first if possible before resorting to downloading additional data from YouTube.
     session.dataset = dataset
 
 The following parameters are available to configure partial downloads of both
-ActivityNet-100 and ActivityNet-200 by passing them to
+ActivityNet 100 and ActivityNet 200 by passing them to
 :func:`load_zoo_dataset() <fiftyone.zoo.datasets.load_zoo_dataset>`:
 
 -   **split** (*None*) and **splits** (*None*): a string or list of strings,
@@ -161,7 +161,7 @@ After downloading the source files, they can be loaded into FiftyOne like so:
 
     source_dir = "/path/to/dir-with-activitynet-files"
 
-    # Load the entire ActivityNet-200 dataset into FiftyOne 
+    # Load the entire ActivityNet 200 dataset into FiftyOne
     dataset = foz.load_zoo_dataset("activitynet-200", source_dir=source_dir)
 
     session = fo.launch_app(dataset)

diff --git a/docs/source/user_guide/dataset_zoo/datasets.rst b/docs/source/user_guide/dataset_zoo/datasets.rst
@@ -69,7 +69,7 @@ This page lists all of the datasets available in the Dataset Zoo.
 
 .. _dataset-zoo-activitynet-100:
 
-ActivityNet-100
+ActivityNet 100
 ---------------
 
 ActivityNet is a large-scale video dataset for human activity understanding
@@ -86,7 +86,7 @@ version of the dataset.
 
 **Notes**
 
--   ActivityNet-100 and -200 differ in the number of activity classes and
+-   ActivityNet 100 and 200 differ in the number of activity classes and
     videos per split
 -   Partial downloads will download videos (if still available) from YouTube
 -   Full splits can be loaded by first downloading the official source files
@@ -118,7 +118,7 @@ specified, FiftyOne will use existing downloaded data first if possible before
 resorting to downloading additional data from YouTube.
 
 The following parameters are available to configure a partial download of
-ActivityNet-100 by passing them to
+ActivityNet 100 by passing them to
 :func:`load_zoo_dataset() <fiftyone.zoo.datasets.load_zoo_dataset>`:
 
 -   **split** (*None*) and **splits** (*None*): a string or list of strings,
@@ -273,7 +273,7 @@ full splits by passing the `source_dir` parameter to
 
 .. _dataset-zoo-activitynet-200:
 
-ActivityNet-200
+ActivityNet 200
 ---------------
 
 ActivityNet is a large-scale video dataset for human activity understanding
@@ -290,9 +290,9 @@ version of the dataset.
 
 **Notes**
 
--   ActivityNet-200 is a superset of ActivityNet-100 videos
--   ActivityNet-100 and -200 differ in the number of activity classes and
-    videos per split
+-   ActivityNet 200 is a superset of ActivityNet 100
+-   ActivityNet 100 and 200 differ in the number of activity classes and videos
+    per split
 -   Partial downloads will download videos (if still available) from YouTube
 -   Full splits can be loaded by first downloading the official source files
     from the
@@ -323,7 +323,7 @@ specified, FiftyOne will use existing downloaded data first if possible before
 resorting to downloading additional data from YouTube.
 
 The following parameters are available to configure a partial download of
-ActivityNet-200 by passing them to
+ActivityNet 200 by passing them to
 :func:`load_zoo_dataset() <fiftyone.zoo.datasets.load_zoo_dataset>`:
 
 -   **split** (*None*) and **splits** (*None*): a string or list of strings,

diff --git a/fiftyone/utils/activitynet.py b/fiftyone/utils/activitynet.py
@@ -70,12 +70,12 @@ def download_activitynet_split(
     Returns:
         a tuple of:
 
-        -   num_samples: the total number of downloaded videos, or ``None`` if
-            everything was already downloaded
-        -   classes: the list of all classes, or ``None`` if everything was
+        -   **num_samples**: the total number of downloaded videos, or ``None``
+            if everything was already downloaded
+        -   **classes**: the list of all classes, or ``None`` if everything was
             already downloaded
-        -   did_download: whether any content was downloaded (True) or if all
-            necessary files were already downloaded (False)
+        -   **did_download**: whether any content was downloaded (True) or if
+            all necessary files were already downloaded (False)
     """
     manager = ActivityNetDatasetManager.from_dataset_dir(dataset_dir, version)
 
@@ -222,7 +222,7 @@ def setup(self):
 
         process_uuids = True
         if self.labels_path is not None and os.path.isfile(self.labels_path):
-            labels = etas.load_json(self.labels_path)
+            labels = etas.read_json(self.labels_path)
             info = ActivityNetInfo(labels)
             sample_ids = self._video_paths_map.keys()
 
@@ -643,7 +643,6 @@ def _separate_versions_and_attempt_to_download(
         remaining_samples = num_samples
 
         if num_a100_ids:
-            logger.info("Downloading videos...")
             if num_samples is None:
                 num_to_download = num_a100_ids
             else:
@@ -664,7 +663,6 @@ def _separate_versions_and_attempt_to_download(
             )
 
         if remaining_samples:
-            logger.info("Downloading ActivityNet-200-specific videos...")
             a200_downloaded_ids, a200_errors = self._attempt_to_download(
                 self.a200_info.data_dir(split),
                 a200_ids,
@@ -690,6 +688,7 @@ def _attempt_to_download(
             download_urls.append(sample_info["url"])
             download_paths.append(download_path)
 
+        logger.info("Downloading videos from YouTube...")
         downloaded, errors = fouy.download_youtube_videos(
             urls=download_urls,
             video_paths=download_paths,
@@ -705,7 +704,7 @@ def _attempt_to_download(
 
     def _merge_and_write_errors(self, download_errors, error_path):
         if os.path.isfile(error_path):
-            prev_errors = etas.load_json(error_path)
+            prev_errors = etas.read_json(error_path)
         else:
             prev_errors = {}
 
@@ -881,8 +880,9 @@ def format_annotations(self, sample_ids, split=None):
 
 
 class ActivityNetSplitInfo(ActivityNetInfo):
-    """Contains information related to paths, labels, and sample ids of a
-    single split"""
+    """Class that contains information related to paths, labels, and sample IDs
+    of a single ActivityNet split.
+    """
 
     def __init__(self, split_dir, version=None, raw_annotations=None):
         self.split_dir = os.path.abspath(split_dir)
@@ -949,10 +949,11 @@ def _get_raw_annotations(self, version=None):
                     "annotations have not been loaded, then a version must be "
                     "provided." % self.raw_anno_path
                 )
+
             anno_link = _ANNOTATION_DOWNLOAD_LINKS[version]
-            etaw.download_file(anno_link, path=self.raw_anno_path, quiet=True)
+            etaw.download_file(anno_link, path=self.raw_anno_path)
 
-        return etas.load_json(self.raw_anno_path)
+        return etas.read_json(self.raw_anno_path)
 
 
 class ActivityNetDatasetInfo(ActivityNetInfo):
@@ -975,6 +976,7 @@ def split_info(self, split):
                 version=self.version,
                 raw_annotations=self.raw_annotations,
             )
+
         return self._split_infos[split]
 
     @property
@@ -1027,7 +1029,9 @@ def _get_existing_sample_ids(self):
                 split_ids = split_info.existing_sample_ids
             else:
                 split_ids = []
+
             ids[split] = split_ids
+
         return ids
 
     def cleanup_split(self, split):
@@ -1036,9 +1040,9 @@ def cleanup_split(self, split):
     def _get_raw_annotations(self):
         if not os.path.isfile(self.raw_anno_path):
             anno_link = _ANNOTATION_DOWNLOAD_LINKS[self.version]
-            etaw.download_file(anno_link, path=self.raw_anno_path, quiet=True)
+            etaw.download_file(anno_link, path=self.raw_anno_path)
 
-        return etas.load_json(self.raw_anno_path)
+        return etas.read_json(self.raw_anno_path)
 
     @classmethod
     def get_dir_info(cls, dataset_dir):
@@ -1072,7 +1076,7 @@ def get_sample_dataset_version(self, sample_id):
 
 
 class ActivityNet100DatasetInfo(ActivityNetDatasetInfo):
-    """ActivityNet-100 dataset info."""
+    """ActivityNet 100 dataset info."""
 
     @property
     def version(self):
@@ -1092,7 +1096,7 @@ def update_existing_sample_ids(self):
 
 
 class ActivityNet200DatasetInfo(ActivityNetDatasetInfo):
-    """ActivityNet-200 dataset info."""
+    """ActivityNet 200 dataset info."""
 
     def __init__(self, foz_dir):
         self.a100_info = ActivityNet100DatasetInfo(foz_dir)

diff --git a/fiftyone/utils/youtube.py b/fiftyone/utils/youtube.py
@@ -5,11 +5,13 @@
 | `voxel51.com <https://voxel51.com/>`_
 |
 """
-import logging
+import importlib
 import itertools
+import logging
 import multiprocessing
 import multiprocessing.dummy
 import os
+import pkg_resources
 
 import numpy as np
 
@@ -18,9 +20,14 @@
 
 import fiftyone.core.utils as fou
 
-pytube = fou.lazy_import(
-    "pytube", callback=lambda: fou.ensure_package("pytube"),
-)
+
+def _ensure_pytube():
+    fou.ensure_package("pytube>=11.0.2")
+    if pkg_resources.get_distribution("pytube").version == "11.0.2":
+        _patch_pytube_cypher()
+
+
+pytube = fou.lazy_import("pytube", callback=_ensure_pytube)
 
 
 logger = logging.getLogger(__name__)
@@ -252,6 +259,7 @@ def _download(tasks, max_videos, skip_failures, quiet):
                     raise ValueError(msg)
 
                 errors[idx] = error
+                pb.draw()
             else:
                 pb.update()
                 downloaded[idx] = video_path
@@ -290,6 +298,7 @@ def _download_multi(
                         raise ValueError(msg)
 
                     errors[idx] = error
+                    pb.draw()
                 else:
                     pb.update()
                     downloaded[idx] = video_path
@@ -472,3 +481,79 @@ def _download_clip(stream, clip_segment, video_path):
     etav.extract_clip(
         stream.url, video_path, start_time=start_time, duration=duration
     )
+
+
+def _patch_pytube_cypher():
+    filepath = os.path.normpath(
+        os.path.join(
+            os.path.dirname(importlib.util.find_spec("pytube").origin),
+            "cipher.py",
+        )
+    )
+
+    find = """
+    function_patterns = [
+        # https://github.com/ytdl-org/youtube-dl/issues/29326#issuecomment-865985377
+        # a.C&&(b=a.get("n"))&&(b=Dea(b),a.set("n",b))}};
+        # In above case, `Dea` is the relevant function name
+        r'a\.[A-Z]&&\(b=a\.get\("n"\)\)&&\(b=([^(]+)\(b\)',
+    ]
+    logger.debug('Finding throttling function name')
+    for pattern in function_patterns:
+        regex = re.compile(pattern)
+        function_match = regex.search(js)
+        if function_match:
+            logger.debug("finished regex search, matched: %s", pattern)
+            return function_match.group(1)"""
+
+    replace = """
+    # Patched by FiftyOne: https://github.com/voxel51/fiftyone
+    # PR: https://github.com/pytube/pytube/pull/1222
+    function_patterns = [
+        # https://github.com/ytdl-org/youtube-dl/issues/29326#issuecomment-865985377
+        # https://github.com/yt-dlp/yt-dlp/commit/48416bc4a8f1d5ff07d5977659cb8ece7640dcd8
+        # var Bpa = [iha];
+        # ...
+        # a.C && (b = a.get("n")) && (b = Bpa[0](b), a.set("n", b),
+        # Bpa.length || iha("")) }};
+        # In the above case, `iha` is the relevant function name
+        r'a\.[a-zA-Z]\s*&&\s*\([a-z]\s*=\s*a\.get\("n"\)\)\s*&&\s*'
+        r'\([a-z]\s*=\s*([a-zA-Z0-9$]{3})(\[\d+\])?\([a-z]\)',
+    ]
+    logger.debug('Finding throttling function name')
+    for pattern in function_patterns:
+        regex = re.compile(pattern)
+        function_match = regex.search(js)
+        if function_match:
+            logger.debug("finished regex search, matched: %s", pattern)
+            if len(function_match.groups()) == 1:
+                return function_match.group(1)
+            idx = function_match.group(2)
+            if idx:
+                idx = idx.strip("[]")
+                array = re.search(
+                    r'var {nfunc}\s*=\s*(\[.+?\]);'.format(
+                        nfunc=function_match.group(1)),
+                    js
+                )
+                if array:
+                    array = array.group(1).strip("[]").split(",")
+                    array = [x.strip() for x in array]
+                    return array[int(idx)]"""
+
+    try:
+        with open(filepath, "r") as f:
+            code = f.read()
+
+        if find in code:
+            logger.debug("Patching '%s'", filepath)
+            fixed = code.replace(find, replace)
+            with open(filepath, "w") as f:
+                f.write(fixed)
+        elif replace in code:
+            logger.debug("Already patched '%s'", filepath)
+        else:
+            logger.debug("Unable to patch '%s'", filepath)
+    except Exception as e:
+        logger.debug(e)
+        logger.debug("Unable to patch '%s'", filepath)
diff --git a/fiftyone/zoo/datasets/base.py b/fiftyone/zoo/datasets/base.py
@@ -45,7 +45,7 @@ class version of the dataset.
 
     Notes:
 
-    -   ActivityNet-100 and -200 differ in the number of activity classes and
+    -   ActivityNet 100 and 200 differ in the number of activity classes and
         videos per split
     -   Partial downloads will download videos (if still available) from
         YouTube
@@ -224,8 +224,8 @@ class version of the dataset.
 
     Notes:
 
-    -   ActivityNet-200 is a superset of ActivityNet-100 videos
-    -   ActivityNet-100 and -200 differ in the number of activity classes and
+    -   ActivityNet 200 is a superset of ActivityNet 100
+    -   ActivityNet 100 and 200 differ in the number of activity classes and
         videos per split
     -   Partial downloads will download videos (if still available) from
         YouTube