Document the code and finished solving final bugs

matallanas · Nov 15, 2022 · c21d613 · c21d613
1 parent ec1f450
commit c21d613
Show file tree

Hide file tree

Showing 12 changed files with 373 additions and 186 deletions.
diff --git a/src/dataset/__init__.py b/src/dataset/__init__.py
@@ -1,2 +1,2 @@
 from .hf_dataset import *
-from .transcript_dataset import *
+from .transcript_dataset import *
diff --git a/src/dataset/hf_dataset.py b/src/dataset/hf_dataset.py
@@ -1,13 +1,22 @@
 from abc import ABC, abstractmethod
+from typing import Optional
 from datasets import load_dataset, Dataset
 from datasets.data_files import EmptyDatasetError
 
+
 class HFDataset(ABC):
-  """
-  Create a dataset to save the transcripts from Youtube.
-  """
-  def __init__(self, name) -> None:
+  """Create a dataset to save the transcripts from Youtube."""
+
+  def __init__(self, name: str, token: Optional[str] = None):
+    """Initialize the Hugging Face dataset.
+
+    Args:
+        name (str): repository ID of the dataset or name to dataset.
+        token (Optional[str], optional): token to upload the dataset if necessary. 
+        Defaults to None.
+    """
     self.name = name
+    self.token = token
     if name != "":
       self._init_dataset()
     else:
@@ -17,9 +26,11 @@ def __init__(self, name) -> None:
 
   @abstractmethod
   def generate_dataset():
+    """This method is called when you want to generate a dataset."""
     pass
 
   def _init_dataset(self):
+    """Load dataset if exists."""
     try:
       self.dataset = load_dataset(self.name)
       self.exist = True
@@ -36,4 +47,5 @@ def _init_dataset(self):
       pass
 
   def upload(self):
-    self.dataset.push_to_hub(self.name)
+    """Push the dataset to the hub."""
+    self.dataset.push_to_hub(repo_id=self.name, token=self.token)
diff --git a/src/dataset/transcript_dataset.py b/src/dataset/transcript_dataset.py
@@ -1,44 +1,97 @@
 import glob
 import os
+from typing import Any, Optional
 import validators
-import pandas as pd
 from downloader import WhisperPP, YoutubeDownloader
 from interpreter import WhisperInterpreter
 from datasets import load_dataset, concatenate_datasets, Dataset
 from dataset.hf_dataset import HFDataset
 
+
 class TranscriptDataset(HFDataset):
+  """Create a TranscriptDataset."""
+
+  def __init__(self, name: str, token: Optional[str] = None):
+    """A decorator to initialize the class.
+
+    Args:
+        name (str): Repository id
+    """
+    super().__init__(name, token)
 
-  def __init__(self, name) -> None:
-    super().__init__(name)
+  def generate_dataset(
+      self,
+      input: str,
+      download_path: str,
+      overwrite: bool,
+      whisper_postprocessor_config: Optional[Any]
+    ):
+    """Generate a transcript dataset from audio transcriptions.
 
-  def generate_dataset(self, input, download_path, overwrite, whisper_config):
+    Args:
+        input (str): An url, a path or afile to make the transcription.
+        download_path (str): Path to store all the files downloaded.
+        overwrite (bool): Flag to overwrite the data transcription.
+        **whisper_postprocessor_config (Optional[Any]): Dictionary with the 
+        configuration of the postprocessor.
+    """
     if validators.url(input):
-      self.from_url(input, download_path, overwrite, **whisper_config)
+      self.from_url(input, download_path, overwrite, **whisper_postprocessor_config)
     else:
-      self.from_files(input, overwrite,  **whisper_config)
+      self.from_files(input, overwrite,  **whisper_postprocessor_config)
 
-  def from_url(self, url: str, download_path: str = "tmp/", overwrite: bool = False, **whisper_config: dict) -> None:
+  def from_url(
+      self,
+      url: str,
+      download_path: str = "tmp/",
+      overwrite: bool = False,
+      **whisper_postprocessor_config: dict
+    ):
+    """Loads the whisper dataset from a URL.
+
+    Args:
+        url (str): Url to download the video.
+        download_path (str, optional): Path to store all the files downloaded.
+        Defaults to "tmp/".
+        overwrite (bool, optional): Flag to overwrite the data transcription.
+        Defaults to False.
+        **whisper_postprocessor_config (Optional[Any]): Dictionary with the 
+        configuration of the postprocessor.
+    """
     if self.is_empty:
       emptyDataset = self.dataset
     else:
-      #emptyDataset=self.dataset["train"].filter(lambda e: e["id"] is None)
       emptyDataset=self.dataset["train"]
-    print(self.dataset.info)
-    whisper_config["repoId"] = self.name
-    whisperPP = WhisperPP(emptyDataset, **whisper_config)
+    whisper_postprocessor_config["repoId"] = self.name
+    whisper_postprocessor_config["token"] = self.token
+    whisperPP = WhisperPP(emptyDataset, **whisper_postprocessor_config)
     downloader = YoutubeDownloader(download_path)
     if not overwrite:
       downloader.config["download_archive"] = os.path.join(download_path,"video_record.txt")
       self._fill_archive(downloader.config["download_archive"])
     downloader.download(url, whisperPP)
     self._concatenate_datasets(whisperPP.get_data())
 
-  def from_files(self, input:str, overwrite: bool = False, **whisper_config):
-    if (whisper_config.get("mode", None) is not None):
-      interpreter = WhisperInterpreter(whisper_config.pop("model_size"))
-      process = getattr(interpreter, whisper_config.pop("mode"))
-      result = process(input, **whisper_config)
+  def from_files(
+      self,
+      input:str,
+      overwrite: bool = False,
+      **whisper_postprocessor_config
+    ):
+    """Loads dataset from files.
+
+    Args:
+        input (str): Input path or file to create the dataset.
+        overwrite (bool, optional): Flag to overwrite the data transcription.
+        Defaults to False.
+        **whisper_postprocessor_config (Optional[Any]): Dictionary with the 
+        configuration of the postprocessor.
+    """
+    if (whisper_postprocessor_config.get("mode", None) is not None):
+      interpreter = WhisperInterpreter(whisper_postprocessor_config.pop("model_size"))
+      process = getattr(interpreter, whisper_postprocessor_config.pop("mode"))
+      whisper_postprocessor_config["write"] = overwrite
+      result = process(input, **whisper_postprocessor_config)
       if type(result) == list:
         dataset = Dataset.from_list(result)
       else:
@@ -49,18 +102,27 @@ def from_files(self, input:str, overwrite: bool = False, **whisper_config):
 
     self._concatenate_datasets(dataset)
 
-  def _fill_archive(self, archive_file):
+  def _fill_archive(self, archive_file: str):
+    """Write the youtube dataset if not empty .
+
+    Args:
+        archive_file (str): File path to the archive file from previous stored 
+        transcripted videos.
+    """
     if not self.is_empty:
       with open(archive_file, "w") as f:
         for id in self.dataset["train"]["id"]:
           f.write(f"youtube {id}\n")
 
   def _concatenate_datasets(self, dataset):
+    """Concatenate dataset with previos ids and not having duplicates.
+
+    Args:
+        dataset (list | Dataset): Data of video transcription.
+    """
     if not self.is_empty:
       selectedIDs = list(set(dataset["id"])-set(self.dataset["train"]["id"]))
       filteredDataset = dataset.filter(lambda element: element["id"] in selectedIDs)
       self.dataset["train"] = concatenate_datasets([self.dataset["train"],filteredDataset])
     else:
       self.dataset = dataset
-
-
diff --git a/src/downloader/__init__.py b/src/downloader/__init__.py
@@ -1,3 +1,3 @@
 from .downloader import *
 from .youtube_downloader import *
-from .whisper_post_processor import *
+from .whisper_post_processor import *
diff --git a/src/downloader/downloader.py b/src/downloader/downloader.py
@@ -1,14 +1,18 @@
 from abc import ABC, abstractmethod
-
+
+
 class Downloader(ABC):
-  """
-  A video downloader from online platforms to a specified format
-  """
+  """Video downloader from online platforms to a specified format."""
 
   @abstractmethod
-  def __init__(self, download_path):
+  def __init__(self, download_path: str):
+    """Initialize the download_path.
+
+    Args:
+      download_path: str, Path where the resultant format is going to be stored.
+    """
     self.download_path = download_path
 
   @abstractmethod
   def download(self):
-    pass
+    pass
diff --git a/src/downloader/whisper_post_processor.py b/src/downloader/whisper_post_processor.py
@@ -1,23 +1,42 @@
+import re
+from typing import Any, Optional, Union
 from interpreter import WhisperInterpreter
 from utils import VIDEO_INFO, json_dump
 from yt_dlp.postprocessor import PostProcessor
 from datasets import Dataset
-import re
+
 
 class WhisperPP(PostProcessor):
-  def __init__(self,data,**whisper_options):
+  """Create a whisper postprocessor after downloading and extracting the audio 
+  from a video.
+  """
+
+  def __init__(self, data: Union[list,Dataset], **whisper_options: Optional[Any]):
+    """Initialize the dataset to process information.
+
+    Args:
+      data: list or Dataset, Data structure to fill with the result of the transcription.
+      **whisper_options: Optional[Any], Options to process the audio with whisper.
+    """
     super().__init__()
     self._options = whisper_options
     interpreter = WhisperInterpreter(self._options.pop("model_size","base"))
     self.data = data
     self._process = getattr(interpreter, self._options.pop("mode","transcribe"))
     self._write = self._options.pop("write")
     self.videos_to_process = self._options.pop("number_videos",0)
-    print(self.videos_to_process)
     self.repoId = self._options.pop("repoId",self._get_name())
-    print(self.repoId)
+    self.token = self._options.pop("token",None)
 
-  def run(self, info):
+  def run(self, info: Any):
+    """Runs the process the audio extracted from the video through whisper.
+
+    Args:
+      info: Any, All the info extracted and tags from the video doenloaded.
+
+    Returns:
+      (list, Any): An empty list and an object needed by the yt_dlp library.
+    """
     self.to_screen(f"Processing Video {info['id']}")
     result = {key: info[key] for key in VIDEO_INFO}
     result.update(self._process(info["filepath"], **self._options))
@@ -27,23 +46,39 @@ def run(self, info):
       json_dump(result, f"{info['filepath'].split('.')[0]}.json")
     return [], info
 
-  def _update_data(self, record):
+  def _update_data(self, record: dict):
+    """Update the data of the transcribed record added to a hugging face dataset 
+    or to a list.
+
+    Args:
+      record: dict, Transcription of the video.
+    """
     dataType = type(self.data)
     if dataType == list:
       self.data.append(record)
     else:
       self.data = self.data.add_item(record)
-      if self.data.num_rows%self.videos_to_process==0 and self.videos_to_process != 0:
-        self.data.push_to_hub(self.repoId)
+      if self.videos_to_process != 0 and self.data.num_rows%self.videos_to_process==0:
+        self.data.push_to_hub(repo_id=self.repoId, token=self.token)
 
   def get_data(self):
-    return self.data
+    """Get the current data.
 
+    Returns:
+      list or Dataset: Get the dataset update after processing the video, list or 
+      playlist.
+    """
+    return self.data
+
   def _get_name(self):
-    if self.data.info.download_checksums is not None:
+    """Get name of the dataset.
+
+    Returns:
+        str: Id of the repository.
+    """
+    if type(self.data) is Dataset and self.data.info.download_checksums is not None:
       regex = r"(?<=datasets\/)(.*?)(?=\/resolve)"
       repoId = re.compile(regex)
       url = list(self.data.info.download_checksums.keys())[0]
       return repoId.findall(url)[0]
     return ""
-
diff --git a/src/downloader/youtube_downloader.py b/src/downloader/youtube_downloader.py
@@ -4,23 +4,50 @@
 from yt_dlp.postprocessor import PostProcessor
 from utils import YT_OPTIONS
 
+
 class YoutubeDownloader(Downloader):
-
-  def __init__(self, download_path:str) -> None:
+  """Download videos from youtube giving a configuration."""
+
+  def __init__(self, download_path:str):
+    """Create a downloader from youtube using specifying the path to save the output.
+
+    Args:
+    download_path: str, Path to download the resulting files.
+    """
     super().__init__(download_path)
     self._ydl_options = YT_OPTIONS
     self._ydl_options["outtmpl"] = os.path.join(download_path,"%(id)s.%(ext)s")
 
 
-  def download(self, url: str, CustomPP: PostProcessor, when: str = "post_process") -> None:
+  def download(self, url: str, CustomPP: PostProcessor, when: str = "post_process"):
+    """Download the YouTube content.
+
+    Args:
+      url: str, Video, playlist or channel video list from youtube. 
+      CustomPP: PostProcessor, A custom post processor to execute previous or after 
+      the download.
+      when: str, optional, When to execute the postprocessor. Defaults to 
+      "post_process".
+    """
     with yt_dlp.YoutubeDL(self._ydl_options) as ydl:
       ydl.add_post_processor(CustomPP, when=when)
       ydl.download(url)
 
   @property
   def config(self):
+    """Returns the configuration with the default values.
+
+    Returns:
+      dict: Configuration dictionary
+    """
     return self._ydl_options
 
   @config.setter
-  def config(self, key: str, value: str) -> None:
-    self._ydl_options[key] = value
+  def config(self, key: str, value: str):
+    """Set configuration parameters for a YL document .
+
+    Args
+      key: str, Name of the property to modify
+      value: str, Value of the property
+    """
+    self._ydl_options[key] = value
diff --git a/src/interpreter/__init__.py b/src/interpreter/__init__.py
@@ -1,2 +1,2 @@
 from .interpreter import *
-from .whisper_interpreter import *
+from .whisper_interpreter import *