Skip to content

Commit

Permalink
Document the code and finished solving final bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
Eduardo Matallanas committed Nov 15, 2022
1 parent ec1f450 commit c21d613
Show file tree
Hide file tree
Showing 12 changed files with 373 additions and 186 deletions.
2 changes: 1 addition & 1 deletion src/dataset/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .hf_dataset import *
from .transcript_dataset import *
from .transcript_dataset import *
22 changes: 17 additions & 5 deletions src/dataset/hf_dataset.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,22 @@
from abc import ABC, abstractmethod
from typing import Optional
from datasets import load_dataset, Dataset
from datasets.data_files import EmptyDatasetError


class HFDataset(ABC):
"""
Create a dataset to save the transcripts from Youtube.
"""
def __init__(self, name) -> None:
"""Create a dataset to save the transcripts from Youtube."""

def __init__(self, name: str, token: Optional[str] = None):
"""Initialize the Hugging Face dataset.
Args:
name (str): repository ID of the dataset or name to dataset.
token (Optional[str], optional): token to upload the dataset if necessary.
Defaults to None.
"""
self.name = name
self.token = token
if name != "":
self._init_dataset()
else:
Expand All @@ -17,9 +26,11 @@ def __init__(self, name) -> None:

@abstractmethod
def generate_dataset():
"""This method is called when you want to generate a dataset."""
pass

def _init_dataset(self):
"""Load dataset if exists."""
try:
self.dataset = load_dataset(self.name)
self.exist = True
Expand All @@ -36,4 +47,5 @@ def _init_dataset(self):
pass

def upload(self):
self.dataset.push_to_hub(self.name)
"""Push the dataset to the hub."""
self.dataset.push_to_hub(repo_id=self.name, token=self.token)
100 changes: 81 additions & 19 deletions src/dataset/transcript_dataset.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,97 @@
import glob
import os
from typing import Any, Optional
import validators
import pandas as pd
from downloader import WhisperPP, YoutubeDownloader
from interpreter import WhisperInterpreter
from datasets import load_dataset, concatenate_datasets, Dataset
from dataset.hf_dataset import HFDataset


class TranscriptDataset(HFDataset):
"""Create a TranscriptDataset."""

def __init__(self, name: str, token: Optional[str] = None):
"""A decorator to initialize the class.
Args:
name (str): Repository id
"""
super().__init__(name, token)

def __init__(self, name) -> None:
super().__init__(name)
def generate_dataset(
self,
input: str,
download_path: str,
overwrite: bool,
whisper_postprocessor_config: Optional[Any]
):
"""Generate a transcript dataset from audio transcriptions.
def generate_dataset(self, input, download_path, overwrite, whisper_config):
Args:
input (str): An url, a path or afile to make the transcription.
download_path (str): Path to store all the files downloaded.
overwrite (bool): Flag to overwrite the data transcription.
**whisper_postprocessor_config (Optional[Any]): Dictionary with the
configuration of the postprocessor.
"""
if validators.url(input):
self.from_url(input, download_path, overwrite, **whisper_config)
self.from_url(input, download_path, overwrite, **whisper_postprocessor_config)
else:
self.from_files(input, overwrite, **whisper_config)
self.from_files(input, overwrite, **whisper_postprocessor_config)

def from_url(self, url: str, download_path: str = "tmp/", overwrite: bool = False, **whisper_config: dict) -> None:
def from_url(
self,
url: str,
download_path: str = "tmp/",
overwrite: bool = False,
**whisper_postprocessor_config: dict
):
"""Loads the whisper dataset from a URL.
Args:
url (str): Url to download the video.
download_path (str, optional): Path to store all the files downloaded.
Defaults to "tmp/".
overwrite (bool, optional): Flag to overwrite the data transcription.
Defaults to False.
**whisper_postprocessor_config (Optional[Any]): Dictionary with the
configuration of the postprocessor.
"""
if self.is_empty:
emptyDataset = self.dataset
else:
#emptyDataset=self.dataset["train"].filter(lambda e: e["id"] is None)
emptyDataset=self.dataset["train"]
print(self.dataset.info)
whisper_config["repoId"] = self.name
whisperPP = WhisperPP(emptyDataset, **whisper_config)
whisper_postprocessor_config["repoId"] = self.name
whisper_postprocessor_config["token"] = self.token
whisperPP = WhisperPP(emptyDataset, **whisper_postprocessor_config)
downloader = YoutubeDownloader(download_path)
if not overwrite:
downloader.config["download_archive"] = os.path.join(download_path,"video_record.txt")
self._fill_archive(downloader.config["download_archive"])
downloader.download(url, whisperPP)
self._concatenate_datasets(whisperPP.get_data())

def from_files(self, input:str, overwrite: bool = False, **whisper_config):
if (whisper_config.get("mode", None) is not None):
interpreter = WhisperInterpreter(whisper_config.pop("model_size"))
process = getattr(interpreter, whisper_config.pop("mode"))
result = process(input, **whisper_config)
def from_files(
self,
input:str,
overwrite: bool = False,
**whisper_postprocessor_config
):
"""Loads dataset from files.
Args:
input (str): Input path or file to create the dataset.
overwrite (bool, optional): Flag to overwrite the data transcription.
Defaults to False.
**whisper_postprocessor_config (Optional[Any]): Dictionary with the
configuration of the postprocessor.
"""
if (whisper_postprocessor_config.get("mode", None) is not None):
interpreter = WhisperInterpreter(whisper_postprocessor_config.pop("model_size"))
process = getattr(interpreter, whisper_postprocessor_config.pop("mode"))
whisper_postprocessor_config["write"] = overwrite
result = process(input, **whisper_postprocessor_config)
if type(result) == list:
dataset = Dataset.from_list(result)
else:
Expand All @@ -49,18 +102,27 @@ def from_files(self, input:str, overwrite: bool = False, **whisper_config):

self._concatenate_datasets(dataset)

def _fill_archive(self, archive_file):
def _fill_archive(self, archive_file: str):
"""Write the youtube dataset if not empty .
Args:
archive_file (str): File path to the archive file from previous stored
transcripted videos.
"""
if not self.is_empty:
with open(archive_file, "w") as f:
for id in self.dataset["train"]["id"]:
f.write(f"youtube {id}\n")

def _concatenate_datasets(self, dataset):
"""Concatenate dataset with previos ids and not having duplicates.
Args:
dataset (list | Dataset): Data of video transcription.
"""
if not self.is_empty:
selectedIDs = list(set(dataset["id"])-set(self.dataset["train"]["id"]))
filteredDataset = dataset.filter(lambda element: element["id"] in selectedIDs)
self.dataset["train"] = concatenate_datasets([self.dataset["train"],filteredDataset])
else:
self.dataset = dataset


2 changes: 1 addition & 1 deletion src/downloader/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .downloader import *
from .youtube_downloader import *
from .whisper_post_processor import *
from .whisper_post_processor import *
16 changes: 10 additions & 6 deletions src/downloader/downloader.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
from abc import ABC, abstractmethod



class Downloader(ABC):
"""
A video downloader from online platforms to a specified format
"""
"""Video downloader from online platforms to a specified format."""

@abstractmethod
def __init__(self, download_path):
def __init__(self, download_path: str):
"""Initialize the download_path.
Args:
download_path: str, Path where the resultant format is going to be stored.
"""
self.download_path = download_path

@abstractmethod
def download(self):
pass
pass
57 changes: 46 additions & 11 deletions src/downloader/whisper_post_processor.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,42 @@
import re
from typing import Any, Optional, Union
from interpreter import WhisperInterpreter
from utils import VIDEO_INFO, json_dump
from yt_dlp.postprocessor import PostProcessor
from datasets import Dataset
import re


class WhisperPP(PostProcessor):
def __init__(self,data,**whisper_options):
"""Create a whisper postprocessor after downloading and extracting the audio
from a video.
"""

def __init__(self, data: Union[list,Dataset], **whisper_options: Optional[Any]):
"""Initialize the dataset to process information.
Args:
data: list or Dataset, Data structure to fill with the result of the transcription.
**whisper_options: Optional[Any], Options to process the audio with whisper.
"""
super().__init__()
self._options = whisper_options
interpreter = WhisperInterpreter(self._options.pop("model_size","base"))
self.data = data
self._process = getattr(interpreter, self._options.pop("mode","transcribe"))
self._write = self._options.pop("write")
self.videos_to_process = self._options.pop("number_videos",0)
print(self.videos_to_process)
self.repoId = self._options.pop("repoId",self._get_name())
print(self.repoId)
self.token = self._options.pop("token",None)

def run(self, info):
def run(self, info: Any):
"""Runs the process the audio extracted from the video through whisper.
Args:
info: Any, All the info extracted and tags from the video doenloaded.
Returns:
(list, Any): An empty list and an object needed by the yt_dlp library.
"""
self.to_screen(f"Processing Video {info['id']}")
result = {key: info[key] for key in VIDEO_INFO}
result.update(self._process(info["filepath"], **self._options))
Expand All @@ -27,23 +46,39 @@ def run(self, info):
json_dump(result, f"{info['filepath'].split('.')[0]}.json")
return [], info

def _update_data(self, record):
def _update_data(self, record: dict):
"""Update the data of the transcribed record added to a hugging face dataset
or to a list.
Args:
record: dict, Transcription of the video.
"""
dataType = type(self.data)
if dataType == list:
self.data.append(record)
else:
self.data = self.data.add_item(record)
if self.data.num_rows%self.videos_to_process==0 and self.videos_to_process != 0:
self.data.push_to_hub(self.repoId)
if self.videos_to_process != 0 and self.data.num_rows%self.videos_to_process==0:
self.data.push_to_hub(repo_id=self.repoId, token=self.token)

def get_data(self):
return self.data
"""Get the current data.
Returns:
list or Dataset: Get the dataset update after processing the video, list or
playlist.
"""
return self.data

def _get_name(self):
if self.data.info.download_checksums is not None:
"""Get name of the dataset.
Returns:
str: Id of the repository.
"""
if type(self.data) is Dataset and self.data.info.download_checksums is not None:
regex = r"(?<=datasets\/)(.*?)(?=\/resolve)"
repoId = re.compile(regex)
url = list(self.data.info.download_checksums.keys())[0]
return repoId.findall(url)[0]
return ""

37 changes: 32 additions & 5 deletions src/downloader/youtube_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,50 @@
from yt_dlp.postprocessor import PostProcessor
from utils import YT_OPTIONS


class YoutubeDownloader(Downloader):

def __init__(self, download_path:str) -> None:
"""Download videos from youtube giving a configuration."""

def __init__(self, download_path:str):
"""Create a downloader from youtube using specifying the path to save the output.
Args:
download_path: str, Path to download the resulting files.
"""
super().__init__(download_path)
self._ydl_options = YT_OPTIONS
self._ydl_options["outtmpl"] = os.path.join(download_path,"%(id)s.%(ext)s")


def download(self, url: str, CustomPP: PostProcessor, when: str = "post_process") -> None:
def download(self, url: str, CustomPP: PostProcessor, when: str = "post_process"):
"""Download the YouTube content.
Args:
url: str, Video, playlist or channel video list from youtube.
CustomPP: PostProcessor, A custom post processor to execute previous or after
the download.
when: str, optional, When to execute the postprocessor. Defaults to
"post_process".
"""
with yt_dlp.YoutubeDL(self._ydl_options) as ydl:
ydl.add_post_processor(CustomPP, when=when)
ydl.download(url)

@property
def config(self):
"""Returns the configuration with the default values.
Returns:
dict: Configuration dictionary
"""
return self._ydl_options

@config.setter
def config(self, key: str, value: str) -> None:
self._ydl_options[key] = value
def config(self, key: str, value: str):
"""Set configuration parameters for a YL document .
Args
key: str, Name of the property to modify
value: str, Value of the property
"""
self._ydl_options[key] = value
2 changes: 1 addition & 1 deletion src/interpreter/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .interpreter import *
from .whisper_interpreter import *
from .whisper_interpreter import *
Loading

0 comments on commit c21d613

Please sign in to comment.