forked from kanjieater/SubPlz
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgen.py
106 lines (96 loc) · 3.25 KB
/
gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import argparse
import sys, os
import stable_whisper
import ffmpeg
import multiprocessing
# from subsai import SubsAI
import traceback
from os import path
from pathlib import Path
from datetime import datetime, timedelta
from tqdm.contrib.concurrent import process_map
from tqdm import tqdm
from pprint import pprint
from utils import read_vtt, write_sub, grab_files
# from split_sentences import split_sentences
from run import get_working_folders, generate_transcript_from_audio, get_model
audio_formats = ['aac', 'ac3', 'alac', 'ape', 'flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'm4b']
video_formats = ['3g2', '3gp', 'avi', 'flv', 'm4v', 'mkv', 'mov', 'mp4', 'mpeg', 'webm']
subtitle_formats = ['ass', 'srt', 'vtt']
SUPPORTED_FORMATS = ['*.' + extension for extension in video_formats + audio_formats]
def generate_subs(files, model):
for file in files:
try:
ext = 'srt'
lang_code = '.ja'
sub_path = str(Path(file).with_suffix(lang_code))
generate_transcript_from_audio(file, sub_path, model, ext)
except Exception as err:
tb = traceback.format_exc()
pprint(tb)
failures.append({"working_folder": file, "err": err})
return failures
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate audio for files")
parser.add_argument(
"-d",
"--dirs",
dest="dirs",
default=None,
required=True,
type=str,
nargs="+",
help="List of folders to run generate subs for",
)
parser.add_argument(
"--use-filtered-cache",
help="Uses cached filtered files and skips cleanup. Skips making the filtered audio files.",
action="store_true",
default=False,
)
parser.add_argument(
"--use-transcript-cache",
help="Uses cached transcript files and skips cleanup. Skips the whisper step.",
action="store_true",
default=False,
)
parser.add_argument(
"--align",
help="Align existing subs",
action="store_true",
default=False,
)
parser.add_argument(
"-sd",
"--subs_dir",
dest="subs_dir",
default=None,
required=False,
type=str,
nargs="+",
help="List of folders that have subs, in the same order as dirs",
)
args = parser.parse_args()
working_folders = get_working_folders(args.dirs)
global model
model = False # global model preserved between files
model = get_model('large-v2')
successes = []
failures = []
for working_folder in working_folders:
# try:
print(working_folder)
audio_files = grab_files(working_folder, SUPPORTED_FORMATS)
failures.extend(generate_subs(audio_files, model))
successes.append(working_folder)
# except Exception as err:
# pprint(err)
# failures.append({"working_folder": working_folder, "err": err})
if len(successes) > 0:
print(f"The following {len(successes)} succeeded:")
pprint(successes)
if len(failures) > 0:
print(f"The following {len(failures)} failed:")
for failure in failures:
pprint(failure['working_folder'])
pprint(failure['err'])