forked from AuYang261/BIT_yanhe_download
-
Notifications
You must be signed in to change notification settings - Fork 0
/
gen_caption.py
96 lines (85 loc) · 3.04 KB
/
gen_caption.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import whisper
import time
from zhconv import convert # 简繁体转换
import sys
import os
def seconds_to_hmsm(seconds):
"""
输入一个秒数,输出为H:M:S:M时间格式
@params:
seconds - Required : 秒 (float)
"""
hours = str(int(seconds // 3600))
minutes = str(int((seconds % 3600) // 60))
seconds = seconds % 60
milliseconds = str(int(int((seconds - int(seconds)) * 1000))) # 毫秒留三位
seconds = str(int(seconds))
# 补0
if len(hours) < 2:
hours = "0" + hours
if len(minutes) < 2:
minutes = "0" + minutes
if len(seconds) < 2:
seconds = "0" + seconds
if len(milliseconds) < 3:
milliseconds = "0" * (3 - len(milliseconds)) + milliseconds
return f"{hours}:{minutes}:{seconds},{milliseconds}"
def main():
# 视频文件路径
video_paths = []
if len(sys.argv) >= 2:
video_paths.append(sys.argv[1])
else:
files = []
for dirpath, dirnames, filenames in os.walk("."):
for filename in filenames:
if filename.endswith(".mp4"):
files.append(os.path.join(dirpath, filename).replace("\\", "/"))
for i, f in enumerate(files):
print(f"[{i}]: ", f)
input_list = eval(
"[" + input("select a video file by input a num(split with ','): ") + "]"
)
for i in input_list:
video_paths.append(files[i])
print("selected video files:", video_paths)
models = []
for model in whisper.available_models():
if ".en" in model:
continue
print(f"[{len(models)}]: ", model)
models.append(model)
model_index = input("select a model by input a num(default 'base'): ")
try:
model_name = models[eval(model_index)]
except:
model_name = "base"
print("selected model:", model_name)
for video_path in video_paths:
audio_path = video_path.replace("mp4", "m4a")
cmd = f'ffmpeg -i "{video_path}" -vn -ar {whisper.audio.SAMPLE_RATE} "{audio_path}"'
os.system(cmd)
model = whisper.load_model(model_name, download_root="whisper_models/")
start = time.time()
result = model.transcribe(audio_path, verbose=False, language="zh")
print("Time cost: ", time.time() - start)
# 写入字幕文件
with open(video_path.replace("mp4", "srt"), "w", encoding="utf-8") as f:
i = 1
for r in result["segments"]:
f.write(str(i) + "\n")
f.write(
seconds_to_hmsm(float(r["start"]))
+ " --> "
+ seconds_to_hmsm(float(r["end"]))
+ "\n"
)
i += 1
f.write(
convert(r["text"], "zh-cn") + "\n"
) # 结果可能是繁体,转为简体zh-cn
f.write("\n")
# 删除音频文件
os.remove(audio_path)
if __name__ == "__main__":
main()