-
Notifications
You must be signed in to change notification settings - Fork 37
/
librivox_align.py
executable file
·126 lines (95 loc) · 3.88 KB
/
librivox_align.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python3
"""
Uses Aeneas to align Librivox MP3 audio book files with book text.
YAML file format:
---
aeneas:
language: <AENEAS LANGUAGE>
gruut:
language: <GRRUT LANGUAGE>
text:
file: <BOOK TEXT FILE NAME>
audio:
<MP3 NAME>:
start_time: <SECONDS TO SKIP FROM START>
end_time: -<SECONDS TO SKIP FROM END>
start_line: <LINE TO START IN BOOK TEXT>
end_line: <LINE TO STOP IN BOOX TEXT>
<MP3 NAME>:
...
"""
import argparse
import logging
from pathlib import Path
import yaml
from aeneas.executetask import ExecuteTask
from aeneas.task import Task
import gruut
_LOGGER = logging.getLogger("librivox_align")
# -----------------------------------------------------------------------------
def main():
"""Main entry point"""
parser = argparse.ArgumentParser(prog="librivox_align.py")
parser.add_argument("book_yml", help="YAML file with book details")
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG)
_LOGGER.debug(args)
args.book_yml = Path(args.book_yml)
input_dir = args.book_yml.parent
with open(args.book_yml, "r") as book_file:
book = yaml.safe_load(book_file)
# Load gruut language
gruut_tokenizer = gruut.get_tokenizer(book["gruut"]["language"])
language = book["aeneas"]["language"]
# Load book text
text_path = Path(input_dir / book["text"]["file"])
_LOGGER.debug("Loading book text from %s", text_path)
with open(text_path, "r") as text_file:
text = text_file.readlines()
# Process MP3 files
for mp3_name, mp3_info in book["audio"].items():
mp3_path = input_dir / mp3_name
sync_path = mp3_path.with_suffix(".json")
config_string = f"task_language={language}|is_text_type=plain|os_task_file_format=json|task_adjust_boundary_no_zero=True"
start_time = float(mp3_info.get("start_time", 0))
if start_time > 0:
# Skip seconds at the beginning
config_string += f"|is_audio_file_head_length={start_time}"
end_time = float(mp3_info.get("end_time", 0))
if end_time < 0:
# Skip seconds at the end
end_time = abs(end_time)
config_string += f"|is_audio_file_tail_length={end_time}"
elif end_time > 0:
# Set length of audio
config_string += f"|is_audio_file_process_length={end_time}"
task = Task(config_string=config_string)
task.audio_file_path_absolute = mp3_path.absolute()
task.sync_map_file_path_absolute = sync_path.absolute()
mp3_text_path = mp3_path.with_suffix(".txt")
with open(mp3_text_path, mode="w+") as mp3_text_file:
start_line = mp3_info.get("start_line", 1)
end_line = mp3_info.get("end_line", len(text))
# Clean up newlines in text
mp3_text = ""
for line_index in range(start_line - 1, end_line):
mp3_text += text[line_index].strip() + "\n"
# Run through gruut tokenizer to expand abbreviations, numbers, etc.
raw_text_path = mp3_path.with_suffix(".raw.txt")
with open(raw_text_path, "w") as raw_text_file:
for sentence in gruut_tokenizer.tokenize(
mp3_text, return_format="sentences"
):
clean_text = " ".join(sentence.clean_words)
# Each sentence in on a line now
print(clean_text, file=mp3_text_file)
print(sentence.raw_text, file=raw_text_file)
mp3_text_file.seek(0)
task.text_file_path_absolute = mp3_text_file.name
# Generate sync map JSON file
_LOGGER.debug("Generating %s (%s)", sync_path, mp3_path)
ExecuteTask(task).execute()
task.output_sync_map_file()
# -----------------------------------------------------------------------------
if __name__ == "__main__":
main()