forked from espnet/espnet
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdump-pcm.py
executable file
·147 lines (130 loc) · 4.75 KB
/
dump-pcm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python3
import argparse
import logging
from distutils.util import strtobool
import kaldiio
import numpy
from espnet.transform.transformation import Transformation
from espnet.utils.cli_utils import get_commandline_args
from espnet.utils.cli_writers import file_writer_helper
def get_parser():
parser = argparse.ArgumentParser(
description="dump PCM files from a WAV scp file",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--write-num-frames", type=str, help="Specify wspecifer for utt2num_frames"
)
parser.add_argument(
"--filetype",
type=str,
default="mat",
choices=["mat", "hdf5", "sound.hdf5", "sound"],
help="Specify the file format for output. "
'"mat" is the matrix format in kaldi',
)
parser.add_argument(
"--format",
type=str,
default=None,
help="The file format for output pcm. "
"This option is only valid "
'when "--filetype" is "sound.hdf5" or "sound"',
)
parser.add_argument(
"--compress", type=strtobool, default=False, help="Save in compressed format"
)
parser.add_argument(
"--compression-method",
type=int,
default=2,
help="Specify the method(if mat) or " "gzip-level(if hdf5)",
)
parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
parser.add_argument(
"--normalize",
choices=[1, 16, 24, 32],
type=int,
default=None,
help="Give the bit depth of the PCM, "
"then normalizes data to scale in [-1,1]",
)
parser.add_argument(
"--preprocess-conf",
type=str,
default=None,
help="The configuration file for the pre-processing",
)
parser.add_argument(
"--keep-length",
type=strtobool,
default=True,
help="Truncating or zero padding if the output length "
"is changed from the input by preprocessing",
)
parser.add_argument("rspecifier", type=str, help="WAV scp file")
parser.add_argument(
"--segments",
type=str,
help="segments-file format: each line is either"
"<segment-id> <recording-id> <start-time> <end-time>"
"e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5",
)
parser.add_argument("wspecifier", type=str, help="Write specifier")
return parser
def main():
parser = get_parser()
args = parser.parse_args()
logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
if args.verbose > 0:
logging.basicConfig(level=logging.INFO, format=logfmt)
else:
logging.basicConfig(level=logging.WARN, format=logfmt)
logging.info(get_commandline_args())
if args.preprocess_conf is not None:
preprocessing = Transformation(args.preprocess_conf)
logging.info("Apply preprocessing: {}".format(preprocessing))
else:
preprocessing = None
with file_writer_helper(
args.wspecifier,
filetype=args.filetype,
write_num_frames=args.write_num_frames,
compress=args.compress,
compression_method=args.compression_method,
pcm_format=args.format,
) as writer:
for utt_id, (rate, array) in kaldiio.ReadHelper(args.rspecifier, args.segments):
if args.filetype == "mat":
# Kaldi-matrix doesn't support integer
array = array.astype(numpy.float32)
if array.ndim == 1:
# (Time) -> (Time, Channel)
array = array[:, None]
if args.normalize is not None and args.normalize != 1:
array = array.astype(numpy.float32)
array = array / (1 << (args.normalize - 1))
if preprocessing is not None:
orgtype = array.dtype
out = preprocessing(array, uttid_list=utt_id)
out = out.astype(orgtype)
if args.keep_length:
if len(out) > len(array):
out = numpy.pad(
out,
[(0, len(out) - len(array))]
+ [(0, 0) for _ in range(out.ndim - 1)],
mode="constant",
)
elif len(out) < len(array):
# The length can be changed by stft, for example.
out = out[: len(out)]
array = out
# shape = (Time, Channel)
if args.filetype in ["sound.hdf5", "sound"]:
# Write Tuple[int, numpy.ndarray] (scipy style)
writer[utt_id] = (rate, array)
else:
writer[utt_id] = array
if __name__ == "__main__":
main()