Skip to content

Commit

Permalink
preprocess
Browse files Browse the repository at this point in the history
  • Loading branch information
MaxMax2016 committed Apr 15, 2023
1 parent ec3f7be commit af11d15
Show file tree
Hide file tree
Showing 16 changed files with 768 additions and 471 deletions.
7 changes: 4 additions & 3 deletions configs/base.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
train:
model: "sovits"
fp16_run: False
log_interval: 200
eval_interval: 1000
Expand All @@ -11,14 +12,14 @@ train:
warmup_epochs: 0
eps: 1e-9
batch_size: 12
segment_size: 8000 # WARNING: base on hop_length
c_mel: 45
c_kl: 1.0
port: 8001
#############################
data:
training_files: "filelists/train.txt"
validation_files: "filelists/val.txt"
training_files: "files/train.txt"
validation_files: "files/valid.txt"
segment_size: 8000 # WARNING: base on hop_length
max_wav_value: 32768.0
sampling_rate: 16000
filter_length: 512
Expand Down
49 changes: 49 additions & 0 deletions prepare/preprocess_f0.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import os
import numpy as np
import librosa
import pyworld
import argparse


def compute_f0(path, save):
x, sr = librosa.load(path, sr=16000)
assert sr == 16000
f0, t = pyworld.dio(
x.astype(np.double),
fs=sr,
f0_ceil=900,
frame_period=1000 * 160 / sr,
)
f0 = pyworld.stonemask(x.astype(np.double), f0, t, fs=16000)
for index, pitch in enumerate(f0):
f0[index] = round(pitch, 1)
np.save(save, f0, allow_pickle=False)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.description = 'please enter embed parameter ...'
parser.add_argument("-w", "--wav", help="wav", dest="wav")
parser.add_argument("-p", "--pit", help="pit", dest="pit")
args = parser.parse_args()
print(args.wav)
print(args.pit)
os.makedirs(args.pit)
wavPath = args.wav
pitPath = args.pit

for spks in os.listdir(wavPath):
if os.path.isdir(f"./{wavPath}/{spks}"):
os.makedirs(f"./{pitPath}/{spks}")
print(f">>>>>>>>>>{spks}<<<<<<<<<<")
for file in os.listdir(f"./{wavPath}/{spks}"):
if file.endswith(".wav"):
# print(file)
file = file[:-4]
compute_f0(f"{wavPath}/{spks}/{file}.wav", f"{pitPath}/{spks}/{file}.nsf")
else:
file = spks
if file.endswith(".wav"):
# print(file)
file = file[:-4]
compute_f0(f"{wavPath}/{file}.wav", f"{pitPath}/{file}.nsf")
60 changes: 60 additions & 0 deletions prepare/preprocess_ppg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import os
import numpy as np
import argparse
import torch

from whisper.model import Whisper, ModelDimensions
from whisper.audio import load_audio, pad_or_trim, log_mel_spectrogram


def load_model(path) -> Whisper:
device = "cuda" if torch.cuda.is_available() else "cpu"
checkpoint = torch.load(path, map_location=device)
dims = ModelDimensions(**checkpoint["dims"])
model = Whisper(dims)
model.load_state_dict(checkpoint["model_state_dict"])
return model.to(device)


def pred_ppg(whisper: Whisper, wavPath, ppgPath):
audio = load_audio(wavPath)
audln = audio.shape[0]
ppgln = audln // 320
# audio = pad_or_trim(audio)
mel = log_mel_spectrogram(audio).to(whisper.device)
with torch.no_grad():
ppg = whisper.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
ppg = ppg[:ppgln,] # [length, dim=1024]
print(ppg.shape)
np.save(ppgPath, ppg, allow_pickle=False)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.description = 'please enter embed parameter ...'
parser.add_argument("-w", "--wav", help="wav", dest="wav")
parser.add_argument("-p", "--ppg", help="ppg", dest="ppg")
args = parser.parse_args()
print(args.wav)
print(args.ppg)
os.makedirs(args.ppg)
wavPath = args.wav
ppgPath = args.ppg

whisper = load_model(os.path.join("whisper_pretrain", "medium.pt"))

for spks in os.listdir(wavPath):
if os.path.isdir(f"./{wavPath}/{spks}"):
os.makedirs(f"./{ppgPath}/{spks}")
print(f">>>>>>>>>>{spks}<<<<<<<<<<")
for file in os.listdir(f"./{wavPath}/{spks}"):
if file.endswith(".wav"):
# print(file)
file = file[:-4]
pred_ppg(whisper, f"{wavPath}/{spks}/{file}.wav", f"{ppgPath}/{spks}/{file}.ppg")
else:
file = spks
if file.endswith(".wav"):
# print(file)
file = file[:-4]
pred_ppg(whisper, f"{wavPath}/{file}.wav", f"{ppgPath}/{file}.ppg")
84 changes: 84 additions & 0 deletions prepare/preprocess_speaker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import os
import torch
import numpy as np
import argparse

from tqdm import tqdm
from argparse import RawTextHelpFormatter
from speaker.models.lstm import LSTMSpeakerEncoder
from speaker.config import SpeakerEncoderConfig

from speaker.utils.audio import AudioProcessor
from speaker.infer import read_json


def get_spk_wavs(dataset_path, output_path):
wav_files = []
os.makedirs(f"./{output_path}")
for spks in os.listdir(dataset_path):
if os.path.isdir(f"./{dataset_path}/{spks}"):
os.makedirs(f"./{output_path}/{spks}")
for file in os.listdir(f"./{dataset_path}/{spks}"):
if file.endswith(".wav"):
wav_files.append(f"./{dataset_path}/{spks}/{file}")
elif spks.endswith(".wav"):
wav_files.append(f"./{dataset_path}/{spks}")
return wav_files


if __name__ == "__main__":

parser = argparse.ArgumentParser(
description="""Compute embedding vectors for each wav file in a dataset.""",
formatter_class=RawTextHelpFormatter,
)
parser.add_argument("dataset_path", type=str, help="Path to dataset waves.")
parser.add_argument(
"output_path", type=str, help="path for output speaker/speaker_wavs.npy."
)
parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
args = parser.parse_args()
dataset_path = args.dataset_path
output_path = args.output_path
# model
args.model_path = os.path.join("speaker_pretrain", "best_model.pth.tar")
args.config_path = os.path.join("speaker_pretrain", "config.json")
# config
config_dict = read_json(args.config_path)

# model
config = SpeakerEncoderConfig(config_dict)
config.from_dict(config_dict)

speaker_encoder = LSTMSpeakerEncoder(
config.model_params["input_dim"],
config.model_params["proj_dim"],
config.model_params["lstm_dim"],
config.model_params["num_lstm_layers"],
)

speaker_encoder.load_checkpoint(args.model_path, eval=True, use_cuda=args.use_cuda)

# preprocess
speaker_encoder_ap = AudioProcessor(**config.audio)
# normalize the input audio level and trim silences
speaker_encoder_ap.do_sound_norm = True
speaker_encoder_ap.do_trim_silence = True

wav_files = get_spk_wavs(dataset_path, output_path)

# compute speaker embeddings
for idx, wav_file in enumerate(tqdm(wav_files)):
waveform = speaker_encoder_ap.load_wav(
wav_file, sr=speaker_encoder_ap.sample_rate
)
spec = speaker_encoder_ap.melspectrogram(waveform)
spec = torch.from_numpy(spec.T)
if args.use_cuda:
spec = spec.cuda()
spec = spec.unsqueeze(0)
embed = speaker_encoder.compute_embedding(spec).detach().cpu().numpy()
embed = embed.squeeze()
embed_path = wav_file.replace(dataset_path, output_path)
embed_path = embed_path.replace(".wav", ".spk")
np.save(embed_path, embed, allow_pickle=False)
62 changes: 62 additions & 0 deletions prepare/preprocess_spec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import os
import torch
import argparse

from vits import spectrogram
from vits import utils
from omegaconf import OmegaConf


def compute_spec(hps, filename, specname):
audio, sampling_rate = utils.load_wav_to_torch(filename)
if sampling_rate != hps.sampling_rate:
raise ValueError(
"{} {} SR doesn't match target {} SR".format(
sampling_rate, hps.sampling_rate
)
)
audio_norm = audio / hps.max_wav_value
audio_norm = audio_norm.unsqueeze(0)
n_fft = hps.filter_length
sampling_rate = hps.sampling_rate
hop_size = hps.hop_length
win_size = hps.win_length
spec = spectrogram.spectrogram_torch(
audio_norm, n_fft, sampling_rate, hop_size, win_size, center=False)
spec = torch.squeeze(spec, 0)
torch.save(spec, specname)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.description = 'please enter embed parameter ...'
parser.add_argument("-w", "--wav", help="wav", dest="wav")
parser.add_argument("-s", "--spe", help="spe", dest="spe")
args = parser.parse_args()
print(args.wav)
print(args.spe)
os.makedirs(args.spe)
wavPath = args.wav
spePath = args.spe
hps = OmegaConf.load("./configs/base.yaml")

for spks in os.listdir(wavPath):
if os.path.isdir(f"./{wavPath}/{spks}"):
os.makedirs(f"./{spePath}/{spks}")
print(f">>>>>>>>>>{spks}<<<<<<<<<<")
for file in os.listdir(f"./{wavPath}/{spks}"):
if file.endswith(".wav"):
# print(file)
file = file[:-4]
compute_spec(hps.data, f"{wavPath}/{spks}/{file}.wav", f"{spePath}/{spks}/{file}.pt")
else:
file = spks
if file.endswith(".wav"):
# print(file)
file = file[:-4]
compute_spec(hps.data, f"{wavPath}/{file}.wav", f"{spePath}/{file}.pt")





56 changes: 56 additions & 0 deletions prepare/preprocess_train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import os
import random


if __name__ == "__main__":
os.makedirs("./files/", exist_ok=True)

rootPath = "./data_svc/waves/"
all_items = []
for spks in os.listdir(f"./{rootPath}"):
if os.path.isdir(f"./{rootPath}/{spks}"):
for file in os.listdir(f"./{rootPath}/{spks}"):
if file.endswith(".wav"):
file = file[:-4]
path_spk = f"./data_svc/speaker/{spks}/{file}.npy"
path_wave = f"./data_svc/waves/{spks}/{file}.wav"
path_spec = f"./data_svc/specs/{spks}/{file}.pt"
path_pitch = f"./data_svc/pitch/{spks}/{file}.nsf.npy"
path_whisper = f"./data_svc/whisper/{spks}/{file}.ppg.npy"
assert os.path.isfile(path_spk)
assert os.path.isfile(path_wave)
assert os.path.isfile(path_spec)
assert os.path.isfile(path_pitch)
assert os.path.isfile(path_whisper)
all_items.append(
f"{path_wave}|{path_spec}|{path_pitch}|{path_whisper}|{path_spk}")
else:
file = spks
if file.endswith(".wav"):
file = file[:-4]
path_spk = f"./data_svc/speaker/{file}.npy"
path_wave = f"./data_svc/waves/{file}.wav"
path_spec = f"./data_svc/specs/{file}.pt"
path_pitch = f"./data_svc/pitch/{file}.nsf.npy"
path_whisper = f"./data_svc/whisper/{file}.ppg.npy"
assert os.path.isfile(path_spk)
assert os.path.isfile(path_wave)
assert os.path.isfile(path_spec)
assert os.path.isfile(path_pitch)
assert os.path.isfile(path_whisper)
all_items.append(
f"{path_wave}|{path_spec}|{path_pitch}|{path_whisper}|{path_spk}")

random.shuffle(all_items)
valids = all_items[:50]
valids.sort()
trains = all_items[50:]
# trains.sort()
fw = open("./files/valid.txt", "w", encoding="utf-8")
for strs in valids:
print(strs, file=fw)
fw.close()
fw = open("./files/train.txt", "w", encoding="utf-8")
for strs in trains:
print(strs, file=fw)
fw.close()
29 changes: 29 additions & 0 deletions prepare/preprocess_zzz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from tqdm import tqdm
from torch.utils.data import DataLoader
from omegaconf import OmegaConf
from vits.data_utils import TextAudioSpeakerSet
from vits.data_utils import TextAudioSpeakerCollate
from vits.data_utils import DistributedBucketSampler


hps = OmegaConf.load("./configs/base.yaml")
dataset = TextAudioSpeakerSet("files/valid.txt", hps.data)

for _ in tqdm(dataset):
pass


sampler = DistributedBucketSampler(
dataset,
4,
[150, 300, 450],
num_replicas=1,
rank=0,
shuffle=True)
collate_fn = TextAudioSpeakerCollate()
loader = DataLoader(dataset, num_workers=0, shuffle=False, pin_memory=True,
collate_fn=collate_fn, batch_sampler=sampler)


for _ in tqdm(loader):
pass
Loading

0 comments on commit af11d15

Please sign in to comment.