forked from PlayVoice/whisper-vits-svc
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
ec3f7be
commit af11d15
Showing
16 changed files
with
768 additions
and
471 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import os | ||
import numpy as np | ||
import librosa | ||
import pyworld | ||
import argparse | ||
|
||
|
||
def compute_f0(path, save): | ||
x, sr = librosa.load(path, sr=16000) | ||
assert sr == 16000 | ||
f0, t = pyworld.dio( | ||
x.astype(np.double), | ||
fs=sr, | ||
f0_ceil=900, | ||
frame_period=1000 * 160 / sr, | ||
) | ||
f0 = pyworld.stonemask(x.astype(np.double), f0, t, fs=16000) | ||
for index, pitch in enumerate(f0): | ||
f0[index] = round(pitch, 1) | ||
np.save(save, f0, allow_pickle=False) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.description = 'please enter embed parameter ...' | ||
parser.add_argument("-w", "--wav", help="wav", dest="wav") | ||
parser.add_argument("-p", "--pit", help="pit", dest="pit") | ||
args = parser.parse_args() | ||
print(args.wav) | ||
print(args.pit) | ||
os.makedirs(args.pit) | ||
wavPath = args.wav | ||
pitPath = args.pit | ||
|
||
for spks in os.listdir(wavPath): | ||
if os.path.isdir(f"./{wavPath}/{spks}"): | ||
os.makedirs(f"./{pitPath}/{spks}") | ||
print(f">>>>>>>>>>{spks}<<<<<<<<<<") | ||
for file in os.listdir(f"./{wavPath}/{spks}"): | ||
if file.endswith(".wav"): | ||
# print(file) | ||
file = file[:-4] | ||
compute_f0(f"{wavPath}/{spks}/{file}.wav", f"{pitPath}/{spks}/{file}.nsf") | ||
else: | ||
file = spks | ||
if file.endswith(".wav"): | ||
# print(file) | ||
file = file[:-4] | ||
compute_f0(f"{wavPath}/{file}.wav", f"{pitPath}/{file}.nsf") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import os | ||
import numpy as np | ||
import argparse | ||
import torch | ||
|
||
from whisper.model import Whisper, ModelDimensions | ||
from whisper.audio import load_audio, pad_or_trim, log_mel_spectrogram | ||
|
||
|
||
def load_model(path) -> Whisper: | ||
device = "cuda" if torch.cuda.is_available() else "cpu" | ||
checkpoint = torch.load(path, map_location=device) | ||
dims = ModelDimensions(**checkpoint["dims"]) | ||
model = Whisper(dims) | ||
model.load_state_dict(checkpoint["model_state_dict"]) | ||
return model.to(device) | ||
|
||
|
||
def pred_ppg(whisper: Whisper, wavPath, ppgPath): | ||
audio = load_audio(wavPath) | ||
audln = audio.shape[0] | ||
ppgln = audln // 320 | ||
# audio = pad_or_trim(audio) | ||
mel = log_mel_spectrogram(audio).to(whisper.device) | ||
with torch.no_grad(): | ||
ppg = whisper.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy() | ||
ppg = ppg[:ppgln,] # [length, dim=1024] | ||
print(ppg.shape) | ||
np.save(ppgPath, ppg, allow_pickle=False) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.description = 'please enter embed parameter ...' | ||
parser.add_argument("-w", "--wav", help="wav", dest="wav") | ||
parser.add_argument("-p", "--ppg", help="ppg", dest="ppg") | ||
args = parser.parse_args() | ||
print(args.wav) | ||
print(args.ppg) | ||
os.makedirs(args.ppg) | ||
wavPath = args.wav | ||
ppgPath = args.ppg | ||
|
||
whisper = load_model(os.path.join("whisper_pretrain", "medium.pt")) | ||
|
||
for spks in os.listdir(wavPath): | ||
if os.path.isdir(f"./{wavPath}/{spks}"): | ||
os.makedirs(f"./{ppgPath}/{spks}") | ||
print(f">>>>>>>>>>{spks}<<<<<<<<<<") | ||
for file in os.listdir(f"./{wavPath}/{spks}"): | ||
if file.endswith(".wav"): | ||
# print(file) | ||
file = file[:-4] | ||
pred_ppg(whisper, f"{wavPath}/{spks}/{file}.wav", f"{ppgPath}/{spks}/{file}.ppg") | ||
else: | ||
file = spks | ||
if file.endswith(".wav"): | ||
# print(file) | ||
file = file[:-4] | ||
pred_ppg(whisper, f"{wavPath}/{file}.wav", f"{ppgPath}/{file}.ppg") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
import os | ||
import torch | ||
import numpy as np | ||
import argparse | ||
|
||
from tqdm import tqdm | ||
from argparse import RawTextHelpFormatter | ||
from speaker.models.lstm import LSTMSpeakerEncoder | ||
from speaker.config import SpeakerEncoderConfig | ||
|
||
from speaker.utils.audio import AudioProcessor | ||
from speaker.infer import read_json | ||
|
||
|
||
def get_spk_wavs(dataset_path, output_path): | ||
wav_files = [] | ||
os.makedirs(f"./{output_path}") | ||
for spks in os.listdir(dataset_path): | ||
if os.path.isdir(f"./{dataset_path}/{spks}"): | ||
os.makedirs(f"./{output_path}/{spks}") | ||
for file in os.listdir(f"./{dataset_path}/{spks}"): | ||
if file.endswith(".wav"): | ||
wav_files.append(f"./{dataset_path}/{spks}/{file}") | ||
elif spks.endswith(".wav"): | ||
wav_files.append(f"./{dataset_path}/{spks}") | ||
return wav_files | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
parser = argparse.ArgumentParser( | ||
description="""Compute embedding vectors for each wav file in a dataset.""", | ||
formatter_class=RawTextHelpFormatter, | ||
) | ||
parser.add_argument("dataset_path", type=str, help="Path to dataset waves.") | ||
parser.add_argument( | ||
"output_path", type=str, help="path for output speaker/speaker_wavs.npy." | ||
) | ||
parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True) | ||
args = parser.parse_args() | ||
dataset_path = args.dataset_path | ||
output_path = args.output_path | ||
# model | ||
args.model_path = os.path.join("speaker_pretrain", "best_model.pth.tar") | ||
args.config_path = os.path.join("speaker_pretrain", "config.json") | ||
# config | ||
config_dict = read_json(args.config_path) | ||
|
||
# model | ||
config = SpeakerEncoderConfig(config_dict) | ||
config.from_dict(config_dict) | ||
|
||
speaker_encoder = LSTMSpeakerEncoder( | ||
config.model_params["input_dim"], | ||
config.model_params["proj_dim"], | ||
config.model_params["lstm_dim"], | ||
config.model_params["num_lstm_layers"], | ||
) | ||
|
||
speaker_encoder.load_checkpoint(args.model_path, eval=True, use_cuda=args.use_cuda) | ||
|
||
# preprocess | ||
speaker_encoder_ap = AudioProcessor(**config.audio) | ||
# normalize the input audio level and trim silences | ||
speaker_encoder_ap.do_sound_norm = True | ||
speaker_encoder_ap.do_trim_silence = True | ||
|
||
wav_files = get_spk_wavs(dataset_path, output_path) | ||
|
||
# compute speaker embeddings | ||
for idx, wav_file in enumerate(tqdm(wav_files)): | ||
waveform = speaker_encoder_ap.load_wav( | ||
wav_file, sr=speaker_encoder_ap.sample_rate | ||
) | ||
spec = speaker_encoder_ap.melspectrogram(waveform) | ||
spec = torch.from_numpy(spec.T) | ||
if args.use_cuda: | ||
spec = spec.cuda() | ||
spec = spec.unsqueeze(0) | ||
embed = speaker_encoder.compute_embedding(spec).detach().cpu().numpy() | ||
embed = embed.squeeze() | ||
embed_path = wav_file.replace(dataset_path, output_path) | ||
embed_path = embed_path.replace(".wav", ".spk") | ||
np.save(embed_path, embed, allow_pickle=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import os | ||
import torch | ||
import argparse | ||
|
||
from vits import spectrogram | ||
from vits import utils | ||
from omegaconf import OmegaConf | ||
|
||
|
||
def compute_spec(hps, filename, specname): | ||
audio, sampling_rate = utils.load_wav_to_torch(filename) | ||
if sampling_rate != hps.sampling_rate: | ||
raise ValueError( | ||
"{} {} SR doesn't match target {} SR".format( | ||
sampling_rate, hps.sampling_rate | ||
) | ||
) | ||
audio_norm = audio / hps.max_wav_value | ||
audio_norm = audio_norm.unsqueeze(0) | ||
n_fft = hps.filter_length | ||
sampling_rate = hps.sampling_rate | ||
hop_size = hps.hop_length | ||
win_size = hps.win_length | ||
spec = spectrogram.spectrogram_torch( | ||
audio_norm, n_fft, sampling_rate, hop_size, win_size, center=False) | ||
spec = torch.squeeze(spec, 0) | ||
torch.save(spec, specname) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.description = 'please enter embed parameter ...' | ||
parser.add_argument("-w", "--wav", help="wav", dest="wav") | ||
parser.add_argument("-s", "--spe", help="spe", dest="spe") | ||
args = parser.parse_args() | ||
print(args.wav) | ||
print(args.spe) | ||
os.makedirs(args.spe) | ||
wavPath = args.wav | ||
spePath = args.spe | ||
hps = OmegaConf.load("./configs/base.yaml") | ||
|
||
for spks in os.listdir(wavPath): | ||
if os.path.isdir(f"./{wavPath}/{spks}"): | ||
os.makedirs(f"./{spePath}/{spks}") | ||
print(f">>>>>>>>>>{spks}<<<<<<<<<<") | ||
for file in os.listdir(f"./{wavPath}/{spks}"): | ||
if file.endswith(".wav"): | ||
# print(file) | ||
file = file[:-4] | ||
compute_spec(hps.data, f"{wavPath}/{spks}/{file}.wav", f"{spePath}/{spks}/{file}.pt") | ||
else: | ||
file = spks | ||
if file.endswith(".wav"): | ||
# print(file) | ||
file = file[:-4] | ||
compute_spec(hps.data, f"{wavPath}/{file}.wav", f"{spePath}/{file}.pt") | ||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import os | ||
import random | ||
|
||
|
||
if __name__ == "__main__": | ||
os.makedirs("./files/", exist_ok=True) | ||
|
||
rootPath = "./data_svc/waves/" | ||
all_items = [] | ||
for spks in os.listdir(f"./{rootPath}"): | ||
if os.path.isdir(f"./{rootPath}/{spks}"): | ||
for file in os.listdir(f"./{rootPath}/{spks}"): | ||
if file.endswith(".wav"): | ||
file = file[:-4] | ||
path_spk = f"./data_svc/speaker/{spks}/{file}.npy" | ||
path_wave = f"./data_svc/waves/{spks}/{file}.wav" | ||
path_spec = f"./data_svc/specs/{spks}/{file}.pt" | ||
path_pitch = f"./data_svc/pitch/{spks}/{file}.nsf.npy" | ||
path_whisper = f"./data_svc/whisper/{spks}/{file}.ppg.npy" | ||
assert os.path.isfile(path_spk) | ||
assert os.path.isfile(path_wave) | ||
assert os.path.isfile(path_spec) | ||
assert os.path.isfile(path_pitch) | ||
assert os.path.isfile(path_whisper) | ||
all_items.append( | ||
f"{path_wave}|{path_spec}|{path_pitch}|{path_whisper}|{path_spk}") | ||
else: | ||
file = spks | ||
if file.endswith(".wav"): | ||
file = file[:-4] | ||
path_spk = f"./data_svc/speaker/{file}.npy" | ||
path_wave = f"./data_svc/waves/{file}.wav" | ||
path_spec = f"./data_svc/specs/{file}.pt" | ||
path_pitch = f"./data_svc/pitch/{file}.nsf.npy" | ||
path_whisper = f"./data_svc/whisper/{file}.ppg.npy" | ||
assert os.path.isfile(path_spk) | ||
assert os.path.isfile(path_wave) | ||
assert os.path.isfile(path_spec) | ||
assert os.path.isfile(path_pitch) | ||
assert os.path.isfile(path_whisper) | ||
all_items.append( | ||
f"{path_wave}|{path_spec}|{path_pitch}|{path_whisper}|{path_spk}") | ||
|
||
random.shuffle(all_items) | ||
valids = all_items[:50] | ||
valids.sort() | ||
trains = all_items[50:] | ||
# trains.sort() | ||
fw = open("./files/valid.txt", "w", encoding="utf-8") | ||
for strs in valids: | ||
print(strs, file=fw) | ||
fw.close() | ||
fw = open("./files/train.txt", "w", encoding="utf-8") | ||
for strs in trains: | ||
print(strs, file=fw) | ||
fw.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
from tqdm import tqdm | ||
from torch.utils.data import DataLoader | ||
from omegaconf import OmegaConf | ||
from vits.data_utils import TextAudioSpeakerSet | ||
from vits.data_utils import TextAudioSpeakerCollate | ||
from vits.data_utils import DistributedBucketSampler | ||
|
||
|
||
hps = OmegaConf.load("./configs/base.yaml") | ||
dataset = TextAudioSpeakerSet("files/valid.txt", hps.data) | ||
|
||
for _ in tqdm(dataset): | ||
pass | ||
|
||
|
||
sampler = DistributedBucketSampler( | ||
dataset, | ||
4, | ||
[150, 300, 450], | ||
num_replicas=1, | ||
rank=0, | ||
shuffle=True) | ||
collate_fn = TextAudioSpeakerCollate() | ||
loader = DataLoader(dataset, num_workers=0, shuffle=False, pin_memory=True, | ||
collate_fn=collate_fn, batch_sampler=sampler) | ||
|
||
|
||
for _ in tqdm(loader): | ||
pass |
Oops, something went wrong.