Skip to content

Commit

Permalink
only
Browse files Browse the repository at this point in the history
  • Loading branch information
MaxMax2016 committed Jul 4, 2023
1 parent 7cdc5af commit aa2a61e
Show file tree
Hide file tree
Showing 10 changed files with 64 additions and 167 deletions.
7 changes: 2 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -168,12 +168,9 @@ data_svc/

> python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-32k -s 32000
- 2, 使用16K音频,提取音高:注意f0_ceil=900,需要根据您数据的最高音进行修改
> python prepare/preprocess_f0.py -w data_svc/waves-16k/ -p data_svc/pitch
- 2, 使用16K音频,提取音高

低质量音频使用下面指令处理

> python prepare/preprocess_f0_crepe.py -w data_svc/waves-16k/ -p data_svc/pitch
> python prepare/preprocess_crepe.py -w data_svc/waves-16k/ -p data_svc/pitch
- 3, 使用16k音频,提取内容编码
> python prepare/preprocess_ppg.py -w data_svc/waves-16k/ -p data_svc/whisper
Expand Down
11 changes: 1 addition & 10 deletions hubert/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,6 @@ def load_model(path, device):
return model


def pred_all(model, wavPath, vecPath, device):
feats = load_audio(wavPath)
feats = torch.from_numpy(feats).to(device)
feats = feats[None, None, :].half()
with torch.no_grad():
vec = model.units(feats).squeeze().data.cpu().float().numpy()
# print(vec.shape) # [length, dim=256] hop=320
np.save(vecPath, vec, allow_pickle=False)


def pred_vec(model, wavPath, vecPath, device):
audio = load_audio(wavPath)
audln = audio.shape[0]
Expand All @@ -45,6 +35,7 @@ def pred_vec(model, wavPath, vecPath, device):
feats = feats[None, None, :].half()
with torch.no_grad():
vec = model.units(feats).squeeze().data.cpu().float().numpy()
# print(vec.shape) # [length, dim=256] hop=320
vec_a.extend(vec)
np.save(vecPath, vec_a, allow_pickle=False)

Expand Down
14 changes: 2 additions & 12 deletions pitch/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,14 @@
def compute_f0_nn(filename, device):
audio, sr = librosa.load(filename, sr=16000)
assert sr == 16000
# Load audio
audio = torch.tensor(np.copy(audio))[None]
# Here we'll use a 20 millisecond hop length
hop_length = 320
# Provide a sensible frequency range for your domain (upper limit is 2006 Hz)
# This would be a reasonable range for speech
fmin = 50
fmax = 1000
# Select a model capacity--one of "tiny" or "full"
model = "tiny"
# Pick a batch size that doesn't cause memory errors on your gpu
batch_size = 512
# Compute pitch using first gpu
pitch, periodicity = crepe.predict(
pitch = crepe.predict(
audio,
sr,
hop_length,
Expand All @@ -32,14 +26,10 @@ def compute_f0_nn(filename, device):
model,
batch_size=batch_size,
device=device,
return_periodicity=True,
return_periodicity=False,
)
pitch = np.repeat(pitch, 2, -1) # 320 -> 160 * 2
periodicity = np.repeat(periodicity, 2, -1) # 320 -> 160 * 2
# CREPE was not trained on silent audio. some error on silent need filter.pitPath
periodicity = crepe.filter.median(periodicity, 9)
pitch = crepe.filter.mean(pitch, 5)
# pitch[periodicity < 0.1] = 0
pitch = pitch.squeeze(0)
return pitch

Expand Down
13 changes: 7 additions & 6 deletions prepare/preprocess_f0_crepe.py → prepare/preprocess_crepe.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import os
import sys,os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import numpy as np
import librosa
import torch
import torchcrepe
import crepe
import argparse
from tqdm import tqdm
from multiprocessing import set_start_method
Expand All @@ -20,11 +21,11 @@ def compute_f0(filename, save, device):
fmin = 50
fmax = 1000
# Select a model capacity--one of "tiny" or "full"
model = "full"
model = "tiny"
# Pick a batch size that doesn't cause memory errors on your gpu
batch_size = 512
# Compute pitch using first gpu
pitch, periodicity = torchcrepe.predict(
pitch, periodicity = crepe.predict(
audio,
sr,
hop_length,
Expand All @@ -38,8 +39,8 @@ def compute_f0(filename, save, device):
pitch = np.repeat(pitch, 2, -1) # 320 -> 160 * 2
periodicity = np.repeat(periodicity, 2, -1) # 320 -> 160 * 2
# CREPE was not trained on silent audio. some error on silent need filter.pitPath
periodicity = torchcrepe.filter.median(periodicity, 9)
pitch = torchcrepe.filter.mean(pitch, 5)
periodicity = crepe.filter.median(periodicity, 9)
pitch = crepe.filter.mean(pitch, 5)
pitch[periodicity < 0.05] = 0
pitch = pitch.squeeze(0)
np.save(save, pitch, allow_pickle=False)
Expand Down
61 changes: 0 additions & 61 deletions prepare/preprocess_f0.py

This file was deleted.

60 changes: 23 additions & 37 deletions prepare/preprocess_ppg.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,22 @@
import numpy as np
import argparse
import torch
from tqdm import tqdm
from multiprocessing import Pool
import random
from whisper.model import Whisper, ModelDimensions
from whisper.audio import load_audio, pad_or_trim, log_mel_spectrogram
from concurrent.futures import ThreadPoolExecutor, as_completed


def load_model(path) -> Whisper:
device = "cuda" if torch.cuda.is_available() else "cpu"
checkpoint = torch.load(path, map_location=device)
checkpoint = torch.load(path, map_location="cpu")
dims = ModelDimensions(**checkpoint["dims"])
print(dims)
model = Whisper(dims)
model.load_state_dict(checkpoint["model_state_dict"])
del model.decoder
# cut = len(model.encoder.blocks) // 2
# cut = -1 * cut
# del model.encoder.blocks[cut:]
model.load_state_dict(checkpoint["model_state_dict"], strict=False)
model.eval()
model.half()
model.to(device)
Expand All @@ -27,58 +29,42 @@ def pred_ppg(whisper: Whisper, wavPath, ppgPath):
audio = load_audio(wavPath)
audln = audio.shape[0]
ppgln = audln // 320
audio = pad_or_trim(audio)
# audio = pad_or_trim(audio)
mel = log_mel_spectrogram(audio).half().to(whisper.device)
with torch.no_grad():
ppg = whisper.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
ppg = ppg[:ppgln,] # [length, dim=1024]
ppg = ppg[:ppgln,] # [length, dim=1280]
# print(ppg.shape)
np.save(ppgPath, ppg, allow_pickle=False)

def process_file(file):
if file.endswith(".wav"):
file = file[:-4]
pred_ppg(whisper, f"{wavPath}/{spks}/{file}.wav", f"{ppgPath}/{spks}/{file}.ppg")

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.description = 'please enter embed parameter ...'
parser.add_argument("-w", "--wav", help="wav", dest="wav")
parser.add_argument("-p", "--ppg", help="ppg", dest="ppg")
parser.add_argument("-t", "--thread_count", help="thread count to process, set 0 to use all cpu cores", dest="thread_count", type=int, default=1)

args = parser.parse_args()
print(args.wav)
print(args.ppg)
if not os.path.exists(args.ppg):
os.makedirs(args.ppg)

os.makedirs(args.ppg, exist_ok=True)
wavPath = args.wav
ppgPath = args.ppg

whisper = load_model(os.path.join("whisper_pretrain", "large-v2.pt"))
spkPaths = os.listdir(wavPath)
random.shuffle(spkPaths)

for spks in os.listdir(wavPath):
for spks in spkPaths:
if os.path.isdir(f"./{wavPath}/{spks}"):
if not os.path.exists(f"./{ppgPath}/{spks}"):
os.makedirs(f"./{ppgPath}/{spks}")
os.makedirs(f"./{ppgPath}/{spks}", exist_ok=True)
print(f">>>>>>>>>>{spks}<<<<<<<<<<")
if args.thread_count == 1:
for file in os.listdir(f"./{wavPath}/{spks}"):
if file.endswith(".wav"):
print(file)
file = file[:-4]
pred_ppg(whisper, f"{wavPath}/{spks}/{file}.wav", f"{ppgPath}/{spks}/{file}.ppg")
else:
if args.thread_count == 0:
process_num = os.cpu_count()
else:
process_num = args.thread_count
with ThreadPoolExecutor(max_workers=process_num) as executor:
futures = [executor.submit(process_file, file) for file in os.listdir(f"./{wavPath}/{spks}")]
for future in tqdm(as_completed(futures), total=len(futures)):
pass
# with Pool(processes=process_num) as pool:
# results = [pool.apply_async(process_file, (file,)) for file in os.listdir(f"./{wavPath}/{spks}")]
# for result in tqdm(results, total=len(results)):
# result.wait()
for file in os.listdir(f"./{wavPath}/{spks}"):
if file.endswith(".wav"):
# print(file)
file = file[:-4]
path_wav = f"{wavPath}/{spks}/{file}.wav"
path_ppg = f"{ppgPath}/{spks}/{file}.ppg"
if os.path.isfile(f"{path_ppg}.npy"):
continue
pred_ppg(whisper, path_wav, path_ppg)
41 changes: 13 additions & 28 deletions svc_preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,24 @@
import argparse
import subprocess
import signal

parser = argparse.ArgumentParser()
parser.add_argument("-t", type=int, default=0, help="thread count")
parser.add_argument("--crepe", action="store_true", help="Use crepe to extract f0")
args = parser.parse_args()

if args.crepe:
commands = [
"python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-16k -s 16000 -t 0",
"python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-32k -s 32000 -t 0",
"python prepare/preprocess_f0_crepe.py -w data_svc/waves-16k/ -p data_svc/pitch -t "+str(args.t),
"python prepare/preprocess_ppg.py -w data_svc/waves-16k/ -p data_svc/whisper -t 1",
"python prepare/preprocess_hubert.py -w data_svc/waves-16k/ -v data_svc/hubert -t 1",
"python prepare/preprocess_speaker.py data_svc/waves-16k/ data_svc/speaker -t 0",
"python prepare/preprocess_speaker_ave.py data_svc/speaker/ data_svc/singer -t 0",
"python prepare/preprocess_spec.py -w data_svc/waves-32k/ -s data_svc/specs -t 0",
"python prepare/preprocess_train.py",
"python prepare/preprocess_zzz.py",
]

else:
commands = [
"python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-16k -s 16000 -t 0",
"python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-32k -s 32000 -t 0",
"python prepare/preprocess_f0.py -w data_svc/waves-16k/ -p data_svc/pitch -t "+str(args.t),
"python prepare/preprocess_ppg.py -w data_svc/waves-16k/ -p data_svc/whisper -t 1",
"python prepare/preprocess_hubert.py -w data_svc/waves-16k/ -v data_svc/hubert -t 1",
"python prepare/preprocess_speaker.py data_svc/waves-16k/ data_svc/speaker -t 0",
"python prepare/preprocess_speaker_ave.py data_svc/speaker/ data_svc/singer -t 0",
"python prepare/preprocess_spec.py -w data_svc/waves-32k/ -s data_svc/specs -t 0",
"python prepare/preprocess_train.py",
"python prepare/preprocess_zzz.py",
]
commands = [
"python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-16k -s 16000 -t 0",
"python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-32k -s 32000 -t 0",
"python prepare/preprocess_crepe.py -w data_svc/waves-16k/ -p data_svc/pitch -t " + str(args.t),
"python prepare/preprocess_ppg.py -w data_svc/waves-16k/ -p data_svc/whisper",
"python prepare/preprocess_hubert.py -w data_svc/waves-16k/ -v data_svc/hubert -t 1",
"python prepare/preprocess_speaker.py data_svc/waves-16k/ data_svc/speaker -t 0",
"python prepare/preprocess_speaker_ave.py data_svc/speaker/ data_svc/singer -t 0",
"python prepare/preprocess_spec.py -w data_svc/waves-32k/ -s data_svc/specs -t 0",
"python prepare/preprocess_train.py",
"python prepare/preprocess_zzz.py",
]


for command in commands:
print(f"Command: {command}")
Expand Down
2 changes: 1 addition & 1 deletion vits_decoder/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def __init__(self, hp):

def forward(self, spk, x, f0):
# Perturbation
x = x + torch.randn_like(x)
x = x + torch.randn_like(x)
# adapter
x = self.adapter(x, spk)
x = self.conv_pre(x)
Expand Down
2 changes: 1 addition & 1 deletion vits_extend/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def create_dataloader_train(hps, n_gpus, rank):
shuffle=True)
train_loader = DataLoader(
train_dataset,
num_workers=2,
num_workers=4,
shuffle=False,
pin_memory=True,
collate_fn=collate_fn,
Expand Down
20 changes: 14 additions & 6 deletions whisper/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,22 @@

def load_model(path) -> Whisper:
device = "cuda" if torch.cuda.is_available() else "cpu"
checkpoint = torch.load(path, map_location=device)
checkpoint = torch.load(path, map_location="cpu")
dims = ModelDimensions(**checkpoint["dims"])
print(dims)
model = Whisper(dims)
model.load_state_dict(checkpoint["model_state_dict"])
del model.decoder
# cut = len(model.encoder.blocks) // 2
# cut = -1 * cut
# del model.encoder.blocks[cut:]
model.load_state_dict(checkpoint["model_state_dict"], strict=False)
model.eval()
model.half()
model.to(device)
# torch.save({
# 'dims': checkpoint["dims"],
# 'model_state_dict': model.state_dict(),
# }, "large-v2.pt")
return model


Expand All @@ -26,10 +34,10 @@ def pred_ppg(whisper: Whisper, wavPath, ppgPath):
audln = audio.shape[0]
ppg_a = []
idx_s = 0
while (idx_s + 25 * 16000 < audln):
short = audio[idx_s:idx_s + 25 * 16000]
idx_s = idx_s + 25 * 16000
ppgln = 25 * 16000 // 320
while (idx_s + 15 * 16000 < audln):
short = audio[idx_s:idx_s + 15 * 16000]
idx_s = idx_s + 15 * 16000
ppgln = 15 * 16000 // 320
# short = pad_or_trim(short)
mel = log_mel_spectrogram(short).half().to(whisper.device)
with torch.no_grad():
Expand Down

0 comments on commit aa2a61e

Please sign in to comment.