preprocess

332plim · Apr 15, 2023 · af11d15 · af11d15
1 parent ec3f7be
commit af11d15
Show file tree

Hide file tree

Showing 16 changed files with 768 additions and 471 deletions.
diff --git a/configs/base.yaml b/configs/base.yaml
@@ -1,4 +1,5 @@
 train:
+  model: "sovits"
   fp16_run: False
   log_interval: 200
   eval_interval: 1000
@@ -11,14 +12,14 @@ train:
   warmup_epochs: 0
   eps: 1e-9
   batch_size: 12
-  segment_size: 8000  # WARNING: base on hop_length
   c_mel: 45
   c_kl: 1.0
   port: 8001
 #############################
 data: 
-  training_files: "filelists/train.txt"
-  validation_files: "filelists/val.txt"
+  training_files: "files/train.txt"
+  validation_files: "files/valid.txt"
+  segment_size: 8000  # WARNING: base on hop_length
   max_wav_value: 32768.0
   sampling_rate: 16000
   filter_length: 512

diff --git a/prepare/preprocess_f0.py b/prepare/preprocess_f0.py
@@ -0,0 +1,49 @@
+import os
+import numpy as np
+import librosa
+import pyworld
+import argparse
+
+
+def compute_f0(path, save):
+    x, sr = librosa.load(path, sr=16000)
+    assert sr == 16000
+    f0, t = pyworld.dio(
+        x.astype(np.double),
+        fs=sr,
+        f0_ceil=900,
+        frame_period=1000 * 160 / sr,
+    )
+    f0 = pyworld.stonemask(x.astype(np.double), f0, t, fs=16000)
+    for index, pitch in enumerate(f0):
+        f0[index] = round(pitch, 1)
+    np.save(save, f0, allow_pickle=False)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.description = 'please enter embed parameter ...'
+    parser.add_argument("-w", "--wav", help="wav", dest="wav")
+    parser.add_argument("-p", "--pit", help="pit", dest="pit")
+    args = parser.parse_args()
+    print(args.wav)
+    print(args.pit)
+    os.makedirs(args.pit)
+    wavPath = args.wav
+    pitPath = args.pit
+
+    for spks in os.listdir(wavPath):
+        if os.path.isdir(f"./{wavPath}/{spks}"):
+            os.makedirs(f"./{pitPath}/{spks}")
+            print(f">>>>>>>>>>{spks}<<<<<<<<<<")
+            for file in os.listdir(f"./{wavPath}/{spks}"):
+                if file.endswith(".wav"):
+                    # print(file)
+                    file = file[:-4]
+                    compute_f0(f"{wavPath}/{spks}/{file}.wav", f"{pitPath}/{spks}/{file}.nsf")
+        else:
+            file = spks
+            if file.endswith(".wav"):
+                # print(file)
+                file = file[:-4]
+                compute_f0(f"{wavPath}/{file}.wav", f"{pitPath}/{file}.nsf")
diff --git a/prepare/preprocess_ppg.py b/prepare/preprocess_ppg.py
@@ -0,0 +1,60 @@
+import os
+import numpy as np
+import argparse
+import torch
+
+from whisper.model import Whisper, ModelDimensions
+from whisper.audio import load_audio, pad_or_trim, log_mel_spectrogram
+
+
+def load_model(path) -> Whisper:
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    checkpoint = torch.load(path, map_location=device)
+    dims = ModelDimensions(**checkpoint["dims"])
+    model = Whisper(dims)
+    model.load_state_dict(checkpoint["model_state_dict"])
+    return model.to(device)
+
+
+def pred_ppg(whisper: Whisper, wavPath, ppgPath):
+    audio = load_audio(wavPath)
+    audln = audio.shape[0]
+    ppgln = audln // 320
+    # audio = pad_or_trim(audio)
+    mel = log_mel_spectrogram(audio).to(whisper.device)
+    with torch.no_grad():
+        ppg = whisper.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
+        ppg = ppg[:ppgln,] # [length, dim=1024]
+        print(ppg.shape)
+        np.save(ppgPath, ppg, allow_pickle=False)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.description = 'please enter embed parameter ...'
+    parser.add_argument("-w", "--wav", help="wav", dest="wav")
+    parser.add_argument("-p", "--ppg", help="ppg", dest="ppg")
+    args = parser.parse_args()
+    print(args.wav)
+    print(args.ppg)
+    os.makedirs(args.ppg)
+    wavPath = args.wav
+    ppgPath = args.ppg
+
+    whisper = load_model(os.path.join("whisper_pretrain", "medium.pt"))
+
+    for spks in os.listdir(wavPath):
+        if os.path.isdir(f"./{wavPath}/{spks}"):
+            os.makedirs(f"./{ppgPath}/{spks}")
+            print(f">>>>>>>>>>{spks}<<<<<<<<<<")
+            for file in os.listdir(f"./{wavPath}/{spks}"):
+                if file.endswith(".wav"):
+                    # print(file)
+                    file = file[:-4]
+                    pred_ppg(whisper, f"{wavPath}/{spks}/{file}.wav", f"{ppgPath}/{spks}/{file}.ppg")
+        else:
+            file = spks
+            if file.endswith(".wav"):
+                # print(file)
+                file = file[:-4]
+                pred_ppg(whisper, f"{wavPath}/{file}.wav", f"{ppgPath}/{file}.ppg")
diff --git a/prepare/preprocess_speaker.py b/prepare/preprocess_speaker.py
@@ -0,0 +1,84 @@
+import os
+import torch
+import numpy as np
+import argparse
+
+from tqdm import tqdm
+from argparse import RawTextHelpFormatter
+from speaker.models.lstm import LSTMSpeakerEncoder
+from speaker.config import SpeakerEncoderConfig
+
+from speaker.utils.audio import AudioProcessor
+from speaker.infer import read_json
+
+
+def get_spk_wavs(dataset_path, output_path):
+    wav_files = []
+    os.makedirs(f"./{output_path}")
+    for spks in os.listdir(dataset_path):
+        if os.path.isdir(f"./{dataset_path}/{spks}"):
+            os.makedirs(f"./{output_path}/{spks}")
+            for file in os.listdir(f"./{dataset_path}/{spks}"):
+                if file.endswith(".wav"):
+                    wav_files.append(f"./{dataset_path}/{spks}/{file}")
+        elif spks.endswith(".wav"):
+            wav_files.append(f"./{dataset_path}/{spks}")
+    return wav_files
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(
+        description="""Compute embedding vectors for each wav file in a dataset.""",
+        formatter_class=RawTextHelpFormatter,
+    )
+    parser.add_argument("dataset_path", type=str, help="Path to dataset waves.")
+    parser.add_argument(
+        "output_path", type=str, help="path for output speaker/speaker_wavs.npy."
+    )
+    parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
+    args = parser.parse_args()
+    dataset_path = args.dataset_path
+    output_path = args.output_path
+    # model
+    args.model_path = os.path.join("speaker_pretrain", "best_model.pth.tar")
+    args.config_path = os.path.join("speaker_pretrain", "config.json")
+    # config
+    config_dict = read_json(args.config_path)
+
+    # model
+    config = SpeakerEncoderConfig(config_dict)
+    config.from_dict(config_dict)
+
+    speaker_encoder = LSTMSpeakerEncoder(
+        config.model_params["input_dim"],
+        config.model_params["proj_dim"],
+        config.model_params["lstm_dim"],
+        config.model_params["num_lstm_layers"],
+    )
+
+    speaker_encoder.load_checkpoint(args.model_path, eval=True, use_cuda=args.use_cuda)
+
+    # preprocess
+    speaker_encoder_ap = AudioProcessor(**config.audio)
+    # normalize the input audio level and trim silences
+    speaker_encoder_ap.do_sound_norm = True
+    speaker_encoder_ap.do_trim_silence = True
+
+    wav_files = get_spk_wavs(dataset_path, output_path)
+
+    # compute speaker embeddings
+    for idx, wav_file in enumerate(tqdm(wav_files)):
+        waveform = speaker_encoder_ap.load_wav(
+            wav_file, sr=speaker_encoder_ap.sample_rate
+        )
+        spec = speaker_encoder_ap.melspectrogram(waveform)
+        spec = torch.from_numpy(spec.T)
+        if args.use_cuda:
+            spec = spec.cuda()
+        spec = spec.unsqueeze(0)
+        embed = speaker_encoder.compute_embedding(spec).detach().cpu().numpy()
+        embed = embed.squeeze()
+        embed_path = wav_file.replace(dataset_path, output_path)
+        embed_path = embed_path.replace(".wav", ".spk")
+        np.save(embed_path, embed, allow_pickle=False)
diff --git a/prepare/preprocess_spec.py b/prepare/preprocess_spec.py
@@ -0,0 +1,62 @@
+import os
+import torch
+import argparse
+
+from vits import spectrogram
+from vits import utils
+from omegaconf import OmegaConf
+
+
+def compute_spec(hps, filename, specname):
+    audio, sampling_rate = utils.load_wav_to_torch(filename)
+    if sampling_rate != hps.sampling_rate:
+        raise ValueError(
+            "{} {} SR doesn't match target {} SR".format(
+                sampling_rate, hps.sampling_rate
+            )
+        )
+    audio_norm = audio / hps.max_wav_value
+    audio_norm = audio_norm.unsqueeze(0)
+    n_fft = hps.filter_length
+    sampling_rate = hps.sampling_rate
+    hop_size = hps.hop_length
+    win_size = hps.win_length
+    spec = spectrogram.spectrogram_torch(
+        audio_norm, n_fft, sampling_rate, hop_size, win_size, center=False)
+    spec = torch.squeeze(spec, 0)
+    torch.save(spec, specname)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.description = 'please enter embed parameter ...'
+    parser.add_argument("-w", "--wav", help="wav", dest="wav")
+    parser.add_argument("-s", "--spe", help="spe", dest="spe")
+    args = parser.parse_args()
+    print(args.wav)
+    print(args.spe)
+    os.makedirs(args.spe)
+    wavPath = args.wav
+    spePath = args.spe
+    hps = OmegaConf.load("./configs/base.yaml")
+
+    for spks in os.listdir(wavPath):
+        if os.path.isdir(f"./{wavPath}/{spks}"):
+            os.makedirs(f"./{spePath}/{spks}")
+            print(f">>>>>>>>>>{spks}<<<<<<<<<<")
+            for file in os.listdir(f"./{wavPath}/{spks}"):
+                if file.endswith(".wav"):
+                    # print(file)
+                    file = file[:-4]
+                    compute_spec(hps.data, f"{wavPath}/{spks}/{file}.wav", f"{spePath}/{spks}/{file}.pt")
+        else:
+            file = spks
+            if file.endswith(".wav"):
+                # print(file)
+                file = file[:-4]
+                compute_spec(hps.data, f"{wavPath}/{file}.wav", f"{spePath}/{file}.pt")
+
+
+
+
+
diff --git a/prepare/preprocess_train.py b/prepare/preprocess_train.py
@@ -0,0 +1,56 @@
+import os
+import random
+
+
+if __name__ == "__main__":
+    os.makedirs("./files/", exist_ok=True)
+
+    rootPath = "./data_svc/waves/"
+    all_items = []
+    for spks in os.listdir(f"./{rootPath}"):
+        if os.path.isdir(f"./{rootPath}/{spks}"):
+            for file in os.listdir(f"./{rootPath}/{spks}"):
+                if file.endswith(".wav"):
+                    file = file[:-4]
+                    path_spk = f"./data_svc/speaker/{spks}/{file}.npy"
+                    path_wave = f"./data_svc/waves/{spks}/{file}.wav"
+                    path_spec = f"./data_svc/specs/{spks}/{file}.pt"
+                    path_pitch = f"./data_svc/pitch/{spks}/{file}.nsf.npy"
+                    path_whisper = f"./data_svc/whisper/{spks}/{file}.ppg.npy"
+                    assert os.path.isfile(path_spk)
+                    assert os.path.isfile(path_wave)
+                    assert os.path.isfile(path_spec)
+                    assert os.path.isfile(path_pitch)
+                    assert os.path.isfile(path_whisper)
+                    all_items.append(
+                        f"{path_wave}|{path_spec}|{path_pitch}|{path_whisper}|{path_spk}")
+        else:
+            file = spks
+            if file.endswith(".wav"):
+                file = file[:-4]
+                path_spk = f"./data_svc/speaker/{file}.npy"
+                path_wave = f"./data_svc/waves/{file}.wav"
+                path_spec = f"./data_svc/specs/{file}.pt"
+                path_pitch = f"./data_svc/pitch/{file}.nsf.npy"
+                path_whisper = f"./data_svc/whisper/{file}.ppg.npy"
+                assert os.path.isfile(path_spk)
+                assert os.path.isfile(path_wave)
+                assert os.path.isfile(path_spec)
+                assert os.path.isfile(path_pitch)
+                assert os.path.isfile(path_whisper)
+                all_items.append(
+                    f"{path_wave}|{path_spec}|{path_pitch}|{path_whisper}|{path_spk}")
+
+    random.shuffle(all_items)
+    valids = all_items[:50]
+    valids.sort()
+    trains = all_items[50:]
+    # trains.sort()
+    fw = open("./files/valid.txt", "w", encoding="utf-8")
+    for strs in valids:
+        print(strs, file=fw)
+    fw.close()
+    fw = open("./files/train.txt", "w", encoding="utf-8")
+    for strs in trains:
+        print(strs, file=fw)
+    fw.close()
diff --git a/prepare/preprocess_zzz.py b/prepare/preprocess_zzz.py
@@ -0,0 +1,29 @@
+from tqdm import tqdm
+from torch.utils.data import DataLoader
+from omegaconf import OmegaConf
+from vits.data_utils import TextAudioSpeakerSet
+from vits.data_utils import TextAudioSpeakerCollate
+from vits.data_utils import DistributedBucketSampler
+
+
+hps = OmegaConf.load("./configs/base.yaml")
+dataset = TextAudioSpeakerSet("files/valid.txt", hps.data)
+
+for _ in tqdm(dataset):
+    pass
+
+
+sampler = DistributedBucketSampler(
+    dataset,
+    4,
+    [150, 300, 450],
+    num_replicas=1,
+    rank=0,
+    shuffle=True)
+collate_fn = TextAudioSpeakerCollate()
+loader = DataLoader(dataset, num_workers=0, shuffle=False, pin_memory=True,
+                    collate_fn=collate_fn, batch_sampler=sampler)
+
+
+for _ in tqdm(loader):
+    pass