only

332plim · Jul 4, 2023 · aa2a61e · aa2a61e
1 parent 7cdc5af
commit aa2a61e
Show file tree

Hide file tree

Showing 10 changed files with 64 additions and 167 deletions.
diff --git a/README.md b/README.md
@@ -168,12 +168,9 @@ data_svc/
 
     > python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-32k -s 32000
 
-- 2， 使用16K音频，提取音高：注意f0_ceil=900，需要根据您数据的最高音进行修改
-    > python prepare/preprocess_f0.py -w data_svc/waves-16k/ -p data_svc/pitch
+- 2， 使用16K音频，提取音高
 
-    低质量音频使用下面指令处理
-
-    > python prepare/preprocess_f0_crepe.py -w data_svc/waves-16k/ -p data_svc/pitch
+    > python prepare/preprocess_crepe.py -w data_svc/waves-16k/ -p data_svc/pitch
 
 - 3， 使用16k音频，提取内容编码
     > python prepare/preprocess_ppg.py -w data_svc/waves-16k/ -p data_svc/whisper

diff --git a/hubert/inference.py b/hubert/inference.py
@@ -16,16 +16,6 @@ def load_model(path, device):
     return model
 
 
-def pred_all(model, wavPath, vecPath, device):
-    feats = load_audio(wavPath)
-    feats = torch.from_numpy(feats).to(device)
-    feats = feats[None, None, :].half()
-    with torch.no_grad():
-        vec = model.units(feats).squeeze().data.cpu().float().numpy()
-        # print(vec.shape)   # [length, dim=256] hop=320
-        np.save(vecPath, vec, allow_pickle=False)
-
-
 def pred_vec(model, wavPath, vecPath, device):
     audio = load_audio(wavPath)
     audln = audio.shape[0]
@@ -45,6 +35,7 @@ def pred_vec(model, wavPath, vecPath, device):
         feats = feats[None, None, :].half()
         with torch.no_grad():
             vec = model.units(feats).squeeze().data.cpu().float().numpy()
+            # print(vec.shape)   # [length, dim=256] hop=320
             vec_a.extend(vec)
     np.save(vecPath, vec_a, allow_pickle=False)
 

diff --git a/pitch/inference.py b/pitch/inference.py
@@ -10,20 +10,14 @@
 def compute_f0_nn(filename, device):
     audio, sr = librosa.load(filename, sr=16000)
     assert sr == 16000
-    # Load audio
     audio = torch.tensor(np.copy(audio))[None]
     # Here we'll use a 20 millisecond hop length
     hop_length = 320
-    # Provide a sensible frequency range for your domain (upper limit is 2006 Hz)
-    # This would be a reasonable range for speech
     fmin = 50
     fmax = 1000
-    # Select a model capacity--one of "tiny" or "full"
     model = "tiny"
-    # Pick a batch size that doesn't cause memory errors on your gpu
     batch_size = 512
-    # Compute pitch using first gpu
-    pitch, periodicity = crepe.predict(
+    pitch = crepe.predict(
         audio,
         sr,
         hop_length,
@@ -32,14 +26,10 @@ def compute_f0_nn(filename, device):
         model,
         batch_size=batch_size,
         device=device,
-        return_periodicity=True,
+        return_periodicity=False,
     )
     pitch = np.repeat(pitch, 2, -1)  # 320 -> 160 * 2
-    periodicity = np.repeat(periodicity, 2, -1)  # 320 -> 160 * 2
-    # CREPE was not trained on silent audio. some error on silent need filter.pitPath
-    periodicity = crepe.filter.median(periodicity, 9)
     pitch = crepe.filter.mean(pitch, 5)
-    # pitch[periodicity < 0.1] = 0
     pitch = pitch.squeeze(0)
     return pitch
 

diff --git a/prepare/preprocess_f0_crepe.py → prepare/preprocess_crepe.py b/prepare/preprocess_f0_crepe.py → prepare/preprocess_crepe.py
@@ -1,8 +1,9 @@
-import os
+import sys,os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 import numpy as np
 import librosa
 import torch
-import torchcrepe
+import crepe
 import argparse
 from tqdm import tqdm
 from multiprocessing import set_start_method
@@ -20,11 +21,11 @@ def compute_f0(filename, save, device):
     fmin = 50
     fmax = 1000
     # Select a model capacity--one of "tiny" or "full"
-    model = "full"
+    model = "tiny"
     # Pick a batch size that doesn't cause memory errors on your gpu
     batch_size = 512
     # Compute pitch using first gpu
-    pitch, periodicity = torchcrepe.predict(
+    pitch, periodicity = crepe.predict(
         audio,
         sr,
         hop_length,
@@ -38,8 +39,8 @@ def compute_f0(filename, save, device):
     pitch = np.repeat(pitch, 2, -1)  # 320 -> 160 * 2
     periodicity = np.repeat(periodicity, 2, -1)  # 320 -> 160 * 2
     # CREPE was not trained on silent audio. some error on silent need filter.pitPath
-    periodicity = torchcrepe.filter.median(periodicity, 9)
-    pitch = torchcrepe.filter.mean(pitch, 5)
+    periodicity = crepe.filter.median(periodicity, 9)
+    pitch = crepe.filter.mean(pitch, 5)
     pitch[periodicity < 0.05] = 0
     pitch = pitch.squeeze(0)
     np.save(save, pitch, allow_pickle=False)

diff --git a/prepare/preprocess_f0.py b/prepare/preprocess_f0.py
diff --git a/prepare/preprocess_ppg.py b/prepare/preprocess_ppg.py
@@ -3,20 +3,22 @@
 import numpy as np
 import argparse
 import torch
-from tqdm import tqdm
-from multiprocessing import Pool
+import random
 from whisper.model import Whisper, ModelDimensions
 from whisper.audio import load_audio, pad_or_trim, log_mel_spectrogram
-from concurrent.futures import ThreadPoolExecutor, as_completed
 
 
 def load_model(path) -> Whisper:
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    checkpoint = torch.load(path, map_location=device)
+    checkpoint = torch.load(path, map_location="cpu")
     dims = ModelDimensions(**checkpoint["dims"])
+    print(dims)
     model = Whisper(dims)
-    model.load_state_dict(checkpoint["model_state_dict"])
     del model.decoder
+    # cut = len(model.encoder.blocks) // 2
+    # cut = -1 * cut
+    # del model.encoder.blocks[cut:]
+    model.load_state_dict(checkpoint["model_state_dict"], strict=False)
     model.eval()
     model.half()
     model.to(device)
@@ -27,58 +29,42 @@ def pred_ppg(whisper: Whisper, wavPath, ppgPath):
     audio = load_audio(wavPath)
     audln = audio.shape[0]
     ppgln = audln // 320
-    audio = pad_or_trim(audio)
+    # audio = pad_or_trim(audio)
     mel = log_mel_spectrogram(audio).half().to(whisper.device)
     with torch.no_grad():
         ppg = whisper.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
-        ppg = ppg[:ppgln,] # [length, dim=1024]
+        ppg = ppg[:ppgln,]  # [length, dim=1280]
         # print(ppg.shape)
         np.save(ppgPath, ppg, allow_pickle=False)
 
-def process_file(file):
-    if file.endswith(".wav"):
-        file = file[:-4]
-        pred_ppg(whisper, f"{wavPath}/{spks}/{file}.wav", f"{ppgPath}/{spks}/{file}.ppg")
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.description = 'please enter embed parameter ...'
     parser.add_argument("-w", "--wav", help="wav", dest="wav")
     parser.add_argument("-p", "--ppg", help="ppg", dest="ppg")
-    parser.add_argument("-t", "--thread_count", help="thread count to process, set 0 to use all cpu cores", dest="thread_count", type=int, default=1)
-
     args = parser.parse_args()
     print(args.wav)
     print(args.ppg)
-    if not os.path.exists(args.ppg):
-        os.makedirs(args.ppg)
 
+    os.makedirs(args.ppg, exist_ok=True)
     wavPath = args.wav
     ppgPath = args.ppg
 
     whisper = load_model(os.path.join("whisper_pretrain", "large-v2.pt"))
+    spkPaths = os.listdir(wavPath)
+    random.shuffle(spkPaths)
 
-    for spks in os.listdir(wavPath):
+    for spks in spkPaths:
         if os.path.isdir(f"./{wavPath}/{spks}"):
-            if not os.path.exists(f"./{ppgPath}/{spks}"):
-                os.makedirs(f"./{ppgPath}/{spks}")
+            os.makedirs(f"./{ppgPath}/{spks}", exist_ok=True)
             print(f">>>>>>>>>>{spks}<<<<<<<<<<")
-            if args.thread_count == 1:
-                for file in os.listdir(f"./{wavPath}/{spks}"):
-                    if file.endswith(".wav"):
-                        print(file)
-                        file = file[:-4]
-                        pred_ppg(whisper, f"{wavPath}/{spks}/{file}.wav", f"{ppgPath}/{spks}/{file}.ppg")
-            else:
-                if args.thread_count == 0:
-                    process_num = os.cpu_count()
-                else:
-                    process_num = args.thread_count
-                with ThreadPoolExecutor(max_workers=process_num) as executor:
-                    futures = [executor.submit(process_file, file) for file in os.listdir(f"./{wavPath}/{spks}")]
-                    for future in tqdm(as_completed(futures), total=len(futures)):
-                        pass
-                # with Pool(processes=process_num) as pool:
-                #     results = [pool.apply_async(process_file, (file,)) for file in os.listdir(f"./{wavPath}/{spks}")]
-                #     for result in tqdm(results, total=len(results)):
-                #         result.wait()
+            for file in os.listdir(f"./{wavPath}/{spks}"):
+                if file.endswith(".wav"):
+                    # print(file)
+                    file = file[:-4]
+                    path_wav = f"{wavPath}/{spks}/{file}.wav"
+                    path_ppg = f"{ppgPath}/{spks}/{file}.ppg"
+                    if os.path.isfile(f"{path_ppg}.npy"):
+                        continue
+                    pred_ppg(whisper, path_wav, path_ppg)
diff --git a/svc_preprocessing.py b/svc_preprocessing.py
@@ -1,39 +1,24 @@
 import argparse
 import subprocess
-import signal
 
 parser = argparse.ArgumentParser()
 parser.add_argument("-t", type=int, default=0, help="thread count")
-parser.add_argument("--crepe", action="store_true", help="Use crepe to extract f0")
 args = parser.parse_args()
 
-if args.crepe:
-   commands = [
-      "python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-16k -s 16000 -t 0",
-      "python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-32k -s 32000 -t 0",
-      "python prepare/preprocess_f0_crepe.py -w data_svc/waves-16k/ -p data_svc/pitch -t "+str(args.t),
-      "python prepare/preprocess_ppg.py -w data_svc/waves-16k/ -p data_svc/whisper -t 1",
-      "python prepare/preprocess_hubert.py -w data_svc/waves-16k/ -v data_svc/hubert -t 1",
-      "python prepare/preprocess_speaker.py data_svc/waves-16k/ data_svc/speaker -t 0",
-      "python prepare/preprocess_speaker_ave.py data_svc/speaker/ data_svc/singer -t 0",
-      "python prepare/preprocess_spec.py -w data_svc/waves-32k/ -s data_svc/specs -t 0",
-      "python prepare/preprocess_train.py",
-      "python prepare/preprocess_zzz.py",
-   ]
 
-else:
-   commands = [
-      "python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-16k -s 16000 -t 0",
-      "python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-32k -s 32000 -t 0",
-      "python prepare/preprocess_f0.py -w data_svc/waves-16k/ -p data_svc/pitch -t "+str(args.t),
-      "python prepare/preprocess_ppg.py -w data_svc/waves-16k/ -p data_svc/whisper -t 1",
-      "python prepare/preprocess_hubert.py -w data_svc/waves-16k/ -v data_svc/hubert -t 1",
-      "python prepare/preprocess_speaker.py data_svc/waves-16k/ data_svc/speaker -t 0",
-      "python prepare/preprocess_speaker_ave.py data_svc/speaker/ data_svc/singer -t 0",
-      "python prepare/preprocess_spec.py -w data_svc/waves-32k/ -s data_svc/specs -t 0",
-      "python prepare/preprocess_train.py",
-      "python prepare/preprocess_zzz.py",
-   ]
+commands = [
+   "python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-16k -s 16000 -t 0",
+   "python prepare/preprocess_a.py -w ./dataset_raw -o ./data_svc/waves-32k -s 32000 -t 0",
+   "python prepare/preprocess_crepe.py -w data_svc/waves-16k/ -p data_svc/pitch -t " + str(args.t),
+   "python prepare/preprocess_ppg.py -w data_svc/waves-16k/ -p data_svc/whisper",
+   "python prepare/preprocess_hubert.py -w data_svc/waves-16k/ -v data_svc/hubert -t 1",
+   "python prepare/preprocess_speaker.py data_svc/waves-16k/ data_svc/speaker -t 0",
+   "python prepare/preprocess_speaker_ave.py data_svc/speaker/ data_svc/singer -t 0",
+   "python prepare/preprocess_spec.py -w data_svc/waves-32k/ -s data_svc/specs -t 0",
+   "python prepare/preprocess_train.py",
+   "python prepare/preprocess_zzz.py",
+]
+
 
 for command in commands:
    print(f"Command: {command}")

diff --git a/vits_decoder/generator.py b/vits_decoder/generator.py
@@ -113,7 +113,7 @@ def __init__(self, hp):
 
     def forward(self, spk, x, f0):
         # Perturbation
-        x = x + torch.randn_like(x)        
+        x = x + torch.randn_like(x)
         # adapter
         x = self.adapter(x, spk)
         x = self.conv_pre(x)

diff --git a/vits_extend/dataloader.py b/vits_extend/dataloader.py
@@ -16,7 +16,7 @@ def create_dataloader_train(hps, n_gpus, rank):
         shuffle=True)
     train_loader = DataLoader(
         train_dataset,
-        num_workers=2,
+        num_workers=4,
         shuffle=False,
         pin_memory=True,
         collate_fn=collate_fn,

diff --git a/whisper/inference.py b/whisper/inference.py
@@ -10,14 +10,22 @@
 
 def load_model(path) -> Whisper:
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    checkpoint = torch.load(path, map_location=device)
+    checkpoint = torch.load(path, map_location="cpu")
     dims = ModelDimensions(**checkpoint["dims"])
+    print(dims)
     model = Whisper(dims)
-    model.load_state_dict(checkpoint["model_state_dict"])
     del model.decoder
+    # cut = len(model.encoder.blocks) // 2
+    # cut = -1 * cut
+    # del model.encoder.blocks[cut:]
+    model.load_state_dict(checkpoint["model_state_dict"], strict=False)
     model.eval()
     model.half()
     model.to(device)
+    # torch.save({
+    #     'dims': checkpoint["dims"],
+    #     'model_state_dict': model.state_dict(),
+    # }, "large-v2.pt")
     return model
 
 
@@ -26,10 +34,10 @@ def pred_ppg(whisper: Whisper, wavPath, ppgPath):
     audln = audio.shape[0]
     ppg_a = []
     idx_s = 0
-    while (idx_s + 25 * 16000 < audln):
-        short = audio[idx_s:idx_s + 25 * 16000]
-        idx_s = idx_s + 25 * 16000
-        ppgln = 25 * 16000 // 320
+    while (idx_s + 15 * 16000 < audln):
+        short = audio[idx_s:idx_s + 15 * 16000]
+        idx_s = idx_s + 15 * 16000
+        ppgln = 15 * 16000 // 320
         # short = pad_or_trim(short)
         mel = log_mel_spectrogram(short).half().to(whisper.device)
         with torch.no_grad():