Skip to content

Commit

Permalink
Switch to logging for print statements
Browse files Browse the repository at this point in the history
  • Loading branch information
Unal Ege Gaznepoglu committed Dec 22, 2023
1 parent 9df17c7 commit 8b09872
Show file tree
Hide file tree
Showing 25 changed files with 136 additions and 92 deletions.
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import torch
torch.set_num_threads(1)

Expand All @@ -11,6 +12,7 @@
from anonymization.modules.tts.IMSToucan.TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.EnergyCalculator import EnergyCalculator
from anonymization.modules.tts.IMSToucan.TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.PitchCalculator import Parselmouth

logger = logging.getLogger(__name__)

class ImsProsodyExtractor:

Expand Down Expand Up @@ -54,7 +56,7 @@ def extract_prosody(self,
try:
norm_wave = self.ap.audio_to_wave_tensor(normalize=True, audio=wave)
except ValueError:
print('Something went wrong, the reference wave might be too short.')
logger.error('Something went wrong, the reference wave might be too short.')
raise RuntimeError

with torch.inference_mode():
Expand Down
10 changes: 6 additions & 4 deletions anonymization/modules/prosody/prosody_extraction.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import torch
torch.set_num_threads(1)

Expand All @@ -8,6 +9,7 @@
from .extraction import *
from utils import read_kaldi_format

logger = logging.getLogger(__name__)

class ProsodyExtraction:

Expand Down Expand Up @@ -47,7 +49,7 @@ def extract_prosody(self, dataset_path: Path, texts, dataset_name=None):
wav_scp = {utt: wav_scp[utt] for utt in unprocessed_utts}

if wav_scp:
print(f'Extract prosody for {len(wav_scp)} of {len(wav_scp) + len(data_prosody)} utterances')
logger.info(f'Extract prosody for {len(wav_scp)} of {len(wav_scp) + len(data_prosody)} utterances')
data_prosody.new = True
i = 0
for utt, wav_path in tqdm(wav_scp.items()):
Expand All @@ -56,7 +58,7 @@ def extract_prosody(self, dataset_path: Path, texts, dataset_name=None):
utt_prosody = self.extractor.extract_prosody(transcript=text, ref_audio_path=wav_path,
input_is_phones=text_is_phones)
except IndexError:
print(f'Index Error for {utt}')
logger.warn(f'IndexError for {utt}')
continue
duration, pitch, energy, start_silence, end_silence = utt_prosody
data_prosody.add_instance(utterance=utt, duration=duration, pitch=pitch, energy=energy,
Expand All @@ -69,8 +71,8 @@ def extract_prosody(self, dataset_path: Path, texts, dataset_name=None):
data_prosody.save_prosody(dataset_results_dir)

elif len(data_prosody.utterances) > 0:
print('No prosody extraction necessary; load stored values instead...')
logger.info('No prosody extraction necessary; load stored values instead...')
else:
print(f'No utterances could be found in {dataset_path}!')
logger.warn(f'No utterances could be found in {dataset_path}!')

return data_prosody
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
from pathlib import Path
import torch
import numpy as np
Expand All @@ -10,6 +11,7 @@
from ..speaker_embeddings import SpeakerEmbeddings
from .utils.WGAN import EmbeddingsGenerator

logger = logging.getLogger(__name__)

class GANAnonymizer(BaseAnonymizer):
"""
Expand Down Expand Up @@ -84,9 +86,9 @@ def anonymize_embeddings(
or 'utt' for utterance level).
"""
if emb_level == "spk":
print(f"Anonymize embeddings of {len(speaker_embeddings)} speakers...")
logger.info(f"Anonymize embeddings of {len(speaker_embeddings)} speakers...")
elif emb_level == "utt":
print(f"Anonymize embeddings of {len(speaker_embeddings)} utterances...")
logger.info(f"Anonymize embeddings of {len(speaker_embeddings)} utterances...")

identifiers = []
speakers = []
Expand Down Expand Up @@ -117,7 +119,7 @@ def anonymize_embeddings(
return anon_embeddings

def _generate_artificial_embeddings(self, gan_model_path: Path, n: int):
print(f"Generate {n} artificial speaker embeddings...")
logger.info(f"Generate {n} artificial speaker embeddings...")
generator = EmbeddingsGenerator(gan_path=gan_model_path, device=self.device)
gan_vectors = generator.generate_embeddings(n=n)
unused_indices = np.arange(len(gan_vectors))
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
from pathlib import Path
import numpy as np
import torch
Expand All @@ -15,6 +16,8 @@
from ..speaker_embeddings import SpeakerEmbeddings
from utils import transform_path

logger = logging.getLogger(__name__)

REVERSED_GENDERS = {
"m": "f",
"f": "m"
Expand Down Expand Up @@ -144,7 +147,7 @@ def __init__(
self.stats_per_dim_path = stats_per_dim_path or Path()

def _load_pool_embeddings(self, pool_data_dir, pool_vec_path, embed_model_path):
print(pool_data_dir)
logger.debug(pool_data_dir)
if pool_vec_path.exists():
pool_embeddings = SpeakerEmbeddings(
vec_type=self.vec_type, emb_level="spk", device=self.device
Expand All @@ -168,7 +171,7 @@ def anonymize_embeddings(self, speaker_embeddings: torch.Tensor, emb_level: str
vectors_a=self.pool_embeddings.vectors, vectors_b=speaker_embeddings.vectors
)

print(f"Anonymize embeddings of {len(speaker_embeddings)} speakers...")
logging.info(f"Anonymize embeddings of {len(speaker_embeddings)} speakers...")
identifiers = []
speakers = []
anon_vectors = []
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import logging
from pathlib import Path
import torch
from os import PathLike
Expand All @@ -8,6 +9,7 @@
from .base_anon import BaseAnonymizer
from ..speaker_embeddings import SpeakerEmbeddings

logger = logging.getLogger(__name__)

class RandomAnonymizer(BaseAnonymizer):
"""
Expand Down Expand Up @@ -73,7 +75,7 @@ def anonymize_embeddings(self, speaker_embeddings, emb_level="spk"):
utterance level).
"""
if self.scaling_ranges:
print("Anonymize vectors in scale!")
logger.debug("Anonymize vectors in scale!")
return self._anonymize_data_in_scale(speaker_embeddings)
else:
identifiers = []
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
# This code is based on the descriptions in https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/processing/PLDA_LDA.py
import logging
from pathlib import Path
from speechbrain.processing.PLDA_LDA import PLDA, StatObject_SB, Ndx, fast_PLDA_scoring
import numpy as np
import torch

logger = logging.getLogger(__name__)
class PLDAModel:

def __init__(self, train_embeddings, results_path: Path=None, save_plda=True):
Expand Down Expand Up @@ -64,13 +66,13 @@ def _train_plda(self, train_embeddings):
vectors = train_embeddings.vectors.to(torch.float64)

modelset = np.array([f'md{speaker}' for speaker in train_embeddings.original_speakers], dtype="|O")
print(len(modelset), len(set(modelset)))
logger.debug(len(modelset), len(set(modelset)))
segset, s, stat0 = self._get_vector_stats(vectors, sg_tag='sg', utt_ids=train_embeddings.get_utt_list())

xvectors_stat = StatObject_SB(modelset=modelset, segset=segset, start=s, stop=s, stat0=stat0,
stat1=vectors.cpu().numpy())

print(vectors.shape)
logger.debug(vectors.shape)

plda = PLDA(rank_f=100)
plda.plda(xvectors_stat)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import logging
from pathlib import Path

from .anonymization.base_anon import BaseAnonymizer
from .speaker_embeddings import SpeakerEmbeddings

logger = logging.getLogger(__name__)

class SpeakerAnonymization:

Expand Down Expand Up @@ -38,14 +40,14 @@ def anonymize_embeddings(self, speaker_embeddings, dataset_name):
self.force_compute:
# if there are already anonymized speaker embeddings from this model and the computation is not forced,
# simply load them
print('No computation of anonymized embeddings necessary; load existing anonymized speaker embeddings '
logger.info('No computation of anonymized embeddings necessary; load existing anonymized speaker embeddings '
'instead...')
anon_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, emb_level=self.emb_level, device=self.device)
anon_embeddings.load_vectors(dataset_results_dir)
return anon_embeddings
else:
# otherwise, create new anonymized speaker embeddings
print('Anonymize speaker embeddings...')
logger.info('Anonymize speaker embeddings...')
anon_embeddings = self.anonymizer.anonymize_embeddings(speaker_embeddings, emb_level=self.emb_level)

if self.save_intermediate:
Expand All @@ -58,5 +60,5 @@ def _load_anonymizer(self, settings: dict):
'The anonymizer must be an instance of BaseAnonymizer, or a ' \
f'subclass of it, but received an instance of {type(anon_method)}'

print(f'Model type of anonymizer: {type(anon_method).__name__}')
logger.info(f'Model type of anonymizer: {type(anon_method).__name__}')
return anon_method
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
from tqdm import tqdm
from pathlib import Path
import torch
Expand All @@ -14,7 +15,7 @@
from utils import read_kaldi_format

set_start_method('spawn', force=True)

logger = logging.getLogger(__name__)

class SpeakerExtraction:

Expand Down Expand Up @@ -57,12 +58,12 @@ def extract_speakers(self, dataset_path, dataset_name=None, emb_level=None):
speaker_embeddings = SpeakerEmbeddings(vec_type=self.vec_type, emb_level='utt', device=self.devices[0])

if (dataset_results_dir / 'speaker_vectors.pt').exists() and not self.force_compute:
print('No speaker extraction necessary; load existing embeddings instead...')
logger.info('No speaker extraction necessary; load existing embeddings instead...')
speaker_embeddings.load_vectors(dataset_results_dir)
# assume the loaded vectors are computed according to the setting in config
speaker_embeddings.emb_level = emb_level
else:
print(f'Extract embeddings of {len(wav_scp)} utterances')
logger.info(f'Extract embeddings of {len(wav_scp)} utterances')
speaker_embeddings.new = True

if self.n_processes > 1:
Expand Down Expand Up @@ -126,7 +127,7 @@ def extraction_job(data):
try:
spk_embs = [extractor.extract_vector(audio=norm_wave, sr=fs) for extractor in speaker_extractors]
except RuntimeError as e:
print(f'Runtime error: {utt}, {signal.shape}, {norm_wave.shape}')
logger.warn(f'Runtime error: {utt}, {signal.shape}, {norm_wave.shape}')
continue

if len(spk_embs) == 1:
Expand Down
11 changes: 6 additions & 5 deletions anonymization/modules/text/speech_recognition.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from tqdm import tqdm
from tqdm.contrib.concurrent import process_map
import time
import logging
from torch.multiprocessing import set_start_method
from itertools import cycle, repeat
import numpy as np
Expand All @@ -11,7 +12,7 @@
from utils import read_kaldi_format

set_start_method('spawn', force=True)

logger = logging.getLogger(__name__)

class SpeechRecognition:

Expand Down Expand Up @@ -49,13 +50,13 @@ def recognize_speech(self, dataset_path, dataset_name=None, utterance_list=None)
texts.load_text(in_dir=dataset_results_dir)

if len(texts) == len(utt2spk):
print('No speech recognition necessary; load existing text instead...')
logger.info('No speech recognition necessary; load existing text instead...')
else:
if len(texts) > 0:
print(f'No speech recognition necessary for {len(texts)} of {len(utt2spk)} utterances')
logger.info(f'No speech recognition necessary for {len(texts)} of {len(utt2spk)} utterances')
# otherwise, recognize the speech
dataset_results_dir.mkdir(exist_ok=True, parents=True)
print(f'Recognize speech of {len(utt2spk)} utterances...')
logger.info(f'Recognize speech of {len(utt2spk)} utterances...')
wav_scp = read_kaldi_format(dataset_path / 'wav.scp')

utterances = []
Expand Down Expand Up @@ -86,7 +87,7 @@ def recognize_speech(self, dataset_path, dataset_name=None, utterance_list=None)

end = time.time()
total_time = round(end - start, 2)
print(f'Total time for speech recognition: {total_time} seconds ({round(total_time / 60, 2)} minutes / '
logger.info(f'Total time for speech recognition: {total_time} seconds ({round(total_time / 60, 2)} minutes / '
f'{round(total_time / 60 / 60, 2)} hours)')
texts = self._combine_texts(main_text_instance=texts, additional_text_instances=new_texts)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import itertools
import os
import logging

import librosa.display as lbd
import matplotlib.pyplot as plt
Expand All @@ -15,6 +16,7 @@
from ..Preprocessing.TextFrontend import get_language_id
from ..TrainingInterfaces.Spectrogram_to_Embedding.StyleEmbedding import StyleEmbedding

logger = logging.getLogger(__name__)

class AnonFastSpeech2(torch.nn.Module):

Expand Down Expand Up @@ -174,7 +176,7 @@ def read_to_file(self,
for (text, durations, pitch, energy) in itertools.zip_longest(text_list, dur_list, pitch_list, energy_list):
if text.strip() != "":
if not silent:
print("Now synthesizing: {}".format(text))
logger.info("Now synthesizing: {}".format(text))
if wav is None:
if durations is not None:
durations = durations.to(self.device)
Expand Down
7 changes: 4 additions & 3 deletions anonymization/modules/tts/IMSToucan/Utility/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@

import os
from abc import ABC

import logging
import torch

logger = logging.getLogger(__name__)

def cumsum_durations(durations):
out = [0]
Expand Down Expand Up @@ -39,11 +40,11 @@ def get_most_recent_checkpoint(checkpoint_dir, verbose=True):
if el.endswith(".pt") and el != "best.pt":
checkpoint_list.append(int(el.split(".")[0].split("_")[1]))
if len(checkpoint_list) == 0:
print("No previous checkpoints found, cannot reload.")
logger.info("No previous checkpoints found, cannot reload.")
return None
checkpoint_list.sort(reverse=True)
if verbose:
print("Reloading checkpoint_{}.pt".format(checkpoint_list[0]))
logger.info("Reloading checkpoint_{}.pt".format(checkpoint_list[0]))
return os.path.join(checkpoint_dir, "checkpoint_{}.pt".format(checkpoint_list[0]))


Expand Down
4 changes: 3 additions & 1 deletion anonymization/modules/tts/IMSToucan/UtteranceCloner.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import soundfile as sf
import torch
from torch.optim import SGD
Expand All @@ -10,6 +11,7 @@
from .TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.EnergyCalculator import EnergyCalculator
from .TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.PitchCalculator import Parselmouth

logger = logging.getLogger(__name__)

class UtteranceCloner:

Expand Down Expand Up @@ -59,7 +61,7 @@ def extract_prosody(self,
try:
norm_wave = self.ap.audio_to_wave_tensor(normalize=True, audio=wave)
except ValueError:
print('Something went wrong, the reference wave might be too short.')
logger.error('Something went wrong, the reference wave might be too short.')
raise RuntimeError

with torch.inference_mode():
Expand Down
4 changes: 3 additions & 1 deletion anonymization/modules/tts/ims_tts.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import torch
import resampy
import logging

from .IMSToucan.InferenceInterfaces.AnonFastSpeech2 import AnonFastSpeech2

logger = logging.getLogger(__name__)

class ImsTTS:

Expand Down Expand Up @@ -36,7 +38,7 @@ def read_text(self, text, speaker_embedding, text_is_phones=True, duration=None,
if i > 30:
break
if i > 0:
print(f'Synthesized utt in {i} takes')
logger.info(f'Synthesized utt in {i} takes')

# start and end silence are computed for 16000, so we have to adapt this to different output sr
factor = self.output_sr // 16000
Expand Down
Loading

0 comments on commit 8b09872

Please sign in to comment.