diff --git a/.gitignore b/.gitignore index ae2c404..0bba77b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ **/__pycache__/** *.wav +audio/* \ No newline at end of file diff --git a/README.md b/README.md index ddc6cbe..fd71608 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,18 @@ If you want to just play around with the TTS, this works as stand-alone. python3 glados-tts/glados.py ``` +the TTS Engine can also be used remotely on a machine more powerful then the Pi to process in house TTS: (executed from glados-tts directory +```console +python3 engine-remote.py +``` + +Default port is 8124 +Be sure to update settings.env variable in your main Glados-voice-assistant directory: +``` +TTS_ENGINE_API = http://192.168.1.3:8124/synthesize/ +``` + + ## Description The initial, regular Tacotron model was trained first on LJSpeech, and then on a heavily modified version of the Ellen McClain dataset (all non-Portal 2 voice lines removed, punctuation added). diff --git a/engine.py b/engine.py index d06721f..1680f6c 100644 --- a/engine.py +++ b/engine.py @@ -1,61 +1,121 @@ -# importing sys import sys import os - sys.path.insert(0, os.getcwd()+'/glados_tts') import torch from utils.tools import prepare_text from scipy.io.wavfile import write import time - -sys.path.insert(0, os.getcwd()+'/glados_tts') - + print("\033[1;94mINFO:\033[;97m Initializing TTS Engine...") # Select the device if torch.is_vulkan_available(): - device = 'vulkan' + device = 'vulkan' if torch.cuda.is_available(): - device = 'cuda' + device = 'cuda' else: - device = 'cpu' + device = 'cpu' # Load models -glados = torch.jit.load('glados_tts/models/glados.pt') -vocoder = torch.jit.load('glados_tts/models/vocoder-gpu.pt', map_location=device) +if __name__ == "__main__": + glados = torch.jit.load('models/glados.pt') + vocoder = torch.jit.load('models/vocoder-gpu.pt', map_location=device) +else: + glados = torch.jit.load('glados_tts/models/glados.pt') + vocoder = torch.jit.load('glados_tts/models/vocoder-gpu.pt', map_location=device) # Prepare models in RAM for i in range(4): - init = glados.generate_jit(prepare_text(str(i))) - init_mel = init['mel_post'].to(device) - init_vo = vocoder(init_mel) - - -def glados_tts(text): - - # Tokenize, clean and phonemize input text - x = prepare_text(text).to('cpu') - - with torch.no_grad(): - - # Generate generic TTS-output - old_time = time.time() - tts_output = glados.generate_jit(x) - - # Use HiFiGAN as vocoder to make output sound like GLaDOS - mel = tts_output['mel_post'].to(device) - audio = vocoder(mel) - print("\033[1;94mINFO:\033[;97m The audio sample took " + str(round((time.time() - old_time) * 1000)) + " ms to generate.") - - # Normalize audio to fit in wav-file - audio = audio.squeeze() - audio = audio * 32768.0 - audio = audio.cpu().numpy().astype('int16') - output_file = ('output.wav') - - # Write audio file to disk - # 22,05 kHz sample rate - write(output_file, 22050, audio) - - return True \ No newline at end of file + init = glados.generate_jit(prepare_text(str(i))) + init_mel = init['mel_post'].to(device) + init_vo = vocoder(init_mel) + + +def glados_tts(text, key=False): + + # Tokenize, clean and phonemize input text + x = prepare_text(text).to('cpu') + + with torch.no_grad(): + + # Generate generic TTS-output + old_time = time.time() + tts_output = glados.generate_jit(x) + + # Use HiFiGAN as vocoder to make output sound like GLaDOS + mel = tts_output['mel_post'].to(device) + audio = vocoder(mel) + print("\033[1;94mINFO:\033[;97m The audio sample took " + str(round((time.time() - old_time) * 1000)) + " ms to generate.") + + # Normalize audio to fit in wav-file + audio = audio.squeeze() + audio = audio * 32768.0 + audio = audio.cpu().numpy().astype('int16') + if(key): + output_file = ('audio/GLaDOS-tts-temp-output-'+key+'.wav') + else: + output_file = ('audio/GLaDOS-tts-temp-output.wav') + + # Write audio file to disk + # 22,05 kHz sample rate + write(output_file, 22050, audio) + + return True + + +# If the script is run directly, assume remote engine +if __name__ == "__main__": + + # Remote Engine Veritables + PORT = 8124 + CACHE = True + + from flask import Flask, request, send_file + import urllib.parse + import shutil + + print("\033[1;94mINFO:\033[;97m Initializing TTS Server...") + + app = Flask(__name__) + + @app.route('/synthesize/', defaults={'text': ''}) + @app.route('/synthesize/') + def synthesize(text): + if(text == ''): return 'No input' + + line = urllib.parse.unquote(request.url[request.url.find('synthesize/')+11:]) + filename = "GLaDOS-tts-"+line.replace(" ", "-") + filename = filename.replace("!", "") + filename = filename.replace("°c", "degrees celcius") + filename = filename.replace(",", "")+".wav" + file = os.getcwd()+'/audio/'+filename + + # Check for Local Cache + if(os.path.isfile(file)): + + # Update access time. This will allow for routine cleanups + os.utime(file, None) + print("\033[1;94mINFO:\033[;97m The audio sample sent from cache.") + return send_file(file) + + # Generate New Sample + key = str(time.time())[7:] + if(glados_tts(line, key)): + tempfile = os.getcwd()+'/audio/GLaDOS-tts-temp-output-'+key+'.wav' + + # If the line isn't too long, store in cache + if(len(line) < 200 and CACHE): + shutil.move(tempfile, file) + else: + return send_file(tempfile) + os.remove(tempfile) + + return send_file(file) + + else: + return 'TTS Engine Failed' + + cli = sys.modules['flask.cli'] + cli.show_server_banner = lambda *x: None + app.run(host="0.0.0.0", port=PORT) \ No newline at end of file diff --git a/utils/tools.py b/utils/tools.py index f79bee0..32e01bd 100644 --- a/utils/tools.py +++ b/utils/tools.py @@ -8,4 +8,4 @@ def prepare_text(text: str)->str: text = text + '.' cleaner = Cleaner('english_cleaners', True, 'en-us') tokenizer = Tokenizer() - return torch.as_tensor(tokenizer(cleaner(text)), dtype=torch.int, device='cpu').unsqueeze(0) + return torch.as_tensor(tokenizer(cleaner(text)), dtype=torch.int, device='cpu').unsqueeze(0) \ No newline at end of file