From 0b0d153196ce8728e848ec615fe09aa49b1a26ca Mon Sep 17 00:00:00 2001 From: eternalliving Date: Fri, 18 Mar 2022 20:23:57 -0700 Subject: [PATCH 1/6] Created Remote engine for Glados TTS Former-commit-id: f877ce93657a0aecb61b17b22f73c4262f4e9f74 --- .gitignore | 2 +- README.md | 12 ++++++++ engine-remote.py | 75 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 engine-remote.py diff --git a/.gitignore b/.gitignore index ae2c404..ed52ba6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,2 @@ **/__pycache__/** -*.wav +*.wav \ No newline at end of file diff --git a/README.md b/README.md index ddc6cbe..714d2bb 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,18 @@ If you want to just play around with the TTS, this works as stand-alone. python3 glados-tts/glados.py ``` +the TTS Engine can also be used remotely on a machine more powerful then the Pi to process in house TTS: (executed from glados-tts directory +```console +python3 engine-remote.py +``` + +Default port is 8124 +Be sure to update settings.env variable in your main GLados-voice-assistant directory: +``` +TTS_ENGINE_URL = http://192.168.1.3:8124/synthesize/ +``` + + ## Description The initial, regular Tacotron model was trained first on LJSpeech, and then on a heavily modified version of the Ellen McClain dataset (all non-Portal 2 voice lines removed, punctuation added). diff --git a/engine-remote.py b/engine-remote.py new file mode 100644 index 0000000..f8dd32c --- /dev/null +++ b/engine-remote.py @@ -0,0 +1,75 @@ + +from flask import Flask, request, send_file +import torch +from utils.tools import prepare_text +from scipy.io.wavfile import write +import time +import os +import urllib.parse + +app = Flask(__name__) + +@app.route('/synthesize/', defaults={'text': ''}) +@app.route('/synthesize/') +def synthesize(text): + text = request.url[request.url.find('synthesize/')+11:] + if(text == ''): return 'No input' + if(glados_tts(urllib.parse.unquote(text))): + return send_file(os.getcwd()+'/output.wav') + else: + return 'TTS Engine Failed' + +print("Initializing TTS Engine...") + +# Select the device +if torch.is_vulkan_available(): + device = 'vulkan' +if torch.cuda.is_available(): + device = 'cuda' +else: + device = 'cpu' + +# Load models +glados = torch.jit.load('models/glados.pt') +vocoder = torch.jit.load('models/vocoder-gpu.pt', map_location=device) + +# Prepare models in RAM +for i in range(4): + init = glados.generate_jit(prepare_text(str(i))) + init_mel = init['mel_post'].to(device) + init_vo = vocoder(init_mel) + + +def glados_tts(text): + + # Tokenize, clean and phonemize input text + x = prepare_text(text).to('cpu') + + with torch.no_grad(): + + # Generate generic TTS-output + old_time = time.time() + tts_output = glados.generate_jit(x) + print("Forward Tacotron took " + str((time.time() - old_time) * 1000) + "ms") + + # Use HiFiGAN as vocoder to make output sound like GLaDOS + old_time = time.time() + mel = tts_output['mel_post'].to(device) + audio = vocoder(mel) + print("HiFiGAN took " + str((time.time() - old_time) * 1000) + "ms") + + # Normalize audio to fit in wav-file + audio = audio.squeeze() + audio = audio * 32768.0 + audio = audio.cpu().numpy().astype('int16') + output_file = ('output.wav') + + # Write audio file to disk + # 22,05 kHz sample rate + write(output_file, 22050, audio) + + return True + +print("Initializing TTS Server...") +if __name__ == "__main__": + app.run(host="0.0.0.0", port=8124) \ No newline at end of file From 4ee1b8dd7bcea5330dbc1b9f3cd8706eb8541990 Mon Sep 17 00:00:00 2001 From: eternalliving Date: Sun, 20 Mar 2022 04:14:36 -0700 Subject: [PATCH 2/6] Updated TTS Engine to be in single script Former-commit-id: a3f85ac4be7be2496e0f85e2bc771252b0d12b88 --- .gitignore | 3 +- engine-remote.py | 75 ------------------------------------------------ engine.py | 74 +++++++++++++++++++++++++++++++++++++++-------- utils/tools.py | 9 ++++++ 4 files changed, 73 insertions(+), 88 deletions(-) delete mode 100644 engine-remote.py diff --git a/.gitignore b/.gitignore index ed52ba6..0bba77b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ **/__pycache__/** -*.wav \ No newline at end of file +*.wav +audio/* \ No newline at end of file diff --git a/engine-remote.py b/engine-remote.py deleted file mode 100644 index f8dd32c..0000000 --- a/engine-remote.py +++ /dev/null @@ -1,75 +0,0 @@ - -from flask import Flask, request, send_file -import torch -from utils.tools import prepare_text -from scipy.io.wavfile import write -import time -import os -import urllib.parse - -app = Flask(__name__) - -@app.route('/synthesize/', defaults={'text': ''}) -@app.route('/synthesize/') -def synthesize(text): - text = request.url[request.url.find('synthesize/')+11:] - if(text == ''): return 'No input' - if(glados_tts(urllib.parse.unquote(text))): - return send_file(os.getcwd()+'/output.wav') - else: - return 'TTS Engine Failed' - -print("Initializing TTS Engine...") - -# Select the device -if torch.is_vulkan_available(): - device = 'vulkan' -if torch.cuda.is_available(): - device = 'cuda' -else: - device = 'cpu' - -# Load models -glados = torch.jit.load('models/glados.pt') -vocoder = torch.jit.load('models/vocoder-gpu.pt', map_location=device) - -# Prepare models in RAM -for i in range(4): - init = glados.generate_jit(prepare_text(str(i))) - init_mel = init['mel_post'].to(device) - init_vo = vocoder(init_mel) - - -def glados_tts(text): - - # Tokenize, clean and phonemize input text - x = prepare_text(text).to('cpu') - - with torch.no_grad(): - - # Generate generic TTS-output - old_time = time.time() - tts_output = glados.generate_jit(x) - print("Forward Tacotron took " + str((time.time() - old_time) * 1000) + "ms") - - # Use HiFiGAN as vocoder to make output sound like GLaDOS - old_time = time.time() - mel = tts_output['mel_post'].to(device) - audio = vocoder(mel) - print("HiFiGAN took " + str((time.time() - old_time) * 1000) + "ms") - - # Normalize audio to fit in wav-file - audio = audio.squeeze() - audio = audio * 32768.0 - audio = audio.cpu().numpy().astype('int16') - output_file = ('output.wav') - - # Write audio file to disk - # 22,05 kHz sample rate - write(output_file, 22050, audio) - - return True - -print("Initializing TTS Server...") -if __name__ == "__main__": - app.run(host="0.0.0.0", port=8124) \ No newline at end of file diff --git a/engine.py b/engine.py index d06721f..287c32b 100644 --- a/engine.py +++ b/engine.py @@ -1,16 +1,12 @@ -# importing sys -import sys -import os - -sys.path.insert(0, os.getcwd()+'/glados_tts') - import torch from utils.tools import prepare_text from scipy.io.wavfile import write import time +import sys +import os sys.path.insert(0, os.getcwd()+'/glados_tts') - + print("\033[1;94mINFO:\033[;97m Initializing TTS Engine...") # Select the device @@ -22,8 +18,8 @@ device = 'cpu' # Load models -glados = torch.jit.load('glados_tts/models/glados.pt') -vocoder = torch.jit.load('glados_tts/models/vocoder-gpu.pt', map_location=device) +glados = torch.jit.load('models/glados.pt') +vocoder = torch.jit.load('models/vocoder-gpu.pt', map_location=device) # Prepare models in RAM for i in range(4): @@ -32,7 +28,7 @@ init_vo = vocoder(init_mel) -def glados_tts(text): +def glados_tts(text, key=False): # Tokenize, clean and phonemize input text x = prepare_text(text).to('cpu') @@ -52,10 +48,64 @@ def glados_tts(text): audio = audio.squeeze() audio = audio * 32768.0 audio = audio.cpu().numpy().astype('int16') - output_file = ('output.wav') + if(key): + output_file = ('audio/GLaDOS-tts-temp-output-'+key+'.wav') + else: + output_file = ('audio/GLaDOS-tts-temp-output.wav') # Write audio file to disk # 22,05 kHz sample rate write(output_file, 22050, audio) - return True \ No newline at end of file + return True + + +# If the script is run directly, assume remote engine +if __name__ == "__main__": + + # Remote Engine Veritables + PORT = 8124 + CACHE = True + + from flask import Flask, request, send_file + from utils.tools import cleanTSSFile + import urllib.parse + + print("Initializing TTS Server...") + + app = Flask(__name__) + + @app.route('/synthesize/', defaults={'text': ''}) + @app.route('/synthesize/') + def synthesize(text): + if(text == ''): return 'No input' + + line = urllib.parse.unquote(request.url[request.url.find('synthesize/')+11:]) + file = os.getcwd()+'/audio/'+cleanTTSFile(line) + + # Check for Local Cache + if(os.path.isfile(file)): + + # Update access time. This will allow for routine cleanups + os.utime(file, None) + + return send_file(file) + + # Generate New Sample + key = time.time()[7:] + if(glados_tts(line, key)): + tempfile = os.getcwd()+'/audio/GLaDOS-tts-temp-output-'+key+'.wav' + return send_file(tempfile) + + # If the line isn't too long, store in cache + if(len(line) < 200): + shutil.move(tempfile, 'audio/'+file) + else: + os.remove(tempfile) + + else: + return 'TTS Engine Failed' + + cli = sys.modules['flask.cli'] + cli.show_server_banner = lambda *x: None + app.run(host="0.0.0.0", PORT) \ No newline at end of file diff --git a/utils/tools.py b/utils/tools.py index f79bee0..8c6c449 100644 --- a/utils/tools.py +++ b/utils/tools.py @@ -9,3 +9,12 @@ def prepare_text(text: str)->str: cleaner = Cleaner('english_cleaners', True, 'en-us') tokenizer = Tokenizer() return torch.as_tensor(tokenizer(cleaner(text)), dtype=torch.int, device='cpu').unsqueeze(0) + +# Check Local Cache +def cleanTTSFile(line): + filename = "GLaDOS-tts-"+line.replace(" ", "-") + filename = filename.replace("!", "") + filename = filename.replace("°c", "degrees celcius") + filename = filename.replace(",", "")+".wav" + + return filename \ No newline at end of file From b3a28a8f09ce43a2b0daff14eb2b5ca029e22e1c Mon Sep 17 00:00:00 2001 From: eternalliving Date: Sun, 20 Mar 2022 07:41:03 -0700 Subject: [PATCH 3/6] Fixing all my mistakes :) Former-commit-id: 1ffa7d7203f9019d09538751996086cc64a24259 --- engine.py | 65 ++++++++++++++++++++++++++------------------------ utils/tools.py | 11 +-------- 2 files changed, 35 insertions(+), 41 deletions(-) diff --git a/engine.py b/engine.py index 287c32b..87c5a23 100644 --- a/engine.py +++ b/engine.py @@ -11,11 +11,11 @@ # Select the device if torch.is_vulkan_available(): - device = 'vulkan' + device = 'vulkan' if torch.cuda.is_available(): - device = 'cuda' + device = 'cuda' else: - device = 'cpu' + device = 'cpu' # Load models glados = torch.jit.load('models/glados.pt') @@ -23,41 +23,41 @@ # Prepare models in RAM for i in range(4): - init = glados.generate_jit(prepare_text(str(i))) - init_mel = init['mel_post'].to(device) - init_vo = vocoder(init_mel) + init = glados.generate_jit(prepare_text(str(i))) + init_mel = init['mel_post'].to(device) + init_vo = vocoder(init_mel) def glados_tts(text, key=False): - # Tokenize, clean and phonemize input text - x = prepare_text(text).to('cpu') + # Tokenize, clean and phonemize input text + x = prepare_text(text).to('cpu') - with torch.no_grad(): + with torch.no_grad(): - # Generate generic TTS-output - old_time = time.time() - tts_output = glados.generate_jit(x) + # Generate generic TTS-output + old_time = time.time() + tts_output = glados.generate_jit(x) - # Use HiFiGAN as vocoder to make output sound like GLaDOS - mel = tts_output['mel_post'].to(device) - audio = vocoder(mel) - print("\033[1;94mINFO:\033[;97m The audio sample took " + str(round((time.time() - old_time) * 1000)) + " ms to generate.") + # Use HiFiGAN as vocoder to make output sound like GLaDOS + mel = tts_output['mel_post'].to(device) + audio = vocoder(mel) + print("\033[1;94mINFO:\033[;97m The audio sample took " + str(round((time.time() - old_time) * 1000)) + " ms to generate.") - # Normalize audio to fit in wav-file - audio = audio.squeeze() - audio = audio * 32768.0 - audio = audio.cpu().numpy().astype('int16') + # Normalize audio to fit in wav-file + audio = audio.squeeze() + audio = audio * 32768.0 + audio = audio.cpu().numpy().astype('int16') if(key): output_file = ('audio/GLaDOS-tts-temp-output-'+key+'.wav') else: output_file = ('audio/GLaDOS-tts-temp-output.wav') - # Write audio file to disk - # 22,05 kHz sample rate - write(output_file, 22050, audio) + # Write audio file to disk + # 22,05 kHz sample rate + write(output_file, 22050, audio) - return True + return True # If the script is run directly, assume remote engine @@ -68,10 +68,9 @@ def glados_tts(text, key=False): CACHE = True from flask import Flask, request, send_file - from utils.tools import cleanTSSFile import urllib.parse - print("Initializing TTS Server...") + print("\033[1;94mINFO:\033[;97m Initializing TTS Server...") app = Flask(__name__) @@ -81,7 +80,11 @@ def synthesize(text): if(text == ''): return 'No input' line = urllib.parse.unquote(request.url[request.url.find('synthesize/')+11:]) - file = os.getcwd()+'/audio/'+cleanTTSFile(line) + filename = "GLaDOS-tts-"+line.replace(" ", "-") + filename = filename.replace("!", "") + filename = filename.replace("°c", "degrees celcius") + filename = filename.replace(",", "")+".wav" + file = os.getcwd()+'/audio/'+filename # Check for Local Cache if(os.path.isfile(file)): @@ -92,14 +95,14 @@ def synthesize(text): return send_file(file) # Generate New Sample - key = time.time()[7:] + key = str(time.time())[7:] if(glados_tts(line, key)): tempfile = os.getcwd()+'/audio/GLaDOS-tts-temp-output-'+key+'.wav' return send_file(tempfile) # If the line isn't too long, store in cache - if(len(line) < 200): - shutil.move(tempfile, 'audio/'+file) + if(len(line) < 200 and CACHE): + shutil.move(tempfile, filename) else: os.remove(tempfile) @@ -108,4 +111,4 @@ def synthesize(text): cli = sys.modules['flask.cli'] cli.show_server_banner = lambda *x: None - app.run(host="0.0.0.0", PORT) \ No newline at end of file + app.run(host="0.0.0.0", port=PORT) \ No newline at end of file diff --git a/utils/tools.py b/utils/tools.py index 8c6c449..32e01bd 100644 --- a/utils/tools.py +++ b/utils/tools.py @@ -8,13 +8,4 @@ def prepare_text(text: str)->str: text = text + '.' cleaner = Cleaner('english_cleaners', True, 'en-us') tokenizer = Tokenizer() - return torch.as_tensor(tokenizer(cleaner(text)), dtype=torch.int, device='cpu').unsqueeze(0) - -# Check Local Cache -def cleanTTSFile(line): - filename = "GLaDOS-tts-"+line.replace(" ", "-") - filename = filename.replace("!", "") - filename = filename.replace("°c", "degrees celcius") - filename = filename.replace(",", "")+".wav" - - return filename \ No newline at end of file + return torch.as_tensor(tokenizer(cleaner(text)), dtype=torch.int, device='cpu').unsqueeze(0) \ No newline at end of file From c5dd8ef6bdefa60c1495182bf0dae5b2f3d2f8f6 Mon Sep 17 00:00:00 2001 From: eternalliving Date: Sun, 20 Mar 2022 07:44:36 -0700 Subject: [PATCH 4/6] fixed readme error Former-commit-id: 086e4011c089d079ae174a954a86f1a6d6737ac3 --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 714d2bb..fd71608 100644 --- a/README.md +++ b/README.md @@ -12,9 +12,9 @@ python3 engine-remote.py ``` Default port is 8124 -Be sure to update settings.env variable in your main GLados-voice-assistant directory: +Be sure to update settings.env variable in your main Glados-voice-assistant directory: ``` -TTS_ENGINE_URL = http://192.168.1.3:8124/synthesize/ +TTS_ENGINE_API = http://192.168.1.3:8124/synthesize/ ``` From 0aa6b5565297de9ab118f0b319909adbd611d2a5 Mon Sep 17 00:00:00 2001 From: eternalliving Date: Sun, 20 Mar 2022 08:22:14 -0700 Subject: [PATCH 5/6] Fixed model locations Former-commit-id: db2b8ebbcf0149fa4d9f9ade83f6dc8fd6844a51 --- engine.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/engine.py b/engine.py index 87c5a23..c222a85 100644 --- a/engine.py +++ b/engine.py @@ -1,11 +1,11 @@ +import sys +import os +sys.path.insert(0, os.getcwd()+'/glados_tts') + import torch from utils.tools import prepare_text from scipy.io.wavfile import write import time -import sys -import os - -sys.path.insert(0, os.getcwd()+'/glados_tts') print("\033[1;94mINFO:\033[;97m Initializing TTS Engine...") @@ -18,8 +18,12 @@ device = 'cpu' # Load models -glados = torch.jit.load('models/glados.pt') -vocoder = torch.jit.load('models/vocoder-gpu.pt', map_location=device) +if __name__ == "__main__": + glados = torch.jit.load('models/glados.pt') + vocoder = torch.jit.load('models/vocoder-gpu.pt', map_location=device) +else: + glados = torch.jit.load('glados_tts/models/glados.pt') + vocoder = torch.jit.load('glados_tts/models/vocoder-gpu.pt', map_location=device) # Prepare models in RAM for i in range(4): From 7f22a9c4c736ca28e4b285195942580d8870519b Mon Sep 17 00:00:00 2001 From: eternalliving Date: Sun, 20 Mar 2022 09:03:04 -0700 Subject: [PATCH 6/6] Fixed cache arrangement and all my other errors Former-commit-id: 285a9cf34a5f892c87e19c472ceef4c8d0bd5fcd --- engine.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/engine.py b/engine.py index c222a85..1680f6c 100644 --- a/engine.py +++ b/engine.py @@ -73,6 +73,7 @@ def glados_tts(text, key=False): from flask import Flask, request, send_file import urllib.parse + import shutil print("\033[1;94mINFO:\033[;97m Initializing TTS Server...") @@ -95,21 +96,23 @@ def synthesize(text): # Update access time. This will allow for routine cleanups os.utime(file, None) - + print("\033[1;94mINFO:\033[;97m The audio sample sent from cache.") return send_file(file) # Generate New Sample key = str(time.time())[7:] if(glados_tts(line, key)): tempfile = os.getcwd()+'/audio/GLaDOS-tts-temp-output-'+key+'.wav' - return send_file(tempfile) - + # If the line isn't too long, store in cache if(len(line) < 200 and CACHE): - shutil.move(tempfile, filename) + shutil.move(tempfile, file) else: + return send_file(tempfile) os.remove(tempfile) + return send_file(file) + else: return 'TTS Engine Failed'