Merge pull request #3 from eternalliving/RemoteEngine

Created Remote engine for Glados TTS Former-commit-id: c645589
R2D2FISH · Mar 21, 2022 · 02e79b2 · 02e79b2
2 parents 4be29fe + 7f22a9c
commit 02e79b2
Show file tree

Hide file tree

Showing 4 changed files with 116 additions and 43 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 **/__pycache__/**
 *.wav
+audio/*
diff --git a/README.md b/README.md
@@ -6,6 +6,18 @@ If you want to just play around with the TTS, this works as stand-alone.
 python3 glados-tts/glados.py
 ```
 
+the TTS Engine can also be used remotely on a machine more powerful then the Pi to process in house TTS: (executed from glados-tts directory
+```console
+python3 engine-remote.py
+```
+
+Default port is 8124
+Be sure to update settings.env variable in your main Glados-voice-assistant directory:
+```
+TTS_ENGINE_API			= http://192.168.1.3:8124/synthesize/
+```
+
+
 ## Description
 The initial, regular Tacotron model was trained first on LJSpeech, and then on a heavily modified version of the Ellen McClain dataset (all non-Portal 2 voice lines removed, punctuation added).
 

diff --git a/engine.py b/engine.py
@@ -1,61 +1,121 @@
-# importing sys
 import sys
 import os
-
 sys.path.insert(0, os.getcwd()+'/glados_tts')
 
 import torch
 from utils.tools import prepare_text
 from scipy.io.wavfile import write
 import time
-
-sys.path.insert(0, os.getcwd()+'/glados_tts')
-
+
 print("\033[1;94mINFO:\033[;97m Initializing TTS Engine...")
 
 # Select the device
 if torch.is_vulkan_available():
-    device = 'vulkan'
+	device = 'vulkan'
 if torch.cuda.is_available():
-    device = 'cuda'
+	device = 'cuda'
 else:
-    device = 'cpu'
+	device = 'cpu'
 
 # Load models
-glados = torch.jit.load('glados_tts/models/glados.pt')
-vocoder = torch.jit.load('glados_tts/models/vocoder-gpu.pt', map_location=device)
+if __name__ == "__main__":
+	glados = torch.jit.load('models/glados.pt')
+	vocoder = torch.jit.load('models/vocoder-gpu.pt', map_location=device)
+else:
+	glados = torch.jit.load('glados_tts/models/glados.pt')
+	vocoder = torch.jit.load('glados_tts/models/vocoder-gpu.pt', map_location=device)
 
 # Prepare models in RAM
 for i in range(4):
-    init = glados.generate_jit(prepare_text(str(i)))
-    init_mel = init['mel_post'].to(device)
-    init_vo = vocoder(init_mel)
-
-
-def glados_tts(text):
-
-    # Tokenize, clean and phonemize input text
-    x = prepare_text(text).to('cpu')
-
-    with torch.no_grad():
-
-        # Generate generic TTS-output
-        old_time = time.time()
-        tts_output = glados.generate_jit(x)
-
-        # Use HiFiGAN as vocoder to make output sound like GLaDOS
-        mel = tts_output['mel_post'].to(device)
-        audio = vocoder(mel)
-        print("\033[1;94mINFO:\033[;97m The audio sample took " + str(round((time.time() - old_time) * 1000)) + " ms to generate.")
-
-        # Normalize audio to fit in wav-file
-        audio = audio.squeeze()
-        audio = audio * 32768.0
-        audio = audio.cpu().numpy().astype('int16')
-        output_file = ('output.wav')
-
-        # Write audio file to disk
-        # 22,05 kHz sample rate 
-        write(output_file, 22050, audio)
-
-    return True
+	init = glados.generate_jit(prepare_text(str(i)))
+	init_mel = init['mel_post'].to(device)
+	init_vo = vocoder(init_mel)
+
+
+def glados_tts(text, key=False):
+
+	# Tokenize, clean and phonemize input text
+	x = prepare_text(text).to('cpu')
+
+	with torch.no_grad():
+
+		# Generate generic TTS-output
+		old_time = time.time()
+		tts_output = glados.generate_jit(x)
+
+		# Use HiFiGAN as vocoder to make output sound like GLaDOS
+		mel = tts_output['mel_post'].to(device)
+		audio = vocoder(mel)
+		print("\033[1;94mINFO:\033[;97m The audio sample took " + str(round((time.time() - old_time) * 1000)) + " ms to generate.")
+
+		# Normalize audio to fit in wav-file
+		audio = audio.squeeze()
+		audio = audio * 32768.0
+		audio = audio.cpu().numpy().astype('int16')
+		if(key):
+			output_file = ('audio/GLaDOS-tts-temp-output-'+key+'.wav')
+		else:
+			output_file = ('audio/GLaDOS-tts-temp-output.wav')
+
+		# Write audio file to disk
+		# 22,05 kHz sample rate 
+		write(output_file, 22050, audio)
+
+	return True
+
+
+# If the script is run directly, assume remote engine
+if __name__ == "__main__":
+
+	# Remote Engine Veritables
+	PORT = 8124
+	CACHE = True
+
+	from flask import Flask, request, send_file
+	import urllib.parse
+	import shutil
+
+	print("\033[1;94mINFO:\033[;97m Initializing TTS Server...")
+
+	app = Flask(__name__)
+
+	@app.route('/synthesize/', defaults={'text': ''})
+	@app.route('/synthesize/<path:text>')
+	def synthesize(text):
+		if(text == ''): return 'No input'
+
+		line = urllib.parse.unquote(request.url[request.url.find('synthesize/')+11:])
+		filename = "GLaDOS-tts-"+line.replace(" ", "-")
+		filename = filename.replace("!", "")
+		filename = filename.replace("°c", "degrees celcius")
+		filename = filename.replace(",", "")+".wav"
+		file = os.getcwd()+'/audio/'+filename
+
+		# Check for Local Cache
+		if(os.path.isfile(file)):
+
+			# Update access time. This will allow for routine cleanups
+			os.utime(file, None)
+			print("\033[1;94mINFO:\033[;97m The audio sample sent from cache.")
+			return send_file(file)
+
+		# Generate New Sample
+		key = str(time.time())[7:]
+		if(glados_tts(line, key)):
+			tempfile = os.getcwd()+'/audio/GLaDOS-tts-temp-output-'+key+'.wav'
+
+			# If the line isn't too long, store in cache
+			if(len(line) < 200 and CACHE):
+				shutil.move(tempfile, file)
+			else:
+				return send_file(tempfile)
+				os.remove(tempfile)
+
+			return send_file(file)
+
+		else:
+			return 'TTS Engine Failed'
+
+	cli = sys.modules['flask.cli']
+	cli.show_server_banner = lambda *x: None
+	app.run(host="0.0.0.0", port=PORT)
diff --git a/utils/tools.py b/utils/tools.py
@@ -8,4 +8,4 @@ def prepare_text(text: str)->str:
         text = text + '.'
     cleaner = Cleaner('english_cleaners', True, 'en-us')
     tokenizer = Tokenizer()
-    return torch.as_tensor(tokenizer(cleaner(text)), dtype=torch.int, device='cpu').unsqueeze(0)
+    return torch.as_tensor(tokenizer(cleaner(text)), dtype=torch.int, device='cpu').unsqueeze(0)