好的,这就是我最终决定的——而且效果很好。使用 ZeroMQ 进行消息传递,我具有获取 wav 播放时间的功能,另一个收集有关即将说出的语音的数据,然后在发送语音之前将所有数据发送到电机核心。运动核心处理时间问题以使下巴与语音同步。所以,我实际上并没有将生成 wav 并将 wav 播放时间的长度返回到最终使用它的节点上的代码,但事实证明消息传递足够快,所以有足够的时间空间来接收、处理和执行运动控制以完美匹配语音。在此处发布此内容,以防它对将来处理类似问题的人们有所帮助。
import time
import zmq
import os
import re
import wave
import contextlib
context = zmq.Context()
socket = context.socket(zmq.REP)
socket.bind("tcp://*:5555") #Listens for speech to output
print("Connecting to Motor Control")
jawCmd = context.socket(zmq.PUB)
jawCmd.connect("tcp://192.168.1.210:5554") #Sends to MotorFunctions for Jaw Movement
def getPlayTime(): # Checks to see if current file duration has changed
fname = '/tmp/speech.wav' # and if yes, sends new duration
with contextlib.closing(wave.open(fname,'r')) as f:
frames = f.getnframes()
rate = f.getframerate()
duration = round(frames / float(rate), 3)
speakTime = str(duration)
return speakTime
def set_voice(V,T):
T2 = '"' + T + '"'
audioFile = "/tmp/speech.wav" # /tmp set as tmpfs, or RAMDISK to reduce SD Card write ops
if V == "A":
voice = "Allison"
elif V == "B":
voice = "Belle"
elif V == "C":
voice = "Callie"
elif V == "D":
voice = "Dallas"
elif V == "V":
voice = "David"
else:
voice = "Belle"
os.system("swift -n " + voice + " -o " + audioFile + " " +T2) # Record audio
tailTrim = .5 # Calculate Jaw Timing
speakTime = eval(getPlayTime()) # Start by getting playlength
speakTime = round((speakTime - tailTrim), 2) # Chop .5 s for trailing silence
wordList = T.split()
jawString = []
for index in range(len(wordList)):
wordLen = len(wordList[index])
jawString.append(wordLen)
jawString = str(jawString)
speakTime = str(speakTime)
jawString = speakTime + "|" + jawString # 3.456|[4, 2, 7, 4, 2, 9, 3, 4, 3, 6] - will split on "|"
jawCmd.send_string(jawString) # Send Jaw Operating Sequence
os.system("aplay " + audioFile) # Play audio
pronunciationDict = {'teh':'the','process':'prawcess','Maeve':'Mayve','Mariposa':'May-reeposah','Lila':'Lala','Trump':'Ass hole'}
def adjustResponse(response): # Adjusts spellings in output string to create better speech output.
for key, value in pronunciationDict.items():
if key in response or key.lower() in response:
response = re.sub(key, value, response, flags=re.I)
return response
SpeakText="Speech center connected and online."
set_voice(V,SpeakText) # Cepstral Voices: A = Allison; B = Belle; C = Callie; D = Dallas; V = David;
while True:
SpeakText = socket.recv().decode('utf-8') # .decode gets rid of the b' in front of the string
SpeakTextX = adjustResponse(SpeakText) # Run the string through the pronunciation dictionary
print("SpeakText = ",SpeakTextX)
set_voice(V,SpeakTextX)
print("Received request: %s" % SpeakTextX)
socket.send_string(str(SpeakTextX)) # Send data back to source for confirmation