以前的问题已经提出了相同或相似的查询
但似乎没有使用window.speechSynthesis()
. 尽管有使用的解决方法epeak
,meSpeak
如何在 chromium 浏览器中创建文本或将文本转换为音频?或向外部服务器发出请求。
如何捕获和记录window.speechSynthesis.speak()
呼叫的音频输出并将结果返回为Blob
、ArrayBuffer
或AudioBuffer
其他对象类型?
以前的问题已经提出了相同或相似的查询
但似乎没有使用window.speechSynthesis()
. 尽管有使用的解决方法epeak
,meSpeak
如何在 chromium 浏览器中创建文本或将文本转换为音频?或向外部服务器发出请求。
如何捕获和记录window.speechSynthesis.speak()
呼叫的音频输出并将结果返回为Blob
、ArrayBuffer
或AudioBuffer
其他对象类型?
Web Speech API 规范目前没有提供关于如何实现返回或捕获和记录window.speechSynthesis.speak()
呼叫音频输出的方法或提示。
也可以看看
Re: 来自 speak() 的 MediaStream、ArrayBuffer、Blob 音频结果用于录制?. 在相关部分,用例包括但不限于
说话有问题的人;即,中风或其他交流抑制疾病的人。他们可以将文本转换为音频文件并将文件发送给其他个人或团体。此功能将有助于他们与其他人交流,类似于帮助斯蒂芬霍金交流的技术;
目前,唯一能听到音频输出的人是浏览器前面的人;本质上,没有利用文本到语音功能的全部潜力。音频结果可用作电子邮件中的附件;媒体流;聊天系统;或其他通信应用程序。即控制生成的音频输出;
另一个应用是提供免费、自由、开源的音频词典和翻译服务——客户端到客户端、客户端到服务器、服务器到客户端。
可以使用和捕获window.speechSynthesis.speak()
呼叫的音频输出的输出。Chromium 浏览器返回预期结果。Firefox 的实施存在问题。在提示时选择。navigator.mediaDevices.getUserMedia()
MediaRecorder()
Monitor of Built-in Audio Analog Stereo
navigator.mediaDevices.getUserMedia()
解决方法很麻烦。我们应该能够得到生成的音频,至少作为一个Blob
,没有navigator.mediaDevices.getUserMedia()
和MediaRecorder()
。
浏览器用户、JavaScript 和 C++ 开发人员、浏览器实现者和规范作者显然需要更多的兴趣来进一步输入;为该功能创建适当的规范,并在浏览器的源代码中实现一致的实现;请参阅如何实现从 window.speechSynthesis.speak() 调用返回 Blob、ArrayBuffer 或 AudioBuffer 的选项。
在 Chromium 中,应安装语音调度程序并使用--enable-speech-dispatcher
标志集启动实例,window.speechSynthesis.getVoices()
返回一个空数组,请参阅如何在 chromium 上使用 Web Speech API?.
概念证明
// SpeechSynthesisRecorder.js guest271314 6-17-2017
// Motivation: Get audio output from `window.speechSynthesis.speak()` call
// as `ArrayBuffer`, `AudioBuffer`, `Blob`, `MediaSource`, `MediaStream`, `ReadableStream`, or other object or data types
// See https://lists.w3.org/Archives/Public/public-speech-api/2017Jun/0000.html
// https://github.com/guest271314/SpeechSynthesisRecorder
// Configuration: Analog Stereo Duplex
// Input Devices: Monitor of Built-in Audio Analog Stereo, Built-in Audio Analog Stereo
class SpeechSynthesisRecorder {
constructor({text = "", utteranceOptions = {}, recorderOptions = {}, dataType = ""}) {
if (text === "") throw new Error("no words to synthesize");
this.dataType = dataType;
this.text = text;
this.mimeType = MediaRecorder.isTypeSupported("audio/webm; codecs=opus")
? "audio/webm; codecs=opus" : "audio/ogg; codecs=opus";
this.utterance = new SpeechSynthesisUtterance(this.text);
this.speechSynthesis = window.speechSynthesis;
this.mediaStream_ = new MediaStream();
this.mediaSource_ = new MediaSource();
this.mediaRecorder = new MediaRecorder(this.mediaStream_, {
mimeType: this.mimeType,
bitsPerSecond: 256 * 8 * 1024
});
this.audioContext = new AudioContext();
this.audioNode = new Audio();
this.chunks = Array();
if (utteranceOptions) {
if (utteranceOptions.voice) {
this.speechSynthesis.onvoiceschanged = e => {
const voice = this.speechSynthesis.getVoices().find(({
name: _name
}) => _name === utteranceOptions.voice);
this.utterance.voice = voice;
console.log(voice, this.utterance);
}
this.speechSynthesis.getVoices();
}
let {
lang, rate, pitch
} = utteranceOptions;
Object.assign(this.utterance, {
lang, rate, pitch
});
}
this.audioNode.controls = "controls";
document.body.appendChild(this.audioNode);
}
start(text = "") {
if (text) this.text = text;
if (this.text === "") throw new Error("no words to synthesize");
return navigator.mediaDevices.getUserMedia({
audio: true
})
.then(stream => new Promise(resolve => {
const track = stream.getAudioTracks()[0];
this.mediaStream_.addTrack(track);
// return the current `MediaStream`
if (this.dataType && this.dataType === "mediaStream") {
resolve({tts:this, data:this.mediaStream_});
};
this.mediaRecorder.ondataavailable = event => {
if (event.data.size > 0) {
this.chunks.push(event.data);
};
};
this.mediaRecorder.onstop = () => {
track.stop();
this.mediaStream_.getAudioTracks()[0].stop();
this.mediaStream_.removeTrack(track);
console.log(`Completed recording ${this.utterance.text}`, this.chunks);
resolve(this);
}
this.mediaRecorder.start();
this.utterance.onstart = () => {
console.log(`Starting recording SpeechSynthesisUtterance ${this.utterance.text}`);
}
this.utterance.onend = () => {
this.mediaRecorder.stop();
console.log(`Ending recording SpeechSynthesisUtterance ${this.utterance.text}`);
}
this.speechSynthesis.speak(this.utterance);
}));
}
blob() {
if (!this.chunks.length) throw new Error("no data to return");
return Promise.resolve({
tts: this,
data: this.chunks.length === 1 ? this.chunks[0] : new Blob(this.chunks, {
type: this.mimeType
})
});
}
arrayBuffer(blob) {
if (!this.chunks.length) throw new Error("no data to return");
return new Promise(resolve => {
const reader = new FileReader;
reader.onload = e => resolve(({
tts: this,
data: reader.result
}));
reader.readAsArrayBuffer(blob ? new Blob(blob, {
type: blob.type
}) : this.chunks.length === 1 ? this.chunks[0] : new Blob(this.chunks, {
type: this.mimeType
}));
});
}
audioBuffer() {
if (!this.chunks.length) throw new Error("no data to return");
return this.arrayBuffer()
.then(ab => this.audioContext.decodeAudioData(ab))
.then(buffer => ({
tts: this,
data: buffer
}))
}
mediaSource() {
if (!this.chunks.length) throw new Error("no data to return");
return this.arrayBuffer()
.then(({
data: ab
}) => new Promise((resolve, reject) => {
this.mediaSource_.onsourceended = () => resolve({
tts: this,
data: this.mediaSource_
});
this.mediaSource_.onsourceopen = () => {
if (MediaSource.isTypeSupported(this.mimeType)) {
const sourceBuffer = this.mediaSource_.addSourceBuffer(this.mimeType);
sourceBuffer.mode = "sequence"
sourceBuffer.onupdateend = () =>
this.mediaSource_.endOfStream();
sourceBuffer.appendBuffer(ab);
} else {
reject(`${this.mimeType} is not supported`)
}
}
this.audioNode.src = URL.createObjectURL(this.mediaSource_);
}));
}
readableStream({size = 1024, controllerOptions = {}, rsOptions = {}}) {
if (!this.chunks.length) throw new Error("no data to return");
const src = this.chunks.slice(0);
const chunk = size;
return Promise.resolve({
tts: this,
data: new ReadableStream(controllerOptions || {
start(controller) {
console.log(src.length);
controller.enqueue(src.splice(0, chunk))
},
pull(controller) {
if (src.length = 0) controller.close();
controller.enqueue(src.splice(0, chunk));
}
}, rsOptions)
});
}
}
用法
let ttsRecorder = new SpeechSynthesisRecorder({
text: "The revolution will not be televised",
utternanceOptions: {
voice: "english-us espeak",
lang: "en-US",
pitch: .75,
rate: 1
}
});
// ArrayBuffer
ttsRecorder.start()
// `tts` : `SpeechSynthesisRecorder` instance, `data` : audio as `dataType` or method call result
.then(tts => tts.arrayBuffer())
.then(({tts, data}) => {
// do stuff with `ArrayBuffer`, `AudioBuffer`, `Blob`,
// `MediaSource`, `MediaStream`, `ReadableStream`
// `data` : `ArrayBuffer`
tts.audioNode.src = URL.createObjectURL(new Blob([data], {type:tts.mimeType}));
tts.audioNode.title = tts.utterance.text;
tts.audioNode.onloadedmetadata = () => {
console.log(tts.audioNode.duration);
tts.audioNode.play();
}
})
// AudioBuffer
ttsRecorder.start()
.then(tts => tts.audioBuffer())
.then(({tts, data}) => {
// `data` : `AudioBuffer`
let source = tts.audioContext.createBufferSource();
source.buffer = data;
source.connect(tts.audioContext.destination);
source.start()
})
// Blob
ttsRecorder.start()
.then(tts => tts.blob())
.then(({tts, data}) => {
// `data` : `Blob`
tts.audioNode.src = URL.createObjectURL(blob);
tts.audioNode.title = tts.utterance.text;
tts.audioNode.onloadedmetadata = () => {
console.log(tts.audioNode.duration);
tts.audioNode.play();
}
})
// ReadableStream
ttsRecorder.start()
.then(tts => tts.readableStream())
.then(({tts, data}) => {
// `data` : `ReadableStream`
console.log(tts, data);
data.getReader().read().then(({value, done}) => {
tts.audioNode.src = URL.createObjectURL(value[0]);
tts.audioNode.title = tts.utterance.text;
tts.audioNode.onloadedmetadata = () => {
console.log(tts.audioNode.duration);
tts.audioNode.play();
}
})
})
// MediaSource
ttsRecorder.start()
.then(tts => tts.mediaSource())
.then(({tts, data}) => {
console.log(tts, data);
// `data` : `MediaSource`
tts.audioNode.srcObj = data;
tts.audioNode.title = tts.utterance.text;
tts.audioNode.onloadedmetadata = () => {
console.log(tts.audioNode.duration);
tts.audioNode.play();
}
})
// MediaStream
let ttsRecorder = new SpeechSynthesisRecorder({
text: "The revolution will not be televised",
utternanceOptions: {
voice: "english-us espeak",
lang: "en-US",
pitch: .75,
rate: 1
},
dataType:"mediaStream"
});
ttsRecorder.start()
.then(({tts, data}) => {
// `data` : `MediaStream`
// do stuff with active `MediaStream`
})
.catch(err => console.log(err))
这是上一个答案的更新代码,适用于 Chrome 96:
<script>
(async () => {
const text = "The revolution will not be televised";
const blob = await new Promise(async resolve => {
console.log("picking system audio");
const stream = await navigator.mediaDevices.getDisplayMedia({video:true, audio:true});
const track = stream.getAudioTracks()[0];
if(!track)
throw "System audio not available";
stream.getVideoTracks().forEach(track => track.stop());
const mediaStream = new MediaStream();
mediaStream.addTrack(track);
const chunks = [];
const mediaRecorder = new MediaRecorder(mediaStream, {bitsPerSecond:128000});
mediaRecorder.ondataavailable = event => {
if (event.data.size > 0)
chunks.push(event.data);
}
mediaRecorder.onstop = () => {
stream.getTracks().forEach(track => track.stop());
mediaStream.removeTrack(track);
resolve(new Blob(chunks));
}
mediaRecorder.start();
const utterance = new SpeechSynthesisUtterance(text);
utterance.onend = () => mediaRecorder.stop();
window.speechSynthesis.speak(utterance);
console.log("speaking...");
});
console.log("audio available", blob);
const player = new Audio();
player.src = URL.createObjectURL(blob);
player.autoplay = true;
player.controls = true;
})()
</script>