我没有直接回答您的问题,但我可能会为您提供一些有用的建议。我设法在我最近建立的一个网站中实现了 Transcribe websocket 音频流。我使用了VueJS,但过程应该非常相似。我没有使用 AWS Transcribe Javascript SDK,而是使用他们提供的 Github 链接,将我的代码基于 AWS博客文章中的信息。
这两种资源对于让它发挥作用都至关重要。如果你克隆 git repo 并运行代码,如果我没记错的话,你应该有一个工作示例。直到今天,我还没有完全理解代码是如何工作的,因为我不了解音频,但它确实有效。
我最终在一些 JS 文件中修改和实现了 Github 代码,然后将其添加到我的代码中。然后我必须计算一些AWS Signature V4的东西,我可以将其发送到 Transcribe API,然后它会返回一个我可以使用 JS 打开的 websocket 链接。发送到 Transcribe websocket 的数据来自附加的麦克风,可以使用MediaDevices.getUserMedia()找到。上面提到的 Github 代码包含将麦克风音频转换为 Transcribe 所需的文件,因为它只接受 8000 和 16000 的比特率,具体取决于您选择的语言。
理解 Transcribe 文档并找到我必须放在一起的所有部分很棘手,因为流式传输到 Transcribe 似乎有点边缘情况,但我希望我提到的资源会让你更容易一些。
编辑:添加源代码
获取转录 websocket 链接。
我在 AWS Lambda 函数运行节点中进行了设置,但您可以将其中的所有内容复制exports.handler
到普通的 JS 文件中。您将需要 cryptojs、aws-sdk 和 moment 节点模块!
//THIS SCRIPT IS BASED ON https://docs.aws.amazon.com/transcribe/latest/dg/websocket.html
const crypto = require('crypto-js');
const moment = require('moment');
const aws = require('aws-sdk');
const awsRegion = '!YOUR-REGION!'
const accessKey = '!YOUR-IAM-ACCESS-KEY!';
const secretAccessKey = '!YOUR-IAM-SECRET-KEY!';
exports.handler = async (event) => {
console.log(event);
// let body = JSON.parse(event.body); I made a body object below for you to test with
let body = {
languageCode: "en-US", //or en-GB etc. I found en-US works better, even for British people due to the higher sample rate, which makes the audio clearer.
sampleRate: 16000
}
console.log(crypto.enc.Hex.stringify(signature_key));
let method = "GET"
let region = awsRegion;
let endpoint = "wss://transcribestreaming." + region + ".amazonaws.com:8443"
let host = "transcribestreaming." + region + ".amazonaws.com:8443"
let amz_date = new moment().format('yyyyMMDDTHHmmss') + 'Z';
let datestamp = new moment().format('yyyyMMDD');
let service = 'transcribe';
let linkExpirationSeconds = 60;
let signatureString = crypto.enc.Hex.stringify(signature_key);
let languageCode = body.languageCode;
let sampleRate = body.sampleRate
let canonical_uri = "/stream-transcription-websocket"
let canonical_headers = "host:" + host + "\n"
let signed_headers = "host"
let algorithm = "AWS4-HMAC-SHA256"
let credential_scope = datestamp + "%2F" + region + "%2F" + service + "%2F" + "aws4_request"
// Date and time of request - NOT url formatted
let credential_scope2 = datestamp + "/" + region + "/" + service + "/" + "aws4_request"
let canonical_querystring = "X-Amz-Algorithm=" + algorithm
canonical_querystring += "&X-Amz-Credential="+ accessKey + "%2F" + credential_scope
canonical_querystring += "&X-Amz-Date=" + amz_date
canonical_querystring += "&X-Amz-Expires=" + linkExpirationSeconds
canonical_querystring += "&X-Amz-SignedHeaders=" + signed_headers
canonical_querystring += "&language-code=" + languageCode + "&media-encoding=pcm&sample-rate=" + sampleRate
//Empty hash as playload is unknown
let emptyHash = crypto.SHA256("");
let payload_hash = crypto.enc.Hex.stringify(emptyHash);
let canonical_request = method + '\n'
+ canonical_uri + '\n'
+ canonical_querystring + '\n'
+ canonical_headers + '\n'
+ signed_headers + '\n'
+ payload_hash
let hashedCanonicalRequest = crypto.SHA256(canonical_request);
let string_to_sign = algorithm + "\n"
+ amz_date + "\n"
+ credential_scope2 + "\n"
+ crypto.enc.Hex.stringify(hashedCanonicalRequest);
//Create the signing key
let signing_key = getSignatureKey(secretAccessKey, datestamp, region, service);
//Sign the string_to_sign using the signing key
let inBytes = crypto.HmacSHA256(string_to_sign, signing_key);
let signature = crypto.enc.Hex.stringify(inBytes);
canonical_querystring += "&X-Amz-Signature=" + signature;
let request_url = endpoint + canonical_uri + "?" + canonical_querystring;
//The final product
console.log(request_url);
let response = {
statusCode: 200,
headers: {
"Access-Control-Allow-Origin": "*"
},
body: JSON.stringify(request_url)
};
return response;
};
function getSignatureKey(key, dateStamp, regionName, serviceName) {
var kDate = crypto.HmacSHA256(dateStamp, "AWS4" + key);
var kRegion = crypto.HmacSHA256(regionName, kDate);
var kService = crypto.HmacSHA256(serviceName, kRegion);
var kSigning = crypto.HmacSHA256("aws4_request", kService);
return kSigning;
};
用于打开 websocket、seding 音频和接收响应的代码
安装 npm 模块:mic-stream(不确定它是否仍然可用,但它在那个 Github 存储库的源代码中,我可能刚刚将它粘贴到 node_modules 文件夹中),@aws-sdk/util-utf8-node,@ aws-sdk/eventstream-marshaller
import audioUtils from "../js/audioUtils.js"; //For encoding audio data as PCM
import mic from "microphone-stream"; //Collect microphone input as a stream of raw bytes
import * as util_utf8_node from "@aws-sdk/util-utf8-node"; //Utilities for encoding and decoding UTF8
import * as marshaller from "@aws-sdk/eventstream-marshaller"; //For converting binary event stream messages to and from JSON
let micstream;
let mediastream;
let inputSampleRate; // The sample rate your mic is producting
let transcribeSampleRate = 16000 //The sample rate you requested from Transcribe
let transcribeLanguageCode = "en-US"; //The language you want Transcribe to use
let websocket;
// first we get the microphone input from the browser (as a promise)...
let mediaStream;
try {
mediaStream = await window.navigator.mediaDevices.getUserMedia({
video: false,
audio: true
})
}
catch (error) {
console.log(error);
alert("Error. Please make sure you allow this website to access your microphone");
return;
}
this.eventStreamMarshaller = new marshaller.EventStreamMarshaller(util_utf8_node.toUtf8, util_utf8_node.fromUtf8);
//let's get the mic input from the browser, via the microphone-stream module
micStream = new mic();
micStream.on("format", data => {
inputSampleRate = data.sampleRate;
});
micStream.setStream(mediaStream);
//THIS IS WHERE YOU NEED TO GET YOURSELF A LINK FROM TRANSCRIBE
//AS MENTIONED I USED AWS LAMBDA FOR THIS
//LOOK AT THE ABOVE CODE FOR GETTING A TRANSCRIBE LINK
getTranscribeLink(transcribeLanguageCode, transcribeSampleRate) // Not a real funtion, you need to make this! The options are what would be in the body object in AWS Lambda
let url = "!YOUR-GENERATED-URL!"
//Configure your websocket
websocket = new WebSocket(url);
websocket.binaryType = "arraybuffer";
websocket.onopen = () => {
//Make the spinner disappear
micStream.on('data', rawAudioChunk => {
// the audio stream is raw audio bytes. Transcribe expects PCM with additional metadata, encoded as binary
let binary = convertAudioToBinaryMessage(rawAudioChunk);
if (websocket.readyState === websocket.OPEN)
websocket.send(binary);
}
)};
// handle messages, errors, and close events
websocket.onmessage = async message => {
//convert the binary event stream message to JSON
var messageWrapper = this.eventStreamMarshaller.unmarshall(Buffer(message.data));
var messageBody = JSON.parse(String.fromCharCode.apply(String, messageWrapper.body));
//THIS IS WHERE YOU DO SOMETHING WITH WHAT YOU GET FROM TRANSCRIBE
console.log("Got something from Transcribe!:");
console.log(messageBody);
}
// FUNCTIONS
function convertAudioToBinaryMessage(audioChunk) {
var raw = mic.toRaw(audioChunk);
if (raw == null) return; // downsample and convert the raw audio bytes to PCM
var downsampledBuffer = audioUtils.downsampleBuffer(raw, inputSampleRate, transcribeSampleRate);
var pcmEncodedBuffer = audioUtils.pcmEncode(downsampledBuffer); // add the right JSON headers and structure to the message
var audioEventMessage = this.getAudioEventMessage(Buffer.from(pcmEncodedBuffer)); //convert the JSON object + headers into a binary event stream message
var binary = this.eventStreamMarshaller.marshall(audioEventMessage);
return binary;
}
function getAudioEventMessage(buffer) {
// wrap the audio data in a JSON envelope
return {
headers: {
':message-type': {
type: 'string',
value: 'event'
},
':event-type': {
type: 'string',
value: 'AudioEvent'
}
},
body: buffer
};
}
audioUtils.js
export default {
pcmEncode: pcmEncode,
downsampleBuffer: downsampleBuffer
}
export function pcmEncode(input) {
var offset = 0;
var buffer = new ArrayBuffer(input.length * 2);
var view = new DataView(buffer);
for (var i = 0; i < input.length; i++, offset += 2) {
var s = Math.max(-1, Math.min(1, input[i]));
view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
}
return buffer;
}
export function downsampleBuffer(buffer, inputSampleRate = 44100, outputSampleRate = 16000) {
if (outputSampleRate === inputSampleRate) {
return buffer;
}
var sampleRateRatio = inputSampleRate / outputSampleRate;
var newLength = Math.round(buffer.length / sampleRateRatio);
var result = new Float32Array(newLength);
var offsetResult = 0;
var offsetBuffer = 0;
while (offsetResult < result.length) {
var nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
var accum = 0,
count = 0;
for (var i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++ ) {
accum += buffer[i];
count++;
}
result[offsetResult] = accum / count;
offsetResult++;
offsetBuffer = nextOffsetBuffer;
}
return result;
}
我想就是这样。它肯定足以让你让它工作