0

这是AWS Transcribe S3 .wav file to text的后续问题。我使用流来读取 .wav 文件内容并将其发送到 AWS。

我没有取回正确的成绩单,而是像一堆“是的”一样胡说八道。陈述。看起来 AWS 无法正确解释字节流,但我不确定出了什么问题。我想知道文件是否需要以某种方式编码,即我不能直接从文件发送原始 .wav 字节?或者我可能需要告诉服务这是 .wav 格式?

这里有什么问题?输入文件是一个有效的 .wav 语音文件,当我听它时听起来很清晰。

这是我的java代码:

package com.amazonaws.transcribe;

import org.reactivestreams.Publisher;
import org.reactivestreams.Subscriber;
import org.reactivestreams.Subscription;
import software.amazon.awssdk.core.SdkBytes;
import software.amazon.awssdk.regions.Region;
import software.amazon.awssdk.services.transcribestreaming.TranscribeStreamingAsyncClient;
import software.amazon.awssdk.services.transcribestreaming.model.*;

import javax.sound.sampled.*;
import java.io.*;
import java.net.URISyntaxException;
import java.nio.ByteBuffer;
import java.util.List;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicLong;


public class TranscribeFileFromStream {
    private static final Region REGION = Region.US_EAST_1;
    private static TranscribeStreamingAsyncClient client;

    public static void main(String args[]) throws URISyntaxException, ExecutionException, InterruptedException, LineUnavailableException {
        System.out.println(System.getProperty("java.version"));
        client = TranscribeStreamingAsyncClient.builder()
                .region(REGION)
                .build();
        try {
            CompletableFuture<Void> result = client.startStreamTranscription(getRequest(16000),
                    new AudioStreamPublisher(getStreamFromFile()),
                    getResponseHandler());
            result.get();
        } finally {
            if (client != null) {
                client.close();
            }
        }
    }

    private static InputStream getStreamFromFile() {
        try {
            File inputFile = new File("~/work/transcribe/src/main/resources/story/media/Story3.m4a.wav");
            InputStream audioStream = new FileInputStream(inputFile);
            return audioStream;
        } catch (FileNotFoundException e) {
            throw new RuntimeException(e);
        }
    }

    private static StartStreamTranscriptionRequest getRequest(Integer mediaSampleRateHertz) {
        return StartStreamTranscriptionRequest.builder()
                .languageCode(LanguageCode.EN_US)
                .mediaEncoding(MediaEncoding.PCM)
                .mediaSampleRateHertz(mediaSampleRateHertz)
                .build();
    }

    private static StartStreamTranscriptionResponseHandler getResponseHandler() {
        return StartStreamTranscriptionResponseHandler.builder()
                .onResponse(r -> {
                    System.out.println("Received Initial response");
                })
                .onError(e -> {
                    System.out.println(e.getMessage());
                    StringWriter sw = new StringWriter();
                    e.printStackTrace(new PrintWriter(sw));
                    System.out.println("Error Occurred: " + sw.toString());
                })
                .onComplete(() -> {
                    System.out.println("=== All records stream successfully ===");
                })
                .subscriber(event -> {
                    List<Result> results = ((TranscriptEvent) event).transcript().results();
                    if (results.size() > 0) {
                        if (!results.get(0).alternatives().get(0).transcript().isEmpty()) {
                            System.out.println(results.get(0).alternatives().get(0).transcript());
                        } else {
                            System.out.println("Empty result");
                        }
                    } else {
                        System.out.println("No results");
                    }
                })
                .build();
    }

    private static class AudioStreamPublisher implements Publisher<AudioStream> {
        private final InputStream inputStream;
        private static Subscription currentSubscription;


        private AudioStreamPublisher(InputStream inputStream) {
            this.inputStream = inputStream;
        }

        @Override
        public void subscribe(Subscriber<? super AudioStream> s) {

            if (this.currentSubscription == null) {
                this.currentSubscription = new SubscriptionImpl(s, inputStream);
            } else {
                this.currentSubscription.cancel();
                this.currentSubscription = new SubscriptionImpl(s, inputStream);
            }
            s.onSubscribe(currentSubscription);
        }
    }

    public static class SubscriptionImpl implements Subscription {
        private static final int CHUNK_SIZE_IN_BYTES = 1024 * 1;
        private final Subscriber<? super AudioStream> subscriber;
        private final InputStream inputStream;
        private ExecutorService executor = Executors.newFixedThreadPool(1);
        private AtomicLong demand = new AtomicLong(0);

        SubscriptionImpl(Subscriber<? super AudioStream> s, InputStream inputStream) {
            this.subscriber = s;
            this.inputStream = inputStream;
        }

        @Override
        public void request(long n) {
            if (n <= 0) {
                subscriber.onError(new IllegalArgumentException("Demand must be positive"));
            }

            demand.getAndAdd(n);

            executor.submit(() -> {
                try {
                    do {
                        ByteBuffer audioBuffer = getNextEvent();
                        if (audioBuffer.remaining() > 0) {
                            AudioEvent audioEvent = audioEventFromBuffer(audioBuffer);
                            subscriber.onNext(audioEvent);
                        } else {
                            subscriber.onComplete();
                            break;
                        }
                    } while (demand.decrementAndGet() > 0);
                } catch (Exception e) {
                    subscriber.onError(e);
                }
            });
        }

        @Override
        public void cancel() {
            executor.shutdown();
        }

        private ByteBuffer getNextEvent() {
            ByteBuffer audioBuffer = null;
            byte[] audioBytes = new byte[CHUNK_SIZE_IN_BYTES];

            int len = 0;
            try {
                len = inputStream.read(audioBytes);

                if (len <= 0) {
                    audioBuffer = ByteBuffer.allocate(0);
                } else {
                    audioBuffer = ByteBuffer.wrap(audioBytes, 0, len);
                }
            } catch (IOException e) {
                throw new UncheckedIOException(e);
            }

            return audioBuffer;
        }

        private AudioEvent audioEventFromBuffer(ByteBuffer bb) {
            return AudioEvent.builder()
                    .audioChunk(SdkBytes.fromByteBuffer(bb))
                    .build();
        }
    }
}

这是我的程序输出:

Received Initial response
No results
No results
Yeah.
No results
Yeah.
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
Yeah.
No results
No results
Oh,
No results
Oh,
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
Oh,
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
4

2 回答 2

1

音频文件的采样率为 44.1 kHz。它被转换为 16 kHz,并且可以正常工作:

https://drive.google.com/file/d/1mYVbNlYK3SpGT4NbFRYGn86177eTCqhd/view?usp=sharing

在此处输入图像描述

于 2021-01-22T22:21:14.540 回答
1

正如 smac2020 指出的那样,采样率是错误的。调试传递给 AWS 的错误元数据值很棘手,因为 AWS 没有错误。你只是得到一个不正确的转录。所以,这里的教训是,确保你知道正确的价值观是什么。其中一些可以自动检测到。

如果您使用的是 mac,则 mediainfo 工具非常有用。

 brew install mediainfo

ffmpeg 也是如此:

brew install ffmpeg

这是一个更新的示例,我使用 AudioFormat.java 自动检测采样率。理想情况下,AWS sdk 会为您执行此操作。如果媒体文件超出可转录的参数范围,则会引发异常。请注意,我必须使用以下工具将原始文件修改为 16,000 采样率:nch.com.au/switch/index.html。如果 SDK 还能够修改采样率等,那就太好了(提示,提示),以便可以更改文件以适应输入参数。

package com.amazonaws.transcribe;

import org.reactivestreams.Publisher;
import org.reactivestreams.Subscriber;
import org.reactivestreams.Subscription;
import software.amazon.awssdk.core.SdkBytes;
import software.amazon.awssdk.regions.Region;
import software.amazon.awssdk.services.transcribestreaming.TranscribeStreamingAsyncClient;
import software.amazon.awssdk.services.transcribestreaming.model.*;

import javax.sound.sampled.*;
import java.io.*;
import java.nio.ByteBuffer;
import java.util.List;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicLong;

import static javax.sound.sampled.AudioFormat.Encoding.*;


public class TranscribeFileFromStream {
    private static final Region REGION = Region.US_EAST_1;
    private static TranscribeStreamingAsyncClient client;

    public static void main(String args[]) throws Exception {
        System.setProperty("AWS_ACCESS_KEY_ID", "myId");
        System.setProperty("AWS_SECRET_ACCESS_KEY", "myKey");

        System.out.println(System.getProperty("java.version"));
       // BasicConfigurator.configure();
        client = TranscribeStreamingAsyncClient.builder()
                .region(REGION)
                .build();
        try {
            File inputFile = new File("/home/me/work/transcribe/src/main/resources/test-file.wav");
            CompletableFuture<Void> result = client.startStreamTranscription(
                    getRequest(inputFile),
                    new AudioStreamPublisher(getStreamFromFile(inputFile)),
                    getResponseHandler());
            result.get();
        } finally {
            if (client != null) {
                client.close();
            }
        }
    }

    private static InputStream getStreamFromFile(File inputFile) {
        try {
            return new FileInputStream(inputFile);
        } catch (FileNotFoundException e) {
            throw new RuntimeException(e);
        }
    }

    private static StartStreamTranscriptionRequest getRequest(File inputFile) throws IOException, UnsupportedAudioFileException {
        //TODO: I read the file twice in this example.  Can this be more performant?
        AudioInputStream audioInputStream = AudioSystem.getAudioInputStream(inputFile);
        AudioFormat audioFormat = audioInputStream.getFormat();
        return StartStreamTranscriptionRequest.builder()
                .languageCode(LanguageCode.EN_US)
                //.mediaEncoding(MediaEncoding.PCM)
                .mediaEncoding(getAwsMediaEncoding(audioFormat))
                .mediaSampleRateHertz(getAwsSampleRate(audioFormat))
                .build();
    }

    private static MediaEncoding getAwsMediaEncoding(AudioFormat audioFormat) {
        final String javaMediaEncoding = audioFormat.getEncoding().toString();

        if (PCM_SIGNED.toString().equals(javaMediaEncoding)) {
            return MediaEncoding.PCM;
        } else if (PCM_UNSIGNED.toString().equals(javaMediaEncoding)){
            return MediaEncoding.PCM;
        } /*else if (ALAW.toString().equals(javaMediaEncoding)){
            //WARNING: I have no idea how ALAW maps to AWS media encodings.
            return MediaEncoding.OGG_OPUS;
        } else if (ULAW.toString().equals(javaMediaEncoding)){
            //WARNING: I have no idea how ULAW maps to AWS encodings.  
            return MediaEncoding.FLAC;
        }*/

        throw new IllegalArgumentException("Not a recognized media encoding:" + javaMediaEncoding);
    }

    private static Integer getAwsSampleRate(AudioFormat audioFormat) {
        return Math.round(audioFormat.getSampleRate());
    }

    private static StartStreamTranscriptionResponseHandler getResponseHandler() {
        return StartStreamTranscriptionResponseHandler.builder()
                .onResponse(r -> {
                    System.out.println("Received Initial response");
                })
                .onError(e -> {
                    System.out.println(e.getMessage());
                    StringWriter sw = new StringWriter();
                    e.printStackTrace(new PrintWriter(sw));
                    System.out.println("Error Occurred: " + sw.toString());
                })
                .onComplete(() -> {
                    System.out.println("=== All records stream successfully ===");
                })
                .subscriber(event -> {
                    List<Result> results = ((TranscriptEvent) event).transcript().results();
                    if (results.size() > 0) {
                        if (!results.get(0).alternatives().get(0).transcript().isEmpty()) {
                            System.out.println(results.get(0).alternatives().get(0).transcript());
                        } else {
                            System.out.println("Empty result");
                        }
                    } else {
                        System.out.println("No results");
                    }
                })
                .build();
    }

    private static class AudioStreamPublisher implements Publisher<AudioStream> {
        private final InputStream inputStream;
        private static Subscription currentSubscription;


        private AudioStreamPublisher(InputStream inputStream) {
            this.inputStream = inputStream;
        }

        @Override
        public void subscribe(Subscriber<? super AudioStream> s) {

            if (this.currentSubscription == null) {
                this.currentSubscription = new SubscriptionImpl(s, inputStream);
            } else {
                this.currentSubscription.cancel();
                this.currentSubscription = new SubscriptionImpl(s, inputStream);
            }
            s.onSubscribe(currentSubscription);
        }
    }

    public static class SubscriptionImpl implements Subscription {
        private static final int CHUNK_SIZE_IN_BYTES = 1024 * 1;
        private final Subscriber<? super AudioStream> subscriber;
        private final InputStream inputStream;
        private ExecutorService executor = Executors.newFixedThreadPool(1);
        private AtomicLong demand = new AtomicLong(0);

        SubscriptionImpl(Subscriber<? super AudioStream> s, InputStream inputStream) {
            this.subscriber = s;
            this.inputStream = inputStream;
        }

        @Override
        public void request(long n) {
            if (n <= 0) {
                subscriber.onError(new IllegalArgumentException("Demand must be positive"));
            }

            demand.getAndAdd(n);

            executor.submit(() -> {
                try {
                    do {
                        ByteBuffer audioBuffer = getNextEvent();
                        if (audioBuffer.remaining() > 0) {
                            AudioEvent audioEvent = audioEventFromBuffer(audioBuffer);
                            subscriber.onNext(audioEvent);
                        } else {
                            subscriber.onComplete();
                            break;
                        }
                    } while (demand.decrementAndGet() > 0);
                } catch (Exception e) {
                    subscriber.onError(e);
                }
            });
        }

        @Override
        public void cancel() {
            executor.shutdown();
        }

        private ByteBuffer getNextEvent() {
            ByteBuffer audioBuffer = null;
            byte[] audioBytes = new byte[CHUNK_SIZE_IN_BYTES];

            int len = 0;
            try {
                len = inputStream.read(audioBytes);

                if (len <= 0) {
                    audioBuffer = ByteBuffer.allocate(0);
                } else {
                    audioBuffer = ByteBuffer.wrap(audioBytes, 0, len);
                }
            } catch (IOException e) {
                throw new UncheckedIOException(e);
            }

            return audioBuffer;
        }

        private AudioEvent audioEventFromBuffer(ByteBuffer bb) {
            return AudioEvent.builder()
                    .audioChunk(SdkBytes.fromByteBuffer(bb))
                    .build();
        }
    }
}
于 2021-01-23T18:55:24.520 回答