1

代码是这样的:

from google.cloud import speech_v1
from google.cloud.speech_v1 import enums

def sample_long_running_recognize(storage_uri):

    client = speech_v1.SpeechClient()

    enable_word_time_offsets = True
    sample_rate_hertz = 44100
    audio_channel_count = 2
    encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16

    language_code = "en-US"
    config = {
        "enable_word_time_offsets": enable_word_time_offsets,
        "language_code": language_code,
        "encoding": encoding,
#        "audio_channel_count": audio_channel_count,
        "sample_rate_hertz": sample_rate_hertz
    }
    audio = {"uri": storage_uri}

    operation = client.long_running_recognize(config, audio)

    print(u"Waiting for operation to complete...")
    response = operation.result()

    # The first result includes start and end time word offsets
    result = response.results[0]
    # First alternative is the most probable result
    alternative = result.alternatives[0]
    print(u"Transcript: {}".format(alternative.transcript))
    # Print the start and end time of each word
    for word in alternative.words:
        print(u"Word: {}".format(word.word))
        print(
            u"Start time: {} seconds {} nanos".format(
                word.start_time.seconds, word.start_time.nanos
            )
        )
        print(
            u"End time: {} seconds {} nanos".format(
                word.end_time.seconds, word.end_time.nanos
            )
        )

我的音频文件是 43 秒的 wav 文件,采样率为 44100Hz,它有立体声通道。我尝试了相同的文件,仅将通道更改为单声道,但它仅将前 27 秒转换为相同的。我已经搜索了很多关于语音到文本 api 中的部分输出的信息,但我找不到类似的情况。

4

0 回答 0