python - Python 音频对齐器

Question

我需要一些关于 python 脚本的帮助。它是 Penn 语音音频校准器。我安装了它，一切正常，只是有点太好了 :) 我正在尝试将成绩单与语音中的时间相匹配，这确实如此，但它也走得太远了，这也是提供每个“电话”的所有时间（如音节）。为了说明这里是一个小部分：

IntervalTier        IntervalTier
phone           word
0.012471655     0.012471655
1.758503401     1.758503401
sp              sp
1.758503401     1.758503401
2.197505669     2.197505669
AY1             I
2.197505669     2.197505669
2.287301587     2.52675737
T               TALKING    
2.287301587     2.52675737
2.397052154     2.756235828
AO1     
2.397052154     
2.466893424     
K       
2.466893424     
2.496825397     
IH0     
2.496825397     
2.52675737      
NG      
2.52675737      
2.556689342

你可以在发音指南中看到你，它使用 AY1 来表示I，然后将 TALKING 分解成一堆音节。一方面，我“可以”按原样使用数据，但是处理一个 20 分钟的文件需要 15 分钟，我认为如果它限制自己只查找单词，它可以做得更快，这将是很好，因为我有数百小时的时间来处理。有一种昂贵（11,000 美元）的商业产品可以在大约 30 秒内完成相同的工作，所以我认为如果我们能加快一点速度，那就太棒了。

我尝试注释掉它写出“电话”的部分，然后更改 HVite 中的选项以进行单词匹配 -o SWT（来自此页面：http ://www.ee.columbia.edu/ln/LabROSA/ doc/HTKBook21/node143.html），但我得到了这个错误：

Traceback (most recent call last):
  File "./align.py", line 316, in <module>
    writeTextGrid(outfile, readAlignedMLF(output_mlf, SR, float(wave_start)))
  File "./align.py", line 146, in readAlignedMLF
    ph = lines[j].split()[2]
IndexError: list index out of range

使用修改后的 HVite 命令，该脚本的运行速度明显更快，但当然以错误结束。这是python脚本：

#!/usr/bin/env python

""" Command-line usage:
      python align.py [options] wave_file transcript_file output_file
      where options may include:
        -r sampling_rate -- override which sample rate model to use, one of 8000, 11025, and 16000
        -s start_time    -- start of portion of wavfile to align (in seconds, default 0)
        -e end_time      -- end of portion of wavfile to align (in seconds, defaul to end)

    You can also import this file as a module and use the functions directly.
"""

import os
import sys
import getopt
import wave
import re

def prep_wav(orig_wav, out_wav, sr_override, wave_start, wave_end):
    global sr_models

    if os.path.exists(out_wav) and False :
        f = wave.open(out_wav, 'r')
        SR = f.getframerate()
        f.close()
        print "Already re-sampled the wav file to " + str(SR)
        return SR

    f = wave.open(orig_wav, 'r')
    SR = f.getframerate()
    f.close()

    soxopts = ""
    if float(wave_start) != 0.0 or wave_end != None :
        soxopts += " trim " + wave_start
        if wave_end != None :
            soxopts += " " + str(float(wave_end)-float(wave_start))

    if (sr_models != None and SR not in sr_models) or (sr_override != None and SR != sr_override) or soxopts != "":
        new_sr = 11025
        if sr_override != None :
            new_sr = sr_override

        print "Resampling wav file from " + str(SR) + " to " + str(new_sr) + soxopts + "..."
        SR = new_sr
        os.system("sox " + orig_wav + " -r " + str(SR) + " " + out_wav + " polyphase" + soxopts)
    else:
        #print "Using wav file, already at sampling rate " + str(SR) + "."
        os.system("cp -f " + orig_wav + " " + out_wav)

    return SR


def prep_mlf(trsfile, mlffile, word_dictionary, surround, between):
    # Read in the dictionary to ensure all of the words
    # we put in the MLF file are in the dictionary. Words
    # that are not are skipped with a warning.
    f = open(word_dictionary, 'r')
    dict = { } # build hash table
    for line in f.readlines():
        if line != "\n" and line != "" :
            dict[line.split()[0]] = True
    f.close()

    f = open(trsfile, 'r')
    lines = f.readlines()
    f.close()

    words = []

    if surround != None:
        words += surround.split(',')

    i = 0

    # this pattern matches hyphenated words, such as TWENTY-TWO; however, it doesn't work with longer things like SOMETHING-OR-OTHER
    hyphenPat = re.compile(r'([A-Z]+)-([A-Z]+)')

    while (i < len(lines)):
        txt = lines[i].replace('\n', '')
        txt = txt.replace('{breath}', '{BR}').replace('&lt;noise&gt;', '{NS}')
        txt = txt.replace('{laugh}', '{LG}').replace('{laughter}', '{LG}')
        txt = txt.replace('{cough}', '{CG}').replace('{lipsmack}', '{LS}')

        for pun in [',', '.', ':', ';', '!', '?', '"', '%', '(', ')', '--', '---']:
            txt = txt.replace(pun,  '')

        txt = txt.upper()

        # break up any hyphenated words into two separate words
        txt = re.sub(hyphenPat, r'\1 \2', txt)

        txt = txt.split()

        for wrd in txt:
            if (wrd in dict):
                words.append(wrd)
                if between != None:
                    words.append(between)
            else:
                print "SKIPPING WORD", wrd

        i += 1

    # remove the last 'between' token from the end
    if between != None:
        words.pop()

    if surround != None:
        words += surround.split(',')

    writeInputMLF(mlffile, words)

def writeInputMLF(mlffile, words) :
    fw = open(mlffile, 'w')
    fw.write('#!MLF!#\n')
    fw.write('"*/tmp.lab"\n')
    for wrd in words:
        fw.write(wrd + '\n')
    fw.write('.\n')
    fw.close()


def readAlignedMLF(mlffile, SR, wave_start):
    # This reads a MLFalignment output  file with phone and word
    # alignments and returns a list of words, each word is a list containing
    # the word label followed by the phones, each phone is a tuple
    # (phone, start_time, end_time) with times in seconds.

    f = open(mlffile, 'r')
    lines = [l.rstrip() for l in f.readlines()]
    f.close()

    if len(lines) < 3 :
        raise ValueError("Alignment did not complete succesfully.")

    j = 2
    ret = []
    while (lines[j] <> '.'):
        if (len(lines[j].split()) == 5): # Is this the start of a word; do we have a word label?
            # Make a new word list in ret and put the word label at the beginning
            wrd = lines[j].split()[4]
            ret.append([wrd])

        # Append this phone to the latest word (sub-)list
        ph = lines[j].split()[2]
        if (SR == 11025):
            st = (float(lines[j].split()[0])/10000000.0 + 0.0125)*(11000.0/11025.0)
            en = (float(lines[j].split()[1])/10000000.0 + 0.0125)*(11000.0/11025.0)
        else:
            st = float(lines[j].split()[0])/10000000.0 + 0.0125
            en = float(lines[j].split()[1])/10000000.0 + 0.0125   
        if st < en:
            ret[-1].append([ph, st+wave_start, en+wave_start])

        j += 1

    return ret

def writeTextGrid(outfile, word_alignments) :
    # make the list of just phone alignments
    phons = []
    for wrd in word_alignments :
        phons.extend(wrd[1:]) # skip the word label

    # make the list of just word alignments
    # we're getting elements of the form:
    #   ["word label", ["phone1", start, end], ["phone2", start, end], ...]
    wrds = []
    for wrd in word_alignments :
        # If no phones make up this word, then it was an optional word
        # like a pause that wasn't actually realized.
        if len(wrd) == 1 :
            continue
        wrds.append([wrd[0], wrd[1][1], wrd[-1][2]]) # word label, first phone start time, last phone end time

    #write the phone interval tier
    fw = open(outfile, 'w')
    fw.write('File type = "ooTextFile short"\n')
    fw.write('"TextGrid"\n')
    fw.write('\n')
    fw.write(str(phons[0][1]) + '\n')
    fw.write(str(phons[-1][2]) + '\n')
    fw.write('<exists>\n')
    fw.write('2\n')
    fw.write('"IntervalTier"\n')
    fw.write('"phone"\n')
    fw.write(str(phons[0][1]) + '\n')
    fw.write(str(phons[-1][-1]) + '\n')
    fw.write(str(len(phons)) + '\n')
    for k in range(len(phons)):
        fw.write(str(phons[k][1]) + '\n')
        fw.write(str(phons[k][2]) + '\n')
        fw.write('"' + phons[k][0] + '"' + '\n')

    #write the word interval tier
    fw.write('"IntervalTier"\n')
    fw.write('"word"\n')
    fw.write(str(phons[0][1]) + '\n')
    fw.write(str(phons[-1][-1]) + '\n')
    fw.write(str(len(wrds)) + '\n')
    for k in range(len(wrds) - 1):
        fw.write(str(wrds[k][1]) + '\n')
        fw.write(str(wrds[k+1][1]) + '\n')
        fw.write('"' + wrds[k][0] + '"' + '\n')

    fw.write(str(wrds[-1][1]) + '\n')
    fw.write(str(phons[-1][2]) + '\n')
    fw.write('"' + wrds[-1][0] + '"' + '\n')               

    fw.close()

def prep_working_directory() :
    os.system("rm -r -f ./tmp")
    os.system("mkdir ./tmp")

def prep_scp(wavfile) :
    fw = open('./tmp/codetr.scp', 'w')
    fw.write(wavfile + ' ./tmp/tmp.plp\n')
    fw.close()
    fw = open('./tmp/test.scp', 'w')
    fw.write('./tmp/tmp.plp\n')
    fw.close()

def create_plp(hcopy_config) :
    os.system('HCopy -T 1 -C ' + hcopy_config + ' -S ./tmp/codetr.scp')

def viterbi(input_mlf, word_dictionary, output_mlf, phoneset, hmmdir) :
    os.system('HVite -T 1 -a -m -I ' + input_mlf + ' -H ' + hmmdir + '/macros -H ' + hmmdir + '/hmmdefs  -S ./tmp/test.scp -i ' + output_mlf + ' -p 0.0 -s 5.0 ' + word_dictionary + ' ' + phoneset + ' > ./tmp/aligned.results')

def getopt2(name, opts, default = None) :
    value = [v for n,v in opts if n==name]
    if len(value) == 0 :
        return default
    return value[0]

if __name__ == '__main__':

    try:
        opts, args = getopt.getopt(sys.argv[1:], "r:s:e:", ["model="])

        # get the three mandatory arguments
        if len(args) != 3 :
            raise ValueError("Specify wavefile, a transcript file, and an output file!")

        wavfile, trsfile, outfile = args

        sr_override = getopt2("-r", opts, None)
        wave_start = getopt2("-s", opts, "0.0")
        wave_end = getopt2("-e", opts, None)
        surround_token = "sp" #getopt2("-p", opts, 'sp')
        between_token = "sp" #getopt2("-b", opts, 'sp')

        if surround_token.strip() == "":
            surround_token = None
        if between_token.strip() == "":
            between_token = None

        mypath = getopt2("--model", opts, None)
    except :
        print __doc__
        (type, value, traceback) = sys.exc_info()
        print value
        sys.exit(0)

    # If no model directory was said explicitly, get directory containing this script.
    hmmsubdir = ""
    sr_models = None
    if mypath == None :
        mypath = os.path.dirname(os.path.abspath(sys.argv[0])) + "/model"
        hmmsubdir = "FROM-SR"
        # sample rates for which there are acoustic models set up, otherwise
        # the signal must be resampled to one of these rates.
        sr_models = [8000, 11025, 16000]

    if sr_override != None and sr_models != None and not sr_override in sr_models :
        raise ValueError, "invalid sample rate: not an acoustic model available"

    word_dictionary = "./tmp/dict"
    input_mlf = './tmp/tmp.mlf'
    output_mlf = './tmp/aligned.mlf'

    # create working directory
    prep_working_directory()

    # create ./tmp/dict by concatening our dict with a local one
    if os.path.exists("dict.local"):
        os.system("cat " + mypath + "/dict dict.local > " + word_dictionary)
    else:
        os.system("cat " + mypath + "/dict > " + word_dictionary)

    #prepare wavefile: do a resampling if necessary
    tmpwav = "./tmp/sound.wav"
    SR = prep_wav(wavfile, tmpwav, sr_override, wave_start, wave_end)

    if hmmsubdir == "FROM-SR" :
        hmmsubdir = "/" + str(SR)

    #prepare mlfile
    prep_mlf(trsfile, input_mlf, word_dictionary, surround_token, between_token)

    #prepare scp files
    prep_scp(tmpwav)

    # generate the plp file using a given configuration file for HCopy
    create_plp(mypath + hmmsubdir + '/config')

    # run Verterbi decoding
    #print "Running HVite..."
    mpfile = mypath + '/monophones'
    if not os.path.exists(mpfile) :
        mpfile = mypath + '/hmmnames'
    viterbi(input_mlf, word_dictionary, output_mlf, mpfile, mypath + hmmsubdir)

    # output the alignment as a Praat TextGrid
    writeTextGrid(outfile, readAlignedMLF(output_mlf, SR, float(wave_start)))

python - Python 音频对齐器

0 回答 0

Related

Reference