2

我一直在使用这个网页作为语音共振峰跟踪的指南......

http://iitg.vlab.co.in/?sub=59&brch=164&sim=615&cnt=1

一切似乎进展顺利,除了最后一步,即将倒谱转换为平滑表示,用于共振峰跟踪的简单峰值拾取。光谱仪看起来不错,倒谱仪(我可以这么说吗?:P)看起来也不错(据我所知),但最后阶段的结果(平滑共振峰表示)不是我所期望的。

我将每个阶段的样本作为视觉图像上传到...

http://imgur.com/a/62duS

该样本用于声音“i”的语音,如“beed”。根据这个网站...

http://home.cc.umanitoba.ca/~robh/howto.html#formants

第一个共振峰应该在 500hz 左右,第二个和第三个共振峰分别在 2200hz 和 2800 hz 左右。光谱仪显示的东西非常相似,但在最后阶段我得到的结果类似于......

F1 - 891 F2 - 1550 F3 - 2329

任何见解将不胜感激。一段时间以来,我一直在兜圈子。我的代码如下所示...

// set up fft parameters
UInt32 log2n = 9;
UInt32 n = 512;
UInt32 window = n;
UInt32 halfN = n/2;
UInt32 stride = 1;
FFTSetup setupReal = [appDelegate getFftSetup];
int stepSize = (hpBuffer.sampleCount-window) / quantizeCount;

// calculate volume from raw samples, because it seems more reliable that fft
UInt32 volumeWindow = 128;
volumeBuffer = malloc(sizeof(float)*quantizeCount);
int windowPos = 0;
for (int i=0; i < quantizeCount; i++) {
    windowPos += stepSize;
    float total = 0.0f;
    float max = 0.0f;
    for (int p=windowPos; p < windowPos+volumeWindow; p++) {
        total += sampleBuffer.buffer[p];
        if (sampleBuffer.buffer[p] > max)
            max = sampleBuffer.buffer[p];
    }
    volumeBuffer[i] = max;
}

// normalize volumebuffer
[FloatAudioBuffer normalizePositiveBuffer:volumeBuffer ofSize:quantizeCount];

// allocate memory for complex array
COMPLEX_SPLIT complexArray;
complexArray.realp = (float*)malloc(4096*sizeof(float));
complexArray.imagp = (float*)malloc(4096*sizeof(float));

// allocate some space for temporary hamming buffer
float *hamBuffer = malloc(n*sizeof(float));

// create spectrum and feature buffer
spectrumBuffer = malloc(sizeof(float)*halfN*quantizeCount);
formantBuffer = malloc(sizeof(float)*4096*quantizeCount);
cepstrumBuffer = malloc(sizeof(float)*halfN*quantizeCount);
lowCepstrumBuffer = malloc(sizeof(float)*featureCount*quantizeCount);
featureBuffer = malloc(sizeof(float)*featureCount*quantizeCount);

// create data point for each quantize segment
float TWOPI = 2.0f * M_PI;
for (int s=0; s < quantizeCount; s++) {
    // copy buffer data into a seperate array and apply hamming window
    int offset = (int)(s * stepSize);
    for (int i=0; i < n; i++)
        hamBuffer[i] = hpBuffer.buffer[offset+i] * ((1.0f-0.46f) - 0.46f*cos(TWOPI*i/((float)n-1.0f)));

    // configure float array into acceptable input array format (interleaved)
    vDSP_ctoz((COMPLEX*)hamBuffer, 2, &complexArray, 1, halfN);

    // run FFT
    vDSP_fft_zrip(setupReal, &complexArray, stride, log2n, FFT_FORWARD);

    // Absolute square (equivalent to mag^2)
    complexArray.imagp[0] = 0.0f;
    vDSP_zvmags(&complexArray, 1, complexArray.realp, 1, halfN);
    bzero(complexArray.imagp, (halfN) * sizeof(float));

    // scale
    float scale = 1.0f / (2.0f*(float)n);
    vDSP_vsmul(complexArray.realp, 1, &scale, complexArray.realp, 1, halfN);

    // get log of absolute values for passing to inverse FFT for cepstrum
    for (int i=0; i < halfN; i++)
        complexArray.realp[i] = logf(sqrtf(complexArray.realp[i]));

    // save this into spectrum buffer
    memcpy(&spectrumBuffer[s*halfN], complexArray.realp, halfN*sizeof(float));


    // convert spectrum to interleaved ready for inverse fft
    vDSP_ctoz((COMPLEX*)&spectrumBuffer[s*halfN], 2, &complexArray, 1, halfN/2);

    // create cepstrum
    vDSP_fft_zrip(setupReal, &complexArray, stride, log2n-1, FFT_INVERSE);

    //convert interleaved to real and straight into cepstrum buffer
    vDSP_ztoc(&complexArray, 1, (COMPLEX*)&cepstrumBuffer[s*halfN], 2, halfN/2);

    // copy first part of cepstrum into low cepstrum buffer
    memcpy(&lowCepstrumBuffer[s*featureCount], &cepstrumBuffer[s*halfN], featureCount*sizeof(float));

    // make 8000 point array based on the first 15 values
    float *tempArray = malloc(8192*sizeof(float));
    for (int i=0; i < 8192; i++) {
        if (i < 15)
            tempArray[i] = cepstrumBuffer[s*halfN+i];
        else
            tempArray[i] = 0.0f;
    }
    vDSP_ctoz((COMPLEX*)tempArray, 2, &complexArray, 1, 4096);
    float newLog2n = log2f(8192.0f);
    complexArray.imagp[0] = 0.0f;
    vDSP_fft_zrip(setupReal, &complexArray, stride, newLog2n, FFT_FORWARD);
    vDSP_zvmags(&complexArray, 1, complexArray.realp, 1, 4096);
    bzero(complexArray.imagp, (4096) * sizeof(float));

    // scale
    scale = 1.0f / (2.0f*(float)8192);
    vDSP_vsmul(complexArray.realp, 1, &scale, complexArray.realp, 1, 4096);

    // get magnitude
    for (int i=0; i < 4096; i++)
        complexArray.realp[i] = sqrtf(complexArray.realp[i]);

    // write to formant buffer
    memcpy(&formantBuffer[s*4096], complexArray.realp, 4096*sizeof(float));

    // complex array now contains formant spectrum
    // it's large, so get features here!
    // try simple peak picking algorithm for first 3 formants
    int formantIndex = 0;
    float *peaks = malloc(6*sizeof(float));
    for (int i=0; i < 6; i++)
        peaks[i] = 0.0f;
    for (int i=1; i < 4096-1 && formantIndex < 6; i++) {
        if (complexArray.realp[i-1] < complexArray.realp[i] &&
            complexArray.realp[i+1] < complexArray.realp[i])
            peaks[formantIndex++] = i;
    }
4

0 回答 0