3

我尝试为语音识别应用程序实现 DTW 算法,我已经成功完成,现在我正在尝试通过修剪来提高 DTW 算法的性能。我已经尝试寻找对此算法的改进,并发现我应该以某种方式计算 2D 数组DTW中特定范围内的值,如 中所示Image #1,但我似乎并不确切知道如何做到这一点。有人可以提供任何帮助吗?包含算法的代码(C#) Sakoe-Chiba 乐队和 Ikatora 平行四边形


/// <summary>
///   Calculates the distance between two audio frames in the form of two MFCCFrame objects as input parameters
///   returns the difference in a double
/// </summary>

double distance(MFCCFrame frame1, MFCCFrame frame2)
{
    double tempSum = 0;
    for (int i = 0; i < 13; i++)
        tempSum += Math.Pow(Math.Abs(frame1.Features[i] - frame2.Features[i]), 2);
    return Math.Sqrt(tempSum);
}

/// <summary>
///   DTW Algorithms
///   Takes input 2 sequences: seq1 and seq2 to calculate the shortest distance between them
///   Uses the function "distance" defined above to calculate distance between two frames
///   2D array -> DTW[,] with dimentions: number of frames in seq 1, number of frames in seq2
///   returns the last element in the 2D array, which is the difference between the two sequences
/// </summary>

double DTWDistance(Sequence seq1, Sequence seq2)
{
    int m = seq1.Frames.Length, n = seq2.Frames.Length;
    double[,] DTW = new double[m, n];
    DTW[0, 0] = 0;
    for (int i = 0; i < m; i++)
    {
        for (int j = 0; j < n; j++)
        {
            double cost = distance(seq1.Frames[i], seq2.Frames[j]);
            if (i == 0 && j == 0)
                DTW[i, j] = cost;
            else if (i == 0)
                DTW[i, j] = cost + DTW[i, j - 1];
            else if (j == 0)
                DTW[i, j] = cost + DTW[i - 1, j];
            else
                DTW[i, j] = (cost + Math.Min(DTW[i - 1, j], Math.Min(DTW[i, j - 1], DTW[i - 1, j - 1])));
        }
    }
}
4

1 回答 1

3

由于没有人回答,我想我会回答这个问题,以防有人也需要帮助。这是常规的 DTW 算法:

/// <summary>
///   DTW Algorithms
///   Takes input 2 sequences: input and template to calculate the shortest distance between them
///   Uses the function "distance" defined above to calculate distance between two frames
///   2D array -> DTW[,] with dimentions: number of frames in input, number of frames in template
///   returns the last element in the 2D array, which is the difference between the two sequences
/// </summary>
/// 
double DTWDistance(Sequence input, Sequence template)
{
    int rows = input.Frames.Length, columns = template.Frames.Length;
    if (rows < (double)(columns / 2) || columns < (double)(rows / 2))
    {
        return double.MaxValue;
    }
        double[,] DTW = new double[rows, columns];
    DTW[0, 0] = 0;
    for (int i = 0; i < rows; i++)
    {
        for (int j = 0; j < columns; j++)
        {
            double cost = distance(input.Frames[i], template.Frames[j]);
            if (i == 0 && j == 0)
                DTW[i, j] = cost;
            else if (i == 0)
                DTW[i, j] = cost + DTW[i, j - 1];
            else if (j == 0)
                DTW[i, j] = cost + DTW[i - 1, j];
            else
                DTW[i, j] = (cost + Math.Min(DTW[i - 1, j], DTW[i - 1, j - 1]));// insert ,match
        }
    }
    return DTW[rows - 1, columns - 1];
}

而且我还在常规 DTW 算法上实施了修剪策略,如下所示:

/// <summary>
/// DTW Algotithm with Pruning
/// Only Sakoe-Chiba band is caluculated and the rest is pruned
/// </summary>
float Pruning_DTWDistance(Sequence input, Sequence output)
{
    int rows = input.Frames.Length, columns = output.Frames.Length;
    if (rows < (double)(columns / 2) || columns < (double)(rows / 2))
    {
        return float.MaxValue;
    }
    float cost;
    float[,] DTW = new float[rows + 1, columns + 1];
    int w = Math.Abs(columns - rows);// window length -> |rows - columns|<= w
    for (int i = 1; i <= rows; i++)
    {
        for (int j = Math.Max(1, i - w); j <= Math.Min(columns, i + w); j++)
        {
            if (DTW[i - 1, j] == 0)
                DTW[i - 1, j] = float.MaxValue;
            if (DTW[i - 1, j - 1] == 0)
                DTW[i - 1, j - 1] = float.MaxValue;
            DTW[0, 0] = 0;
            cost = distance(input.Frames[i - 1], output.Frames[j - 1]);// frames 0 based
            DTW[i, j] = (cost + Math.Min(DTW[i - 1, j], DTW[i - 1, j - 1]));// insert ,match
        }
    }
    return DTW[rows, columns];
}

这两个函数都使用了一个辅助函数distance,它的定义如下:

/// <summary>
///   Calculates the distance between two audio frames in the form of MFCCFrame objects
///   returns the difference in a float
/// </summary>
/// 
float distance(MFCCFrame frame1, MFCCFrame frame2)
{
    double tempSum = 0;
    for (int i = 0; i < 13; i++)
        tempSum += Math.Pow(Math.Abs(frame1.Features[i] - frame2.Features[i]), 2);
    return (float)Math.Sqrt(tempSum);
}

[编辑] 为了提高 DTW 算法的内存消耗,我只使用了 2 个数组而不是 2D 矩阵,此更改后的算法如下所示:

    double DTW_improved(Sequence input, Sequence template)
    {
        // Don't compare two sequences if one of their lengths is half the other's
        if (input.Frames.Length <= (0.5 * template.Frames.Length) || template.Frames.Length <= (0.5 * input.Frames.Length))
            return double.PositiveInfinity;
        int rows = template.Frames.Length, columns = input.Frames.Length;

        double[] c1 = new double[rows];
        double[] c2 = new double[rows];
        double[] temp; // To hold address only (use it in swapping address) 
        c1[0] = distance(input.Frames[0], template.Frames[0]);
        for (int i = 1; i < rows; i++)
            c1[i] = c1[i - 1] + distance(input.Frames[0], template.Frames[i]);
        for (int i = 1; i < columns; i++)
        {
            c2[0] = distance(input.Frames[i], template.Frames[0]) + c1[0];
            c2[1] = distance(input.Frames[i], template.Frames[1]) + Math.Min(c1[0], c1[1]); 
            // Calculating first 2 elements of the array before the loop
            //since they both need special conditions
            for (int j = 2; j < rows; j++)
                c2[j] = Math.Min(c1[j], Math.Min(c1[j - 1], c1[j - 2])) + distance(input.Frames[i], template.Frames[j]);

            if (i != columns - 1) // Swapping addresses of c1 & c2
            {
                temp = c2;
                c2 = c1;
                c1 = temp;
            }
        }
        return c2[rows - 1] / (0.5 * (input.Frames.Length + template.Frames.Length)); // Normalization: Dividing edit distance by average of input length & template length
    }
于 2015-12-27T15:01:32.983 回答