这是对这个问题的“头脑风暴”回答。
基本上,这计算了句子 2 与句子 1 的距离,作为与句子 1 的笛卡尔距离(假设在原点),其中距离是 2 个句子中单词之间的最小 Levenshtein 差异的总和。它具有 2 个相等的句子给出 0 距离的特性。
如果这种方法已在其他地方发表,我不知道。
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Diagnostics;
namespace ConsoleApplication1
{
class Program
{
static void Main(string[] args)
{
string str1 = "The cat sat on the mat";
string str2 = "The quick brown fox jumped over the lazy cow";
ReportDifference(str1, str1);
ReportDifference(str2, str2);
ReportDifference(str1, str2);
ReportDifference(str2, str1);
}
/// <summary>
/// Quick test andisplay routine
/// </summary>
/// <param name="str1">First sentence to test with</param>
/// <param name="str2">Second sentence to test with</param>
static void ReportDifference(string str1, string str2)
{
Debug.WriteLine(
String.Format("difference between \"{0}\" and \"{1}\" is {2}",
str1, str2, Difference(str1, str2)));
}
/// <summary>
/// This does the hard work.
/// Basically, what it does is:
/// 1) Split the stings into tokens/words
/// 2) Form a cartesian product of the 2 lists of words.
/// 3) Calculate the Levenshtein Distance between each word.
/// 4) Group on the words from the first sentance
/// 5) Get the min distance between the word in first sentence and all of the words from the second
/// 6) Square the distances for each word.
/// (based on the distance betwen 2 points is the sqrt of the sum of the x,y,... axises distances
/// what this assumes is the first word is the origin)
/// 7) take the square root of sum
/// </summary>
/// <param name="str1">sentence 1 compare</param>
/// <param name="str2">sentence 2 compare</param>
/// <returns>distance calculated</returns>
static double Difference(string str1, string str2)
{
string[] splitters = { " " };
var a = Math.Sqrt(
(from x in str1.Split(splitters, StringSplitOptions.RemoveEmptyEntries)
from y in str2.Split(splitters, StringSplitOptions.RemoveEmptyEntries)
select new {x, y, ld = Distance.LD(x,y)} )
.GroupBy(x => x.x)
.Select(q => new { q.Key, min_match = q.Min(p => p.ld) })
.Sum(s => (double)(s.min_match * s.min_match )));
return a;
}
}
/// <summary>
/// Lifted from http://www.merriampark.com/ldcsharp.htm
/// </summary>
public class Distance
{
/// <summary>
/// Compute Levenshtein distance
/// </summary>
/// <param name="s">String 1</param>
/// <param name="t">String 2</param>
/// <returns>Distance between the two strings.
/// The larger the number, the bigger the difference.
/// </returns>
public static int LD(string s, string t)
{
int n = s.Length; //length of s
int m = t.Length; //length of t
int[,] d = new int[n + 1, m + 1]; // matrix
int cost; // cost
// Step 1
if (n == 0) return m;
if (m == 0) return n;
// Step 2
for (int i = 0; i <= n; d[i, 0] = i++) ;
for (int j = 0; j <= m; d[0, j] = j++) ;
// Step 3
for (int i = 1; i <= n; i++)
{
//Step 4
for (int j = 1; j <= m; j++)
{
// Step 5
cost = (t.Substring(j - 1, 1) == s.Substring(i - 1, 1) ? 0 : 1);
// Step 6
d[i, j] = System.Math.Min(System.Math.Min(d[i - 1, j] + 1, d[i, j - 1] + 1),
d[i - 1, j - 1] + cost);
}
}
// Step 7
return d[n, m];
}
}
}