我有一个数据库表,其中有一列 SQLServer Soundex 编码的姓氏 + 名字。在我的 C# 程序中,我想使用 soundex 转换一个字符串以用于我的查询。
dotnet 库中是否有用于 soundex 的标准字符串函数,或者是实现它的开源库(可能作为字符串的扩展方法)?
我知道这已经晚了,但我也需要类似的东西(尽管不涉及数据库),唯一的答案是不准确的('Tymczak' 和'Pfister' 失败)。
这就是我想出的:
class Program
{
public static void Main(string[] args)
{
Assert.AreEqual(Soundex.Generate("H"), "H000");
Assert.AreEqual(Soundex.Generate("Robert"), "R163");
Assert.AreEqual(Soundex.Generate("Rupert"), "R163");
Assert.AreEqual(Soundex.Generate("Rubin"), "R150");
Assert.AreEqual(Soundex.Generate("Ashcraft"), "A261");
Assert.AreEqual(Soundex.Generate("Ashcroft"), "A261");
Assert.AreEqual(Soundex.Generate("Tymczak"), "T522");
Assert.AreEqual(Soundex.Generate("Pfister"), "P236");
Assert.AreEqual(Soundex.Generate("Gutierrez"), "G362");
Assert.AreEqual(Soundex.Generate("Jackson"), "J250");
Assert.AreEqual(Soundex.Generate("VanDeusen"), "V532");
Assert.AreEqual(Soundex.Generate("Deusen"), "D250");
Assert.AreEqual(Soundex.Generate("Sword"), "S630");
Assert.AreEqual(Soundex.Generate("Sord"), "S630");
Assert.AreEqual(Soundex.Generate("Log-out"), "L230");
Assert.AreEqual(Soundex.Generate("Logout"), "L230");
Assert.AreEqual(Soundex.Generate("123"), Soundex.Empty);
Assert.AreEqual(Soundex.Generate(""), Soundex.Empty);
Assert.AreEqual(Soundex.Generate(null), Soundex.Empty);
}
}
public static class Soundex
{
public const string Empty = "0000";
private static readonly Regex Sanitiser = new Regex(@"[^A-Z]", RegexOptions.Compiled);
private static readonly Regex CollapseRepeatedNumbers = new Regex(@"(\d)?\1*[WH]*\1*", RegexOptions.Compiled);
private static readonly Regex RemoveVowelSounds = new Regex(@"[AEIOUY]", RegexOptions.Compiled);
public static string Generate(string Phrase)
{
// Remove non-alphas
Phrase = Sanitiser.Replace((Phrase ?? string.Empty).ToUpper(), string.Empty);
// Nothing to soundex, return empty
if (string.IsNullOrEmpty(Phrase))
return Empty;
// Convert consonants to numerical representation
var Numified = Numify(Phrase);
// Remove repeated numberics (characters of the same sound class), even if separated by H or W
Numified = CollapseRepeatedNumbers.Replace(Numified, @"$1");
if (Numified.Length > 0 && Numified[0] == Numify(Phrase[0]))
{
// Remove first numeric as first letter in same class as subsequent letters
Numified = Numified.Substring(1);
}
// Remove vowels
Numified = RemoveVowelSounds.Replace(Numified, string.Empty);
// Concatenate, pad and trim to ensure X### format.
return string.Format("{0}{1}", Phrase[0], Numified).PadRight(4, '0').Substring(0, 4);
}
private static string Numify(string Phrase)
{
return new string(Phrase.ToCharArray().Select(Numify).ToArray());
}
private static char Numify(char Character)
{
switch (Character)
{
case 'B': case 'F': case 'P': case 'V':
return '1';
case 'C': case 'G': case 'J': case 'K': case 'Q': case 'S': case 'X': case 'Z':
return '2';
case 'D': case 'T':
return '3';
case 'L':
return '4';
case 'M': case 'N':
return '5';
case 'R':
return '6';
default:
return Character;
}
}
}
根据Dotnet Services和tigrou的回答,我已经更正了算法,以反映Wikipedia中描述的功能。
Ashcraft = A226、Tymczak = T522、Pfister = P236 和 Honeyman = H555 等测试用例现在可以正常工作。
public static string Soundex(string data)
{
StringBuilder result = new StringBuilder();
if (data != null && data.Length > 0)
{
string previousCode = "", currentCode = "", currentLetter = "";
result.Append(data[0]); // keep initial char
for (int i = 0; i < data.Length; i++) //start at 0 in order to correctly encode "Pf..."
{
currentLetter = data[i].ToString().ToLower();
currentCode = "";
if ("bfpv".Contains(currentLetter))
currentCode = "1";
else if ("cgjkqsxz".Contains(currentLetter))
currentCode = "2";
else if ("dt".Contains(currentLetter))
currentCode = "3";
else if (currentLetter == "l")
currentCode = "4";
else if ("mn".Contains(currentLetter))
currentCode = "5";
else if (currentLetter == "r")
currentCode = "6";
if (currentCode != previousCode && i > 0) // do not add first code to result string
result.Append(currentCode);
if (result.Length == 4) break;
previousCode = currentCode; // always retain previous code, even empty
}
}
if (result.Length < 4)
result.Append(new String('0', 4 - result.Length));
return result.ToString().ToUpper();
}
private string Soundex(string word)
{
word = word.ToUpper();
word = word[0] +
Regex.Replace(
Regex.Replace(
Regex.Replace(
Regex.Replace(
Regex.Replace(
Regex.Replace(
Regex.Replace(word.Substring(1), "[AEIOUYHW]",""),
"[BFPV]+", "1"),
"[CGJKQSXZ]+", "2"),
"[DT]+","3"),
"[L]+","4"),
"[MN]+","5"),
"[R]+","6")
;
return word.PadRight(4,'0').Substring(0,4);
}
您可以在每个 SQL 的 c# 中使用类似的东西
public static string Soundex(string data)
{
StringBuilder result = new StringBuilder();
if (data != null && data.Length > 0)
{
string previousCode = "", currentCode = "", currentLetter = "";
result.Append(data.Substring(0, 1));
for (int i = 1; i < data.Length; i++)
{
currentLetter = data.Substring(i, 1).ToLower();
currentCode = "";
if ("bfpv".IndexOf(currentLetter) > -1)
currentCode = "1";
else if ("cgjkqsxz".IndexOf(currentLetter) > -1)
currentCode = "2";
else if ("dt".IndexOf(currentLetter) > -1)
currentCode = "3";
else if (currentLetter == "l")
currentCode = "4";
else if ("mn".IndexOf(currentLetter) > -1)
currentCode = "5";
else if (currentLetter == "r")
currentCode = "6";
if (currentCode != previousCode)
result.Append(currentCode);
if (result.Length == 4) break;
if (currentCode != "")
previousCode = currentCode;
}
}
if (result.Length < 4)
result.Append(new String('0', 4 - result.Length));
return result.ToString().ToUpper();
}