我做了一个基于 Lingua::EN::NameCase的快速 C# 端口https://github.com/tamtamchik/namecase 。
public static class CIQNameCase
{
static Dictionary<string, string> _exceptions = new Dictionary<string, string>
{
{@"\bMacEdo" ,"Macedo"},
{@"\bMacEvicius" ,"Macevicius"},
{@"\bMacHado" ,"Machado"},
{@"\bMacHar" ,"Machar"},
{@"\bMacHin" ,"Machin"},
{@"\bMacHlin" ,"Machlin"},
{@"\bMacIas" ,"Macias"},
{@"\bMacIulis" ,"Maciulis"},
{@"\bMacKie" ,"Mackie"},
{@"\bMacKle" ,"Mackle"},
{@"\bMacKlin" ,"Macklin"},
{@"\bMacKmin" ,"Mackmin"},
{@"\bMacQuarie" ,"Macquarie"}
};
static Dictionary<string, string> _replacements = new Dictionary<string, string>
{
{@"\bAl(?=\s+\w)" , @"al"}, // al Arabic or forename Al.
{@"\b(Bin|Binti|Binte)\b" , @"bin"}, // bin, binti, binte Arabic
{@"\bAp\b" , @"ap"}, // ap Welsh.
{@"\bBen(?=\s+\w)" , @"ben"}, // ben Hebrew or forename Ben.
{@"\bDell([ae])\b" , @"dell$1"}, // della and delle Italian.
{@"\bD([aeiou])\b" , @"d$1"}, // da, de, di Italian; du French; do Brasil
{@"\bD([ao]s)\b" , @"d$1"}, // das, dos Brasileiros
{@"\bDe([lrn])\b" , @"de$1"}, // del Italian; der/den Dutch/Flemish.
{@"\bEl\b" , @"el"}, // el Greek or El Spanish.
{@"\bLa\b" , @"la"}, // la French or La Spanish.
{@"\bL([eo])\b" , @"l$1"}, // lo Italian; le French.
{@"\bVan(?=\s+\w)" , @"van"}, // van German or forename Van.
{@"\bVon\b" , @"von"} // von Dutch/Flemish
};
static string[] _conjunctions = { "Y", "E", "I" };
static string _romanRegex = @"\b((?:[Xx]{1,3}|[Xx][Ll]|[Ll][Xx]{0,3})?(?:[Ii]{1,3}|[Ii][VvXx]|[Vv][Ii]{0,3})?)\b";
/// <summary>
/// Case a name field into its appropriate case format
/// e.g. Smith, de la Cruz, Mary-Jane, O'Brien, McTaggart
/// </summary>
/// <param name="nameString"></param>
/// <returns></returns>
public static string NameCase(string nameString)
{
// Capitalize
nameString = Capitalize(nameString);
nameString = UpdateIrish(nameString);
// Fixes for "son (daughter) of" etc
foreach (var replacement in _replacements.Keys)
{
if (Regex.IsMatch(nameString, replacement))
{
Regex rgx = new Regex(replacement);
nameString = rgx.Replace(nameString, _replacements[replacement]);
}
}
nameString = UpdateRoman(nameString);
nameString = FixConjunction(nameString);
return nameString;
}
/// <summary>
/// Capitalize first letters.
/// </summary>
/// <param name="nameString"></param>
/// <returns></returns>
private static string Capitalize(string nameString)
{
nameString = nameString.ToLower();
nameString = Regex.Replace(nameString, @"\b\w", x => x.ToString().ToUpper());
nameString = Regex.Replace(nameString, @"'\w\b", x => x.ToString().ToLower()); // Lowercase 's
return nameString;
}
/// <summary>
/// Update for Irish names.
/// </summary>
/// <param name="nameString"></param>
/// <returns></returns>
private static string UpdateIrish(string nameString)
{
if(Regex.IsMatch(nameString, @".*?\bMac[A-Za-z^aciozj]{2,}\b") || Regex.IsMatch(nameString, @".*?\bMc"))
{
nameString = UpdateMac(nameString);
}
return nameString;
}
/// <summary>
/// Updates irish Mac & Mc.
/// </summary>
/// <param name="nameString"></param>
/// <returns></returns>
private static string UpdateMac(string nameString)
{
MatchCollection matches = Regex.Matches(nameString, @"\b(Ma?c)([A-Za-z]+)");
if(matches.Count == 1 && matches[0].Groups.Count == 3)
{
string replacement = matches[0].Groups[1].Value;
replacement += matches[0].Groups[2].Value.Substring(0, 1).ToUpper();
replacement += matches[0].Groups[2].Value.Substring(1);
nameString = nameString.Replace(matches[0].Groups[0].Value, replacement);
// Now fix "Mac" exceptions
foreach (var exception in _exceptions.Keys)
{
nameString = Regex.Replace(nameString, exception, _exceptions[exception]);
}
}
return nameString;
}
/// <summary>
/// Fix roman numeral names.
/// </summary>
/// <param name="nameString"></param>
/// <returns></returns>
private static string UpdateRoman(string nameString)
{
MatchCollection matches = Regex.Matches(nameString, _romanRegex);
if (matches.Count > 1)
{
foreach(Match match in matches)
{
if(!string.IsNullOrEmpty(match.Value))
{
nameString = Regex.Replace(nameString, match.Value, x => x.ToString().ToUpper());
}
}
}
return nameString;
}
/// <summary>
/// Fix Spanish conjunctions.
/// </summary>
/// <param name=""></param>
/// <returns></returns>
private static string FixConjunction(string nameString)
{
foreach (var conjunction in _conjunctions)
{
nameString = Regex.Replace(nameString, @"\b" + conjunction + @"\b", x => x.ToString().ToLower());
}
return nameString;
}
}
用法
string name_cased = CIQNameCase.NameCase("McCarthy");
这是我的测试方法,一切似乎都通过了:
[TestMethod]
public void Test_NameCase_1()
{
string[] names = {
"Keith", "Yuri's", "Leigh-Williams", "McCarthy",
// Mac exceptions
"Machin", "Machlin", "Machar",
"Mackle", "Macklin", "Mackie",
"Macquarie", "Machado", "Macevicius",
"Maciulis", "Macias", "MacMurdo",
// General
"O'Callaghan", "St. John", "von Streit",
"van Dyke", "Van", "ap Llwyd Dafydd",
"al Fahd", "Al",
"el Grecco",
"ben Gurion", "Ben",
"da Vinci",
"di Caprio", "du Pont", "de Legate",
"del Crond", "der Sind", "van der Post", "van den Thillart",
"von Trapp", "la Poisson", "le Figaro",
"Mack Knife", "Dougal MacDonald",
"Ruiz y Picasso", "Dato e Iradier", "Mas i Gavarró",
// Roman numerals
"Henry VIII", "Louis III", "Louis XIV",
"Charles II", "Fred XLIX", "Yusof bin Ishak",
};
foreach(string name in names)
{
string name_upper = name.ToUpper();
string name_cased = CIQNameCase.NameCase(name_upper);
Console.WriteLine(string.Format("name: {0} -> {1} -> {2}", name, name_upper, name_cased));
Assert.IsTrue(name == name_cased);
}
}