2

我有以下字符串

an-ca
an-ca

如果你仔细观察,你会发现它们是不同的!

为了比较像这样的2个字符串,我找到了这个解决方案:

if (String.Compare(str1, str2, StringComparison.InvariantCulture) == 0) ...

所以我有2个问题

  1. W3C 处理字符的方式在不同的语言中是相同的 - 所以比较问题

  2. 如何在 C# 中将字符串转换为“InvariantCulture”

编辑:

链接可以更好地解释问题

提前致谢

4

2 回答 2

3

有很多不匹配的情况,尤其是对于没有明显差异的字符。例如,我们假设一个空间总是一个 32 (0x20) 字节。当文本来自网络时,空格可能是 160 (0xa0),即“ ” 实体。而且还有更多的空间。连字符、点、逗号等也可能发生同样的情况。请查看此页面以获取更多信息,尤其是页面,因为它包含这种不匹配之王最常见的情况。使用包含的信息,您可以编写一个函数(最好是直接的以获得更好的性能 - 也就是一个长开关)来重新创建一个已经被 html 解码的字符串。

作为概念证明,我将包含用于创建友好 URL 的代码,同时保留用于根据标题进行标记的字典。它仅涵盖 Windows 西欧和中欧字符集,包括 thorn 和 eth。如您所见,有一些棘手的部分,当然还有很大的定制空间。这个意大利面条代码只对初始字符串进行一次传递。

public static string Latin(this string s, ref Dictionary<string, int> dict)
{
    StringBuilder
        f = new StringBuilder(new string(' ', s.Length * 5)),
        w = new StringBuilder(50);
    string word = "";
    int index = -1, len = s.Length, position = -1, ws = 0;
    while (++index < len)
    {
        if (s[index] > '\x007f')
        {
            switch (s[index])
            {
                case 'á':
                case 'Á':
                case 'à':
                case 'À':
                case 'â':
                case 'Â':
                case 'ä':
                case 'Ä':
                case 'ă':
                case 'Ă':
                case 'ā':
                case 'Ā':
                case 'ã':
                case 'Ã':
                case 'å':
                case 'Å':
                case 'ą':
                case 'Ą':
                    f[++position] = 'a';
                    break;
                case 'æ':
                case 'Æ':
                    f[++position] = 'a';
                    f[++position] = 'e';
                    break;
                case 'ć':
                case 'Ć':
                case 'č':
                case 'Č':
                case 'ç':
                case 'Ç':
                    f[++position] = 'c';
                    break;
                case 'ď':
                case 'Ď':
                case 'đ':
                case 'Đ':
                    f[++position] = 'd';
                    break;
                case 'é':
                case 'É':
                case 'è':
                case 'È':
                case 'ė':
                case 'Ė':
                case 'ê':
                case 'Ê':
                case 'ë':
                case 'Ë':
                case 'ě':
                case 'Ě':
                case 'ē':
                case 'Ē':
                case 'ę':
                case 'Ę':
                    f[++position] = 'e';
                    break;
                case 'ğ':
                case 'Ğ':
                case 'ģ':
                case 'Ģ':
                    f[++position] = 'g';
                    break;
                case 'ı':
                case 'í':
                case 'Í':
                case 'ì':
                case 'Ì':
                case 'İ':
                case 'î':
                case 'Î':
                case 'ï':
                case 'Ï':
                case 'ī':
                case 'Ī':
                case 'į':
                case 'Į':
                    f[++position] = 'i';
                    break;
                case 'ķ':
                case 'Ķ':
                    f[++position] = 'k';
                    break;
                case 'ĺ':
                case 'Ĺ':
                case 'ľ':
                case 'Ľ':
                case 'ļ':
                case 'Ļ':
                case 'ł':
                case 'Ł':
                    f[++position] = 'l';
                    break;
                case 'ń':
                case 'Ń':
                case 'ň':
                case 'Ň':
                case 'ñ':
                case 'Ñ':
                case 'ņ':
                case 'Ņ':
                    f[++position] = 'n';
                    break;
                case 'ó':
                case 'Ó':
                case 'ò':
                case 'Ò':
                case 'ô':
                case 'Ô':
                case 'ö':
                case 'Ö':
                case 'ō':
                case 'Ō':
                case 'õ':
                case 'Õ':
                case 'ő':
                case 'Ő':
                case 'ø':
                case 'Ø':
                    f[++position] = 'o';
                    break;
                case 'œ':
                case 'Œ':
                    f[++position] = 'o';
                    f[++position] = 'e';
                    break;
                case 'ŕ':
                case 'Ŕ':
                case 'ř':
                case 'Ř':
                case 'ŗ':
                case 'Ŗ':
                    f[++position] = 'r';
                    break;
                case 'ś':
                case 'Ś':
                case 'š':
                case 'Š':
                case 'ş':
                case 'Ş':
                    f[++position] = 's';
                    break;
                case 'ß':
                    f[++position] = 's';
                    f[++position] = 's';
                    break;
                case 'ť':
                case 'Ť':
                case 'ţ':
                case 'Ţ':
                    f[++position] = 't';
                    break;
                case 'ð':
                case 'Ð':
                case 'þ':
                case 'Þ':
                    f[++position] = 't';
                    f[++position] = 'h';
                    break;
                case 'ú':
                case 'Ú':
                case 'ù':
                case 'Ù':
                case 'û':
                case 'Û':
                case 'ü':
                case 'Ü':
                case 'ū':
                case 'Ū':
                case 'ů':
                case 'Ů':
                case 'ų':
                case 'Ų':
                case 'ű':
                case 'Ű':
                    f[++position] = 'u';
                    break;
                case 'ý':
                case 'Ý':
                case 'ÿ':
                case 'Ÿ':
                    f[++position] = 'y';
                    break;
                case 'ź':
                case 'Ź':
                case 'ż':
                case 'Ż':
                case 'ž':
                case 'Ž':
                    f[++position] = 'z';
                    break;
                case '\x00a0': // no-break space
                case '\x2000': // other spaces
                case '\x2001':
                case '\x2002':
                case '\x2003':
                case '\x2004':
                case '\x2005':
                case '\x2006':
                case '\x2007':
                case '\x2008':
                case '\x2009':
                case '\x200a':
                case '\x200b':
                case '\x200c':
                case '\x200d':
                    if (position > -1 && f[position] != '-') f[++position] = '-';
                    break;
            }
        }
        else if ((s[index] > '\x002f' && s[index] < '\x003a') || (s[index] > '\x0060' && s[index] < '\x007b'))
        {
            f[++position] = s[index];
        }
        else if ((s[index] > '\x0040' && s[index] < '\x005b'))
        {
            f[++position] = (char)(s[index] | ' ');
        }
        else
        {
            switch (s[index])
            {
                case '#':
                    if (position > 0 && (f[position] == 'c' || f[position] == 'j' || f[position] == 'f'))
                    {
                        f[++position] = 's';
                        f[++position] = 'h';
                        f[++position] = 'a';
                        f[++position] = 'r';
                        f[++position] = 'p';
                    }
                    else if (index + 1 < len && ((s[index + 1] >= '0') && (s[index + 1] <= '9')))
                    {
                        f[++position] = 'n';
                        f[++position] = 'o';
                    }
                    else if (position > -1 && f[position] != '-') f[++position] = '-';
                    break;
                case '+':
                    f[++position] = 'p';
                    f[++position] = 'l';
                    f[++position] = 'u';
                    f[++position] = 's';
                    break;
                case '.':
                    //f[++position] = 'd';
                    //f[++position] = 'o';
                    //f[++position] = 't';
                    break;
                case '-':
                    break;
                default:
                    if (position > -1 && f[position] != '-') f[++position] = '-';
                    break;
            }
        }
        if (f[position] == '-' && ws < position)
        {
            for (var i = ws; i < position; i++) w.Append(f[i]);
            word = w.ToString();
            if (dict.ContainsKey(word)) dict[word] += 1;
            else dict.Add(word, 1);
            w = new StringBuilder(50);
            ws = position + 1;
        }
    }
    if (f[position] == '-') position--;
    return f.ToString(0, position + 1);
}

将其用作您初始要求的指南。
作为记录,它为“C/C++ 文章”而不是“cc---articles”或“cc-articles”生成“c-cplusplus-articles”。

于 2011-12-11T14:18:58.790 回答
2

尽管它不是万无一失的,并且仍在进行中,但这里有一些我们创建的扩展方法,用于将字符从 UTF 转换为 ASCII。

public static class UnicodeToAsciiConverter
{
    #region Mappings

    /// <summary>
    /// A string-to-string dictionary for mapping unicode text element keys to their
    /// equivalent ASCII text element values. 
    /// </summary>
    private static readonly IDictionary<string, string> UnicodeToAsciiConversions = new Dictionary<string, string>
    {
        { "ʼ", SingleQuote }, { "‘", SingleQuote }, { "’", SingleQuote },  { "ʻ", SingleQuote }, { "–", "-" }, 
        { "‎", string.Empty }, { "¯", "_"}, { "—", "-"}, 

        { "á", a }, { "Á", A }, { "à", a }, { "À", A }, { "â", a }, { "Â", A }, { "ä", a }, { "Ä", A }, 
        { "ă", a }, { "Ă", A }, { "ā", a }, { "Ā", A }, { "ã", a }, { "Ã", A }, { "å", a }, { "Å", A }, 
        { "ầ", a }, { "Ầ", A }, { "ắ", a }, { "Ắ", A }, { "ằ", a }, { "Ằ", A }, { "ẵ", a }, { "Ẵ", A }, 
        { "ả", a }, { "Ả", A }, { "ạ", a }, { "Ạ", A }, { "ậ", a }, { "Ậ", A }, { "ấ", a }, { "Ấ", A }, 
        { "ą", a }, { "Ą", A }, 

        { "æ", "ae" }, { "Æ", "AE" }, { "ǣ", "ae" }, { "Ǣ", "AE" }, 
        { "ß", b }, { "þ", b }, { "Þ", B }, 

        { "ć", c }, { "Ć", C }, { "č", c }, { "Č", C }, { "ç", c }, { "Ç", C }, { "ĉ", c }, { "Ĉ", C }, 
        { "ċ", c }, { "Ċ", C }, 
        { "ḑ", d }, { "Ḑ", D }, { "đ", d }, { "Đ", D }, { "ð", d }, { "Ð", D }, { "ḍ", d }, { "Ḍ", D }, 
        { "ď", d }, { "Ď", D }, { "ḑ", d }, { "Ḑ", D }, 

        { "é", e }, { "É", E }, { "è", e }, { "È", E }, { "ė", e }, { "Ė", E }, { "ê", e }, { "Ê", E }, 
        { "ë", e }, { "Ë", E }, { "ě", e }, { "Ě", E }, { "ĕ", e }, { "Ĕ", E }, { "ē", e }, { "Ē", E }, 
        { "ę", e }, { "Ę", E }, { "ế", e }, { "Ế", E }, { "ề", e }, { "Ề", E }, { "ệ", e }, { "Ệ", E }, 
        { "ǝ", e }, { "Ǝ", E }, { "ə", e }, { "Ə", E }, { "ể", e }, { "Ể", E }, { "ễ", e }, { "Ễ", E }, 
        { "ẹ̀", e }, { "Ẹ̀", E }, { "ɛ́", e }, { "ɛ", e }, { "Ɛ", E }, 

        { "ğ", g }, { "Ğ", G }, { "ĝ", g }, { "Ĝ", G }, { "ġ", g }, { "Ġ", G }, { "ģ", g }, { "Ģ", G }, 
        { "ḩ", h }, { "Ḩ", H }, { "ħ", h }, { "Ħ", H }, { "ḥ", h }, { "Ḥ", H }, { "ĥ", h }, { "Ĥ", H }, 
        { "ẖ", h }, { "H̱", H }, { "h̲", h }, { "H̲", H }, { "ḩ", h }, { "Ḩ", H }, 

        { "ı", i }, { "ı".ToUpper(), I }, { "í", i }, { "Í", I }, { "ì", i }, { "Ì", I }, { "İ".ToLower(), i }, { "İ", I }, 
        { "î", i }, { "Î", I }, { "ï", i }, { "Ï", I }, { "ĭ", i }, { "Ĭ", I }, { "ī", i }, { "Ī", I }, 
        { "ĩ", i }, { "Ĩ", I }, { "ỉ", i }, { "Ỉ", I }, { "ị", i }, { "Ị", I }, 
        { "ķ", k }, { "Ķ", K }, 
        { "ļ", l }, { "Ļ", L }, { "ł", l }, { "Ł", L }, { "ľ", l }, { "Ľ", L }, 
        { "ň", n }, { "Ň", N }, { "ñ", n }, { "Ñ", N }, { "ń", n }, { "Ń", N }, { "ŋ", n }, { "Ŋ", N }, 
        { "ņ", n }, { "Ņ", N }, 

        { "ó", o }, { "Ó", O }, { "ò", o }, { "Ò", O }, { "ô", o }, { "Ô", O }, { "ö", o }, { "Ö", O }, 
        { "ŏ", o }, { "Ŏ", O }, { "ō", o }, { "Ō", O }, { "õ", o }, { "Õ", O }, { "ő", o }, { "Ő", O }, 
        { "ố", o }, { "Ố", O }, { "ồ", o }, { "Ồ", O }, { "ø", o }, { "Ø", O }, { "ơ", o }, { "Ơ", O }, 
        { "ọ", o }, { "Ọ", O }, { "ớ", o }, { "Ớ", O }, { "ộ", o }, { "Ộ", O }, { "ɔ", o }, { "Ɔ", O }, 
        { "ɔ́", o }, { "Ɔ́", O }, { "ổ", o }, { "Ổ", O }, { "ỏ", o }, { "Ỏ", O }, 

        { "œ", oe }, { "Œ", OE }, { "œ̆", oe }, { "Œ̆", OE }, 

        { "ř", r }, { "Ř", R }, 
        { "ś", s }, { "Ś", S }, { "š", s }, { "Š", S }, { "ş", s }, { "Ş", S }, { "ṣ", s }, { "Ṣ", S }, 
        { "ŝ", s }, { "Ŝ", S }, { "ș", s }, { "s̲", s }, { "S̲", S }, 
        { "ţ", t }, { "Ţ", T }, { "ṭ", t }, { "Ṭ", T }, { "ŧ", t }, { "Ŧ", T }, { "ț", t }, { "ť", t }, { "Ť", T }, 

        { "ú", u }, { "Ú", U }, { "ù", u }, { "Ù", U }, { "ü", u }, { "Ü", U }, { "ŭ", u }, { "Ŭ", U }, 
        { "ū", u }, { "Ū", U }, { "ũ", u }, { "Ũ", U }, { "ų", u }, { "Ų", U }, { "ủ", u }, { "Ủ", U }, 
        { "ư", u }, { "Ư", U }, { "ừ", u }, { "Ừ", U }, { "û", u }, { "Û", U }, { "ự", u }, { "Ự", U }, 
        { "ů", u }, { "Ů", U }, { "ụ", u }, { "Ụ", U }, { "ṳ", u }, { "Ṳ", U }, { "ứ", u }, { "Ứ", U }, 
        { "ŵ", w }, { "Ŵ", W }, 
        { "ý", y }, { "Ý", Y }, { "ỹ", y }, { "Ỹ", Y }, { "ỳ", y }, { "Ỳ", Y }, 
        { "ź", z }, { "Ź", Z }, { "ž", z }, { "Ž", Z }, { "z̄", z }, { "Z̄", Z }, { "z̧", z }, { "Z̧", Z }, 
        { "ż", z }, { "Ż", Z }, { "ẕ", z }, { "Ẕ", Z }, 

    };

    #region ASCII Constants

    // ReSharper disable InconsistentNaming
    // ascii constants save memory for immutable strings
    private const string SingleQuote = "'";
    private const string a = "a";
    private const string A = "A";
    private const string b = "b";
    private const string B = "B";
    private const string c = "c";
    private const string C = "C";
    private const string d = "d";
    private const string D = "D";
    private const string e = "e";
    private const string E = "E";
    private const string g = "g";
    private const string G = "G";
    private const string h = "h";
    private const string H = "H";
    private const string i = "i";
    private const string I = "I";
    private const string k = "k";
    private const string K = "K";
    private const string l = "l";
    private const string L = "L";
    private const string n = "n";
    private const string N = "N";
    private const string o = "o";
    private const string O = "O";
    private const string oe = "oe";
    private const string OE = "OE";
    private const string r = "r";
    private const string R = "R";
    private const string s = "s";
    private const string S = "S";
    private const string t = "t";
    private const string T = "T";
    private const string u = "u";
    private const string U = "U";
    private const string w = "w";
    private const string W = "W";
    private const string y = "y";
    private const string Y = "Y";
    private const string z = "z";
    private const string Z = "Z";
    // ReSharper restore InconsistentNaming

    #endregion
    #endregion
    #region Conversion

    /// <summary>
    /// Converts unicode text to its ASCII equivalent using a 2-pass algorithm. 
    /// <para>
    /// In the first pass, the <paramref name="unicodeText"/> parameter is searched 
    /// for text present in the <see cref="UnicodeToAsciiConversions"/> dictionary. 
    /// For each match, the unicode text is replaced with its equivalent ASCII text 
    /// from the dictionary. <em>It cannot be guaranteed that each non-ASCII character
    /// will be converted after the first pass.</em>
    /// </para>
    /// <para>
    /// The second pass guarantees that the converted text will be ASCII-compatible. 
    /// This is achieved by replacing all incompatible charaters with a question mark 
    /// (?) character.
    /// </para>
    /// </summary>
    /// <param name="unicodeText">
    /// The unicode text to convert to an ASCII equivalent.
    /// </param>
    /// <returns>
    /// The ASCII equivalent of the <paramref name="unicodeText"/> value.
    /// </returns>
    private static string Convert(string unicodeText)
    {
        if (string.IsNullOrWhiteSpace(unicodeText))
            return unicodeText;

        var asciiBuilder = new StringBuilder(unicodeText);
        foreach (var conversion in UnicodeToAsciiConversions)
        {
            if (unicodeText.Contains(conversion.Key))
            {
                asciiBuilder.Replace(conversion.Key, conversion.Value);
            }
        }
        asciiBuilder.Replace("·", string.Empty);
        asciiBuilder.Replace("‎", string.Empty);
        var utf8Encoding = new UTF8Encoding();
        var asciiEncoding = new ASCIIEncoding();
        var convertedToAsciiPass1 = asciiBuilder.ToString();
        var utfBytes = utf8Encoding.GetBytes(convertedToAsciiPass1);
        if (utfBytes.Contains((byte)204) && utfBytes.Contains((byte)129))
        {
            var utfList = utfBytes.ToList();
            while (utfList[utfList.IndexOf(204) + 1] == 129)
            {
                utfList.RemoveAt(utfList.IndexOf(204) + 1);
                utfList.RemoveAt(utfList.IndexOf(204));
            }
            utfBytes = utfList.ToArray();
        }
        var asciiBytes = Encoding.Convert(utf8Encoding, asciiEncoding, utfBytes);
        var convertedToAsciiPass2 = asciiEncoding.GetString(asciiBytes);
        return convertedToAsciiPass2;
    }

    #endregion
    #region String Extension Methods

    /// <summary>
    /// Converts unicode text to its ASCII equivalent using the 
    /// <see cref="UnicodeToAsciiConverter.Convert(System.String)"/> implementation.
    /// </summary>
    /// <remarks>
    /// This is simply a shortcut to provide a more fluent API when converting 
    /// unicode text values to their ASCII equivalents. 
    /// </remarks>
    /// <returns>
    /// The ASCII equivalent of the this text value.
    /// </returns>
    public static string ConvertToAscii(this string unicodeText)
    {
        return Convert(unicodeText);
    }

    /// <summary>
    /// Determines whether a string of text contains only question marks and 
    /// whitespace characters. This is useful in determining whether a conversion
    /// from unicode to ASCII failed completely, as often happens with languages
    /// like Arabic. 
    /// </summary>
    /// <returns>
    /// <code>True</code> if this string of text contains only question marks and whitespace
    /// characters, otherwise <code>false</code>.
    /// </returns>
    public static bool ContainsOnlyQuestionMarksAndWhiteSpace(this string text)
    {
        return text.All(character => character == '?' || character == ' ' || character == '\'' 
            || character == '-' || character == '_' || character == '(' || character == ')'
            || character == ',' || character == '/' || character == '.' || character == '&'
            || character == '"');
    }

    #endregion

}
于 2011-12-11T13:44:04.497 回答