0

我从mysql中选择数据,数据库不在utf8中(unicode字符另存为拉丁文,例如unicode字符串Đỗ Tiến(正确形式)另存为Äá»— Tiến)。如果我使用 PHP 回显到 html,我只是设置<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />了正确的网页显示。如果我没有设置元标记,当被 Chrome 打开时,Chrome 检测到 windows-1258 编码,手动更改为 Unicode (utf-8),网页正确显示。

问题是:当我使用 jdbc 从 mysql 中选择数据时,我会这样转换:

    byte[] asciiBytes1 = "Äá»— tiến".getBytes("Cp1258");
    byte[] asciiBytes2 = "Äá»— tiến".getBytes("ISO-8859-1");
    String unicode1 = new String(asciiBytes1, "UTF-8");
    String unicode2 = new String(asciiBytes2, "UTF-8");
    System.out.println(unicode1);//�?ỗ tiến
    System.out.println(unicode2);//Đ�? tiến

结果,java不能正确转换,我在http://docs.oracle.com/javase/1.4.2/docs/guide/intl/encoding.doc.html尝试了许多编码,不仅是 Cp1258 和 ISO-8859 -1,但没有一个有效。2个简单的转换方法是使用Äá»— tiến我之前提到的带有字符串的html文件或使用notepad ++,设置编码ANSI,粘贴Äá»— tiến字符串然后更改为utf-8,它将显示Đỗ Tiến(是我想要的正确字符串)

4

2 回答 2

1

That's kinda complicated, it's in modified Windows-1252 where 0x81, 0x8d, 0x8f, 0x90 and 0x9d that are normally not assigned are replaced with respective C1 characters. It seems Java doesn't take this into account by default when using Windows-1252.

It is easiest to just fix your database and use UTF-8 everywhere.

Here's the code anyway

public static byte[] getBytesModifiedW1252( String str ) {
    final int[] windows1252 = {
            0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007,0x0008,0x0009,0x000A,0x000B,0x000C,0x000D,0x000E,0x000F
            ,0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017,0x0018,0x0019,0x001A,0x001B,0x001C,0x001D,0x001E,0x001F
            ,0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,0x0028,0x0029,0x002A,0x002B,0x002C,0x002D,0x002E,0x002F
            ,0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039,0x003A,0x003B,0x003C,0x003D,0x003E,0x003F
            ,0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,0x0048,0x0049,0x004A,0x004B,0x004C,0x004D,0x004E,0x004F
            ,0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,0x0058,0x0059,0x005A,0x005B,0x005C,0x005D,0x005E,0x005F
            ,0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,0x0068,0x0069,0x006A,0x006B,0x006C,0x006D,0x006E,0x006F
            ,0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,0x0078,0x0079,0x007A,0x007B,0x007C,0x007D,0x007E,0x007F
            ,0x20AC,0x0081,0x201A,0x0192,0x201E,0x2026,0x2020,0x2021,0x02C6,0x2030,0x0160,0x2039,0x0152,0x008D,0x017D,0x008F
            ,0x0090,0x2018,0x2019,0x201C,0x201D,0x2022,0x2013,0x2014,0x02DC,0x2122,0x0161,0x203A,0x0153,0x009D,0x017E,0x0178
            ,0x00A0,0x00A1,0x00A2,0x00A3,0x00A4,0x00A5,0x00A6,0x00A7,0x00A8,0x00A9,0x00AA,0x00AB,0x00AC,0x00AD,0x00AE,0x00AF
            ,0x00B0,0x00B1,0x00B2,0x00B3,0x00B4,0x00B5,0x00B6,0x00B7,0x00B8,0x00B9,0x00BA,0x00BB,0x00BC,0x00BD,0x00BE,0x00BF
            ,0x00C0,0x00C1,0x00C2,0x00C3,0x00C4,0x00C5,0x00C6,0x00C7,0x00C8,0x00C9,0x00CA,0x00CB,0x00CC,0x00CD,0x00CE,0x00CF
            ,0x00D0,0x00D1,0x00D2,0x00D3,0x00D4,0x00D5,0x00D6,0x00D7,0x00D8,0x00D9,0x00DA,0x00DB,0x00DC,0x00DD,0x00DE,0x00DF
            ,0x00E0,0x00E1,0x00E2,0x00E3,0x00E4,0x00E5,0x00E6,0x00E7,0x00E8,0x00E9,0x00EA,0x00EB,0x00EC,0x00ED,0x00EE,0x00EF
            ,0x00F0,0x00F1,0x00F2,0x00F3,0x00F4,0x00F5,0x00F6,0x00F7,0x00F8,0x00F9,0x00FA,0x00FB,0x00FC,0x00FD,0x00FE,0x00FF
        };
    Map<Integer, Integer> map = new HashMap<Integer, Integer>();

    for( int i = 0; i < windows1252.length; ++i ) {
        map.put( windows1252[i], i);
    }
    byte replacement = (byte)0x003F;

    byte[] ret = new byte[str.length()];

    for( int i = 0; i < str.length(); ++i ) {
        int cp = str.charAt(i);
        Integer w1252 = map.get(cp);
        ret[i] = w1252 == null ? replacement : (byte)(int)w1252;
    }

    return ret;
}

public static void main(String args[]) throws UnsupportedEncodingException {
    byte[] bytes = getBytesModifiedW1252( "Äá»— tiến" );
    System.out.println(new String(bytes, "UTF-8"));
    //Đỗ tiến
}

Here's the opposite:

public static String getStringModifiedW1252( byte[] bytes ) {

    final int[] windows1252 = {
            0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007,0x0008,0x0009,0x000A,0x000B,0x000C,0x000D,0x000E,0x000F
            ,0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017,0x0018,0x0019,0x001A,0x001B,0x001C,0x001D,0x001E,0x001F
            ,0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,0x0028,0x0029,0x002A,0x002B,0x002C,0x002D,0x002E,0x002F
            ,0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039,0x003A,0x003B,0x003C,0x003D,0x003E,0x003F
            ,0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,0x0048,0x0049,0x004A,0x004B,0x004C,0x004D,0x004E,0x004F
            ,0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,0x0058,0x0059,0x005A,0x005B,0x005C,0x005D,0x005E,0x005F
            ,0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,0x0068,0x0069,0x006A,0x006B,0x006C,0x006D,0x006E,0x006F
            ,0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,0x0078,0x0079,0x007A,0x007B,0x007C,0x007D,0x007E,0x007F
            ,0x20AC,0x0081,0x201A,0x0192,0x201E,0x2026,0x2020,0x2021,0x02C6,0x2030,0x0160,0x2039,0x0152,0x008D,0x017D,0x008F
            ,0x0090,0x2018,0x2019,0x201C,0x201D,0x2022,0x2013,0x2014,0x02DC,0x2122,0x0161,0x203A,0x0153,0x009D,0x017E,0x0178
            ,0x00A0,0x00A1,0x00A2,0x00A3,0x00A4,0x00A5,0x00A6,0x00A7,0x00A8,0x00A9,0x00AA,0x00AB,0x00AC,0x00AD,0x00AE,0x00AF
            ,0x00B0,0x00B1,0x00B2,0x00B3,0x00B4,0x00B5,0x00B6,0x00B7,0x00B8,0x00B9,0x00BA,0x00BB,0x00BC,0x00BD,0x00BE,0x00BF
            ,0x00C0,0x00C1,0x00C2,0x00C3,0x00C4,0x00C5,0x00C6,0x00C7,0x00C8,0x00C9,0x00CA,0x00CB,0x00CC,0x00CD,0x00CE,0x00CF
            ,0x00D0,0x00D1,0x00D2,0x00D3,0x00D4,0x00D5,0x00D6,0x00D7,0x00D8,0x00D9,0x00DA,0x00DB,0x00DC,0x00DD,0x00DE,0x00DF
            ,0x00E0,0x00E1,0x00E2,0x00E3,0x00E4,0x00E5,0x00E6,0x00E7,0x00E8,0x00E9,0x00EA,0x00EB,0x00EC,0x00ED,0x00EE,0x00EF
            ,0x00F0,0x00F1,0x00F2,0x00F3,0x00F4,0x00F5,0x00F6,0x00F7,0x00F8,0x00F9,0x00FA,0x00FB,0x00FC,0x00FD,0x00FE,0x00FF
        };

    StringBuilder ret = new StringBuilder(bytes.length);

    for( int i = 0; i < bytes.length; ++i ) {
        ret.append( (char) windows1252[(bytes[i] < 0 ? 256 + bytes[i] : bytes[i] )] );
    }

    return ret.toString();

}

public static void main(String args[]) throws UnsupportedEncodingException {
    String str = "Đỗ tiến";
    String w1252 = getStringModifiedW1252( str.getBytes("UTF-8"));
    System.out.println(w1252);
    //Äá»— tiến
}

You probably want to stash the map and array somewhere instead of creating them when the methods are called

于 2013-01-13T11:52:17.233 回答
-1

尝试这个

byte[] asciiBytes1 = "Äá»— tiến".getBytes("Cp1258");
byte[] asciiBytes2 = "Äá»— tiến".getBytes("ISO-8859-1");
String unicode1 = new String(asciiBytes1, "Cp1258");
String unicode2 = new String(asciiBytes2, "ISO-8859-1");
System.out.println(unicode1);//�?ỗ tiến
System.out.println(unicode2);//Đ�? tiến
于 2013-01-13T11:27:48.290 回答