delphi - 使用 UTF8String 转换非规范化字符

Question

在将 UTF-8 编码的表情符号转换为字符串时，我们没有使用 UTF8ToString 获得正确的字符。我们从外部接口接收这些 UTF8 字符。我们使用在线 UTF8 解码器测试了 UTF 字符，发现它们包含正确的字符。我怀疑这些是复合字符。

procedure TestUTF8Convertion;
const
  utf8Denormalized: RawByteString = #$ED#$A0#$BD#$ED#$B8#$85#$20 + #$ED#$A0#$BD#$ED#$B8#$86#$20 + #$ED#$A0#$BD#$ED#$B8#$8A;
  utf8Normalized: RawByteString = #$F0#$9F#$98#$85 + #$F0#$9F#$98#$86 + #$F0#$9F#$98#$8A;
begin
  Memo1.Lines.Add(UTF8ToString(utf8Denormalized));
  Memo1.Lines.Add(UTF8ToString(utf8Normalized));
end;

Memo1 中的输出：

非规范化：��

标准化：

基于WinApi函数编写自己的转换函数MultiByteToWideChar并没有解决这个问题。

function UTF8DenormalizedToString(s: PAnsiChar): string;
var
  pwc: PWideChar;
  len: cardinal;
begin
  GetMem(pwc, (Length(s) + 1) * SizeOf(WideChar));
  len := MultiByteToWideChar(CP_UTF8, MB_PRECOMPOSED, @s[0], -1, pwc, length(s));
  SetString(result, pwc, len);
  FreeMem(pwc);
end;

score 3 · Accepted Answer

3

于 2020-08-25T16:25:25.107 回答

score 3 · Accepted Answer

If you have CESU-8 data in a buffer and you need to convert it to UTF-8 you can replace the surrogate pairs with a single UTF-8 encoded char. The rest of the data can be left unchanged.

In this case, your emoji is this :

code point : 01 F6 05
UTF-8 : F0 9F 98 85
UTF-16 : D8 3D DE 05
CESU-8 : ED A0 BD ED B8 85

The high surrogate in CESU-8 has this data : $003D

And the low surrogate in CESU-8 has this data : $0205

As Remy and AmigoJack pointed out you'll find these values when you decode the UTF-16 version of the emoji.

In the case of UTF-16 you will also need to multiply the $003D value by $400 (shl 10), add the result to $0205 and then add $10000 to the final result to get the code point.

Once you have the code point you can convert it to a 4-byte UTF-8 set of values.

function ValidHighSurrogate(const aBuffer: array of AnsiChar; i: integer): boolean;
var
  n: byte;
begin
  Result := False;
  if (ord(aBuffer[i]) <> $ED) then
    exit;

  n := ord(aBuffer[i + 1]) shr 4;
  if ((n and $A) <> $A) then
    exit;

  n := ord(aBuffer[i + 2]) shr 6;
  if ((n and $2) = $2) then
    Result := True;
end;

function ValidLowSurrogate(const aBuffer: array of AnsiChar; i: integer): boolean;
var
  n: byte;
begin
  Result := False;
  if (ord(aBuffer[i]) <> $ED) then
    exit;

  n := ord(aBuffer[i + 1]) shr 4;
  if ((n and $B) <> $B) then
    exit;

  n := ord(aBuffer[i + 2]) shr 6;
  if ((n and $2) = $2) then
    Result := True;
end;

function GetRawSurrogateValue(const aBuffer: array of AnsiChar; i: integer): integer;
var
  a, b: integer;
begin
  a := ord(aBuffer[i + 1]) and $0F;
  b := ord(aBuffer[i + 2]) and $3F;

  Result := (a shl 6) or b;
end;

function CESU8ToUTF8(const aBuffer: array of AnsiChar): boolean;
var
  TempBuffer: array of AnsiChar;
  i, j, TempLen: integer;
  TempHigh, TempLow, TempCodePoint: integer;
begin
  TempLen := length(aBuffer);
  SetLength(TempBuffer, TempLen);

  i := 0;
  j := 0;
  while (i < TempLen) do
    if (i + 5 < TempLen) and ValidHighSurrogate(aBuffer, i) and
      ValidLowSurrogate(aBuffer, i + 3) then
    begin
      TempHigh := GetRawSurrogateValue(aBuffer, i);
      TempLow := GetRawSurrogateValue(aBuffer, i + 3);
      TempCodePoint := (TempHigh shl 10) + TempLow + $10000;
      TempBuffer[j] := AnsiChar($F0 + ((TempCodePoint and $1C0000) shr 18));
      TempBuffer[j + 1] := AnsiChar($80 + ((TempCodePoint and $3F000) shr 12));
      TempBuffer[j + 2] := AnsiChar($80 + ((TempCodePoint and $FC0) shr 6));
      TempBuffer[j + 3] := AnsiChar($80 + (TempCodePoint and $3F));
      inc(j, 4);
      inc(i, 6);
    end
    else
    begin
      TempBuffer[j] := aBuffer[i];
      inc(i);
      inc(j);
    end;

  Result := < save the buffer here >;
end;

score 2 · Accepted Answer

UTF-8 consists of 1, 2, 3, or 4 bytes per character. The codepoint U+1F605 is correctly encoded as #$F0#$9F#$98#$85.
UTF-16 consists of 2 or 4 bytes per character. The 4 byte sequences are needed to encode codepoints beyond U+FFFF (such as most Emojis). Only UCS-2 is limited to codepoints U+0000 to U+FFFF (this applies to Windows NT versions before 2000).
A sequence like #$ED#$A0#$BD#$ED#$B8#$85 (UTF-8 high surrogate, followed by low surrogate) is no valid UTF-8, but instead CESU-8 - it results from naive, thus improper translation from UTF-16 to UTF-8: instead of (recognizing and) translating a 4 byte UTF-16 sequence (encoding one codepoint) into a 4 byte UTF-8 sequence only and always 2 bytes are translated, turning 2x2 bytes into an invalid 6 byte UTF-8 sequence.

Converting your valid UTF-8 sequence #$F0#$9F#$98#$85 into the valid UTF-16 sequence #$3d#$d8#$05#$de works for me. Of course, make sure you use a proper font which is actually able to render Emojis:

// const CP_UTF8= 65001;

function Utf8ToUtf16( const sIn: AnsiString; iSrcCodePage: DWord= CP_UTF8 ): WideString;
var
  iLenDest, iLenSrc: Integer;
begin
  // First calculate how much space is needed
  iLenSrc:= Length( sIn );
  iLenDest:= MultiByteToWideChar( iSrcCodePage, 0, PAnsiChar(sIn), iLenSrc, nil, 0 );

  // Now provide the accurate space
  SetLength( result, iLenDest );
  if iLenDest> 0 then begin  // Otherwise ERROR_INVALID_PARAMETER might occur
    if MultiByteToWideChar( iSrcCodePage, 0, PAnsiChar(sIn), iLenSrc, PWideChar(result), iLenDest )= 0 then begin
      // GetLastError();
      result:= '';
    end;
  end;
end;

...
  Edit1.Font.Name:= 'Segoe UI Symbol';  // Already available in Win7
  Edit1.Text:= Utf8ToUtf16( AnsiString(#$F0#$9F#$98#$85' vs. '#$ED#$A0#$BD#$ED#$B8#$85) );
  // Should display:  vs. ����

To my knowledge Windows neither has a codepage for CESU-8, nor for WTF-8 and as such won't deal with your invalid UTF-8. Also the usage of MB_PRECOMPOSED is discouraged and does not apply to this case anyway.

Talk to whoever gives you invalid UTF-8 and demand to make his job correct (or to give you the UTF-16 right away). Otherwise you must pre-process incoming UTF-8 by scanning it for matching surrogate pairs to then replace those bytes into a proper sequence. Not impossible, not even that difficult, but a dull work of patience.

delphi - 使用 UTF8String 转换非规范化字符

3 回答 3

Related

Reference