If you have CESU-8 data in a buffer and you need to convert it to UTF-8 you can replace the surrogate pairs with a single UTF-8 encoded char. The rest of the data can be left unchanged.
In this case, your emoji is this :
- code point : 01 F6 05
- UTF-8 : F0 9F 98 85
- UTF-16 : D8 3D DE 05
- CESU-8 : ED A0 BD ED B8 85
The high surrogate in CESU-8 has this data : $003D
And the low surrogate in CESU-8 has this data : $0205
As Remy and AmigoJack pointed out you'll find these values when you decode the UTF-16 version of the emoji.
In the case of UTF-16 you will also need to multiply the $003D value by $400 (shl 10), add the result to $0205 and then add $10000 to the final result to get the code point.
Once you have the code point you can convert it to a 4-byte UTF-8 set of values.
function ValidHighSurrogate(const aBuffer: array of AnsiChar; i: integer): boolean;
var
n: byte;
begin
Result := False;
if (ord(aBuffer[i]) <> $ED) then
exit;
n := ord(aBuffer[i + 1]) shr 4;
if ((n and $A) <> $A) then
exit;
n := ord(aBuffer[i + 2]) shr 6;
if ((n and $2) = $2) then
Result := True;
end;
function ValidLowSurrogate(const aBuffer: array of AnsiChar; i: integer): boolean;
var
n: byte;
begin
Result := False;
if (ord(aBuffer[i]) <> $ED) then
exit;
n := ord(aBuffer[i + 1]) shr 4;
if ((n and $B) <> $B) then
exit;
n := ord(aBuffer[i + 2]) shr 6;
if ((n and $2) = $2) then
Result := True;
end;
function GetRawSurrogateValue(const aBuffer: array of AnsiChar; i: integer): integer;
var
a, b: integer;
begin
a := ord(aBuffer[i + 1]) and $0F;
b := ord(aBuffer[i + 2]) and $3F;
Result := (a shl 6) or b;
end;
function CESU8ToUTF8(const aBuffer: array of AnsiChar): boolean;
var
TempBuffer: array of AnsiChar;
i, j, TempLen: integer;
TempHigh, TempLow, TempCodePoint: integer;
begin
TempLen := length(aBuffer);
SetLength(TempBuffer, TempLen);
i := 0;
j := 0;
while (i < TempLen) do
if (i + 5 < TempLen) and ValidHighSurrogate(aBuffer, i) and
ValidLowSurrogate(aBuffer, i + 3) then
begin
TempHigh := GetRawSurrogateValue(aBuffer, i);
TempLow := GetRawSurrogateValue(aBuffer, i + 3);
TempCodePoint := (TempHigh shl 10) + TempLow + $10000;
TempBuffer[j] := AnsiChar($F0 + ((TempCodePoint and $1C0000) shr 18));
TempBuffer[j + 1] := AnsiChar($80 + ((TempCodePoint and $3F000) shr 12));
TempBuffer[j + 2] := AnsiChar($80 + ((TempCodePoint and $FC0) shr 6));
TempBuffer[j + 3] := AnsiChar($80 + (TempCodePoint and $3F));
inc(j, 4);
inc(i, 6);
end
else
begin
TempBuffer[j] := aBuffer[i];
inc(i);
inc(j);
end;
Result := < save the buffer here >;
end;