首先,查找编码错误,然后查找不需要的代码点。
后者很容易,因为有 Unicode 属性可以识别它们。(见下文)
要准确报告错误,您可能需要编写自己的解码器来查找 UTF-8 错误。
sub bytes_to_hex { join ' ', map { sprintf '%02X', $_ } unpack 'C*', $_[0] }
my @errors;
my @warns;
my $output = '';
for ($input) {
while (!/\G \z /xgc) {
my $pos = pos;
if (/\G (
(?: [\x00-\x7F]
| [\xC0-\xDF][\x80-\xBF]
| [\xE0-\xEF][\x80-\xBF]{2}
| [\xF0-\xF7][\x80-\xBF]{3}
| [\xF8-\xFB][\x80-\xBF]{4}
| [\xFC-\xFD][\x80-\xBF]{5}
)
) /xgc) {
my $bytes = $1;
my @bytes = unpack 'C*', $bytes;
my $hex_bytes = bytes_to_hex($bytes);
if ($bytes =~ /^
(?: [\xC0-\xC1]
| \xE0[\x80-\x9F]
| \xF0[\x80-\x8F]
| \xF8[\x80-\x87]
| \xFC[\x80-\x83]
)
/x) {
push @warns, "Overlong encoding $hex_bytes at pos $pos";
}
if ($bytes =~ /^[\xF8-\xFD]/) {
push @warns, "Defunct 5 or 6 byte sequence $hex_bytes at pos $pos";
}
my $code_point_ord = @bytes == 1
? $bytes[0]
: $bytes[0] & ( 0x7F >> @bytes );
$code_point_ord = ( $code_point_ord << 6 ) | ( $_ & 0x3F )
for @bytes[ 1..$#bytes ];
my $code_point_hex = sprintf('U+%05X', $code_point_ord);
my $code_point = chr($code_point_ord);
if ($code_point_ord >= 0x110000) {
push @errors, "Non-Unicode $code_point_hex at pos $pos";
} else {
push @warns, "Surrogate $code_point_hex at pos $pos"
if $code_point =~ /\p{Cs}/;
push @warns, "Private use $code_point_hex at pos $pos"
if $code_point =~ /\p{Co}/;
push @warns, "Unassigned $code_point_hex at pos $pos"
if $code_point =~ /\p{Cn}/;
$output .= $code_point;
}
}
elsif (/\G (
(?: [\xC0-\xDF]
| [\xE0-\xEF][\x80-\xBF]{0,1}
| [\xF0-\xF7][\x80-\xBF]{0,2}
| [\xF8-\xFB][\x80-\xBF]{0,3}
| [\xFC-\xFD][\x80-\xBF]{0,4}
)
) /xgc) {
my $bytes = $1;
my $hex_bytes = bytes_to_hex($bytes);
push @errors, "Incomplete sequence $hex_bytes at pos $pos";
}
elsif (/\G ( [\x80-\xBF] ) /xgc) {
my $byte = $1;
my $hex_byte = bytes_to_hex($byte);
push @errors, "Unexpected continuation byte $hex_byte at pos $pos";
}
elsif (/\G ( [\xFE-\xFF] ) /xgc) {
my $byte = $1;
my $hex_byte = bytes_to_hex($byte);
push @errors, "Invalid byte $hex_byte at pos $pos";
}
else {
die "Bug";
}
}
}