javascript - 按 utf-8 字节位置提取子字符串

Question

我有一个字符串以及用于提取子字符串的开头和长度。两个位置（开始和长度）都基于原始 UTF8 字符串中的字节偏移量。

但是，有一个问题：

开始和长度以字节为单位，所以我不能使用“子字符串”。UTF8 字符串包含几个多字节字符。有没有一种超高效的方法来做到这一点？（我不需要解码字节......）

例子： var orig = '你好吗？'

s,e 可能是 3,3 以提取第二个字符（好）。我在找

var result = orig.substringBytes(3,3);

帮助！

更新 #1在 C/C++ 中，我只是将其转换为字节数组，但不确定 javascript 中是否有等价物。顺便说一句，是的，我们可以将其解析为字节数组并将其解析回字符串，但似乎应该有一种快速的方法将其剪切到正确的位置。想象一下“orig”是 1000000 个字符，s = 6 个字节，l = 3 个字节。

更新 #2感谢 zerkms 有用的重定向，我最终得到了以下结果，它不能正常工作 - 适用于多字节，但对于单字节却搞砸了。

function substrBytes(str, start, length)
{
    var ch, startIx = 0, endIx = 0, re = '';
    for (var i = 0; 0 < str.length; i++)
    {
        startIx = endIx++;

        ch = str.charCodeAt(i);
        do {
            ch = ch >> 8;   // a better way may exist to measure ch len
            endIx++;
        }
        while (ch);

        if (endIx > start + length)
        {
            return re;
        }
        else if (startIx >= start)
        {
            re += str[i];
        }
    }
}

更新#3我不认为改变字符代码真的有效。当正确答案是三个时，我正在读取两个字节......不知何故我总是忘记这一点。UTF8 和 UTF16 的代码点相同，但编码占用的字节数取决于编码！！！所以这不是正确的方法。

score 11 · Accepted Answer

我玩得很开心。希望这可以帮助。

因为 Javascript 不允许对字符串进行直接字节访问，所以找到起始位置的唯一方法是前向扫描。

更新#3我不认为改变字符代码真的有效。当正确答案是三个时，我正在读取两个字节......不知何故我总是忘记这一点。UTF8 和 UTF16 的代码点相同，但编码占用的字节数取决于编码！！！所以这不是正确的方法。

这是不正确的 - 实际上 javascript 中没有 UTF-8 字符串。根据 ECMAScript 262 规范，所有字符串 - 无论输入编码如何 - 都必须在内部存储为 UTF-16（“[sequence of] 16-bit unsigned integers”）。

考虑到这一点，8 位移位是正确的（但没有必要）。

假设您的字符存储为 3 字节序列是错误的……
事实上，JS（ECMA-262）字符串中的所有字符都是 16 位（2 字节）长。

这可以通过手动将多字节字符转换为 utf-8 来解决，如下面的代码所示。

请参阅我的示例代码中解释的详细信息：

function encode_utf8( s )
{
  return unescape( encodeURIComponent( s ) );
}

function substr_utf8_bytes(str, startInBytes, lengthInBytes) {

   /* this function scans a multibyte string and returns a substring. 
    * arguments are start position and length, both defined in bytes.
    * 
    * this is tricky, because javascript only allows character level 
    * and not byte level access on strings. Also, all strings are stored
    * in utf-16 internally - so we need to convert characters to utf-8
    * to detect their length in utf-8 encoding.
    *
    * the startInBytes and lengthInBytes parameters are based on byte 
    * positions in a utf-8 encoded string.
    * in utf-8, for example: 
    *       "a" is 1 byte, 
            "ü" is 2 byte, 
       and  "你&quot; is 3 byte.
    *
    * NOTE:
    * according to ECMAScript 262 all strings are stored as a sequence
    * of 16-bit characters. so we need a encode_utf8() function to safely
    * detect the length our character would have in a utf8 representation.
    * 
    * http://www.ecma-international.org/publications/files/ecma-st/ECMA-262.pdf
    * see "4.3.16 String Value":
    * > Although each value usually represents a single 16-bit unit of 
    * > UTF-16 text, the language does not place any restrictions or 
    * > requirements on the values except that they be 16-bit unsigned 
    * > integers.
    */

    var resultStr = '';
    var startInChars = 0;

    // scan string forward to find index of first character
    // (convert start position in byte to start position in characters)

    for (bytePos = 0; bytePos < startInBytes; startInChars++) {

        // get numeric code of character (is >128 for multibyte character)
        // and increase "bytePos" for each byte of the character sequence

        ch = str.charCodeAt(startInChars);
        bytePos += (ch < 128) ? 1 : encode_utf8(str[startInChars]).length;
    }

    // now that we have the position of the starting character,
    // we can built the resulting substring

    // as we don't know the end position in chars yet, we start with a mix of
    // chars and bytes. we decrease "end" by the byte count of each selected 
    // character to end up in the right position
    end = startInChars + lengthInBytes - 1;

    for (n = startInChars; startInChars <= end; n++) {
        // get numeric code of character (is >128 for multibyte character)
        // and decrease "end" for each byte of the character sequence
        ch = str.charCodeAt(n);
        end -= (ch < 128) ? 1 : encode_utf8(str[n]).length;

        resultStr += str[n];
    }

    return resultStr;
}

var orig = 'abc你好吗？';

alert('res: ' + substr_utf8_bytes(orig, 0, 2)); // alerts: "ab"
alert('res: ' + substr_utf8_bytes(orig, 2, 1)); // alerts: "c"
alert('res: ' + substr_utf8_bytes(orig, 3, 3)); // alerts: "你&quot;
alert('res: ' + substr_utf8_bytes(orig, 6, 6)); // alerts: "好吗"

score 8 · Accepted Answer

@Kaii的答案几乎是正确的，但其中有一个错误。它无法处理 Unicode 从 128 到 255 的字符。这是修改后的版本（只需将 256 更改为 128）：

function encode_utf8( s )
{
  return unescape( encodeURIComponent( s ) );
}

function substr_utf8_bytes(str, startInBytes, lengthInBytes) {

   /* this function scans a multibyte string and returns a substring. 
    * arguments are start position and length, both defined in bytes.
    * 
    * this is tricky, because javascript only allows character level 
    * and not byte level access on strings. Also, all strings are stored
    * in utf-16 internally - so we need to convert characters to utf-8
    * to detect their length in utf-8 encoding.
    *
    * the startInBytes and lengthInBytes parameters are based on byte 
    * positions in a utf-8 encoded string.
    * in utf-8, for example: 
    *       "a" is 1 byte, 
            "ü" is 2 byte, 
       and  "你" is 3 byte.
    *
    * NOTE:
    * according to ECMAScript 262 all strings are stored as a sequence
    * of 16-bit characters. so we need a encode_utf8() function to safely
    * detect the length our character would have in a utf8 representation.
    * 
    * http://www.ecma-international.org/publications/files/ecma-st/ECMA-262.pdf
    * see "4.3.16 String Value":
    * > Although each value usually represents a single 16-bit unit of 
    * > UTF-16 text, the language does not place any restrictions or 
    * > requirements on the values except that they be 16-bit unsigned 
    * > integers.
    */

    var resultStr = '';
    var startInChars = 0;

    // scan string forward to find index of first character
    // (convert start position in byte to start position in characters)

    for (bytePos = 0; bytePos < startInBytes; startInChars++) {

        // get numeric code of character (is >= 128 for multibyte character)
        // and increase "bytePos" for each byte of the character sequence

        ch = str.charCodeAt(startInChars);
        bytePos += (ch < 128) ? 1 : encode_utf8(str[startInChars]).length;
    }

    // now that we have the position of the starting character,
    // we can built the resulting substring

    // as we don't know the end position in chars yet, we start with a mix of
    // chars and bytes. we decrease "end" by the byte count of each selected 
    // character to end up in the right position
    end = startInChars + lengthInBytes - 1;

    for (n = startInChars; startInChars <= end; n++) {
        // get numeric code of character (is >= 128 for multibyte character)
        // and decrease "end" for each byte of the character sequence
        ch = str.charCodeAt(n);
        end -= (ch < 128) ? 1 : encode_utf8(str[n]).length;

        resultStr += str[n];
    }

    return resultStr;
}

var orig = 'abc你好吗？©';

alert('res: ' + substr_utf8_bytes(orig, 0, 2)); // alerts: "ab"
alert('res: ' + substr_utf8_bytes(orig, 2, 1)); // alerts: "c"
alert('res: ' + substr_utf8_bytes(orig, 3, 3)); // alerts: "你"
alert('res: ' + substr_utf8_bytes(orig, 6, 6)); // alerts: "好吗"
alert('res: ' + substr_utf8_bytes(orig, 15, 2)); // alerts: "©"

顺便说一句，这是一个错误修复，它应该对有同样问题的人有用。为什么审稿人会因为修改“太多”或“太小”而拒绝我的编辑建议？@Adam Eberlin @Kjuly @Jasonw

score 2 · Accepted Answer

function substrBytes(str, start, length)
{
    var buf = new Buffer(str);
    return buf.slice(start, start+length).toString();
}

AYB

score 1 · Accepted Answer

对于 IE 用户，上述答案中的代码将输出undefined. 因为在 IE 中是不支持str[n]的，也就是说不能用字符串作为数组。您需要替换str[n]为str.charAt(n). 代码应该是；

function encode_utf8( s ) {
  return unescape( encodeURIComponent( s ) );
}

function substr_utf8_bytes(str, startInBytes, lengthInBytes) {

    var resultStr = '';
    var startInChars = 0;

    for (bytePos = 0; bytePos < startInBytes; startInChars++) {
        ch = str.charCodeAt(startInChars);
        bytePos += (ch < 128) ? 1 : encode_utf8(str.charAt(startInChars)).length;
    }

    end = startInChars + lengthInBytes - 1;

    for (n = startInChars; startInChars <= end; n++) {
        ch = str.charCodeAt(n);
        end -= (ch < 128) ? 1 : encode_utf8(str.charAt(n)).length;

        resultStr += str.charAt(n);
    }

    return resultStr;
}

score 1 · Accepted Answer

也许用它来计算字节和示例。它计算你的字符是 2 个字节，而不是 3 个字节跟随@Kaii 的函数：

jQuery.byteLength = function(target) {
    try {
        var i = 0;
        var length = 0;
        var count = 0;
        var character = '';
        //
        target = jQuery.castString(target);
        length = target.length;
        //
        for (i = 0; i < length; i++) {
            // 1 文字を切り出し Unicode に変換
            character = target.charCodeAt(i);
            //
            // Unicode の半角 : 0x0 - 0x80, 0xf8f0, 0xff61 - 0xff9f, 0xf8f1 -
            // 0xf8f3
            if ((character >= 0x0 && character < 0x81)
                    || (character == 0xf8f0)
                    || (character > 0xff60 && character < 0xffa0)
                    || (character > 0xf8f0 && character < 0xf8f4)) {
                // 1 バイト文字
                count += 1;
            } else {
                // 2 バイト文字
                count += 2;
            }
        }
        //
        return (count);
    } catch (e) {
        jQuery.showErrorDetail(e, 'byteLength');
        return (0);
    }
};

for (var j = 1, len = value.length; j <= len; j++) {
    var slice = value.slice(0, j);
    var slength = $.byteLength(slice);
    if ( slength == 106 ) {
        $(this).val(slice);
        break;
    }
}

score 0 · Accepted Answer

System.ArraySegment 很有用，但您需要使用数组输入、偏移量和索引器来构造函数。

javascript - 按 utf-8 字节位置提取子字符串

6 回答 6

Related

Reference