c++ - 在 C++ 中检测无效的 URL/URI 编码序列

Question

我正在使用此代码的 CPP 版本http://www.w3.org/International/unescape.java将 url 编码字符串解码回其原始形式。我希望能够检测到无效的 uri/url 编码序列，例如 %s、%p、%ya、%ax 等。但是如果我传递字符串 'Skywalker%s is back' ，它会返回 'Skywalker is back'。它切断了无效序列，知道如何正确处理..？或者 CPP 中的 URLDecoder 函数在这样的问题上很酷......？

    std::wstring URIDecodeAsWStr(std::string _tmpStrToEncode)
    {
        std::wstring _WStrDecPath = L"";
        int l  = _tmpStrToEncode.length();
        int ch = -1 ;
        int b, sumb = 0;
        for (int i = 0, more = -1 ; i < l ; i++)
        {
            ch = _tmpStrToEncode.at(i);
            switch (ch)
            {
                case '%':
                ++i;
                if(i >= l)
                {
                    b = ch;
                    break;
                }
                ch = _tmpStrToEncode.at(i);
                int hb;
                if(isdigit((unsigned char)ch))
                {
                    hb = ch - '0';
                }
                else
                {
                    hb = 10 + tolower(ch) - 'a';
                }
                hb = hb & 0xF;
                ++i;
                if(i >= l)
                {
                    b = ch;
                    break;
                }
                ch = _tmpStrToEncode.at(i);
                int lb;
                if(isdigit((unsigned char)ch))
                {
                    lb = ch - '0';
                }
                else
                {
                    lb = 10 + tolower(ch) - 'a';
                }
                lb = lb & 0xF;
                b = (hb << 4) | lb ;
                break ;

                default:
                b = ch ;
            }
            if ((b & 0xc0) == 0x80) // 10xxxxxx (continuation byte)
            {
                sumb = (sumb << 6) | (b & 0x3f) ;   // Add 6 bits to sumb
                if (--more == 0)
                {
                WCHAR temp [2];
                temp [0]= sumb;
                temp [1]= L'\0';
                _WStrDecPath.append(temp);              // Add char to sbuf
                }
            } else if ((b & 0x80) == 0x00){ // 0xxxxxxx (yields 7 bits)
                WCHAR temp [2];
                temp [0]= b;
                temp [1]= L'\0';
                _WStrDecPath.append(temp);  // Store in sbuf
            } else if ((b & 0xe0) == 0xc0) {    // 110xxxxx (yields 5 bits)
                sumb = b & 0x1f;
                more = 1;   // Expect 1 more byte
            } else if ((b & 0xf0) == 0xe0) {    // 1110xxxx (yields 4 bits)
                sumb = b & 0x0f;
                more = 2;   // Expect 2 more bytes
            } else if ((b & 0xf8) == 0xf0) {    // 11110xxx (yields 3 bits)
                sumb = b & 0x07;
                more = 3;   // Expect 3 more bytes
            } else if ((b & 0xfc) == 0xf8) {    // 111110xx (yields 2 bits)
                sumb = b & 0x03;
                more = 4;   // Expect 4 more bytes
            } else /*if ((b & 0xfe) == 0xfc)*/ {    // 1111110x (yields 1 bit)
                sumb = b & 0x01;
                more = 5;   // Expect 5 more bytes
            } /* No need to test if the UTF-8 encoding is well-formed */
        }
        return _WStrDecPath;
    }

还有一件事，输入应该是字符串，返回必须是 wstring。

c++ - 在 C++ 中检测无效的 URL/URI 编码序列

0 回答 0

Related

Reference