如jfs 的回答中所述,在提供无效的错误处理程序时,您很可能不会收到错误,因为当解码没有错误时,Python 不会检查错误处理程序是否有效。
然而,值得注意的是,这种行为是依赖于实现的。如您所见,在 CPython 中,encode
anddecode
函数在遇到错误之前不会检查是否存在错误处理程序。
相反,在 IronPython 中,encode
anddecode
函数在尝试编码/解码之前检查指定的错误处理程序是否存在,因此,您提供的示例代码会产生如下错误:
Traceback (most recent call last):
File ".\code.py", line 6, in <module>
LookupError: unknown error handler name 'foo23'
当然,在这种情况下,其他 Python 实现可能会有不同的行为。
我想验证 CPython 实际上是在等待验证错误处理程序,直到遇到解码错误而 IronPython 没有,所以我检查了这两种实现的源代码。
CPython
以下是Python 2.6.2 文件中的PyUnicode_DecodeUTF8Stateful
函数代码。unicodeobject.c
这个函数似乎完成了解码 UTF-8 编码字节的大部分工作。
PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed)
{
const char *starts = s;
int n;
Py_ssize_t startinpos;
Py_ssize_t endinpos;
Py_ssize_t outpos;
const char *e;
PyUnicodeObject *unicode;
Py_UNICODE *p;
const char *errmsg = "";
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
/* Note: size will always be longer than the resulting Unicode
character count */
unicode = _PyUnicode_New(size);
if (!unicode)
return NULL;
if (size == 0) {
if (consumed)
*consumed = 0;
return (PyObject *)unicode;
}
/* Unpack UTF-8 encoded data */
p = unicode->str;
e = s + size;
while (s < e) {
Py_UCS4 ch = (unsigned char)*s;
if (ch < 0x80) {
*p++ = (Py_UNICODE)ch;
s++;
continue;
}
n = utf8_code_length[ch];
if (s + n > e) {
if (consumed)
break;
else {
errmsg = "unexpected end of data";
startinpos = s-starts;
endinpos = size;
goto utf8Error;
}
}
switch (n) {
case 0:
errmsg = "unexpected code byte";
startinpos = s-starts;
endinpos = startinpos+1;
goto utf8Error;
case 1:
errmsg = "internal error";
startinpos = s-starts;
endinpos = startinpos+1;
goto utf8Error;
case 2:
if ((s[1] & 0xc0) != 0x80) {
errmsg = "invalid data";
startinpos = s-starts;
endinpos = startinpos+2;
goto utf8Error;
}
ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
if (ch < 0x80) {
startinpos = s-starts;
endinpos = startinpos+2;
errmsg = "illegal encoding";
goto utf8Error;
}
else
*p++ = (Py_UNICODE)ch;
break;
case 3:
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80) {
errmsg = "invalid data";
startinpos = s-starts;
endinpos = startinpos+3;
goto utf8Error;
}
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
if (ch < 0x0800) {
/* Note: UTF-8 encodings of surrogates are considered
legal UTF-8 sequences;
XXX For wide builds (UCS-4) we should probably try
to recombine the surrogates into a single code
unit.
*/
errmsg = "illegal encoding";
startinpos = s-starts;
endinpos = startinpos+3;
goto utf8Error;
}
else
*p++ = (Py_UNICODE)ch;
break;
case 4:
if ((s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
(s[3] & 0xc0) != 0x80) {
errmsg = "invalid data";
startinpos = s-starts;
endinpos = startinpos+4;
goto utf8Error;
}
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
/* validate and convert to UTF-16 */
if ((ch < 0x10000) /* minimum value allowed for 4
byte encoding */
|| (ch > 0x10ffff)) /* maximum value allowed for
UTF-16 */
{
errmsg = "illegal encoding";
startinpos = s-starts;
endinpos = startinpos+4;
goto utf8Error;
}
#ifdef Py_UNICODE_WIDE
*p++ = (Py_UNICODE)ch;
#else
/* compute and append the two surrogates: */
/* translate from 10000..10FFFF to 0..FFFF */
ch -= 0x10000;
/* high surrogate = top 10 bits added to D800 */
*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
/* low surrogate = bottom 10 bits added to DC00 */
*p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
#endif
break;
default:
/* Other sizes are only needed for UCS-4 */
errmsg = "unsupported Unicode code range";
startinpos = s-starts;
endinpos = startinpos+n;
goto utf8Error;
}
s += n;
continue;
utf8Error:
outpos = p-PyUnicode_AS_UNICODE(unicode);
if (unicode_decode_call_errorhandler(
errors, &errorHandler,
"utf8", errmsg,
starts, size, &startinpos, &endinpos, &exc, &s,
&unicode, &outpos, &p))
goto onError;
}
if (consumed)
*consumed = s-starts;
/* Adjust length */
if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
goto onError;
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
return (PyObject *)unicode;
onError:
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
Py_DECREF(unicode);
return NULL;
}
我们可以看到这个函数调用了另一个函数 ,unicode_decode_call_errorhandler
它实际上是使用错误处理程序的。该函数的代码如下
static
int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
const char *encoding, const char *reason,
const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
{
static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
PyObject *restuple = NULL;
PyObject *repunicode = NULL;
Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Py_ssize_t requiredsize;
Py_ssize_t newpos;
Py_UNICODE *repptr;
Py_ssize_t repsize;
int res = -1;
if (*errorHandler == NULL) {
*errorHandler = PyCodec_LookupError(errors);
if (*errorHandler == NULL)
goto onError;
}
if (*exceptionObject == NULL) {
*exceptionObject = PyUnicodeDecodeError_Create(
encoding, input, insize, *startinpos, *endinpos, reason);
if (*exceptionObject == NULL)
goto onError;
}
else {
if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
goto onError;
if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
goto onError;
if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
goto onError;
}
restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
if (restuple == NULL)
goto onError;
if (!PyTuple_Check(restuple)) {
PyErr_SetString(PyExc_TypeError, &argparse[4]);
goto onError;
}
if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
goto onError;
if (newpos<0)
newpos = insize+newpos;
if (newpos<0 || newpos>insize) {
PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
goto onError;
}
/* need more space? (at least enough for what we
have+the replacement+the rest of the string (starting
at the new input position), so we won't have to check space
when there are no errors in the rest of the string) */
repptr = PyUnicode_AS_UNICODE(repunicode);
repsize = PyUnicode_GET_SIZE(repunicode);
requiredsize = *outpos + repsize + insize-newpos;
if (requiredsize > outsize) {
if (requiredsize<2*outsize)
requiredsize = 2*outsize;
if (_PyUnicode_Resize(output, requiredsize) < 0)
goto onError;
*outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
}
*endinpos = newpos;
*inptr = input + newpos;
Py_UNICODE_COPY(*outptr, repptr, repsize);
*outptr += repsize;
*outpos += repsize;
/* we made it! */
res = 0;
onError:
Py_XDECREF(restuple);
return res;
}
由于使用错误处理程序PyUnicode_DecodeUTF8Stateful
调用,因此调用最终验证提供的错误处理程序。请参阅下面的代码。unicode_decode_call_errorhandler
NULL
unicode_decode_call_errorhandler
PyCodec_LookupError
PyObject *PyCodec_LookupError(const char *name)
{
PyObject *handler = NULL;
PyInterpreterState *interp = PyThreadState_GET()->interp;
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
return NULL;
if (name==NULL)
name = "strict";
handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
if (!handler)
PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
else
Py_INCREF(handler);
return handler;
}
请注意,该PyUnicode_DecodeUTF8Stateful
调用中的代码位于unicode_decode_call_errorhandler
utf8Error 标签下,只有在解码中遇到错误时才能访问该标签。
铁蟒
在 IronPython 2.7.9 中,解码在下面的StringOps.DoDecode
函数 (in StringOps.cs
) 中处理。
internal static string DoDecode(CodeContext context, string s, string errors, string encoding, Encoding e, bool final, out int numBytes) {
byte[] bytes = s.MakeByteArray();
int start = GetStartingOffset(e, bytes);
numBytes = bytes.Length - start;
#if FEATURE_ENCODING
// CLR's encoder exceptions have a 1-1 mapping w/ Python's encoder exceptions
// so we just clone the encoding & set the fallback to throw in strict mode.
e = (Encoding)e.Clone();
switch (errors) {
case "backslashreplace":
case "xmlcharrefreplace":
case "strict": e.DecoderFallback = final ? DecoderFallback.ExceptionFallback : new ExceptionFallBack(numBytes, e is UTF8Encoding); break;
case "replace": e.DecoderFallback = ReplacementFallback; break;
case "ignore": e.DecoderFallback = new PythonDecoderFallback(encoding, s, null); break;
default:
e.DecoderFallback = new PythonDecoderFallback(encoding, s, LightExceptions.CheckAndThrow(PythonOps.LookupEncodingError(context, errors)));
break;
}
#endif
string decoded = e.GetString(bytes, start, numBytes);
#if FEATURE_ENCODING
if (e.DecoderFallback is ExceptionFallBack fallback) {
byte[] badBytes = fallback.buffer.badBytes;
if (badBytes != null) {
numBytes -= badBytes.Length;
}
}
#endif
return decoded;
}
这里的DoDecode
函数是switch
在解码之前在语句中创建错误处理程序。如果包含错误处理errors
程序(DoDecode
PythonDecoderFallback
PythonOps.LookupEncodingError
[LightThrowing]
internal static object LookupEncodingError(CodeContext/*!*/ context, string name) {
Dictionary<string, object> errorHandlers = context.LanguageContext.ErrorHandlers;
lock (errorHandlers) {
if (errorHandlers.ContainsKey(name))
return errorHandlers[name];
else
return LightExceptions.Throw(PythonOps.LookupError("unknown error handler name '{0}'", name));
}
}
当在字典中LookupEncodingError
找不到带有给定的错误处理程序时,它会“抛出”一个 LookupError ——即,它创建一个对象并返回它。然后该对象由函数检查,最终在 IronPython 中使用无效错误处理程序调用时产生“未知错误处理程序名称”错误。name
errorHandlers
LightException
LightException
LightExceptions.CheckAndThrow
decode
同样,所有这些都发生在调用对象的方法DoDecode
之前,因此无论是否存在解码错误,IronPython 都会使用无效的错误处理程序产生错误。Encoding
GetString