我正在编写一个 Web 服务,它返回包含很长列表的对象,以 JSON 编码。当然,我们希望使用迭代器而不是 Python 列表,这样我们就可以从数据库中流式传输对象;不幸的是,标准库json.JSONEncoder
(_iterencode_list
文档字符串建议覆盖默认值以将对象转换为列表,但这意味着我们失去了流式传输的好处。以前,我们覆盖了一个私有方法,但是(正如预期的那样)在重构编码器时破坏了。
以流方式将迭代器序列化为 Python 中的 JSON 列表的最佳方法是什么?
我正是需要这个。第一种方法是覆盖该JSONEncoder.iterencode()
方法。然而,这不起作用,因为一旦迭代器不是顶级的,一些_iterencode()
函数的内部就会接管。
在对代码进行了一些研究之后,我发现了一个非常 hacky 的解决方案,但它确实有效。仅限 Python 3,但我确信 python 2 也可以实现同样的魔法(只是其他魔法方法名称):
import collections.abc
import json
import itertools
import sys
import resource
import time
starttime = time.time()
lasttime = None
def log_memory():
if "linux" in sys.platform.lower():
to_MB = 1024
else:
to_MB = 1024 * 1024
print("Memory: %.1f MB, time since start: %.1f sec%s" % (
resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / to_MB,
time.time() - starttime,
"; since last call: %.1f sec" % (time.time() - lasttime) if lasttime
else "",
))
globals()["lasttime"] = time.time()
class IterEncoder(json.JSONEncoder):
"""
JSON Encoder that encodes iterators as well.
Write directly to file to use minimal memory
"""
class FakeListIterator(list):
def __init__(self, iterable):
self.iterable = iter(iterable)
try:
self.firstitem = next(self.iterable)
self.truthy = True
except StopIteration:
self.truthy = False
def __iter__(self):
if not self.truthy:
return iter([])
return itertools.chain([self.firstitem], self.iterable)
def __len__(self):
raise NotImplementedError("Fakelist has no length")
def __getitem__(self, i):
raise NotImplementedError("Fakelist has no getitem")
def __setitem__(self, i):
raise NotImplementedError("Fakelist has no setitem")
def __bool__(self):
return self.truthy
def default(self, o):
if isinstance(o, collections.abc.Iterable):
return type(self).FakeListIterator(o)
return super().default(o)
print(json.dumps((i for i in range(10)), cls=IterEncoder))
print(json.dumps((i for i in range(0)), cls=IterEncoder))
print(json.dumps({"a": (i for i in range(10))}, cls=IterEncoder))
print(json.dumps({"a": (i for i in range(0))}, cls=IterEncoder))
log_memory()
print("dumping 10M numbers as incrementally")
with open("/dev/null", "wt") as fp:
json.dump(range(10000000), fp, cls=IterEncoder)
log_memory()
print("dumping 10M numbers built in encoder")
with open("/dev/null", "wt") as fp:
json.dump(list(range(10000000)), fp)
log_memory()
结果:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[]
{"a": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}
{"a": []}
Memory: 8.4 MB, time since start: 0.0 sec
dumping 10M numbers as incrementally
Memory: 9.0 MB, time since start: 8.6 sec; since last call: 8.6 sec
dumping 10M numbers built in encoder
Memory: 395.5 MB, time since start: 17.1 sec; since last call: 8.5 sec
很明显,IterEncoder 不需要内存来存储 10M 个整数,同时保持相同的编码速度。
(hacky)技巧是_iterencode_list
实际上不需要任何列表内容。它只是想知道列表是否为空(__bool__
),然后获取它的迭代器。isinstance(x, (list, tuple))
但是,它仅在返回 True时才到达此代码。所以我将迭代器打包到一个列表子类中,然后禁用所有随机访问,将第一个元素放在前面,以便我知道它是否为空,然后反馈迭代器。然后该default
方法在迭代器的情况下返回这个假列表。
将其保存到模块文件中并将其导入或直接粘贴到您的代码中。
'''
Copied from Python 2.7.8 json.encoder lib, diff follows:
@@ -331,6 +331,8 @@
chunks = _iterencode(value, _current_indent_level)
for chunk in chunks:
yield chunk
+ if first:
+ yield buf
if newline_indent is not None:
_current_indent_level -= 1
yield '\n' + (' ' * (_indent * _current_indent_level))
@@ -427,12 +429,12 @@
yield str(o)
elif isinstance(o, float):
yield _floatstr(o)
- elif isinstance(o, (list, tuple)):
- for chunk in _iterencode_list(o, _current_indent_level):
- yield chunk
elif isinstance(o, dict):
for chunk in _iterencode_dict(o, _current_indent_level):
yield chunk
+ elif hasattr(o, '__iter__'):
+ for chunk in _iterencode_list(o, _current_indent_level):
+ yield chunk
else:
if markers is not None:
markerid = id(o)
'''
from json import encoder
def _make_iterencode(markers, _default, _encoder, _indent, _floatstr,
_key_separator, _item_separator, _sort_keys, _skipkeys, _one_shot,
## HACK: hand-optimized bytecode; turn globals into locals
ValueError=ValueError,
basestring=basestring,
dict=dict,
float=float,
id=id,
int=int,
isinstance=isinstance,
list=list,
long=long,
str=str,
tuple=tuple,
):
def _iterencode_list(lst, _current_indent_level):
if not lst:
yield '[]'
return
if markers is not None:
markerid = id(lst)
if markerid in markers:
raise ValueError("Circular reference detected")
markers[markerid] = lst
buf = '['
if _indent is not None:
_current_indent_level += 1
newline_indent = '\n' + (' ' * (_indent * _current_indent_level))
separator = _item_separator + newline_indent
buf += newline_indent
else:
newline_indent = None
separator = _item_separator
first = True
for value in lst:
if first:
first = False
else:
buf = separator
if isinstance(value, basestring):
yield buf + _encoder(value)
elif value is None:
yield buf + 'null'
elif value is True:
yield buf + 'true'
elif value is False:
yield buf + 'false'
elif isinstance(value, (int, long)):
yield buf + str(value)
elif isinstance(value, float):
yield buf + _floatstr(value)
else:
yield buf
if isinstance(value, (list, tuple)):
chunks = _iterencode_list(value, _current_indent_level)
elif isinstance(value, dict):
chunks = _iterencode_dict(value, _current_indent_level)
else:
chunks = _iterencode(value, _current_indent_level)
for chunk in chunks:
yield chunk
if first:
yield buf
if newline_indent is not None:
_current_indent_level -= 1
yield '\n' + (' ' * (_indent * _current_indent_level))
yield ']'
if markers is not None:
del markers[markerid]
def _iterencode_dict(dct, _current_indent_level):
if not dct:
yield '{}'
return
if markers is not None:
markerid = id(dct)
if markerid in markers:
raise ValueError("Circular reference detected")
markers[markerid] = dct
yield '{'
if _indent is not None:
_current_indent_level += 1
newline_indent = '\n' + (' ' * (_indent * _current_indent_level))
item_separator = _item_separator + newline_indent
yield newline_indent
else:
newline_indent = None
item_separator = _item_separator
first = True
if _sort_keys:
items = sorted(dct.items(), key=lambda kv: kv[0])
else:
items = dct.iteritems()
for key, value in items:
if isinstance(key, basestring):
pass
# JavaScript is weakly typed for these, so it makes sense to
# also allow them. Many encoders seem to do something like this.
elif isinstance(key, float):
key = _floatstr(key)
elif key is True:
key = 'true'
elif key is False:
key = 'false'
elif key is None:
key = 'null'
elif isinstance(key, (int, long)):
key = str(key)
elif _skipkeys:
continue
else:
raise TypeError("key " + repr(key) + " is not a string")
if first:
first = False
else:
yield item_separator
yield _encoder(key)
yield _key_separator
if isinstance(value, basestring):
yield _encoder(value)
elif value is None:
yield 'null'
elif value is True:
yield 'true'
elif value is False:
yield 'false'
elif isinstance(value, (int, long)):
yield str(value)
elif isinstance(value, float):
yield _floatstr(value)
else:
if isinstance(value, (list, tuple)):
chunks = _iterencode_list(value, _current_indent_level)
elif isinstance(value, dict):
chunks = _iterencode_dict(value, _current_indent_level)
else:
chunks = _iterencode(value, _current_indent_level)
for chunk in chunks:
yield chunk
if newline_indent is not None:
_current_indent_level -= 1
yield '\n' + (' ' * (_indent * _current_indent_level))
yield '}'
if markers is not None:
del markers[markerid]
def _iterencode(o, _current_indent_level):
if isinstance(o, basestring):
yield _encoder(o)
elif o is None:
yield 'null'
elif o is True:
yield 'true'
elif o is False:
yield 'false'
elif isinstance(o, (int, long)):
yield str(o)
elif isinstance(o, float):
yield _floatstr(o)
elif isinstance(o, dict):
for chunk in _iterencode_dict(o, _current_indent_level):
yield chunk
elif hasattr(o, '__iter__'):
for chunk in _iterencode_list(o, _current_indent_level):
yield chunk
else:
if markers is not None:
markerid = id(o)
if markerid in markers:
raise ValueError("Circular reference detected")
markers[markerid] = o
o = _default(o)
for chunk in _iterencode(o, _current_indent_level):
yield chunk
if markers is not None:
del markers[markerid]
return _iterencode
encoder._make_iterencode = _make_iterencode
不能很好地支持真正的流式传输json
,因为这也意味着客户端应用程序也必须支持流式传输。有一些 Java 库支持读取流式json
流,但不是很通用。还有一些 python 绑定yail
,这是一个支持流的 C 库。
也许您可以使用Yaml
而不是json
. Yaml
是 json 的超集。它对双方的流式传输都有更好的支持,并且任何json
消息仍然有效yaml
。
但是在您的情况下,将对象流拆分为单独的json
消息流可能要简单得多。
另请参阅此处的讨论,哪些客户端库支持流式传输:是否有 JSON 的流式传输 API?
没那么简单。WSGI(这是大多数人使用的)协议不支持流式传输。支持它的服务器违反了规范。
即使您使用不兼容的服务器,也必须使用ijson 之类的东西。也看看这个和你有同样问题的人http://www.enricozini.org/2011/tips/python-stream-json/
编辑:然后这一切都归结为客户端,我想它将用Javascript(?)编写。但我看不出如何从不完整的 JSON 块中构造 javascript(或任何语言)对象。我唯一能想到的就是手动将长 JSON 分解为更小的 JSON 对象(在服务器端),然后将其一一传输到客户端。但这需要 websockets 而不是无状态的 http 请求/响应。如果 Web 服务是指 REST API,那么我想这不是您想要的。