我有10 万个非常大的 JSON 文件,需要对特定元素进行处理。为了避免内存过载,我使用了一个名为ijson的 python 库,当我处理每个对象时,它工作得很好,f.seek(0)
但它使我的处理非常慢。另外,如果我删除此f.seek(0)
输出,则会出现错误
过早的EOF
部分 JSON:
{
"info": {
"added": 1638217153.782366,
"started": 1638261651.130148,
"duration": 15,
"ended": 1638261666.212257,
"owner": null,
"score": 0.2,
"id": 5062,
"category": "file",
"git": {
"head": "13cbe0d9e457be3673304533043e992ead1ea9b2",
"fetch_head": "13cbe0d9e457be3673304533043e992ead1ea9b2"
},
"monitor": "2deb9ccd75d5a7a3fe05b2625b03a8639d6ee36b",
"package": "dll",
"route": "internet",
"custom": null,
"machine": {
"status": "stopped",
"name": "192.168.56.1012",
"label": "192.168.56.1012",
"manager": "VirtualBox",
"started_on": "2021-11-30 08:40:51",
"shutdown_on": "2021-11-30 08:41:06"
},
"platform": "windows",
"version": "2.0.7",
"options": "procmemdump=yes,route=internet"
},
"network": {
"domains": [
{
"ip": "87.101.200.41",
"domain": "www.msftncsi.com"
},
{
"ip": "131.107.255.255",
"domain": "dns.msftncsi.com"
}
]
},
"signatures": [
{
"families": [],
"description": "This executable has a PDB path",
"severity": 1,
"ttp": {},
"markcount": 1,
"references": [],
"marks": [
{
"category": "pdb_path",
"ioc": "MsiHnd.pdb",
"type": "ioc",
"description": null
}
],
"name": "has_pdb"
}
],
"behavior": {
"generic": [
{
"process_path": "C:\\Windows\\System32\\lsass.exe",
"process_name": "lsass.exe",
"pid": 496,
"summary": {},
"first_seen": 1638224353.328125,
"ppid": 380
}
],
"apistats": {
"2336": {
"NtQueryValueKey": 2,
"LdrUnloadDll": 1,
"NtCreateSection": 1,
"LoadStringW": 2,
"CreateActCtxW": 4,
"NtOpenKey": 2,
"NtUnmapViewOfSection": 4,
"MessageBoxTimeoutW": 1,
"SetUnhandledExceptionFilter": 1,
"SetErrorMode": 1,
"NtCreateFile": 1,
"NtClose": 17,
"GetSystemTimeAsFileTime": 1,
"LdrLoadDll": 1,
"NtTerminateProcess": 3,
"GetFileAttributesW": 2,
"NtMapViewOfSection": 1
}
},
"summary": {
"file_opened": [
"C:\\Windows\\System32\\en-US\\KERNELBASE.dll.mui"
]
}
}
}
当前代码:(多个f.seek(0)
)不想使用:)
my_file_list = [f for f in glob.glob("data/jsons/test.json")]
final_result = []
i = 0
for filename in my_file_list:
try:
with open(filename, 'r', encoding='utf8', errors='ignore') as f:
row = {}
parse_events = ijson.parse(f, use_float=True)
for prefix, event, value in parse_events:
if prefix == 'info.added':
row['added'] = value
elif prefix == 'info.started':
row['started'] = value
elif prefix == 'info.duration':
row['duration'] = value
elif prefix == 'info.ended':
row['ended'] = value
elif prefix == 'info' and event == 'end_map':
break
f.seek(0)
row['AF-DomainCount'] = sum(1 for _ in ijson.items(f, 'network.domains.item'))
f.seek(0)
row['AG-SignatureCount'] = sum(1 for _ in ijson.items(f, 'signatures.item'))
f.seek(0)
row['AH-GenericCount'] = sum(1 for _ in ijson.items(f, 'behavior.generic.item'))
f.seek(0)
row['AI-ApistatCount'] = sum(1 for _ in ijson.items(f, 'behavior.apistats'))
f.seek(0)
row['AJ-ProcessCount'] = sum(1 for _ in ijson.items(f, 'behavior.processes.item'))
f.seek(0)
row['AK-SummaryCount'] = sum(1 for _ in ijson.items(f, 'behavior.summary'))
f.seek(0)
apistats_element = ijson.items(f, 'behavior.apistats')
for inner_apistats in apistats_element:
for index, inner_fields in inner_apistats.items():
row = dict(Counter(row) + Counter(inner_fields))
row['AA-Filename'] = os.path.basename(filename)
i+=1
print(f"processed file {i}", end='\r')
except Exception as e:
#pass
print(f"Filename {filename} has issue with {e}")
row = {}
if row:
final_result.append(row)
输出:行
{
'added': 1638217153.782366,
'started': 1638261651.130148,
'duration': 15,
'ended': 1638261666.212257,
'AF-DomainCount': 2,
'AG-SignatureCount': 1,
'AH-GenericCount': 1,
'AI-ApistatCount': 1,
'AK-SummaryCount': 1,
'NtQueryValueKey': 2,
'LdrUnloadDll': 1,
'NtCreateSection': 1,
'LoadStringW': 2,
'CreateActCtxW': 4,
'NtOpenKey': 2,
'NtUnmapViewOfSection': 4,
'MessageBoxTimeoutW': 1,
'SetUnhandledExceptionFilter': 1,
'SetErrorMode': 1,
'NtCreateFile': 1,
'NtClose': 17,
'GetSystemTimeAsFileTime': 1,
'LdrLoadDll': 1,
'NtTerminateProcess': 3,
'GetFileAttributesW': 2,
'NtMapViewOfSection': 1,
'AA-Filename': 'test.json'
}
不确定这是 使用 python ijson 读取具有多个 json 对象的大型 json 文件的原因,而 ijson 无法同时处理多个 json 元素。
另外,让我知道任何其他 python 包或任何可以处理大尺寸 JSON 而没有内存问题的示例示例。
编辑:如果我不使用parse_eventsf.seek(0)
,则仅row['AF-DomainCount']
返回正确值,其他行数为 0
row['AF-DomainCount'] = sum(1 for _ in ijson.items(parse_events, 'network.domains.item'))
row['AG-SignatureCount'] = sum(1 for _ in ijson.items(parse_events, 'signatures.item'))
row['AH-GenericCount'] = sum(1 for _ in ijson.items(parse_events, 'behavior.generic.item'))
row['AI-ApistatCount'] = sum(1 for _ in ijson.items(parse_events, 'behavior.apistats'))
row['AJ-ProcessCount'] = sum(1 for _ in ijson.items(parse_events, 'behavior.processes.item'))
row['AK-SummaryCount'] = sum(1 for _ in ijson.items(parse_events, 'behavior.summary'))
注意:这不是作业,而是我面临的现实生活问题。基本上,我需要某种解决方案来避免f.seek(0)
多次使用 ijson 使我的脚本更快