python - 使用 ijson 和 f.seek(0) 读取大文件花费太多时间

Question

我有10 万个非常大的 JSON 文件，需要对特定元素进行处理。为了避免内存过载，我使用了一个名为ijson的 python 库，当我处理每个对象时，它工作得很好，f.seek(0)但它使我的处理非常慢。另外，如果我删除此f.seek(0)输出，则会出现错误

过早的EOF

部分 JSON：

{
"info": {
    "added": 1638217153.782366, 
    "started": 1638261651.130148, 
    "duration": 15, 
    "ended": 1638261666.212257, 
    "owner": null, 
    "score": 0.2, 
    "id": 5062, 
    "category": "file", 
    "git": {
        "head": "13cbe0d9e457be3673304533043e992ead1ea9b2", 
        "fetch_head": "13cbe0d9e457be3673304533043e992ead1ea9b2"
    }, 
    "monitor": "2deb9ccd75d5a7a3fe05b2625b03a8639d6ee36b", 
    "package": "dll", 
    "route": "internet", 
    "custom": null, 
    "machine": {
        "status": "stopped", 
        "name": "192.168.56.1012", 
        "label": "192.168.56.1012", 
        "manager": "VirtualBox", 
        "started_on": "2021-11-30 08:40:51", 
        "shutdown_on": "2021-11-30 08:41:06"
    }, 
    "platform": "windows", 
    "version": "2.0.7", 
    "options": "procmemdump=yes,route=internet"
}, 
"network": {
    "domains": [
        {
            "ip": "87.101.200.41", 
            "domain": "www.msftncsi.com"
        }, 
        {
            "ip": "131.107.255.255", 
            "domain": "dns.msftncsi.com"
        }
    ]
}, 
"signatures": [
    {
        "families": [], 
        "description": "This executable has a PDB path", 
        "severity": 1, 
        "ttp": {}, 
        "markcount": 1, 
        "references": [], 
        "marks": [
            {
                "category": "pdb_path", 
                "ioc": "MsiHnd.pdb", 
                "type": "ioc", 
                "description": null
            }
        ], 
        "name": "has_pdb"
    }
],
"behavior": {
    "generic": [
        {
            "process_path": "C:\\Windows\\System32\\lsass.exe", 
            "process_name": "lsass.exe", 
            "pid": 496, 
            "summary": {}, 
            "first_seen": 1638224353.328125, 
            "ppid": 380
        }
    ], 
    "apistats": {
        "2336": {
            "NtQueryValueKey": 2, 
            "LdrUnloadDll": 1, 
            "NtCreateSection": 1, 
            "LoadStringW": 2, 
            "CreateActCtxW": 4, 
            "NtOpenKey": 2, 
            "NtUnmapViewOfSection": 4, 
            "MessageBoxTimeoutW": 1, 
            "SetUnhandledExceptionFilter": 1, 
            "SetErrorMode": 1, 
            "NtCreateFile": 1, 
            "NtClose": 17, 
            "GetSystemTimeAsFileTime": 1, 
            "LdrLoadDll": 1, 
            "NtTerminateProcess": 3, 
            "GetFileAttributesW": 2, 
            "NtMapViewOfSection": 1
        }
    },
    "summary": {
        "file_opened": [
            "C:\\Windows\\System32\\en-US\\KERNELBASE.dll.mui"
        ]
    }
}

}

当前代码：（多个f.seek(0)）不想使用:)

my_file_list = [f for f in glob.glob("data/jsons/test.json")]
final_result = []
i = 0
for filename in my_file_list:
    try:
        with open(filename, 'r', encoding='utf8', errors='ignore') as f:
            row = {}
            parse_events = ijson.parse(f, use_float=True)
            for prefix, event, value in parse_events:
                if prefix == 'info.added':
                    row['added'] = value
                elif prefix == 'info.started':
                    row['started'] = value
                elif prefix == 'info.duration':
                     row['duration'] = value
                elif prefix == 'info.ended':
                     row['ended'] = value
                elif prefix == 'info' and event == 'end_map':
                    break
            
            f.seek(0)
            row['AF-DomainCount'] = sum(1 for _ in ijson.items(f, 'network.domains.item'))
            f.seek(0)
            row['AG-SignatureCount'] = sum(1 for _ in ijson.items(f, 'signatures.item'))
            f.seek(0)
            row['AH-GenericCount'] = sum(1 for _ in ijson.items(f, 'behavior.generic.item'))
            f.seek(0)
            row['AI-ApistatCount'] = sum(1 for _ in ijson.items(f, 'behavior.apistats'))
            f.seek(0)
            row['AJ-ProcessCount'] = sum(1 for _ in ijson.items(f, 'behavior.processes.item'))
            f.seek(0)
            row['AK-SummaryCount'] = sum(1 for _ in ijson.items(f, 'behavior.summary'))

            f.seek(0)
            apistats_element = ijson.items(f, 'behavior.apistats')
            for inner_apistats in apistats_element:
                for index, inner_fields in inner_apistats.items():
                    row = dict(Counter(row) + Counter(inner_fields))
            
            row['AA-Filename'] = os.path.basename(filename)
            i+=1
            print(f"processed file {i}", end='\r')
            
    except Exception as e:
        #pass
        print(f"Filename {filename} has issue with {e}")
        row = {}
    
    if row:        
        final_result.append(row)

输出：行

{
 'added': 1638217153.782366,
 'started': 1638261651.130148,
 'duration': 15,
 'ended': 1638261666.212257,
 'AF-DomainCount': 2,
 'AG-SignatureCount': 1,
 'AH-GenericCount': 1,
 'AI-ApistatCount': 1,
 'AK-SummaryCount': 1,
 'NtQueryValueKey': 2,
 'LdrUnloadDll': 1,
 'NtCreateSection': 1,
 'LoadStringW': 2,
 'CreateActCtxW': 4,
 'NtOpenKey': 2,
 'NtUnmapViewOfSection': 4,
 'MessageBoxTimeoutW': 1,
 'SetUnhandledExceptionFilter': 1,
 'SetErrorMode': 1,
 'NtCreateFile': 1,
 'NtClose': 17,
 'GetSystemTimeAsFileTime': 1,
 'LdrLoadDll': 1,
 'NtTerminateProcess': 3,
 'GetFileAttributesW': 2,
 'NtMapViewOfSection': 1,
 'AA-Filename': 'test.json'
}

不确定这是 使用 python ijson 读取具有多个 json 对象的大型 json 文件的原因，而 ijson 无法同时处理多个 json 元素。

另外，让我知道任何其他 python 包或任何可以处理大尺寸 JSON 而没有内存问题的示例示例。

编辑：如果我不使用parse_eventsf.seek(0)，则仅row['AF-DomainCount']返回正确值，其他行数为 0

row['AF-DomainCount'] = sum(1 for _ in ijson.items(parse_events, 'network.domains.item'))
row['AG-SignatureCount'] = sum(1 for _ in ijson.items(parse_events, 'signatures.item'))
row['AH-GenericCount'] = sum(1 for _ in ijson.items(parse_events, 'behavior.generic.item'))
row['AI-ApistatCount'] = sum(1 for _ in ijson.items(parse_events, 'behavior.apistats'))
row['AJ-ProcessCount'] = sum(1 for _ in ijson.items(parse_events, 'behavior.processes.item'))
row['AK-SummaryCount'] = sum(1 for _ in ijson.items(parse_events, 'behavior.summary'))

注意：这不是作业，而是我面临的现实生活问题。基本上，我需要某种解决方案来避免f.seek(0)多次使用 ijson 使我的脚本更快

python - 使用 ijson 和 f.seek(0) 读取大文件花费太多时间

0 回答 0

Related

Reference