我目前正在根据结束时间对 json 文件中的 netflow 数据进行排序。我将所有这些数据放入字典中,其中键是结束时间(但只有小时和分钟,因此多个数据值属于一个时间)。然而,这需要一点时间——不超过几秒钟,但这仍然太长了。有什么好方法可以改善这个大 O?我现在正在做的只是逐行浏览文件,提取结束时间,并创建一个空字典(其中值是空集),键是结束时间的小时/分钟。然后,我只是浏览字典并将具有相应结束时间的行添加到给定键的值,这是一个集合。
编辑:这是一种 json 数据的示例。以下是其中的一行。我正在使用的文件接近 300,000 行。
{
"@timestamp": "2015-05-18T19:26:08.000Z",
"netflow": {
"version": "9",
"flow_seq_num": "188185",
"flowset_id": "257",
"last_switched": "2015-05-15T14:28:02.999Z",
"first_switched": "2015-05-15T14:27:38.999Z",
"in_bytes": "71",
"in_pkts": "1",
"input_snmp": "5",
"output_snmp": "4",
"ipv4_src_addr": "192.1.44.133",
"ipv4_dst_addr": "10.10.1.4",
"protocol": "6",
"src_tos": "0",
"dst_tos": "2",
"l4_src_port": "12373",
"l4_dst_port": "80",
"flow_sampler_id": "0",
"ipv4_next_hop": "10.10.1.5",
"dst_mask": "2",
"src_mask": "31",
"tcp_flags": "6",
"direction": "0"
},
"@version": "1",
"host": "192.168.19.202",
"src_host_name": "",
"dst_host_name": "",
"app_name": "",
"tcp_flags_str": "",
"dscp": "",
"highval": "",
"src_blacklisted": "0",
"dst_blacklisted": "0",
"invalid_ToS": "0",
"bytes_per_packet": 71,
"tcp_nominal_payload": "0",
"malformed_ip": "0",
"empty_tcp": "0",
"short_tcp_handshake": "0",
"icmp_malformed_packets": "0",
"snort_attack_flow": "0",
"empty_udp": "0",
"short_udp": "0",
"short_tcp_rstack": "0",
"short_tcp_pansf": "0",
"short_tcp_synack": "0",
"short_tcp_synrst": "0",
"short_tcp_finack": "0",
"short_tcp_pna": "0",
"non_unicast_src": "0",
"multicast": "0",
"broadcast": "0",
"network": "0",
"tcp_urg": "0",
"land_attack": "0",
"short_tcp_ack": "0",
"tcp_synfin": "0",
"tcp_fin": "0",
"malformed_tcp": "1",
"tcp_xmas": "0",
"udp_echo_req": "0",
"tcp_null": "0",
"tcp_syn": "0",
"malformed_udp": "0",
"tcp_rst": "0",
"icmp_request": "0",
"icmp_response": "0",
"icmp_port_unreachable": "0",
"icmp_host_unreachable": "0",
"icmp_unreachable_for_Tos": "0",
"icmp_network_unreachable": "0",
"icmp_redirects": "0",
"icmp_time_exceeded_flows": "0",
"icmp_parameter_problem_flows": "0",
"icmp_trace_route": "0",
"icmp_datagram": "0",
"udp_echo_chargen_broadcast": "0",
"udp_chargen_echo_broadcast": "0",
"icmp_src_quench": "0",
"icmp_proto_unreachable": "0",
"udp_echo_broadcast": "0",
"udp_echo_rsp": "0"
}
至于我尝试过的代码,目前我只是将这些行转换为字典来访问我想要排序的不同值。这真的很简单,我只是使用 json.loads 等来创建字典。什么样的数据结构最适合组织这种事情?我现在正在使用字典,但有更好的字典吗?