我编写了一个代码来生成具有 379613734 条边的图形。
但是由于内存原因,代码无法完成。当它通过 6200 万行时,它会占用大约 97% 的服务器内存。所以我杀了它。
import os, sys
import time
import networkx as nx
G = nx.Graph()
ptime = time.time()
j = 1
for line in open("./US_Health_Links.txt", 'r'):
#for line in open("./test_network.txt", 'r'):
follower = line.strip().split()[0]
followee = line.strip().split()[1]
G.add_edge(follower, followee)
if j%1000000 == 0:
print j*1.0/1000000, "million lines done", time.time() - ptime
ptime = time.time()
j += 1
DG = G.to_directed()
# P = nx.path_graph(DG)
Nn_G = G.number_of_nodes()
N_CC = nx.number_connected_components(G)
LCC = nx.connected_component_subgraphs(G)[0]
n_LCC = LCC.nodes()
Nn_LCC = LCC.number_of_nodes()
inDegree = DG.in_degree()
outDegree = DG.out_degree()
Density = nx.density(G)
# Diameter = nx.diameter(G)
# Centrality = nx.betweenness_centrality(PDG, normalized=True, weighted_edges=False)
# Clustering = nx.average_clustering(G)
print "number of nodes in G\t" + str(Nn_G) + '\n' + "number of CC in G\t" + str(N_CC) + '\n' + "number of nodes in LCC\t" + str(Nn_LCC) + '\n' + "Density of G\t" + str(Density) + '\n'
# sys.exit()
# j += 1
1000 1001
1000245 1020191
1000 10267352
1000653 10957902
1000 11039092
1000 1118691
10346 11882
1000 1228281
1000 1247041
1000 12965332
121340 13027572
1000 13075072
1000 13183162
1000 13250162
1214 13326292
1000 13452672
1000 13844892
1000 14061830
12340 1406481
1000 14134703
1000 14216951
1000 14254402
12134 14258044
1000 14270791
1000 14278978
12134 14313332
1000 14392970
1000 14441172
1000 14497568
1000 14502775
1000 14595635
1000 14620544
1000 14632615
10234 14680596
1000 14956164
10230 14998341
112000 15132211
1000 15145450
100 15285998
1000 15288974
1000 15300187
1000 1532061
1000 15326300
最后,有没有人有分析 Twitter 链接数据的经验?我很难采用有向图并计算节点的平均/中值入度和出度。有什么帮助或想法吗?