我有一个 MDB 数据库,其中包含有关论坛帖子的以下属性:
thread
author (posted in the thread)
children (a list of authors who replied to the post)
child_count (number of children in the list)
我正在尝试使用以下节点构建图表:
thread
author
child authors
我的数据库中的不同作者总数超过 30,000,但生成的作者数约为 3000。或者,在总共 33000 个节点中,以下代码生成大约 5000 个。这是怎么回事?
for doc in coll.find():
thread = doc['thread'].encode('utf-8')
author_parent = doc['author'].encode('utf-8')
children = doc['children']
children_count = len(children)
#print G.nodes()
#print post_parent, author, doc['thread']
try:
if thread in G:
continue
else:
G.add_node(thread, color='red')
thread_count+=1
if author_parent in G:
G.add_edge(author_parent, thread)
else:
G.add_node(author_parent, color='green')
G.add_edge(author_parent, thread, weight=0)
author_count+=1
if doc['child_count']!=0:
for doc in children:
if doc['author'].encode("utf-8") in G:
print doc['author'].encode("utf-8"), 'in G'
G.add_edge(doc['author'].encode("utf-8"), author_parent)
else:
G.add_node(doc['author'].encode("utf-8"),color='green')
G.add_edge(doc['author'].encode("utf-8"), author_parent, weight=0)
author_count+=1
except:
print "failed"
nx.write_dot(G,PATH)
print thread_count, author_count, children_count