python - 在 python 中使用 networkx 创建图形和执行链接预测时出错

Question

我正在尝试使用 csv 文件制作图表，该文件包含有关节点的边缘、职业和年龄的信息。我将社区分配给每个节点并执行链接预测。

import networkx as nx
import csv
engineers1 = []
engineers2 = []
engineers3 = []
engineers4 = []
engineers5 = []
actors1= []
actors2= []
actors3= []
actors4= []
actors5= []
writers1 = []
writers2= []
writers3= []
writers4 = []
writers5 = []
doctors1= []
doctors2= []
doctors3= []
doctors4= []
doctors5= []
drivers1=[]
drivers2=[]
drivers3=[]
drivers4=[]
drivers5=[]
teachers1=[]
teachers2=[]
teachers3=[]
teachers4=[]
teachers5=[]
nodes=[]
g=nx.Graph()

for i in range(0,4038):
    g.add_node(i)

with open("asd1.csv",'r') as csv_file:
    csv_reader=csv.DictReader(csv_file)

    for line in csv_reader:
        g.add_edge(line['first'],line['second'])

csv_file.close()

with open("asd1.csv",'r') as csv_file:
    csv_reader=csv.DictReader(csv_file)
    for line in csv_reader:
         if (line['profession'] == 'actor' and line['age'] >= '13' and 
line['age'] <= '17'):
            actors1.append(line['name'])
        if (line['profession'] == 'actor' and line['age'] >= '18' and 
line['age'] <= '29'):
          actors2.append(line['name'])
        if (line['profession'] == 'actor' and line['age'] >= '30' and 
line['age'] <= '49'):
        actors3.append(line['name'])
    if (line['profession'] == 'actor' and line['age'] >= '50' and line['age'] <= '64'):
        actors4.append(line['name'])
    if (line['profession'] == 'actor' and line['age'] >= '65'):
        actors5.append(line['name'])

    if (line['profession'] == 'eng' and line['age'] >= '13' and line['age'] <= '17'):
        engineers1.append(line['name'])
    if (line['profession'] == 'eng' and line['age'] >= '18' and line['age'] <= '29'):
        engineers2.append(line['name'])
    if (line['profession'] == 'eng' and line['age'] >= '30' and line['age'] <= '49'):
        engineers3.append(line['name'])
    if (line['profession'] == 'eng' and line['age'] >= '50' and line['age'] <= '64'):
        engineers4.append(line['name'])
    if (line['profession'] == 'eng' and line['age'] >= '65'):
        engineers5.append(line['name'])

    if (line['profession'] == 'teacher' and line['age'] >= '13' and line['age'] <= '17'):
        teachers1.append(line['name'])
    if (line['profession'] == 'teacher' and line['age'] >= '18' and line['age'] <= '29'):
        teachers2.append(line['name'])
    if (line['profession'] == 'teacher' and line['age'] >= '30' and line['age'] <= '49'):
        teachers3.append(line['name'])
    if (line['profession'] == 'teacher' and line['age'] >= '50' and line['age'] <= '64'):
        teachers4.append(line['name'])
    if (line['profession'] == 'teacher' and line['age'] >= '65'):
        teachers5.append(line['name'])

    if (line['profession'] == 'driver' and line['age'] >= '13' and line['age'] <= '17'):
        drivers1.append(line['name'])
    if (line['profession'] == 'driver' and line['age'] >= '18' and line['age'] <= '29'):
        drivers2.append(line['name'])
    if (line['profession'] == 'driver' and line['age'] >= '30' and line['age'] <= '49'):
        drivers3.append(line['name'])
    if (line['profession'] == 'driver' and line['age'] >= '50' and line['age'] <= '64'):
        doctors4.append(line['name'])
    if (line['profession'] == 'driver' and line['age'] >= '65'):
        drivers5.append(line['name'])

    if (line['profession'] == 'doctor' and line['age'] >= '13' and line['age'] <= '17'):
        doctors1.append(line['name'])
    if (line['profession'] == 'doctor' and line['age'] >= '18' and line['age'] <= '29'):
        doctors2.append(line['name'])
    if (line['profession'] == 'doctor' and line['age'] >= '30' and line['age'] <= '49'):
        doctors3.append(line['name'])
    if (line['profession'] == 'doctor' and line['age'] >= '50' and line['age'] <= '64'):
        drivers4.append(line['name'])
    if (line['profession'] == 'doctor' and line['age'] >= '65'):
        doctors5.append(line['name'])

csv_file.close()

print("actors having age between 13 and 17: ",actors1) 
print("actors having age between 18 and 29: ",actors2)
print("actors having age between 30 and 49: ",actors3) 
print("actors having age between 50 and 64: ",actors4)
print("actors having age 65 and above: ",actors5)
print('\n')

print("engineers having age between 13 and 17: ",engineers1)
print("engineers having age between 18 and 29: ",engineers2)
print("engineers having age between 30 and 49: ",engineers3)
print("engineers having age between 50 and 64: ",engineers4)
print("engineers having age 65 and above: ",engineers5)
print('\n')

print("teachers having age between 13 and 17: ",teachers1)
print("teachers having age between 18 and 29: ",teachers2)
print("teachers having age between 30 and 49: ",teachers3)
print("teachers having age between 50 and 64: ",teachers4)
print("teachers having age 65 and above: ",teachers5)
print('\n')

print("drivers having age between 13 and 17: ",drivers1)
print("drivers having age between 18 and 29: ",drivers2)
print("drivers having age between 30 and 49: ",drivers3)
print("drivers having age between 50 and 64: ",drivers4)
print("drivers having age 65 and above: ",drivers5)
print('\n')

print("doctors having age between 13 and 17: ",doctors1)
print("doctors having age between 18 and 29: ",doctors2)
print("doctors having age between 30 and 49: ",doctors3)
print("doctors having age between 50 and 64: ",doctors4)
print("doctors having age 65 and above: ",doctors5)
print('\n')

for i in range(0,4038):
    g.node[i]['community']=0

for x1 in actors1:
    g.node[x1]['community']=0
for x2 in actors2:
    g.node[x2]['community']=1 
for x3 in actors3:
    g.node[x3]['community']=2
for x4 in actors4:
    g.node[x4]['community']=3
for x5 in actors5:
    g.node[x5]['community']=4
for x6 in engineers1:
    g.node[x6]['community']=5
for x7 in engineers2:
    g.node[x7]['community']=6
for x8 in engineers3:
    g.node[x8]['community']=7
for x9 in engineers4:
    g.node[x9]['community']=8
for x10 in engineers5:
    g.node[x10]['community']=9
for x11 in teachers1:
    g.node[x11]['community']=10
for x12 in teachers2:
    g.node[x12]['community']=11
for x13 in teachers3:
    g.node[x13]['community']=12
for x14 in teachers4:
    g.node[x14]['community']=13
for x15 in teachers5:
    g.node[x15]['community']=14
for x16 in drivers1:
    g.node[x16]['community']=15
for x17 in drivers2:
    g.node[x17]['community']=16
for x18 in drivers3:
    g.node[x18]['community']=17
for x19 in drivers4:
    g.node[x19]['community']=18
for x20 in drivers5:
    g.node[x20]['community']=19
for x21 in doctors1:
    g.node[x21]['community']=20
for x22 in doctors2:
   g.node[x22]['community']=21
for x23 in doctors3:
    g.node[x23]['community']=22
for x24 in doctors4:
    g.node[x24]['community']=23
for x25 in doctors5:
    g.node[x25]['community']=24

print(g.nodes())
l=list(nx.cn_soundarajan_hopcroft(g))
print(l)

score 0 · Accepted Answer

序幕

我强烈建议您阅读任何解释算法的优秀编程书籍。你的问题可以用几行代码来解决。

第一幕

看看你的问题。你有几个职业，几个年龄组和名字作为唯一标识符。并且您想将它们彼此区分开来。现在看看你的代码。为了解决您的问题，您正在为每个年龄-职业组合创建唯一列表。它是可以创建的最少可修改的结构。如果您必须添加另外五个职业（有数千种不同的职业），您将不得不将您的代码加倍。此外，您在复制粘贴时很容易出错。只是一个普通merchandiser3的代替merchandiser4可以让你在红眼地狱中度过接下来的一两个小时。看，您的代码中已经有错误！

if (line['profession'] == 'doctor' and line['age'] >= '13' and line['age'] <= '17'):
    doctors1.append(line['name'])
if (line['profession'] == 'doctor' and line['age'] >= '18' and line['age'] <= '29'):
    doctors2.append(line['name'])
if (line['profession'] == 'doctor' and line['age'] >= '30' and line['age'] <= '49'):
    doctors3.append(line['name'])
if (line['profession'] == 'doctor' and line['age'] >= '50' and line['age'] <= '64'):
    # Hello, guys! I am ready to torture his brain and eyes for hours!!
    drivers4.append(line['name'])
if (line['profession'] == 'doctor' and line['age'] >= '65'):
    doctors5.append(line['name'])

而且，作为头脑中的最后一枪，你并不真的需要所有这些列表。例如，您可以为每个职业创建一个字典。或者是其他东西。但您可以注意到，您的数据对每个人都有非常频繁的模式。姓名、年龄、职业……等等，我们从哪里获取数据？CSV 文件？什么是CSV 文件？

是的。

桌子。

第二幕

如果您从表中读取数据，最好将此数据存储在表中！（嗯，大部分时间......）Python 有一个很棒的表格库——Pandas。您所有的数百行代码都可以减少到一二十行！现在仔细看看我的手，魔法开始了……

零。我们导入熊猫：

import pandas as pd

第一的。我们为年龄聚类创建了单独的函数。如果我们的大老板要我们处理 11 岁的神经科学家，我们将做好充分准备：

def get_age_cluster(age):
    a = int(age)
    if a >= 0 and a <= 12:
        return '<13'
    if a >= 13 and a <= 17:
        return '13-17'
    if a >= 18 and a <= 29:
        return '18-29'
    if a >= 30 and a <= 49:
        return '30-49'
    if a >= 50 and a <= 64:
        return '50-64'
    elif a >= 65:
        return '>64'

第二。我们阅读了 CSV。您正在手动进行，逐行处理，处理每个可能的组合......为什么？！是普通的手术！人家早就写好了！偷懒！

（多年积蓄在心里的老师傅的忠告！笑话。我没有心。）

df=pd.read_csv('TF.csv')

是的，就是这样。是的。真的。一条线。二十四个符号（记住这个数字！！）。现在让我们和我们的十个小可爱成为朋友：

我们只是加载了 CSV，但我们没有转换age列。它包含年龄，但应该包含集群。不是问题！

df['age'] = df['age'].apply(get_age_cluster)

完毕！您可以将任何转换函数应用于表中的行或列。所以我们不需要对年龄进行排序，对年龄进行排序，对年龄进行排序，对aegs进行排序……我们可以写一个漂亮的单行。结果如下：

你可以注意到我们有一些垃圾列。不是问题！

df = df.drop('waka', axis=1) df = df.drop('we_dont_need_this_column', axis=1)

我们有一张漂亮的小桌子：

现在到主要任务。根据每个职业和年龄获取所有名称。Pandas 有很多很多的分组功能。让我们使用最简单的：

grouped = df.groupby(['profession', 'age'])
for group in grouped.groups:
    print(group, list(grouped.get_group(group)['name']))

我们得到带有职业年龄组的分组结构：grouped = df.groupby(['profession', 'age'])，并且对于此结构中的每个组：for group in grouped.groups:我们打印：print()每个组中“名称”列的列表：grouped.get_group(group)['name'])。结果如下：

('eng', '30-49') ['Cthulhu']
('driver', '18-29') ['John Doe 3']
('actor', '13-17') ['John Doe 4']
('actor', '18-29') ['Yog-Sothoth']
('teacher', '18-29') ['John Doe 2', 'Shub-Niggurath']
('eng', '>64') ['Fblthp the Lost']
('driver', '<13') ['Azathoth']
('doctor', '18-29') ['Nyarlathotep']
('doctor', '30-49') ['John Doe 1']

这是整个代码：

import pandas as pd

def get_age_cluster(age):
    a = int(age)
    if a >= 0 and a <= 12:
        return '<13'
    if a >= 13 and a <= 17:
        return '13-17'
    if a >= 18 and a <= 29:
        return '18-29'
    if a >= 30 and a <= 49:
        return '30-49'
    if a >= 50 and a <= 64:
        return '50-64'
    elif a >= 65:
        return '>64'

df=pd.read_csv('TF.csv')
df['age'] = df['age'].apply(get_age_cluster)
df = df.drop('waka', axis=1)
df = df.drop('we_dont_need_this_column', axis=1)
grouped = df.groupby(['profession', 'age'])
for group in grouped.groups:
    print(group, list(grouped.get_group(group)['name']))

二十四行。我想我们现在可以称自己为“了不起的二十四”。它就像神奇四侠，但神奇二十四。但是我们的 Graph Doom 还活着……

第三幕

我们创建了表格，进行了一些转换，对其进行了排序和过滤。但是您还有另一个问题 - 图表。而这个问题比第一个更难。

您正在从一个文件中读取节点（人类）和边（我不知道到底是什么。关系？）。它迫使你的图有很强的限制——节点的数量等于边的数量。这是非常罕见的情况。我认为你在开始编写这个脚本之前做错了什么。我建议您为节点和边使用不同的文件（或至少一个文件中的不同部分）。但！让我们假设你正在做你想做的事，每个人（当然还有克苏鲁！）只有一个优势。在这种情况下，我们可以只用两行代码来构建我们的图表：

G = nx.Graph()
G.add_edges_from(df[['first', 'second']].values)

答对了！我们完了。现在让我们来看看这个奇怪的复杂的东西：

设置每个节点的社区（注意你需要它的算法）：

for n in G.nodes:
    G.nodes[n]['community'] = 0

并计算这个：

csh = nx.cn_soundarajan_hopcroft(G)

我们得到一个迭代器。将其转换为列表并获得结果：

[(1, 8, 2),
 (1, 9, 0),
 (1, 2, 4),
 (1, 4, 0),
 (1, 6, 2),
 (2, 8, 2),
 (2, 9, 2),
 (2, 5, 0),
 (2, 6, 2),
 (3, 9, 0),
 (3, 4, 2),
 (3, 5, 2),
 (3, 6, 0),
 (3, 7, 4),
 (4, 8, 0),
 (4, 5, 2),
 (4, 7, 2),
 (5, 8, 0),
 (5, 9, 0),
 (5, 7, 2),
 (6, 8, 0),
 (6, 9, 2),
 (6, 7, 0),
 (7, 8, 0),
 (7, 9, 0),
 (8, 9, 0)]

总决赛

我希望你喜欢我为你写的小音乐:) 我推荐你写一些好的 Python 编程书和算法编程书。祝你好运！

python - 在 python 中使用 networkx 创建图形和执行链接预测时出错

1 回答 1

Related

Reference