我问了一个关于将一堆 RDF 三元组转换为多路数组表示的问题。我的解决方案看起来笨拙而且......丑陋。我尝试在我的解决方案中使用函数,但我发现这是不必要的,因为使用函数的优势并不明显。我想也许我的编程思想是面向过程的,而不是面向对象的。
这是我的解决方案:(这是没有任何功能的原始版本)
import numpy as np
from numpy import ones
from scipy.sparse import coo_matrix
s = []
p = []
o = []
# format of each line in 'test.txt' is (subject, predicate, object)
# extract sub, pre and obj in each line then stored in s, p, o, respectively.
for row in open('test.txt'):
s.append(row.split('(')[1].split(',')[0])
p.append(row.split('(')[1].split(', ')[1])
o.append(row.split(')')[0].split(', ')[2])
# create mapping from indexes to entities
l = s + o
mapping = { v:i for (i, v) in enumerate(sorted(set(l))) }
mapping_p = { v:i for (i, v) in enumerate(sorted(set(p))) }
# get lists contain [indexes, entities] from initial lists s, p, o
n_s = [ [mapping[v], v] for v in s ]
n_p = [ [mapping_p[v], v] for v in p ]
n_o = [ [mapping[v], v] for v in o ]
# create the coodinate of each list
cood_s = []
cood_p = []
cood_o = []
# get the index of s[i], p[i], o[i] and stored in cood_s, cood_p, cood_o, respectively
for i in range(len(n_s)):
cood_s.append(n_s[i][0])
cood_p.append(n_p[i][0])
cood_o.append(n_o[i][0])
cood = zip(cood_s, cood_o, cood_p)
data = [ list(i) for i in cood ]
m = len(set(p)) # m is the number of predicates
data1 = []
# data1: a list contains ndarrays, each of which records the coodinate of the entity-to-entity matrix
for i in range(m):
data2 = []
for j in range(len(data)):
if data[j][2] == i: data2.append(data[j])
data1.append(np.array(data2))
T = []
n = len(set(l)) # n is the number of entities
# generates the coordinate matrix of each predicate as a ndarray then appends to T
for i in range(m):
T.append( np.array(coo_matrix((ones(len(data1[i])), (data1[i][:, 0], data1[i][:, 1])), shape = (n, n)).todense() ))
OK,希望你不会觉得无聊,那我用函数把一些语句连接起来,如下:
def split_data_matrix(filename):
s = []
p = []
o = []
l = []
for row in open(filename):
s.append(row.split('"')[1])
o.append(row.split('"')[3])
p.append(row.split('(')[0])
l = s + o
return s, o, p, l
s, o, p, l = split_data_matrix('test.txt')
def generate_mapping(l, p):
mapping = { v:i for (i, v) in enumerate(sorted(set(l))) }
mapping_p = { v:i for (i, v) in enumerate(sorted(set(p))) }
n_s = [ [mapping[v], v] for v in s ]
n_p = [ [mapping_p[v], v] for v in p ]
n_o = [ [mapping[v], v] for v in o ]
return n_s, n_p, n_o
n_s, n_p, n_o = generate_mapping(l, p)
def generate_index(n_s, n_p, n_o):
cood_s = []
cood_p = []
cood_o = []
for i in range(len(n_s)):
cood_s.append(n_s[i][0])
cood_p.append(n_p[i][0])
cood_o.append(n_o[i][0])
cood = zip(cood_s, cood_o, cood_p)
data = [ list(i) for i in cood ]
return data
data = generate_index(n_s, n_p, n_o)
def generate_coordinate(data):
data1 = []
m = len(set(p))
for i in range(m):
data2 = []
for j in range(len(data)):
if data[j][2] == i: data2.append(data[j])
data1.append(np.array(data2))
return data1
data1 = generate_coordinate(data)
def generate_ndarrays(data1):
T = []
n = len(set(l))
m = len(set(p))
for i in range(m):
T.append( np.array(coo_matrix((ones(len(data1[i])), (data1[i][:, 0], data1[i][:, 1])), shape = (n, n)).todense() ))
return T
T = generate_ndarrays(data1)
我的函数生成过程很简单:我只需将一些功能相似线连接在一起,然后在其上定义一个函数。我不太确定我的方式是否合理。有人可以告诉我如何处理吗?任何评论都将受到欢迎。