0

我问了一个关于将一堆 RDF 三元组转换为多路数组表示的问题。我的解决方案看起来笨拙而且......丑陋。我尝试在我的解决方案中使用函数,但我发现这是不必要的,因为使用函数的优势并不明显。我想也许我的编程思想是面向过程的,而不是面向对象的。

这是我的解决方案:(这是没有任何功能的原始版本)

import numpy as np
from numpy import ones
from scipy.sparse import coo_matrix

s = []
p = []
o = []

# format of each line in 'test.txt' is (subject, predicate, object)
# extract sub, pre and obj in each line then stored in s, p, o, respectively.
for row in open('test.txt'):
    s.append(row.split('(')[1].split(',')[0])   
    p.append(row.split('(')[1].split(', ')[1])
    o.append(row.split(')')[0].split(', ')[2])

# create mapping from indexes to entities
l = s + o
mapping = { v:i for (i, v) in enumerate(sorted(set(l))) }    
mapping_p = { v:i for (i, v) in enumerate(sorted(set(p))) }

# get lists contain [indexes, entities] from initial lists s, p, o 
n_s = [ [mapping[v], v] for v in s ]
n_p = [ [mapping_p[v], v] for v in p ]
n_o = [ [mapping[v], v] for v in o ]

# create the coodinate of each list 
cood_s = []
cood_p = []
cood_o = []

# get the index of s[i], p[i], o[i] and stored in cood_s, cood_p, cood_o, respectively
for i in range(len(n_s)):
    cood_s.append(n_s[i][0])    
    cood_p.append(n_p[i][0])
    cood_o.append(n_o[i][0])
cood = zip(cood_s, cood_o, cood_p)
data = [ list(i) for i in cood ]
m = len(set(p))    # m is the number of predicates
data1 = []

# data1: a list contains ndarrays, each of which records the coodinate of the entity-to-entity matrix 
for i in range(m):    
    data2 = []   
    for j in range(len(data)):
        if data[j][2] == i: data2.append(data[j])
    data1.append(np.array(data2))

T = []    
n = len(set(l)) # n is the number of entities

# generates the coordinate matrix of each predicate as a ndarray then appends to T
for i in range(m):
    T.append( np.array(coo_matrix((ones(len(data1[i])), (data1[i][:, 0], data1[i][:, 1])), shape = (n, n)).todense() ))

OK,希望你不会觉得无聊,那我用函数把一些语句连接起来,如下:

def split_data_matrix(filename):
    s = []
    p = []
    o = []
    l = []
    for row in open(filename):  
        s.append(row.split('"')[1])
        o.append(row.split('"')[3])
        p.append(row.split('(')[0])
    l = s + o
    return s, o, p, l
s, o, p, l = split_data_matrix('test.txt')

def generate_mapping(l, p):
    mapping = { v:i for (i, v) in enumerate(sorted(set(l))) }
    mapping_p = { v:i for (i, v) in enumerate(sorted(set(p))) }
    n_s = [ [mapping[v], v] for v in s ]
    n_p = [ [mapping_p[v], v] for v in p ]
    n_o = [ [mapping[v], v] for v in o ]
    return n_s, n_p, n_o
n_s, n_p, n_o = generate_mapping(l, p)

def generate_index(n_s, n_p, n_o):
    cood_s = []
    cood_p = []
    cood_o = []
    for i in range(len(n_s)):
        cood_s.append(n_s[i][0])
        cood_p.append(n_p[i][0])
        cood_o.append(n_o[i][0])
    cood = zip(cood_s, cood_o, cood_p)
    data = [ list(i) for i in cood ]
    return data
data = generate_index(n_s, n_p, n_o)

def generate_coordinate(data):
    data1 = []
    m = len(set(p))

    for i in range(m):
        data2 = []
        for j in range(len(data)):
            if data[j][2] == i: data2.append(data[j])
        data1.append(np.array(data2))
    return data1
data1 = generate_coordinate(data)

def generate_ndarrays(data1):
    T = []
    n = len(set(l))
    m = len(set(p))

    for i in range(m):
        T.append( np.array(coo_matrix((ones(len(data1[i])), (data1[i][:, 0], data1[i][:, 1])), shape = (n, n)).todense() ))
    return T
T = generate_ndarrays(data1)

我的函数生成过程很简单:我只需将一些功能相似线连接在一起,然后在其上定义一个函数。我不太确定我的方式是否合理。有人可以告诉我如何处理吗?任何评论都将受到欢迎。

4

0 回答 0