我有如下代码:
import pandas as pd
import os
import jellyfish
import numpy as np
a = {'c' : ['dog', 'cat', 'tree','slow','fast','hurry','hello', 'world', 'germany', 'france','rahul', 'india',
'pakisthan', 'bangla', 'australia','newzealand', 'united kingdom', 'france','spain', 'belgium',
'bangladesh', 'west indies','USA','canada','afghanisthan','columbia','tamilnadu','telangana','hyderabad',
'khanapur', 'warangal']}
df = pd.DataFrame(a)
我在类结构中有以下代码:
class Distance:
def __init__(self, partitions):
self.partitions = partitions
def partitionlist(self, list_ofelements, num_of_divisions):
for i in range(0, list_ofelements.size, num_of_divisions):
yield list_ofelements[i:i + num_of_divisions]
def fuzzy_match(self, terms):
if len(terms) == 1:
return 1
return jellyfish.jaro_winkler((*terms))
def distance_measure(self, x, y):
term1 = np.repeat(x,len(y))
term2 = np.tile(y,(len(x),1)).flatten()
terms = list(zip(term1,term2))
resu = [self.fuzzy_match(frozenset(t)) for t in terms]
final_ = np.reshape(resu, (len(x),len(y)))
return np.around(final_.astype('float16'),2)
def dist_calculation(self, list_of_companies):
for index,item in enumerate(list_of_companies):
for i in range(len(list_of_companies)):
files = os.listdir()
filename = 'result_for_partition' + str(index) + str('_') + str(i) + '.npy'
if not filename in files:
if index <= i:
print('for partition: ' + str(index) + str('_') + str(i))
partition_result = self.distance_measure(list_of_companies[index],list_of_companies[i])
np.save(filename, partition_result)
return self
def read_distances(self, list_of_companies):
files = os.listdir()
fullfilename = 'jaro_distance' + '.npy'
if not fullfilename in files:
arr = None
for index,item in enumerate(list_of_companies):
row = None
for i in range(len(list_of_companies)):
if i == 0 and index <= i:
filename = 'result_for_partition' + str(index)+ str('_') + str(i) + str('.npy')
row = np.load(filename)
#print(row)
elif index <= i:
#print('elif')
print(index,i)
filename = 'result_for_partition' + str(index) + str('_') + str(i) + str('.npy')
block = np.load(filename)
row = np.hstack((row,block))
#print(row)
elif i==0 and index > i:
print(index,i)
filename = 'result_for_partition' + str(i) + str('_') + str(index) + str('.npy')
row = np.load(filename).T
#print(row)
else:
print(index,i)
filename = 'result_for_partition' + str(i) + str('_') + str(index) + str('.npy')
block = np.load(filename).T
row = np.hstack((row,block))
#print(row)
if index > 0:
row = np.concatenate((previous_rows, row), axis =0)
print(row.shape)
previous_rows = row
if index == len(list_of_companies)-1:
matrix = row
np.save(fullfilename, matrix)
print("saved in:")
print(fullfilename)
distances = 1 - np.load(fullfilename)
else:
distances = 1 - np.load(fullfilename)
return distances
def dist_matrix(self, dataset):
company_names = dataset['c'].unique()
print('length of unique company name:',company_names.size)
elements_list = list(self.partitionlist(company_names, self.partitions))
print(elements_list)
self.dist_calculation(elements_list)
distance_mat = self.read_distances(elements_list)
return distance_mat
obj = Distance(partitions = 6)
matrix = obj.dist_matrix(dataset = df)
我实际上是在尝试识别文本之间的相似性。在这里,我使用了一个玩具数据集,但在我的情况下,我有一个巨大的文本数据,我必须在其中找到相似性。我无法将整个数据放入 RAM 中,因此决定划分数据并构建距离矩阵。但是它花费了太多时间,并且希望通过使用多处理来更快地运行它。我尝试了几种方法,但我陷入了僵局。另外,我找到了 ray 库,但无法将其集成到此代码中(尝试但未能减少时间)。有没有办法在处理方面对其进行优化。