0

我有如下代码:

import pandas as pd
import os
import jellyfish
import numpy as np
a = {'c' : ['dog', 'cat', 'tree','slow','fast','hurry','hello', 'world', 'germany', 'france','rahul', 'india',
           'pakisthan', 'bangla', 'australia','newzealand', 'united kingdom', 'france','spain', 'belgium',
           'bangladesh', 'west indies','USA','canada','afghanisthan','columbia','tamilnadu','telangana','hyderabad',
           'khanapur', 'warangal']}
df = pd.DataFrame(a)

我在类结构中有以下代码:

class Distance:
    def __init__(self, partitions):
        self.partitions = partitions

    def partitionlist(self, list_ofelements, num_of_divisions):
        for i in range(0, list_ofelements.size, num_of_divisions): 
            yield list_ofelements[i:i + num_of_divisions]


    def fuzzy_match(self, terms):
        if len(terms) == 1:
            return 1
        return jellyfish.jaro_winkler((*terms))

    def distance_measure(self, x, y):
        term1 = np.repeat(x,len(y))
        term2 = np.tile(y,(len(x),1)).flatten()
        terms = list(zip(term1,term2))
        resu =  [self.fuzzy_match(frozenset(t)) for t in terms]
        final_ = np.reshape(resu, (len(x),len(y)))
        return np.around(final_.astype('float16'),2)

    def dist_calculation(self, list_of_companies):
        for index,item in enumerate(list_of_companies):
            for i in range(len(list_of_companies)):
                files = os.listdir()
                filename = 'result_for_partition' + str(index) + str('_') + str(i) + '.npy'
                if not filename in files:
                    if index <= i:
                        print('for partition: ' + str(index) + str('_') + str(i))
                        partition_result = self.distance_measure(list_of_companies[index],list_of_companies[i])
                        np.save(filename, partition_result)
        return self

    def read_distances(self, list_of_companies):
        files = os.listdir()
        fullfilename = 'jaro_distance' + '.npy'
        if not fullfilename in files:
            arr = None
            for index,item in enumerate(list_of_companies):
                row = None
                for i in range(len(list_of_companies)):
                    if i == 0 and index <= i:
                        filename = 'result_for_partition' + str(index)+ str('_') + str(i) + str('.npy')
                        row = np.load(filename)
                        #print(row) 

                    elif index <= i:
                        #print('elif')
                        print(index,i)
                        filename = 'result_for_partition' + str(index) + str('_') + str(i) + str('.npy')
                        block = np.load(filename)
                        row = np.hstack((row,block))
                        #print(row)

                    elif i==0 and index > i:
                        print(index,i)
                        filename = 'result_for_partition' + str(i) + str('_') + str(index) + str('.npy')
                        row = np.load(filename).T
                        #print(row)                       

                    else:
                        print(index,i)
                        filename = 'result_for_partition' + str(i) + str('_') + str(index) + str('.npy')
                        block = np.load(filename).T
                        row = np.hstack((row,block))
                        #print(row)

                if index > 0:
                    row = np.concatenate((previous_rows, row), axis =0)
                    print(row.shape)
                previous_rows = row

                if index == len(list_of_companies)-1:
                    matrix = row
            np.save(fullfilename, matrix)
            print("saved in:")
            print(fullfilename)
            distances = 1 - np.load(fullfilename)
        else:
            distances = 1 - np.load(fullfilename)

        return distances

    def dist_matrix(self, dataset):
        company_names = dataset['c'].unique()
        print('length of unique company name:',company_names.size)
        elements_list =  list(self.partitionlist(company_names, self.partitions))
        print(elements_list)
        self.dist_calculation(elements_list)
        distance_mat = self.read_distances(elements_list)                              
        return distance_mat


obj = Distance(partitions = 6)
matrix = obj.dist_matrix(dataset = df)

我实际上是在尝试识别文本之间的相似性。在这里,我使用了一个玩具数据集,但在我的情况下,我有一个巨大的文本数据,我必须在其中找到相似性。我无法将整个数据放入 RAM 中,因此决定划分数据并构建距离矩阵。但是它花费了太多时间,并且希望通过使用多处理来更快地运行它。我尝试了几种方法,但我陷入了僵局。另外,我找到了 ray 库,但无法将其集成到此代码中(尝试但未能减少时间)。有没有办法在处理方面对其进行优化。

4

0 回答 0