arrays - 查找组合在 numpy 二维数组中出现的次数

Question

我有一个 2D numpy 数组，我想要一个函数在数组的 col1 和 col2 上运行，如果“M”是 col1 中唯一值的数量，“N”是 col2 中唯一值的数量，则输出 1D数组的大小为 (M * N) 例如，假设 col1 中有 3 个唯一值：A1、A2 和 A3，col2 中有 2 个唯一值：X1 和 X2。那么，可能的组合是：（A1 X1），（A1 X2），（A2 X1），（A2 X2），（A3 X1），（A3 X2）。现在我想找出每个组合在同一行中一起出现的次数，即有多少行包含组合 (A1,X1) 等等。我想将计数作为一维数组返回。这是我的代码：

import numpy as np
#@profile
def myfunc(arr1,arr2):
    unique_arr1 = np.unique(arr1)
    unique_arr2 = np.unique(arr2)
    pdt = len(unique_arr1)*len(unique_arr2)
    count = np.zeros(pdt).astype(int)

## getting the number of possible combinations and storing them in arr1_n and arr2_n
    if ((len(unique_arr2)>0) and (len(unique_arr1)>0)):
        arr1_n = unique_arr1.repeat(len(unique_arr2))
        arr2_n = np.tile(unique_arr2,len(unique_arr1))
## Finding the number of times a particular combination has occured
    for i in np.arange(0,pdt):
        pos1 = np.where(arr1==arr1_n[i])[0]
        pos2 = np.where(arr2==arr2_n[i])[0]
        count[i] = len(np.intersect1d(pos1,pos2))
return count

np.random.seed(1)
myarr = np.random.randint(20,size=(80000,4))
a = myfunc(myarr[:,1],myarr[:,2])

以下是我在此代码上运行 line_profiler 时的分析结果。

定时器单位：1e-06 s

总时间：18.1849 s 文件：testcode3.py 功能：第 2 行的 myfunc

Line # Hits Time Per Hit % Time Line Contents

 2                                           @profile
 3                                           def myfunc(arr1,arr2):
 4         1      74549.0  74549.0      0.4      unique_arr1 = np.unique(arr1)
 5         1      72970.0  72970.0      0.4      unique_arr2 = np.unique(arr2)
 6         1          9.0      9.0      0.0      pdt = len(unique_arr1)*len(unique_arr2)
 7         1         48.0     48.0      0.0      count = np.zeros(pdt).astype(int)
 8                                           
 9         1          5.0      5.0      0.0      if ((len(unique_arr2)>0) and (len(unique_arr1)>0)):
10         1         16.0     16.0      0.0          arr1_n = unique_arr1.repeat(len(unique_arr2))
11         1        105.0    105.0      0.0          arr2_n = np.tile(unique_arr2,len(unique_arr1))
12       401       5200.0     13.0      0.0      for i in np.arange(0,pdt):
13       400    6870931.0  17177.3     37.8          pos1 = np.where(arr1==arr1_n[i])[0]
14       400    6844999.0  17112.5     37.6          pos2 = np.where(arr2==arr2_n[i])[0]
15       400    4316035.0  10790.1     23.7          count[i] = len(np.intersect1d(pos1,pos2))
16         1          4.0      4.0      0.0      return count

如您所见，np.where 和 np.intersect1D 占用了大量时间。谁能建议更快的方法来做到这一点？将来我将不得不使用比这个大得多的真实数据，因此我需要优化这段代码。

score 0 · Accepted Answer

了解您可以使用的列的最大可能值：

def myfunc2(arr1,arr2):
    # The *100 depends on your maximum possible value
    complete_arr = myarr[:,1]*100 + myarr[:,2]
    unique_elements, counts_elements = np.unique(complete_arr, return_counts=True)

return counts_elements

8·10e5 和 8·10e6 行的结果：

N: 800000, myfucn2 time: 78.287 ms, myfucn time: 6556.748 ms
Equal?: True
N: 8000000, myfucn2 time: 736.020 ms, myfucn time: 100544.354 ms
Equal?: True

测试代码：

times_f1 = []
times_f2 = []
ns = 8*10**np.linspace(3, 6, 10)

for i in ns:
    np.random.seed(1)
    myarr = np.random.randint(20,size=(int(i),4))

    start1 = time.time()
    a = myfunc2(myarr[:,1],myarr[:,2])
    end1 = time.time() 
    times_f2.append(end1-start1)

    start2 = time.time()
    b = myfunc(myarr[:,1],myarr[:,2])
    end2 = time.time()
    times_f1.append(end2-start2)

    print("N: {:1>d}, myfucn2 time: {:.3f} ms, myfucn time: {:.3f} ms".format(int(i), (end1-start1)*1000.0, (end2-start2)*1000.0))
    print("Equal?: " + str(np.array_equal(a,b)))

两种情况的时间复杂度似乎都是 O(n)：

score 0 · Accepted Answer

为了满足 Bidisha Das 的要求：

代码：

def myfunc3(arr1, arr2):

    order_mag = 10**(int(math.log10(np.amax([arr1, arr2]))) + 1)

    #complete_arr = arr1*order_mag + arr2
    complete_arr = np.multiply(arr1, order_mag) + arr2

    unique_elements, counts_elements = np.unique(complete_arr, return_counts=True)

    unique_arr1 = np.unique(arr1)
    unique_arr2 = np.unique(arr2)

    r = np.zeros((len(unique_arr1), len(unique_arr2)))

    for i in range(len(unique_elements)):
        i1 = np.where(unique_arr1==int(unique_elements[i]/order_mag))
        i2 = np.where(unique_arr2==(unique_elements[i]%order_mag))

        r[i1,i2] += counts_elements[i]

    r = r.flatten()

    return r

测试代码：

times_f3 = []
times_f1 = []
ns = 8*10**np.linspace(3, 6, 10)

for i in ns:
    np.random.seed(1)
    myarr = np.random.randint(20,size=(int(i),4))

    start1 = time.time()
    a = myfunc3(myarr[:,1],myarr[:,2])
    end1 = time.time() 
    times_f3.append(end1-start1)

    start2 = time.time()
    b = myfunc(myarr[:,1],myarr[:,2])
    end2 = time.time()
    times_f1.append(end2-start2)

    print("N: {:1>d}, myfucn2 time: {:.3f} ms, myfucn time: {:.3f} ms".format(int(i), (end1-start1)*1000.0, (end2-start2)*1000.0))
    print("Equal?: " + str(np.array_equal(a,b)))

arrays - 查找组合在 numpy 二维数组中出现的次数

Line # Hits Time Per Hit % Time Line Contents

2 回答 2

代码：

测试代码：

结果：

Related

Reference