python - 如何加速组合算法？

Question

B下面的代码找到形成 string的最少列表项A。让我们假设A='hello world how are you doing'和B=['hello world how', 'hello are' ,'hello', 'hello are you doing']。然后，由于索引为 0 和 3 的项目包含 stringA的所有单词，因此答案将为 2。

我将所有字符串转换为整数以加快算法速度，但由于测试用例更大更复杂，我需要更优化的算法。我想知道如何加快这个算法。

import itertools

A='hello world how are you doing'
B=['hello world how', 'hello are' ,'hello', 'hello are you doing']

d = {}
res_A = [d.setdefault(word, len(d)+1) for word in A.lower().split()]
mapping = dict(zip(A.split(), range(1, len(A) + 1)))

# find mappings of words in B
res_B = [[mapping[word] for word in s.split()] for s in B]

set_a = set(res_A)
solved = False

for L in range(0, len(res_B)+1):
    for subset in itertools.combinations(res_B, L):
        s = set(item for sublist in subset for item in sublist)
        if set_a.issubset(s):
            print(f'{L}')
            solved = True
            break
    if solved: break

score 0 · Accepted Answer

我在 remove_sub 上有一个逻辑错误，不知道为什么它仍然有效

尝试清理数据并从 b 中减少尽可能多的项目

import itertools as it
import time
import numpy as np
from collections import Counter, defaultdict as dd
import copy

A='hello world how are you doing'
B=['hello world how', 'hello are' ,'hello', 'hello are you doing']

d = {}
res_A = [d.setdefault(word, len(d)+1) for word in A.lower().split()
mapping = dict(zip(A.split(), range(1, len(A) + 1)))

# find mappings of words in B
res_B = [[mapping[word] for word in s.split()] for s in B]
set_a = set(res_A)

# my adding works on list of sets
for i in range(len(res_B)):
    res_B[i] = set(res_B[i])

# a is a list of numbers, b is a list of sets of numbers, we are trying to cover a using min items from b
a = np.random.randint(0,50,size = 30)
np_set_a  = set(a)
b = []
for i in range(200):
    size = np.random.randint(0,20)
    b.append(set(np.random.choice(a,size)))

# till here, created a,b for larger data test

def f1(set_a, b):
    solved = False
    for L in range(0, len(b)+1):
        for subset in it.combinations(b, L):
            s = set(item for sublist in subset for item in sublist)
            if set_a.issubset(s):
                print(f'{L}','**************f1')
                solved = True
                break
        if solved: break

def rare(b):
    c = Counter() #a dict where the key is a num and the value is how many times this num appears on all b sets
    items = dd(list) # dict where the key is num and value is list of index where this num exist in b
    for i in range(len(b)):
        c.update(b[i]) 
        for num in b[i]:
            items[num].append(i)

    rare = set()
    common = c.most_common() #return sorted list of tuples with a number and how many times it appear
    for i in range(1,len(common)-1): #take all the numbers that appear only once on b, these items will have to be on the final combination so you can remove them from b and their numbers from a because those numbers are covered
        if common[-i][1] ==1:
            rare.add(common[0])
            continue
        break

    rare_items = {} # a set of all index that have rare number in them
    for k in rare: 
        rare_items.update(items[k])

    values_from_rare_items = set() # a set of all the numbers in the items with the rare numbers
    for i in rare_items:
        values_from_rare_items.update(b[i])

    for i in reversed(sorted(rare_items)): #remove from b all the items with rare numbers, because they have to be on the final combination, you dont need to check them
        b.pop(i)
    return values_from_rare_items,b, len(rare_items)

#check sets on b, if 2 are equal remove 1, if 1 is a subset of the other, remove it 
def remove_sub(b):
    to_pop = set()
    t = copy.deepcopy(b)
    for i in range(len(b)):
        for j in range(len(t)):
            if i ==j:
                continue
            if b[i] == t[j]:
                to_pop.add(i)
                continue
            if b[i].issubset(t[j]):
                to_pop.add(i)
            if t[j].issubset(b[i]):
                to_pop.add(j)
    for i in reversed(sorted(to_pop)):
        b.pop(i)
    return b

def f2(set_a, b):
    b1 = remove_sub(b)
    values_from_rare_items,b2, num_rare_items = rare(b)
    a_without_rare = set_a-values_from_rare_items #remove from a all the number you added with the rare unique numbers, they are already covered
    

    solved = False
    for L in range(0, len(b2)+1):
        for subset in it.combinations(b2, L):
            s = set(item for sublist in subset for item in sublist)
            if a_without_rare.issubset(s):
                length = L+num_rare_items 
                print(f'{length}', "*********f2")
                solved = True
                break
        if solved: break


s = time.time()
f1(set_a,b)
print(time.time()-s,'********************f1')

s = time.time()
f2(set_a,b)
print(time.time()-s,'******************f2')


s = time.time()
f1(set_a,res_B)
print(time.time()-s,'********************f1')

s = time.time()
f2(set_a,res_B)
print(time.time()-s,'******************f2')

this is the out put

2 **************f1
0.16755199432373047 ********************f1 num_array

2 *********f2
0.09078240394592285 ******************f2 num_array

2 **************f1
0.0009989738464355469 ********************f1 your_data

2 *********f2
0.0009975433349609375 ******************f2 your_data

您可以通过获取仅出现几次的所有项目来进一步改进它，并将它们视为它们出现一次，在极少数情况下它不会是真正的最小数字，但时间改进是显着的

python - 如何加速组合算法？

1 回答 1

Related

Reference