我在 remove_sub 上有一个逻辑错误,不知道为什么它仍然有效
尝试清理数据并从 b 中减少尽可能多的项目
import itertools as it
import time
import numpy as np
from collections import Counter, defaultdict as dd
import copy
A='hello world how are you doing'
B=['hello world how', 'hello are' ,'hello', 'hello are you doing']
d = {}
res_A = [d.setdefault(word, len(d)+1) for word in A.lower().split()
mapping = dict(zip(A.split(), range(1, len(A) + 1)))
# find mappings of words in B
res_B = [[mapping[word] for word in s.split()] for s in B]
set_a = set(res_A)
# my adding works on list of sets
for i in range(len(res_B)):
res_B[i] = set(res_B[i])
# a is a list of numbers, b is a list of sets of numbers, we are trying to cover a using min items from b
a = np.random.randint(0,50,size = 30)
np_set_a = set(a)
b = []
for i in range(200):
size = np.random.randint(0,20)
b.append(set(np.random.choice(a,size)))
# till here, created a,b for larger data test
def f1(set_a, b):
solved = False
for L in range(0, len(b)+1):
for subset in it.combinations(b, L):
s = set(item for sublist in subset for item in sublist)
if set_a.issubset(s):
print(f'{L}','**************f1')
solved = True
break
if solved: break
def rare(b):
c = Counter() #a dict where the key is a num and the value is how many times this num appears on all b sets
items = dd(list) # dict where the key is num and value is list of index where this num exist in b
for i in range(len(b)):
c.update(b[i])
for num in b[i]:
items[num].append(i)
rare = set()
common = c.most_common() #return sorted list of tuples with a number and how many times it appear
for i in range(1,len(common)-1): #take all the numbers that appear only once on b, these items will have to be on the final combination so you can remove them from b and their numbers from a because those numbers are covered
if common[-i][1] ==1:
rare.add(common[0])
continue
break
rare_items = {} # a set of all index that have rare number in them
for k in rare:
rare_items.update(items[k])
values_from_rare_items = set() # a set of all the numbers in the items with the rare numbers
for i in rare_items:
values_from_rare_items.update(b[i])
for i in reversed(sorted(rare_items)): #remove from b all the items with rare numbers, because they have to be on the final combination, you dont need to check them
b.pop(i)
return values_from_rare_items,b, len(rare_items)
#check sets on b, if 2 are equal remove 1, if 1 is a subset of the other, remove it
def remove_sub(b):
to_pop = set()
t = copy.deepcopy(b)
for i in range(len(b)):
for j in range(len(t)):
if i ==j:
continue
if b[i] == t[j]:
to_pop.add(i)
continue
if b[i].issubset(t[j]):
to_pop.add(i)
if t[j].issubset(b[i]):
to_pop.add(j)
for i in reversed(sorted(to_pop)):
b.pop(i)
return b
def f2(set_a, b):
b1 = remove_sub(b)
values_from_rare_items,b2, num_rare_items = rare(b)
a_without_rare = set_a-values_from_rare_items #remove from a all the number you added with the rare unique numbers, they are already covered
solved = False
for L in range(0, len(b2)+1):
for subset in it.combinations(b2, L):
s = set(item for sublist in subset for item in sublist)
if a_without_rare.issubset(s):
length = L+num_rare_items
print(f'{length}', "*********f2")
solved = True
break
if solved: break
s = time.time()
f1(set_a,b)
print(time.time()-s,'********************f1')
s = time.time()
f2(set_a,b)
print(time.time()-s,'******************f2')
s = time.time()
f1(set_a,res_B)
print(time.time()-s,'********************f1')
s = time.time()
f2(set_a,res_B)
print(time.time()-s,'******************f2')
this is the out put
2 **************f1
0.16755199432373047 ********************f1 num_array
2 *********f2
0.09078240394592285 ******************f2 num_array
2 **************f1
0.0009989738464355469 ********************f1 your_data
2 *********f2
0.0009975433349609375 ******************f2 your_data
您可以通过获取仅出现几次的所有项目来进一步改进它,并将它们视为它们出现一次,在极少数情况下它不会是真正的最小数字,但时间改进是显着的