python - 查找所有重叠的字典键组

Question

假设我有一个 Python 列表字典。我想找到所有具有共同项目的键组，以及每个这样的组对应的项目。

例如，假设项目是简单的整数：

dct      = dict()
dct['a'] = [0, 5, 7]
dct['b'] = [1, 2, 5]
dct['c'] = [3, 2]
dct['d'] = [3]
dct['e'] = [0, 5]

这些组将是：

groups    = dict()
groups[0] = ['a', 'e']
groups[1] = ['b', 'c']
groups[2] = ['c', 'd']
groups[3] = ['a', 'b', 'e']

这些群体的共同点是：

common    = dict()
common[0] = [0, 5]
common[1] = [2]
common[2] = [3]
common[3] = [5]

为了解决这个问题，我相信构建一个像下面这样的矩阵是有价值的，但我不知道如何从这一点着手。是否有任何有助于解决此类问题的 Python 库？

   | a  b  c  d  e |
|a|  x           x |
|b|     x  x     x |
|c|     x  x  x    |
|d|        x  x    |
|e|  x  x        x |

更新

我试图总结@NickBurns 在函数中提供的解决方案，但我在重现该解决方案时遇到了问题：

dct = { 'a' : [0, 5, 7], 'b' : [1, 2, 5], 'c' : [3, 2], 'd' : [3], 'e' : [0, 5]}

groups, common_items = get_groups(dct)
print 'Groups', groups
print 'Common items',  common_items

我得到：

Groups: defaultdict(<type 'list'>, {0: ['a', 'e'], 2: ['c', 'b'], 3: ['c', 'd'], 5: ['a', 'b', 'e']})                                                        

Common items: {0: None, 2: None, 3: None, 5: None}

这是功能

from collections import defaultdict
def common(query_group, dct):
    """ Recursively find the common elements within groups """
    if len(query_group) <= 1:
        return
    # Extract the elements from groups,
    # Pull their original values from dct
    # Get the intersection of these
    first, second = set(dct[query_group[0]]), set(dct[query_group[1]])  
    # print(first.intersection(second))
    return common(query_group[2:], dct)


def get_groups (dct):
  groups = defaultdict(list)

  for key, values in dct.items():
    for value in values:
      groups[value].append(key)

  # Clean up the groups:      
  for key in groups.keys():
    # i.e. the value is common to more than 1 group
    if len(groups[key]) <= 1:    
      del groups[key]

  # Identify common elements:
  common_items = dict()
  for k,v in groups.iteritems():
    if len(v) > 1:
      common_items[k] = common(v, dct)

  return groups, common_items

score 3 · Accepted Answer

我会尝试创建第二个字典 ( groups)，它表示原始中每个列表的交集dct。例如，您可以使用 defaultdict 来执行此操作，例如：

from collections import defaultdict
groups = defaultdict(list)
dct = { 'a' : [0, 5, 7], 'b' : [1, 2, 5], 'c' : [3, 2], 'd' : [3], 'e' : [0, 5]}
for key, values in dct.items():
    for value in values:
        groups[value].append(key)

for key in groups.keys():
    if len(groups[key]) > 1:    # i.e. the value is common to more than 1 group
        print(key, groups[key])

(0, ['a', 'e'])
(2, ['c', 'b'])
(3, ['c', 'd'])
(5, ['a', 'b', 'e'])

找到共同的元素有点麻烦，你需要遍历每个组并找到原始的交集dct。也许像这样的递归例程会起作用：

def common(query_group, dct, have_common=[]):
    """ Recursively find the common elements within groups """

    if len(query_group) <= 1:
        return have_common

    # extract the elements from groups, and pull their original values from dct
    # then get the intersection of these
    first, second = set(dct[query_group[0]]), set(dct[query_group[1]])
    have_common.extend(first.intersection(second))

    return common(query_group[2:], dct, have_common)

for query_group in groups.values():
    if len(query_group) > 1:
        print(query_group, '=>', common(query_group, dct, have_common=[]))

['e', 'a'] => [0, 5]    
['b', 'c'] => [2]    
['d', 'c'] => [3]    
['e', 'b', 'a'] => [5}]

显然它需要一些更漂亮的格式，但我认为它可以完成工作。希望这会有所帮助。

score 2 · Accepted Answer

这非常接近您的要求 - 看看它，看看它是否足够接近。

from collections import defaultdict

dct = dict()
dct['a'] = [0, 5, 7]
dct['b'] = [1, 2, 5]
dct['c'] = [3, 2]
dct['d'] = [3]
dct['e'] = [0, 5]

inverseDict = defaultdict(list)
for key in dct:
    for item in dct[key]:
        inverseDict[item].append(key)
for item in inverseDict.keys():
    if len(inverseDict[item]) < 2:
        del inverseDict[item]

for item in inverseDict:
    print item, ":", inverseDict[item]

输出：

0 : ['a', 'e']
2 : ['c', 'b']
3 : ['c', 'd']
5 : ['a', 'b', 'e']

score 2 · Accepted Answer

您可以使用NetworkX库来获取该矩阵（邻接矩阵）表示：

import networkx as nx
dct = { 'a' : [0, 5, 7], 'b' : [1, 2, 5], 'c' : [3, 2], 'd' : [3], 'e' : [0, 5]}
nodes = sorted(dct)

G = nx.Graph()
for node in nodes:
    attached_nodes = dct[node]
    G.add_node(node)
    for nod in attached_nodes:
        if 0 <= nod < len(nodes):
            G.add_edge(node, nodes[nod])

print G.nodes()
print G.edges()
print G.has_edge('a','b')
print G.has_edge('b','c')

输出：

['a', 'c', 'b', 'e', 'd']
[('a', 'a'), ('a', 'e'), ('c', 'c'), ('c', 'b'), ('c', 'd'), ('b', 'b'), ('d', 'd')]
False
True

score 1 · Accepted Answer

这是一个大麻烦，但它有效。它基本上是在构建一个这样的数组：

  | 0 1 2 3 4 5 6 7 |
  +-----------------+
|a| 1 0 0 0 1 0 0 1 |
|b| 0 1 1 0 0 1 0 0 |
|c| 0 0 1 1 0 0 0 0 |
|d| 0 0 0 1 0 0 0 0 |
|e| 1 0 0 0 0 1 0 0 |

这些组是具有多个的唯一列1。要查找组的所有公共元素，您可以1在组定义具有1s 的位置找到具有 s 的列。并用 Python 编写它，并巧妙地使用 scipy 的稀疏矩阵来构建上述数组，我得到以下结果：

import numpy as np
import scipy.sparse as sps

dct = {'a' : [0, 5, 7], 'b' : [1, 2, 5], 'c' : [3, 2],
       'd' : [3], 'e' : [0, 5]}

keys = []
lens = []
vals = []

for key, items in dct.items():
    keys.append(key)
    lens.append(len(items))
    vals.extend(items)

keys = np.array(keys)
lens = np.array(lens)
vals = np.array(vals)
unique_values, val_idx = np.unique(vals, return_inverse=True)

data = np.ones_like(val_idx)
indices = val_idx
indptr = np.concatenate(([0], np.cumsum(lens)))

dct_array = sps.csr_matrix((data, indices, indptr))
dct_array = dct_array.T.toarray()
mask = dct_array.sum(axis=-1) >= 2
dct_array = dct_array[mask].astype(np.bool)
unique_values = unique_values[mask]

dct_array = np.ascontiguousarray(dct_array)
dct_array = dct_array.view((np.void,
                            (dct_array.dtype.itemsize *
                             len(keys)))).ravel()
groups, grp_idx = np.unique(dct_array,
                            return_index=True)
groups = groups.view(np.bool).reshape(-1, len(keys))
dct_array = dct_array.view(np.bool).reshape(-1, len(keys))

for group, idx in zip(groups, grp_idx) :
    print 'group {0}'.format(keys[group])
    common = unique_values[np.all(np.logical_and(dct_array[idx],
                                                 dct_array) ==
                                  dct_array[idx], axis=-1)]
    print 'common {0}'.format(common)

这打印出来：

group ['c' 'd']
common [3]
group ['c' 'b']
common [2]
group ['a' 'e']
common [0 5]
group ['a' 'b' 'e']
common [5]

python - 查找所有重叠的字典键组

更新

4 回答 4

Related

Reference