0

我是 dedupe 库的新手,并且在我的一个项目中使用了 pandas_dedupe。

这是基于作为参数传递给函数的多列的 pandas 数据帧中联系人的简单重复数据删除。

import pandas as pd
from pandas_dedupe import dedupe_dataframe

df = pd.DataFrame.from_dict({'name':['john', 'mark', 'frank', 'jon', 'john'], 'zip':['11', '22', '33', '11', '11']})

dd = dedupe_dataframe(df, ['name', 'zip'], canonicalize=True, sample_size=1)

Importing data ...
Reading from dedupe_dataframe_learned_settings
Clustering...
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-4-4ba151f21074> in <module>
----> 1 dd = dedupe_dataframe(df, ['name', 'zip'], canonicalize=True, sample_size=1)

~/opt/anaconda3/envs/pegaclean/lib/python3.6/site-packages/pandas_dedupe/dedupe_dataframe.py in dedupe_dataframe(df, field_properties, canonicalize, config_name, update_model, threshold, sample_size, n_cores)
    250 
    251     # Cluster the records
--> 252     clustered_df = _cluster(deduper, data_d, threshold, canonicalize)
    253     results = df.join(clustered_df, how='left')
    254     results.drop(['dictionary'], axis=1, inplace=True)

~/opt/anaconda3/envs/pegaclean/lib/python3.6/site-packages/pandas_dedupe/dedupe_dataframe.py in _cluster(deduper, data, threshold, canonicalize)
    144     # ## Clustering
    145     print('Clustering...')
--> 146     clustered_dupes = deduper.partition(data, threshold)
    147 
    148     print('# duplicate sets', len(clustered_dupes))

~/opt/anaconda3/envs/pegaclean/lib/python3.6/site-packages/dedupe/api.py in partition(self, data, threshold)
    172         """
    173         pairs = self.pairs(data)
--> 174         pair_scores = self.score(pairs)
    175         clusters = self.cluster(pair_scores, threshold)
    176 

~/opt/anaconda3/envs/pegaclean/lib/python3.6/site-packages/dedupe/api.py in score(self, pairs)
    108                                            self.data_model,
    109                                            self.classifier,
--> 110                                            self.num_cores)
    111         except RuntimeError:
    112             raise RuntimeError('''

~/opt/anaconda3/envs/pegaclean/lib/python3.6/site-packages/dedupe/core.py in scoreDuplicates(record_pairs, data_model, classifier, num_cores)
    180         from .backport import Process, Queue  # type: ignore
    181 
--> 182     first, record_pairs = peek(record_pairs)
    183     if first is None:
    184         raise BlockingError("No records have been blocked together. "

~/opt/anaconda3/envs/pegaclean/lib/python3.6/site-packages/dedupe/core.py in peek(seq)
    330 def peek(seq: Iterator) -> Tuple[Optional[Any], Iterator]:
    331     try:
--> 332         first = next(seq)
    333     except TypeError as e:
    334         if "not an iterator" not in str(e):

~/opt/anaconda3/envs/pegaclean/lib/python3.6/site-packages/dedupe/api.py in pairs(self, data)
    247 
    248             con.executemany("INSERT INTO blocking_map values (?, ?)",
--> 249                             self.fingerprinter(data.items()))
    250 
    251             self.fingerprinter.reset_indices()

~/opt/anaconda3/envs/pegaclean/lib/python3.6/site-packages/dedupe/blocking.py in __call__(self, records, target)
     95 
     96             for pred_id, predicate in predicates:
---> 97                 block_keys = predicate(instance, target=target)
     98                 for block_key in block_keys:
     99                     yield block_key + pred_id, record_id

~/opt/anaconda3/envs/pegaclean/lib/python3.6/site-packages/dedupe/predicates.py in __call__(self, record, **kwargs)
     82 class StringPredicate(SimplePredicate):
     83     def __call__(self, record: RecordDict, **kwargs):
---> 84         column = record[self.field]
     85         if column:
     86             return self.func(" ".join(strip_punc(column).split()))

KeyError: 'street'

不知道'street'这里的列是什么意思,我的数据框没有这样的列,我检查了很多,但没有找到任何结果,任何帮助将不胜感激。

4

0 回答 0