我是 dedupe 库的新手,并且在我的一个项目中使用了 pandas_dedupe。
这是基于作为参数传递给函数的多列的 pandas 数据帧中联系人的简单重复数据删除。
import pandas as pd
from pandas_dedupe import dedupe_dataframe
df = pd.DataFrame.from_dict({'name':['john', 'mark', 'frank', 'jon', 'john'], 'zip':['11', '22', '33', '11', '11']})
dd = dedupe_dataframe(df, ['name', 'zip'], canonicalize=True, sample_size=1)
Importing data ...
Reading from dedupe_dataframe_learned_settings
Clustering...
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-4-4ba151f21074> in <module>
----> 1 dd = dedupe_dataframe(df, ['name', 'zip'], canonicalize=True, sample_size=1)
~/opt/anaconda3/envs/pegaclean/lib/python3.6/site-packages/pandas_dedupe/dedupe_dataframe.py in dedupe_dataframe(df, field_properties, canonicalize, config_name, update_model, threshold, sample_size, n_cores)
250
251 # Cluster the records
--> 252 clustered_df = _cluster(deduper, data_d, threshold, canonicalize)
253 results = df.join(clustered_df, how='left')
254 results.drop(['dictionary'], axis=1, inplace=True)
~/opt/anaconda3/envs/pegaclean/lib/python3.6/site-packages/pandas_dedupe/dedupe_dataframe.py in _cluster(deduper, data, threshold, canonicalize)
144 # ## Clustering
145 print('Clustering...')
--> 146 clustered_dupes = deduper.partition(data, threshold)
147
148 print('# duplicate sets', len(clustered_dupes))
~/opt/anaconda3/envs/pegaclean/lib/python3.6/site-packages/dedupe/api.py in partition(self, data, threshold)
172 """
173 pairs = self.pairs(data)
--> 174 pair_scores = self.score(pairs)
175 clusters = self.cluster(pair_scores, threshold)
176
~/opt/anaconda3/envs/pegaclean/lib/python3.6/site-packages/dedupe/api.py in score(self, pairs)
108 self.data_model,
109 self.classifier,
--> 110 self.num_cores)
111 except RuntimeError:
112 raise RuntimeError('''
~/opt/anaconda3/envs/pegaclean/lib/python3.6/site-packages/dedupe/core.py in scoreDuplicates(record_pairs, data_model, classifier, num_cores)
180 from .backport import Process, Queue # type: ignore
181
--> 182 first, record_pairs = peek(record_pairs)
183 if first is None:
184 raise BlockingError("No records have been blocked together. "
~/opt/anaconda3/envs/pegaclean/lib/python3.6/site-packages/dedupe/core.py in peek(seq)
330 def peek(seq: Iterator) -> Tuple[Optional[Any], Iterator]:
331 try:
--> 332 first = next(seq)
333 except TypeError as e:
334 if "not an iterator" not in str(e):
~/opt/anaconda3/envs/pegaclean/lib/python3.6/site-packages/dedupe/api.py in pairs(self, data)
247
248 con.executemany("INSERT INTO blocking_map values (?, ?)",
--> 249 self.fingerprinter(data.items()))
250
251 self.fingerprinter.reset_indices()
~/opt/anaconda3/envs/pegaclean/lib/python3.6/site-packages/dedupe/blocking.py in __call__(self, records, target)
95
96 for pred_id, predicate in predicates:
---> 97 block_keys = predicate(instance, target=target)
98 for block_key in block_keys:
99 yield block_key + pred_id, record_id
~/opt/anaconda3/envs/pegaclean/lib/python3.6/site-packages/dedupe/predicates.py in __call__(self, record, **kwargs)
82 class StringPredicate(SimplePredicate):
83 def __call__(self, record: RecordDict, **kwargs):
---> 84 column = record[self.field]
85 if column:
86 return self.func(" ".join(strip_punc(column).split()))
KeyError: 'street'
不知道'street'
这里的列是什么意思,我的数据框没有这样的列,我检查了很多,但没有找到任何结果,任何帮助将不胜感激。