2

我无法运行重复数据删除。我正在尝试使用此库从大量地址中删除重复项。这是我的代码:

import collections
import logging
import optparse
from numpy import nan

import dedupe
from unidecode import unidecode

optp = optparse.OptionParser()
optp.add_option('-v', '--verbose', dest='verbose', action='count',
                help='Increase verbosity (specify multiple times for more)'
                )
(opts, args) = optp.parse_args()
log_level = logging.WARNING 
if opts.verbose == 1:
    log_level = logging.INFO
elif opts.verbose >= 2:
    log_level = logging.DEBUG
logging.getLogger().setLevel(log_level)

input_file = 'H:/My Documents/Python Scripts/Dedupe/DupeTester.csv'
output_file = 'csv_example_output.csv'
settings_file = 'csv_example_learned_settings'
training_file = 'csv_example_training.json'

def preProcess(column):

    import unidecode
    column = column.decode("utf8")
    column = unidecode.unidecode(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    return column

def readData(filename):
    data_d = {}
    with open(filename) as f:
        reader = csv.DictReader(f)
        for row in reader:
            clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
            row_id = int(row[''])
            data_d[row_id] = dict(clean_row)

    return data_d


print 'importing data ...'
data_d = readData(input_file)

if os.path.exists(settings_file):
    print 'reading from', settings_file
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticDedupe(f)

else:
    fields = [
        {"field" : "fulladdr", "type" : "Address"},
        {"field" : "zip", "type" : "ShortString"},
             ]

deduper = dedupe.Dedupe(fields)

deduper.sample(data_d, 200)

if os.path.exists(training_file):
        print 'reading labeled examples from ', training_file
        with open(training_file, 'rb') as f:
            deduper.readTraining(f)

print 'starting active labeling...'

dedupe.consoleLabel(deduper)

deduper.train()

with open(training_file, 'w') as tf :
        deduper.writeTraining(tf)

with open(settings_file, 'w') as sf :
        deduper.writeSettings(sf)

print 'blocking...'



threshold = deduper.threshold(data_d, recall_weight=2)



print 'clustering...'
clustered_dupes = deduper.match(data_d, threshold)

print '# duplicate sets', len(clustered_dupes)




cluster_membership = {}
cluster_id = 0
for (cluster_id, cluster) in enumerate(clustered_dupes):
    id_set, scores = cluster
    cluster_d = [data_d[c] for c in id_set]
    canonical_rep = dedupe.canonicalize(cluster_d)
    for record_id, score in zip(id_set, scores) :
        cluster_membership[record_id] = {
            "cluster id" : cluster_id,
            "canonical representation" : canonical_rep,
            "confidence": score
        }

singleton_id = cluster_id + 1

with open(output_file, 'w') as f_output:
    writer = csv.writer(f_output)

    with open(input_file) as f_input :
        reader = csv.reader(f_input)

        heading_row = reader.next()
        heading_row.insert(0, 'confidence_score')
        heading_row.insert(0, 'Cluster ID')
        canonical_keys = canonical_rep.keys()
        for key in canonical_keys:
            heading_row.append('canonical_' + key)

        writer.writerow(heading_row)

        for row in reader:
            row_id = int(row[0])
            if row_id in cluster_membership :
                cluster_id = cluster_membership[row_id]["cluster id"]
                canonical_rep = cluster_membership[row_id]["canonical representation"]
                row.insert(0, cluster_membership[row_id]['confidence'])
                row.insert(0, cluster_id)
                for key in canonical_keys:
                    row.append(canonical_rep[key].encode('utf8'))
            else:
                row.insert(0, None)
                row.insert(0, singleton_id)
                singleton_id += 1
                for key in canonical_keys:
                    row.append(None)
            writer.writerow(row)

具体来说,当我运行它时,我得到以下信息:

C:\Anaconda\lib\site-packages\dedupe\core.py:18: UserWarning: There may be duplicates in the sample
  warnings.warn("There may be duplicates in the sample")
Traceback (most recent call last):

  File "<ipython-input-1-33e46d604c5f>", line 1, in <module>
    runfile('H:/My Documents/Python Scripts/Dedupe/dupetestscript.py', wdir='H:/My Documents/Python Scripts/Dedupe')

  File "C:\Anaconda\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 580, in runfile
    execfile(filename, namespace)

  File "H:/My Documents/Python Scripts/Dedupe/dupetestscript.py", line 67, in <module>
    deduper.sample(data_d, 200)

  File "C:\Anaconda\lib\site-packages\dedupe\api.py", line 924, in sample
    random_sample_size))

TypeError: unhashable type: 'numpy.ndarray'
4

1 回答 1

2

可以更改 numpy 数组(它是“可变的”)。Python 通过使用键的哈希值而不是键来加速字典访问。

因此,只有像数字、字符串或元组这样的可散列对象才能用作字典中的键。来自 hashable 的 Python 词汇表定义:

如果一个对象的哈希值在其生命周期内永远不会改变(它需要一个__hash__() 方法),并且可以与其他对象进行比较(它需要一个 () 方法),那么它就是可哈希的__eq__。比较相等的可散列对象必须具有相同的散列值。

哈希性使对象可用作字典键和集合成员,因为这些数据结构在内部使用哈希值。

Python 的所有不可变内置对象都是可散列的,而没有可变容器(例如列表或字典)是可散列的。默认情况下,作为用户定义类实例的对象是可散列的;它们都比较不相等(除了自己),它们的哈希值是从它们的id()中得出的。

于 2015-01-16T21:19:34.107 回答