duplicates - 为什么 Dedupe.io 中的记录链接为匹配的记录提供不同的集群 ID？

Question

嗨，我有以下两个文件，我想在这两个文件之间找到匹配项。Test1.csv 中的每条记录最多可以匹配 Test2.csv 中的一条记录，但 Test1.csv 中的多条记录可以匹配 Test2.csv 中的同一条记录。我匹配名称和 domainWithExtension 列。

这是代码：

import csv
import re
import logging
import optparse

import dedupe
from unidecode import unidecode

def preProcess(column):
  column = unidecode(column)
  column = re.sub('\n', ' ', column)
  column = re.sub('-', '', column)
  column = re.sub('/', ' ', column)
  column = re.sub("'", '', column)
  column = re.sub(",", '', column)
  column = re.sub(":", ' ', column)
  column = re.sub('  +', ' ', column)
  column = column.strip().strip('"').strip("'").lower().strip()
  if not column:
      column = None
  return column

def readData(filename):
  """
  Read in our data from a CSV file and create a dictionary of records,
  where the key is a unique record ID.
  """

  data_d = {}
  with open(filename,encoding='utf-8') as f:
      reader = csv.DictReader(f)
      for i, row in enumerate(reader):
          clean_row = dict([(k, preProcess(v)) for (k, v) in row.items()])
          data_d[filename + str(i)] = dict(clean_row)

  return data_d



if __name__ == '__main__':

  optp = optparse.OptionParser()
  optp.add_option('-v', '--verbose', dest='verbose', action='count',
                  help='Increase verbosity (specify multiple times for more)'
                  )
  (opts, args) = optp.parse_args()
  log_level = logging.WARNING
  if opts.verbose:
      if opts.verbose == 1:
          log_level = logging.INFO
      elif opts.verbose >= 2:
          log_level = logging.DEBUG
  logging.getLogger().setLevel(log_level)

  output_file = 'data_matching_output.csv'
  settings_file = 'data_matching_learned_settings'
  training_file = 'data_matching_training.json'

  left_file = 'Test1.csv'
  right_file = 'Test2.csv'

  print('importing data ...')
  data_1 = readData(left_file)
  data_2 = readData(right_file)

  if os.path.exists(settings_file):
      print('reading from', settings_file)
      with open(settings_file, 'rb') as sf:
          linker = dedupe.StaticRecordLink(sf)

  else:

      fields = [
                  {'field' : 'name', 'type': 'String', 'has missing': True},
                  {'field' : 'domainWithExtension', 'type': 'String', 'has missing': True},
              ]


      linker = dedupe.RecordLink(fields)

      if os.path.exists(training_file):
          print('reading labeled examples from ', training_file)
          with open(training_file) as tf:
              linker.prepare_training(data_1,
                                      data_2,
                                      training_file=tf,
                                      sample_size=15000)
      else:
          linker.prepare_training(data_1, data_2, sample_size=15000)

          print('starting active labeling...')

          dedupe.console_label(linker)

          linker.train()

          with open(training_file, 'w') as tf:
              linker.write_training(tf)

          with open(settings_file, 'wb') as sf:
              linker.write_settings(sf)

  print('clustering...')
  linked_records = linker.join(data_1, data_2, 0.5, constraint='many-to-one')
  print(linked_records)
  print('# duplicate sets', len(linked_records))

  cluster_membership = {}
  for cluster_id, (cluster, score) in enumerate(linked_records):
      for record_id in cluster:
          cluster_membership[record_id] = {'Cluster ID': cluster_id,
                                           'Link Score': score}
  print(cluster_membership)
  with open(output_file, 'w',encoding = "utf-8") as f:

      header_unwritten = True

      for fileno, filename in enumerate((left_file, right_file)):
          with open(filename,encoding = "utf-8") as f_input:
              reader = csv.DictReader(f_input)

              if header_unwritten:

                  fieldnames = (['Cluster ID', 'Link Score', 'source file'] +
                                reader.fieldnames)

                  writer = csv.DictWriter(f, fieldnames=fieldnames)
                  writer.writeheader()

                  header_unwritten = False

              for row_id, row in enumerate(reader):

                  record_id = filename + str(row_id)
                  cluster_details = cluster_membership.get(record_id, {})
                  row['source file'] = fileno
                  row.update(cluster_details)

                  writer.writerow(row)

这有效并给出以下结果：

“Boxaround”的记录在 Test1.csv 中出现两次。因此，我希望这两个记录都与 Test2.csv 中的“Boxaround”记录匹配，并且输出中应该具有相同的集群 ID，但是输出中的集群 ID 4 有两条记录，另一条的集群 ID 为 0，用于“环绕”。我希望所有三个“Boxaround”记录都具有相同的集群 ID 4。我怎样才能做到这一点？请帮忙。

duplicates - 为什么 Dedupe.io 中的记录链接为匹配的记录提供不同的集群 ID？

0 回答 0

Related

Reference