我有一个名为 customer 的自定义对象,其中包含 Customer_Name、Address_Line_1、Post_Code 等字段。
我想浏览所有记录并比较 Customer_Name 的相似性(基于模糊搜索或 levenshtein 距离)。如果相似度高于或低于某个阈值,则会更新自定义字段 (Possible_Duplicate_Customer_ID__c) 以识别可能的重复项。
我已经设法实现了这一点,但我遇到了两个问题:
1)。超出 Salesforce 控制器限制(脚本语句过多:200001)可能是由 Levenshtein 距离算法所需的大量循环引起的。2)。我提交的列表(newList)也包含重复的 ID。
private static List<Customer__c> newList = new List<Customer__c>();
webService static Integer findDupes() {
Integer returnCount = 0;
Double cost = 0;
Integer COST_THRESHOLD = 5;
Map<id,Customer__c> cMap = new Map<id,Customer__c>([
select ID, Name, Customer_Name__c, Possible_Duplicate_Customer_ID__c
from Customer__c
]);
List<Customer__c> custList1 = cMap.values();
List<Customer__c> custList2 = custList1.clone();
for (Customer__c cust1 :custList1) {
for (Customer__c cust2 :custList2) {
cost = LevenshteinDistance.computeLevenshteinDistance(
cust1.Customer_Name__c, cust2.Customer_Name__c);
if(cost<COST_THRESHOLD && cost != 0) {
Customer__c c = new Customer__c(
id = cust2.Id,
Possible_Duplicate_Customer_ID__c = cust1.Name
);
newList.add(c);
}
System.debug(cost+' edits to transform '
+cust1.Customer_Name__c+' to '+cust2.Customer_Name__c);
}
}
returnCount = newList.size();
update newList;
return returnCount;
}