我有类似的要求,但我想保留最新的条目。以下查询适用于我的包含数百万条记录和重复项的集合。
/** Create a array to store all duplicate records ids*/
var duplicates = [];
/** Start Aggregation pipeline*/
db.collection.aggregate([
{
$match: { /** Add any filter here. Add index for filter keys*/
filterKey: {
$exists: false
}
}
},
{
$sort: { /** Sort it in such a way that you want to retain first element*/
createdAt: -1
}
},
{
$group: {
_id: {
key1: "$key1", key2:"$key2" /** These are the keys which define the duplicate. Here document with same value for key1 and key2 will be considered duplicate*/
},
dups: {
$push: {
_id: "$_id"
}
},
count: {
$sum: 1
}
}
},
{
$match: {
count: {
"$gt": 1
}
}
}
],
{
allowDiskUse: true
}).forEach(function(doc){
doc.dups.shift();
doc.dups.forEach(function(dupId){
duplicates.push(dupId._id);
})
})
/** Delete the duplicates*/
var i,j,temparray,chunk = 100000;
for (i=0,j=duplicates.length; i<j; i+=chunk) {
temparray = duplicates.slice(i,i+chunk);
db.collection.bulkWrite([{deleteMany:{"filter":{"_id":{"$in":temparray}}}}])
}