1

我有一个将数据保存到 MongoDB 的 Node.js 应用程序。给定一个文档,我想在数据库中找到最相似的文档。

我的想法是实现某种最近邻算法,将所有记录作为训练序列并返回最相似的文档(包括关于这两个文档相似程度的某种百分比。)

例如,在我的数据库中有这些记录...

{ name: "Bill",   age: 10,  pc: "Mac",      ip: "68.23.13.8" }
{ name: "Alice",  age: 22,  pc: "Windows",  ip: "193.186.11.3" }
{ name: "Bob",    age: 12,  pc: "Windows",  ip: "56.89.22.1" }

...我想找到最接近这个的文件

{ name: "Tom", age: 10, pc: "Mac", ip: "68.23.13.10" }
// algorithm returns "Bill", .76 

是否有任何节点模块/实现采用任何类型的对象/参数并返回其最近的邻居?

4

2 回答 2

2

这是一些示例代码。它假定您可以对每个请求运行搜索。如果要修改它,请确保所有相似度函数都返回一个介于 0 和 1 之间的数字。

function tokenize(string) {
  var tokens = [];
  for (var i = 0; i < string.length-1; i++) {
    tokens.push(string.substr(i,2));
  }

  return tokens.sort();
}

function intersect(a, b)
{
  var ai=0, bi=0;
  var result = new Array();

  while( ai < a.length && bi < b.length )
  {
     if      (a[ai] < b[bi] ){ ai++; }
     else if (a[ai] > b[bi] ){ bi++; }
     else /* they're equal */
     {
       result.push(a[ai]);
       ai++;
       bi++;
     }
  }

  return result;
}

function sum(items) {
  var sum = 0;
  for (var i = 0; i < items.length; i++) {
    sum += items[i];
  }

  return sum;
}

function wordSimilarity(a, b) {
  var left   = tokenize(a);
  var right  = tokenize(b);
  var middle = intersect(left, right);

  return (2*middle.length) / (left.length + right.length);
}

function ipSimilarity(a, b) {
  var left  = a.split('.');
  var right = b.split('.');

  var diffs = [];
  for (var i = 0; i < 4; i++) {
    var diff1 = 255-left[i];
    var diff2 = 255-right[i];
    var diff  = Math.abs(diff2-diff1);

    diffs[i] = diff;
  }

  var distance = sum(diffs)/(255*4);

  return 1 - distance;
}

function ageSimilarity(a, b) {
  var maxAge   = 100;
  var diff1    = maxAge-a;
  var diff2    = maxAge-b;
  var diff     = Math.abs(diff2-diff1);
  var distance = diff / maxAge;

  return 1-distance;
}

function recordSimilarity(a, b) {
  var fields = [
    {name:'name', measure:wordSimilarity},
    {name:'age',  measure:ageSimilarity},
    {name:'pc',   measure:wordSimilarity},
    {name:'ip',   measure:ipSimilarity}
  ];

  var sum = 0;
  for (var i = 0; i < fields.length; i++) {
    var field   = fields[i];
    var name    = field.name;
    var measure = field.measure;
    var sim     = measure(a[name], b[name]);

    sum += sim;
  }

  return sum / fields.length;
}

function findMostSimilar(items, query) {
  var maxSim = 0;
  var result = null;

  for (var i = 0; i < items.length; i++) {
    var item = items[i];
    var sim  = recordSimilarity(item, query);

    if (sim > maxSim) {
      maxSim = sim;
      result = item;
    }
  }

  return result
}

var items = [
  { name: "Bill",   age: 10,  pc: "Mac",      ip: "68.23.13.8" },
  { name: "Alice",  age: 22,  pc: "Windows",  ip: "193.186.11.3" },
  { name: "Bob",    age: 12,  pc: "Windows",  ip: "56.89.22.1" }
];

var query  = { name: "Tom", age: 10, pc: "Mac", ip: "68.23.13.10" };
var result = findMostSimilar(items, query);

console.log(result);
于 2013-01-15T00:09:27.747 回答
0

一种直接的方法是计算两个文档之间的差异,差异越大,距离越大。您可以使用最大可能的差异对差异进行归一化,这应该为您提供可以相互比较的相对距离。

看看这个问题以计算 json 文档的差异。

JSON 对象的增量编码

于 2013-01-14T22:11:18.953 回答