algorithm - 按相似度排序

Question

我有一系列订单。

[a, b]
[a, b, c]
[a, b, c, d]
[a, b, c, d]
[b, c]
[c, d]

其中a、b、c、d是SKU，大箱子里装满了。并且有数千个订单和数百个可能的 SKU。

现在想象一下，在打包这些订单时，如果一个订单缺少前一个订单中的商品，您必须将该 SKU 的盒子收起来（同样取出一个您没有的盒子）。

您如何对此进行排序，以便有最少数量的盒子更改？或者，用更程序化的术语来说：如何最小化累积汉明距离/最大化集合中相邻项目之间的相交？

我真的不知道从哪里开始。是否已经有一些算法呢？有没有合适的近似值？

score 5 · Accepted Answer

确实@irrelephant 是正确的。这是一个无向哈密顿路径问题。将其建模为一个完整的无向图，其中节点是 sku 集，每条边的权重是各个集之间的汉明距离。那么找到一个打包顺序就相当于找到了一个恰好触及每个节点一次的路径。这是一条哈密顿路径（HP）。你想要最小的重量HP。

坏消息是找到最小权重 HP 是 NP 完全的，这意味着最佳解决方案通常需要指数时间。

好消息是有合理的近似算法。显而易见的贪心算法给出的答案不比最优 HP 的两倍差。这是：

create the graph of Hamming distances
sort the edges by weight in increasing order: e0, e1, ...
set C = emptyset
for e in sequence e0, e1, ...
   if C union {e} does not cause a cycle nor a vertex with degree more than 2 in C
      set C = C union {e}
return C

请注意，if使用经典的不相交集并集查找算法和顶点中的事件边计数器可以在几乎恒定的时间内实现语句测试。

因此，假设计算汉明距离是恒定时间，那么对于 n sku 集，这里的运行时间可以是 O(n^2 log n)。

如果您的词汇表中没有图表，请考虑一个三角形表，其中每对 sku 集都有一个条目。表中的条目是汉明距离。您想要对表条目进行排序，然后将 sku 集对以排序顺序一一添加到您的计划中，跳过会导致“分叉”或“循环”的对。叉子是一组像 (a,b), (b,c), (b,d) 这样的对。循环将是（a，b），（b，c），（c，a）。

还有更复杂的多项式时间算法可以达到 3/2 的近似值。

score 1 · Accepted Answer

我非常喜欢这个问题，我无法抗拒对上面建议的算法进行编码。代码有点长，所以我把它放在一个单独的响应中。

它在示例中提出了这个序列。

Step 1: c d
Step 2: b c
Step 3: a b c
Step 4: a b c d
Step 5: a b c d
Step 6: a b

请注意，此算法忽略了初始设置和最终拆卸成本。它只考虑设置间的距离。这里的汉明距离是 2 + 1 + 1 + 0 + 2 = 6。这与问题中给出的顺序相同的总距离。

#include <stdio.h>
#include <stdlib.h>

// With these data types we can have up to 64k items and 64k sets of items,
// But then the table of pairs is about 20Gb!
typedef unsigned short ITEM, INDEX;

// A sku set in the problem.
struct set {
  INDEX n_elts;
  ITEM *elts;
};

// A pair of sku sets and associated info.
struct pair {
  INDEX i, j;         // Indices of sets.
  ITEM dist;          // Hamming distance between sets.
  INDEX rank, parent; // Disjoint set union/find fields.
};

// For a given set, the adjacent ones along the path under construction.
struct adjacent {
  unsigned char n;  // 0, 1, or 2.
  INDEX elts[2];    // Indices of n adjacent sets.
};

// Some tracing functions for fun.
void print_pair(struct pair *pairs, int i)
{
  struct pair *p = pairs + i;
  printf("%d:(%d,%d@%d)[%d->%d]\n", i, p->i, p->j, p->dist, p->rank, p->parent);
}

void print_adjacent(struct adjacent *adjs, int i)
{
  struct adjacent *a = adjs + i;
  switch (a->n) {
    case 0: printf("%d:o", i); break;
    case 1: printf("%d:o->%d\n", i, a->elts[0]); break;
    default: printf("%d:%d<-o->%d\n", i, a->elts[0], a->elts[1]); break;
  }
}

// Compute the Hamming distance between two sets. Assumes elements are sorted.
// Works a bit like merging.
ITEM hamming_distance(struct set *a, struct set *b)
{
  int ia = 0, ib = 0;
  ITEM d = 0;   
  while (ia < a->n_elts && ib < b->n_elts) {
    if (a->elts[ia] < b->elts[ib]) {
      ++d;
      ++ia;
    }
    else if (a->elts[ia] > b->elts[ib]) {
      ++d;
      ++ib;
    }
    else {
      ++ia;
      ++ib;
    }
  }
  return d + (a->n_elts - ia) + (b->n_elts - ib);
}

// Classic disjoint set find operation.
INDEX find(struct pair *pairs, INDEX x)
{
  if (pairs[x].parent != x)
    pairs[x].parent = find(pairs, pairs[x].parent);
  return pairs[x].parent;
}

// Classic disjoint set union. Assumes x and y are canonical.
void do_union(struct pair *pairs, INDEX x, INDEX y)
{
  if (x == y) return;
  if (pairs[x].rank < pairs[y].rank)
    pairs[x].parent = y;
  else if (pairs[x].rank > pairs[y].rank)
    pairs[y].parent = x;
  else {
    pairs[y].parent = x;
    pairs[x].rank++;
  }
}

// Sort predicate to sort pairs by Hamming distance.
int by_dist(const void *va, const void *vb)
{
  const struct pair *a = va, *b = vb;
  return a->dist < b->dist ? -1 : a->dist > b->dist ? +1 : 0;
}

// Return a plan with greedily found least Hamming distance sum.
// Just an array of indices into the given table of sets.
// TODO: Deal with calloc/malloc failure!
INDEX *make_plan(struct set *sets, INDEX n_sets) 
{
  // Allocate enough space for all the pairs taking care for overflow. 
  // This grows as the square of n_sets!
  size_t n_pairs = (n_sets & 1) ? n_sets / 2 * n_sets : n_sets / 2 * (n_sets - 1);
  struct pair *pairs = calloc(n_pairs, sizeof(struct pair));

  // Initialize the pairs.
  int ip = 0;
  for (int j = 1; j < n_sets; j++) {
    for (int i = 0; i < j; i++) {
      struct pair *p = pairs + ip++;
      p->i = i;
      p->j = j;
      p->dist = hamming_distance(sets + i, sets + j);
    }
  }

  // Sort by Hamming distance.
  qsort(pairs, n_pairs, sizeof pairs[0], by_dist);

  // Initialize the disjoint sets.
  for (int i = 0; i < n_pairs; i++) {
    struct pair *p = pairs + i;
    p->rank = 0;
    p->parent = i;
  }

  // Greedily add pairs to the Hamiltonian path so long as they don't cause a non-path!
  ip = 0;
  struct adjacent *adjs = calloc(n_sets, sizeof(struct adjacent));
  for (int i = 0; i < n_pairs; i++) {
    struct pair *p = pairs + i;
    struct adjacent *ai = adjs + p->i, *aj = adjs + p->j; 

    // Continue if we'd get a vertex with degree 3 by adding this edge.
    if (ai->n == 2 || aj->n == 2) continue;

    // Find (possibly) disjoint sets of pair's elements.
    INDEX i_set = find(pairs, p->i);
    INDEX j_set = find(pairs, p->j);

    // Continue if we'd form a cycle by adding this edge.
    if (i_set == j_set) continue;

    // Otherwise add this edge.
    do_union(pairs, i_set, j_set);
    ai->elts[ai->n++] = p->j;
    aj->elts[aj->n++] = p->i;

    // Done after we've added enough pairs to touch all sets in a path.
    if (++ip == n_sets - 1) break;
  }

  // Find a set with only one adjacency, the path start.
  int p = -1;
  for (int i = 0; i < n_sets; ++i) 
    if (adjs[i].n == 1) {
      p = i;
      break;
    }

  // A plan will be an ordering of sets.
  INDEX *plan = malloc(n_sets * sizeof(INDEX));

  // Walk along the path to get the ordering.
  for (int i = 0; i < n_sets; i++) {
    plan[i] = p;
    struct adjacent *a = adjs + p;
    // This logic figures out which adjacency takes us forward.
    p = a->elts[ a->n > 1 && a->elts[1] != plan[i-1] ];
  }

  // Done with intermediate data structures.
  free(pairs);
  free(adjs);

  return plan;
}

// A tiny test case.  Much more testing needed!

#define ARRAY_SIZE(A) (sizeof A / sizeof A[0])
#define SET(Elts) { ARRAY_SIZE(Elts), Elts }

// Items must be in ascending order for Hamming distance calculation.
ITEM a1[] = { 'a', 'b' };
ITEM a2[] = { 'a', 'b', 'c' };
ITEM a3[] = { 'a', 'b', 'c', 'd' };
ITEM a4[] = { 'a', 'b', 'c', 'd' };
ITEM a5[] = { 'b', 'c' };
ITEM a6[] = { 'c', 'd' };

// Out of order to see how we do.
struct set sets[] = { SET(a3), SET(a6), SET(a1), SET(a4), SET(a5), SET(a2) };

int main(void)
{
  int n_sets = ARRAY_SIZE(sets);
  INDEX *plan = make_plan(sets, n_sets);

  for (int i = 0; i < n_sets; i++) {
    struct set *s = sets + plan[i];
    printf("Step %d: ", i+1);
    for (int j = 0; j < s->n_elts; j++) printf("%c ", (char)s->elts[j]);
    printf("\n");
  }
  return 0;
}

algorithm - 按相似度排序

2 回答 2

Related

Reference