algorithm - 发现长模式

Question

给定一个排序的数字列表，我想找到最长的子序列，其中连续元素之间的差异呈几何增加。所以如果列表是

 1, 2, 3, 4, 7, 15, 27, 30, 31, 81

那么子序列是1, 3, 7, 15, 31。或者考虑1, 2, 5, 6, 11, 15, 23, 41, 47哪个具有5, 11, 23, 47a = 3 和 k = 2 的子序列。

这可以在 O( n ² ) 时间内解决吗？其中 n 是列表的长度。

我对差异级数为ak、ak ²、ak ³等的一般情况感兴趣，其中a和k都是整数，以及a = 1的特殊情况，因此差异级数为k , k ² , k ³等。

score 11 · Accepted Answer

更新

我对算法进行了改进，平均需要 O(M + N^2) 和 O(M+N) 的内存需求。主要与下面描述的协议相同，但为了计算 ech 差异 D 的可能因素 A、K，我预加载了一个表。对于 M=10^7，此表的构建时间不到一秒。

我做了一个 C 实现，用不到 10 分钟的时间来解决 N=10^5 不同的随机整数元素。

以下是 C 语言的源代码：只需执行： gcc -O3 -o findgeo findgeo.c

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <memory.h>
#include <time.h>

struct Factor {
    int a;
    int k;
    struct Factor *next;
};

struct Factor *factors = 0;
int factorsL=0;

void ConstructFactors(int R) {
    int a,k,C;
    int R2;
    struct Factor *f;
    float seconds;
    clock_t end;
    clock_t start = clock();

    if (factors) free(factors);
    factors = malloc (sizeof(struct Factor) *((R>>1) + 1));
    R2 = R>>1 ;
    for (a=0;a<=R2;a++) {
        factors[a].a= a;
        factors[a].k=1;
        factors[a].next=NULL;
    }
    factorsL=R2+1;
    R2 = floor(sqrt(R));
    for (k=2; k<=R2; k++) {
        a=1;
        C=a*k*(k+1);
        while (C<R) {
            C >>= 1;
            f=malloc(sizeof(struct Factor));
            *f=factors[C];
            factors[C].a=a;
            factors[C].k=k;
            factors[C].next=f;
            a++;
            C=a*k*(k+1);
        }
    }

    end = clock();
    seconds = (float)(end - start) / CLOCKS_PER_SEC;
    printf("Construct Table: %f\n",seconds);
}

void DestructFactors() {
    int i;
    struct Factor *f;
    for (i=0;i<factorsL;i++) {
        while (factors[i].next) {
            f=factors[i].next->next;
            free(factors[i].next);
            factors[i].next=f;
        }
    }
    free(factors);
    factors=NULL;
    factorsL=0;
}

int ipow(int base, int exp)
{
    int result = 1;
    while (exp)
    {
        if (exp & 1)
            result *= base;
        exp >>= 1;
        base *= base;
    }

    return result;
}

void findGeo(int **bestSolution, int *bestSolutionL,int *Arr, int L) {
    int i,j,D;
    int mustExistToBeBetter;
    int R=Arr[L-1]-Arr[0];
    int *possibleSolution;
    int possibleSolutionL=0;
    int exp;
    int NextVal;
    int idx;
    int kMax,aMax;
    float seconds;
    clock_t end;
    clock_t start = clock();


    kMax = floor(sqrt(R));
    aMax = floor(R/2);
    ConstructFactors(R);
    *bestSolutionL=2;
    *bestSolution=malloc(0);

    possibleSolution = malloc(sizeof(int)*(R+1));

    struct Factor *f;
    int *H=malloc(sizeof(int)*(R+1));
    memset(H,0, sizeof(int)*(R+1));
    for (i=0;i<L;i++) {
        H[ Arr[i]-Arr[0] ]=1;
    }
    for (i=0; i<L-2;i++) {
        for (j=i+2; j<L; j++) {
            D=Arr[j]-Arr[i];
            if (D & 1) continue;
            f = factors + (D >>1);
            while (f) {
                idx=Arr[i] + f->a * f->k  - Arr[0];
                if ((f->k <= kMax)&& (f->a<aMax)&&(idx<=R)&&H[idx]) {
                    if (f->k ==1) {
                        mustExistToBeBetter = Arr[i] + f->a * (*bestSolutionL);
                    } else {
                        mustExistToBeBetter = Arr[i] + f->a * f->k * (ipow(f->k,*bestSolutionL) - 1)/(f->k-1);
                    }
                    if (mustExistToBeBetter< Arr[L-1]+1) {
                        idx=  floor(mustExistToBeBetter - Arr[0]);
                    } else {
                        idx = R+1;
                    }
                    if ((idx<=R)&&H[idx]) {
                        possibleSolution[0]=Arr[i];
                        possibleSolution[1]=Arr[i] + f->a*f->k;
                        possibleSolution[2]=Arr[j];
                        possibleSolutionL=3;
                        exp = f->k * f->k * f->k;
                        NextVal = Arr[j] + f->a * exp;
                        idx=NextVal - Arr[0];
                        while ( (idx<=R) && H[idx]) {
                            possibleSolution[possibleSolutionL]=NextVal;
                            possibleSolutionL++;
                            exp = exp * f->k;
                            NextVal = NextVal + f->a * exp;
                            idx=NextVal - Arr[0];
                        }

                        if (possibleSolutionL > *bestSolutionL) {
                            free(*bestSolution);
                            *bestSolution = possibleSolution;
                            possibleSolution = malloc(sizeof(int)*(R+1));
                            *bestSolutionL=possibleSolutionL;
                            kMax= floor( pow (R, 1/ (*bestSolutionL) ));
                            aMax= floor(R /  (*bestSolutionL));
                        }
                    }
                }
                f=f->next;
            }
        }
    }

    if (*bestSolutionL == 2) {
        free(*bestSolution);
        possibleSolutionL=0;
        for (i=0; (i<2)&&(i<L); i++ ) {
            possibleSolution[possibleSolutionL]=Arr[i];
            possibleSolutionL++;
        }
        *bestSolution = possibleSolution;
        *bestSolutionL=possibleSolutionL;
    } else {
        free(possibleSolution);
    }
    DestructFactors();
    free(H);

    end = clock();
    seconds = (float)(end - start) / CLOCKS_PER_SEC;
    printf("findGeo: %f\n",seconds);
}

int compareInt (const void * a, const void * b)
{
    return *(int *)a - *(int *)b;
}

int main(void) {
    int N=100000;
    int R=10000000;
    int *A = malloc(sizeof(int)*N);
    int *Sol;
    int SolL;
    int i;


    int *S=malloc(sizeof(int)*R);
    for (i=0;i<R;i++) S[i]=i+1;

    for (i=0;i<N;i++) {
        int r = rand() % (R-i);
        A[i]=S[r];
        S[r]=S[R-i-1];
    }

    free(S);
    qsort(A,N,sizeof(int),compareInt);

/*
    int step = floor(R/N);
    A[0]=1;
    for (i=1;i<N;i++) {
        A[i]=A[i-1]+step;
    }
*/

    findGeo(&Sol,&SolL,A,N);

    printf("[");
    for (i=0;i<SolL;i++) {
        if (i>0) printf(",");
        printf("%d",Sol[i]);
    }
    printf("]\n");
    printf("Size: %d\n",SolL);

    free(Sol);
    free(A);
    return EXIT_SUCCESS;
}

演示

我将尝试证明我提出的算法是 O(N`2+M) 平均分布的随机序列。我不是数学家，也不习惯做这种演示，所以请随意填写以纠正我看到的任何错误。

有 4 个缩进循环，两个第一个是 N^2 因子。M 用于计算可能因素表）。

第三个循环平均每对只执行一次。你可以看到这个检查预先计算的因子表的大小。当 N->inf 时，它的大小是 M。所以每对的平均步数是 M/M=1。

所以证明恰好检查了第四个循环。（对于所有对，遍历好的序列的执行小于或等于 O(N^2)。

为了证明这一点，我将考虑两种情况：一种是 M>>N，另一种是 M ~= N。其中 M 是初始数组的最大差值：M= S(n)-S(1)。

对于第一种情况，(M>>N) 找到重合的概率是 p=N/M。要开始一个序列，它必须与第二个元素和 b+1 个元素重合，其中 b 是迄今为止最好的序列的长度。所以循环将进入 N^2*(N/M)^2 时间。这个系列的平均长度（假设一个无限系列）是 p/(1-p) = N/(MN) 。所以循环将被执行的总次数是 N^2 * (N/M)^2 * N/(MN) 。当 M>>N 时，这接近于 0。这里的问题是当 M~=N 时。

现在让我们考虑一下 M~=N 的情况。让我们认为 b 是迄今为止最好的序列长度。对于 A=k=1 的情况，则序列必须在 Nb 之前开始，因此序列的数量将为 Nb，循环的次数将最大为 (Nb)*b。

对于 A>1 和 k=1，我们可以推断 (NA*b/d)*b 其中 d 是 M/N（数字之间的平均距离）。如果我们添加从 1 到 dN/b 的所有 A，那么我们会看到上限：

$\sum_{A=1}^{dN/b}\left ( N-\frac{Ab}{d} \right )b=\frac{N^2d}{2}$

对于 k>=2 的情况，我们看到序列必须在之前开始 NA*k^b/d ，因此循环将输入 A*k^b/d)*b 从 1 到 dN/k^b 的所有 As 的平均值和相加，它给出的限制为

$\sum_{A=1}^{dN/k^b}\left ( N-\frac{Ak^b}{d} \right )b=\frac{bN^2d}{2k^b}$

这里，最坏的情况是当 b 最小时。因为我们正在考虑最小系列，让我们考虑一个非常糟糕的情况 b = 2 所以给定 k 的第 4 次循环的通过次数将小于

$\frac{dN^2}{k^2}$ .

如果我们将所有 k 从 2 添加到无穷大将是：

$\sum_{k=2}^{\infty } \frac{dN^2}{k^2} = dN^2 \left ( \frac{\pi ^2}{6} -1\right )$

因此，将 k=1 和 k>=2 的所有通道相加，我们有最大值：

$\frac{N^2d}{2} +N^2d \left ( \frac{\pi ^2}{6} -1\right ) = N^2d\left ( \frac{\pi ^2}{6 } - \frac{1}{2}\right ) \simeq 1.45N^2d$

请注意，d=M/N=1/p。

所以我们有两个限制，一个当 d=1/p=M/N 变为 1 时变为无限，另一个当 d 变为无限时变为无限。所以我们的极限是两者的最小值，最坏的情况是当两个方程交叉时。所以如果我们解方程：

$N^2d\left ( \frac{\pi ^2}{6} - \frac{1}{2}\right ) = N^2\left ( \frac{N}{M} \right )^2\ frac{N}{MN} =N^2\left ( \frac{1}{d} \right )^2\frac{1}{d-1}$

我们看到最大值是在 d=1.353 时

因此证明第四个循环总共将被处理少于 1.55N^2 次。

当然，这是针对一般情况的。在最坏的情况下，我无法找到一种方法来生成第四个循环高于 O(N^2) 的系列，并且我坚信它们不存在，但我不是证明它的数学家。

旧答案

这是平均 O((n^2)*cube_root(M)) 的解决方案，其中 M 是数组的第一个元素和最后一个元素之间的差。和 O(M+N) 的内存需求。

1.- 构造一个长度为 M 的数组 H，如果 i 存在于初始数组中，则 M[i - S[0]]=true，如果不存在，则为 false。

2.- 对于数组 S[j] 中的每一对，S[i] 执行：

2.1 检查它是否可以成为可能解决方案的第一和第三要素。为此，请计算满足等式 S(i) = S(j) + AK + AK^2 的所有可能的 A、K 对。检查此 SO 问题以了解如何解决此问题。并检查是否存在第二个元素：S[i]+ A*K

2.2 进一步检查是否存在元素位置是我们拥有的最佳解决方案。例如，如果到目前为止我们拥有的最佳解决方案是 4 个元素，则检查是否存在元素 A[j] + A K + A K^2 + A K^3 + A K^4

2.3 如果 2.1 和 2.2 为真，则迭代这个系列有多长并设置为 bestSolution 直到现在比最后一个更长。

这是javascript中的代码：

function getAKs(A) {
    if (A / 2 != Math.floor(A / 2)) return [];
    var solution = [];
    var i;
    var SR3 = Math.pow(A, 1 / 3);
    for (i = 1; i <= SR3; i++) {
        var B, C;
        C = i;
        B = A / (C * (C + 1));
        if (B == Math.floor(B)) {
            solution.push([B, C]);
        }

        B = i;
        C = (-1 + Math.sqrt(1 + 4 * A / B)) / 2;
        if (C == Math.floor(C)) {
            solution.push([B, C]);
        }
    }

    return solution;
}

function getBestGeometricSequence(S) {
    var i, j, k;

    var bestSolution = [];

    var H = Array(S[S.length-1]-S[0]);
    for (i = 0; i < S.length; i++) H[S[i] - S[0]] = true;

    for (i = 0; i < S.length; i++) {
        for (j = 0; j < i; j++) {
            var PossibleAKs = getAKs(S[i] - S[j]);
            for (k = 0; k < PossibleAKs.length; k++) {
                var A = PossibleAKs[k][0];
                var K = PossibleAKs[k][17];

                var mustExistToBeBetter;
                if (K==1) {
                    mustExistToBeBetter = S[j] + A * bestSolution.length;
                } else {
                    mustExistToBeBetter = S[j] + A * K * (Math.pow(K,bestSolution.length) - 1)/(K-1);
                }

                if ((H[S[j] + A * K - S[0]]) && (H[mustExistToBeBetter - S[0]])) {
                    var possibleSolution=[S[j],S[j] + A * K,S[i]];
                    exp = K * K * K;
                    var NextVal = S[i] + A * exp;
                    while (H[NextVal - S[0]] === true) {
                        possibleSolution.push(NextVal);
                        exp = exp * K;
                        NextVal = NextVal + A * exp;
                    }

                    if (possibleSolution.length > bestSolution.length) {
                        bestSolution = possibleSolution;
                    }
                }
            }
        }
    }
    return bestSolution;
}

//var A= [ 1, 2, 3,5,7, 15, 27, 30,31, 81];
var A=[];
for (i=1;i<=3000;i++) {
    A.push(i);
}
var sol=getBestGeometricSequence(A);

$("#result").html(JSON.stringify(sol));

你可以在这里查看代码：http: //jsfiddle.net/6yHyR/1/

我维持另一种解决方案，因为我相信与 N 相比，当 M 非常大时它仍然更好。

score 1 · Accepted Answer

只是从一些东西开始，这是一个简单的 JavaScript 解决方案：

var input = [0.7, 1, 2, 3, 4, 7, 15, 27, 30, 31, 81], 
    output = [], indexes, values, i, index, value, i_max_length,
    i1, i2, i3, j1, j2, j3, difference12a, difference23a, difference12b, difference23b,
    scale_factor, common_ratio_a, common_ratio_b, common_ratio_c,
    error, EPSILON = 1e-9, common_ratio_is_integer,
    resultDiv = $("#result");

for (i1 = 0; i1 < input.length - 2; ++i1) {
    for (i2 = i1 + 1; i2 < input.length - 1; ++i2) {
        scale_factor = difference12a = input[i2] - input[i1];
        for (i3 = i2 + 1; i3 < input.length; ++i3) {
            difference23a = input[i3] - input[i2];
            common_ratio_1a = difference23a / difference12a;
            common_ratio_2a = Math.round(common_ratio_1a);
            error = Math.abs((common_ratio_2a - common_ratio_1a) / common_ratio_1a);
            common_ratio_is_integer = error < EPSILON;
            if (common_ratio_2a > 1 && common_ratio_is_integer) {
                indexes = [i1, i2, i3];
                j1 = i2;
                j2 = i3
                difference12b = difference23a;
                for (j3 = j2 + 1; j3 < input.length; ++j3) {
                    difference23b = input[j3] - input[j2];
                    common_ratio_1b = difference23b / difference12b;
                    common_ratio_2b = Math.round(common_ratio_1b);
                    error = Math.abs((common_ratio_2b - common_ratio_1b) / common_ratio_1b);
                    common_ratio_is_integer = error < EPSILON;
                    if (common_ratio_is_integer && common_ratio_2a === common_ratio_2b) {
                        indexes.push(j3);
                        j1 = j2;
                        j2 = j3
                        difference12b = difference23b;
                    }
                }
                values = [];
                for (i = 0; i < indexes.length; ++i) {
                    index = indexes[i];
                    value = input[index];
                    values.push(value);
                }
                output.push(values);
            }
        }
    }
}
if (output !== []) {
    i_max_length = 0;
    for (i = 1; i < output.length; ++i) {
        if (output[i_max_length].length < output[i].length)
            i_max_length = i;
    }
    for (i = 0; i < output.length; ++i) {
        if (output[i_max_length].length == output[i].length)
            resultDiv.append("<p>[" + output[i] + "]</p>");
    }
}

输出：

[1, 3, 7, 15, 31]

我找到每个候选子序列的前三个项目，从中计算比例因子和公比，如果公比是整数，则在第三个元素之后迭代剩余元素，并将它们添加到子序列中，即符合前三项定义的几何级数。作为最后一步，我选择具有最大长度的 sebsequence/s。

score 0 · Accepted Answer

这是我在 Javascript 中的解决方案。它应该接近 O(n^2)，除非在某些病理情况下。

function bsearch(Arr,Val, left,right) {
    if (left == right) return left;
    var m=Math.floor((left + right) /2);
    if (Val <= Arr[m]) {
        return bsearch(Arr,Val,left,m);
    } else {
        return bsearch(Arr,Val,m+1,right);
    }
}

function findLongestGeometricSequence(S) {
    var bestSolution=[];

    var i,j,k;
    var H={};
    for (i=0;i<S.length;i++) H[S[i]]=true;

    for (i=0;i<S.length;i++) {
        for (j=0;j<i;j++) {
            for (k=j+1;k<i;) {
                var possibleSolution=[S[j],S[k],S[i]];

                var K = (S[i] - S[k]) / (S[k] - S[j]);
                var A = (S[k] - S[j]) * (S[k] - S[j]) / (S[i] - S[k]); 

                if ((Math.floor(K) == K) && (Math.floor(A)==A)) {
                    exp= K*K*K;
                    var NextVal= S[i] + A * exp;
                    while (H[NextVal] === true) {
                        possibleSolution.push(NextVal);
                        exp = exp * K;
                        NextVal= NextVal + A * exp;                
                    }

                    if (possibleSolution.length > bestSolution.length)
                        bestSolution=possibleSolution;

                    K--;
                } else {
                    K=Math.floor(K);
                }

                if (K>0) {
                    var NextPossibleMidValue= (S[i] + K*S[j]) / (K +1); 
                    k++;
                    if (S[k]<NextPossibleMidValue) {
                        k=bsearch(S,NextPossibleMidValue, k+1, i);
                    }
                } else {
                    k=i;
                }
            }        
        }
    }
    return bestSolution;
}


function Run() {
    var MyS= [0.7, 1, 2, 3, 4, 5,6,7, 15, 27, 30,31, 81];

    var sol = findLongestGeometricSequence(MyS); 

    alert(JSON.stringify(sol));
}

小解释

如果我们取数组 S(j) < S(k) < S(i) 的 3 个数字，那么您可以计算 a 和 k 使得： S(k) = S(j) + a*k 和 S(i) = S(k) + a*k^2（2 个方程和 2 个隐身）。考虑到这一点，您可以检查数组中是否存在 S(next) = S(i) + a*k^3 的数字。如果是这种情况，则继续检查 S(next2) = S(next) + a*k^4 等等。

这将是一个 O(n^3) 解决方案，但您可以利用 k 必须是整数以限制所选 S(k) 点的优势。

如果 a 已知，那么您可以计算 a(k)，并且您只需要在第三个循环中检查一个数字，因此这种情况显然是 O(n^2)。

score 0 · Accepted Answer

Python：

def subseq(a):
    seq = []
    aset = set(a)
    for i, x in enumerate(a):
        # elements after x
        for j, x2 in enumerate(a[i+1:]):
            j += i + 1  # enumerate starts j at 0, we want a[j] = x2
            bk = x2 - x  # b*k (assuming k and k's exponent start at 1)

            # given b*k, bruteforce values of k
            for k in range(1, bk + 1):
                items = [x, x2]  # our subsequence so far
                nextdist = bk * k # what x3 - x2 should look like

                while items[-1] + nextdist in aset:
                    items.append(items[-1] + nextdist)
                    nextdist *= k

                if len(items) > len(seq):
                    seq = items

    return seq

运行时间是O(dn^3)，d两个元素之间的（平均？）距离在哪里n，当然是len(a)。

score 0 · Accepted Answer

我认为这个任务与不久前发布的 Longest equal-spaced subsequence相关。我刚刚在 Python 中稍微修改了我的算法：

from math import sqrt

def add_precalc(precalc, end, (a, k), count, res, N):
    if end + a * k ** res[1]["count"] > N: return

    x = end + a * k ** count
    if x > N or x < 0: return

    if precalc[x] is None: return

    if (a, k) not in precalc[x]:
        precalc[x][(a, k)] = count

    return

def factors(n):
    res = []
    for x in range(1, int(sqrt(n)) + 1):
        if n % x == 0:
            y = n / x
            res.append((x, y))
            res.append((y, x))
    return res

def work(input):
    precalc = [None] * (max(input) + 1)
    for x in input: precalc[x] = {}

    N = max(input)

    res = ((0, 0), {"end":0, "count":0})
    for i, x in enumerate(input):
        for y in input[i::-1]:
            for a, k in factors(x - y):
                if (a, k) in precalc[x]: continue
                add_precalc(precalc, x, (a, k), 2, res, N)

        for step, count in precalc[x].iteritems():
            count += 1
            if count > res[1]["count"]: res = (step, {"end":x, "count":count})
            add_precalc(precalc, x, step, count, res, N)
        precalc[x] = None

    d = [res[1]["end"]]
    for x in range(res[1]["count"] - 1, 0, -1):
        d.append(d[-1] - res[0][0] * res[0][1] ** x)
    d.reverse()
    return d

解释

遍历数组
对于数组的每个前一个元素，计算当前元素和采用的前一个元素之间的差异因子，然后预先计算序列的下一个可能元素并将其保存到 precalc 数组
因此，当到达元素时，precalc 数组中i已经有所有可能的元素序列i，所以我们必须计算下一个可能的元素并将其保存到 precalc。

目前，算法中有一个地方可能很慢 - 每个先前数字的因式分解。我认为可以通过两种优化使其更快：

更有效的分解算法
使用数组已排序并且已经有预先计算的序列的事实，找到一种不查看数组的每个元素的方法

score 0 · Accepted Answer

实际上，这与最长等距子序列完全相同，您只需要考虑数据的对数即可。如果序列为a, ak, ak^2, ak^3，则对数值为ln(a), ln(a) + ln(k), ln(a)+2ln(k), ln(a)+ 3ln(k)，所以它是等距的。反之亦然。上面的问题中有很多不同的代码。

我不认为特殊情况 a=1 可以比上述算法的改编更有效地解决。

algorithm - 发现长模式

6 回答 6

Related

Reference