SRSwift 的答案可能是您提供的问题的最佳答案。标准算法的困难在于您似乎没有函数的单个局部/全局最小值(响应方差),但是有多个局部最小值导致它在相对较低的灵活性下无法很好地工作它有随数据密度调整的。如果您有很多“年”,那么这种事情很容易解决,您可以一次跳过一年,而不是跳过五年或十年或其他任何时间(以避免局部最小值);但只有几十年的时间是不切实际的。
这是一个核心的机器学习应用程序,具有集群节点的能力,并具有多种解决方案。你的特别的似乎吸引了最简单的,我几年前在一门课程中学到的,如果你认为它很容易实现。
- 定义一个要最小化的函数,例如 minim_f。
- 定义一个函数,该函数获取您的数据,在一个方向上修改一个质心的集群质心(或任何定义集群的任何内容),例如 modif_f。(质心和方向应该是参数。)
然后你交替调用 minim_f 和 modif_f ;你调用 minim_f,获取它的值,用一组参数调用 modif_f;然后检查 minim_f ,看看它是否更好。如果是这样,请继续朝着这个方向前进。如果不是,则恢复到上一次迭代的原始值并尝试在 modif_f 中进行不同的修改。继续前进,直到找到局部最小值,希望是全局最小值。
其确切机制各不相同;特别是,您可能一次调整一个或多个质心,并且您必须找出正确的方法来继续调整,直到不再进行调整为止。
我为您的数据写了一个小例子;它确实得出了与 SRSwift 相同的答案,尽管 proc 意味着计算出的方差与 SRSwift 程序中的方差不同。我不是统计学家,也不会说哪个是正确的,但它们的工作方式显然足够相似,这并不重要。我的是一个非常简单的实现,并且会从改进中受益匪浅,但希望它解释了基本概念。
data maindat;
input Year Response ;
datalines;
1994 -4.300511714
1994 -9.646920963
1994 -15.86956805
1993 -16.14857235
1993 -13.05797186
1993 -13.80941206
1992 -3.521394503
1992 -1.102526302
1992 -0.137573583
1992 2.669238665
1992 -9.540489193
1992 -19.27474303
1992 -3.527077011
1991 1.676464068
1991 -2.238822314
1991 4.663079037
1991 -5.346920963
1990 -8.543723186
1990 0.507460641
1990 0.995302284
1990 0.464194011
1989 4.728791571
1989 5.578685423
1988 2.771297564
1988 7.109159247
1987 15.96059456
1987 2.985292226
1986 -4.301136971
1985 5.854674875
1985 5.797294021
1984 4.393329025
1983 -6.622580905
1982 0.268500302
1977 12.23062252
;
run;
proc sort data=maindat;
by year;
run;
proc freq data=maindat; * Start us off with a frequency table by year.;
tables year/out=yearfreq outcum;
run;
data initial_clusters; * Guess that the best starting point is 1/3 of the years for each cluster.;
set yearfreq;
cluster = floor(cum_pct/33.334)+1;
run;
data cluster_years; * Merge on the clusters;
merge maindat initial_clusters(keep=year cluster);
by year;
run;
proc means data=cluster_years; * And get that starting variance.;
class cluster;
types cluster;
var response;
output out=cluster_var var=;
run;
data cluster_var_tot; * Create our starting 'cumulative' file of variances;
set cluster_var end=eof;
total_var+response;
iter=1;
if eof then output;
keep total_var iter;
run;
data current_clusters; * And initialize the current cluster estimate to the initial clusters;
set initial_clusters;
run;
* Here is our recursive cluster-testing macro.;
%macro try_cluster(cluster_adj=, cluster_new=,iter=1);
/* Here I include both MODIF_F and MINIM_F, largely because variable scoping is irritating if I separate them. */
/* But you can easily swap out the MINIM_F portion if needed to a different minimization function. */
/* This is MODIF_F, basically */
data adjusted_clusters;
set current_clusters;
by cluster;
%if &cluster_adj. < &cluster_new. %then %do;
if last.cluster
%end;
%else %do;
if first.cluster
%end;
and cluster=&cluster_adj. then cluster=&cluster_new.;
run;
data cluster_years;
merge maindat adjusted_clusters(keep=year cluster);
by year;
run;
/* end MODIF_F */
/* This would be MINIM_F if it were a function of its own */
proc means data=cluster_years noprint; *Calculate variance by cluster;
class cluster;
types cluster;
var response;
output out=cluster_var var=;
run;
data cluster_var_tot;
set cluster_var_tot cluster_var indsname=dsn end=eof;
retain last_var last_iter;
if dsn='WORK.CLUSTER_VAR_TOT' then do; *Keep the old cluster variances for history;
output;
last_var=total_var;
last_iter=_n_;
end;
else do; *Sum up the variance for this iteration;
total_var+response;
iter=last_iter+1;
if eof then do;
if last_var > total_var then smaller=1; *If it is smaller...;
else smaller=0;
call symputx('smaller',smaller,'l'); *save smaller to a macro variable;
if smaller=1 then output; *... then output it.;
end;
end;
keep total_var iter;
run;
/* End MINIM_F */
%if &smaller=1 %then %do; *If this iteration was better, then keep iterating, otherwise stop;
data current_clusters;
set adjusted_clusters; *replace old clusters with better clusters;
run;
%if &iter<10 %then %try_cluster(cluster_adj=&cluster_adj.,cluster_new=&cluster_new.,iter=&iter.+1);
%end;
%mend try_cluster;
* Let us try a few changes;
%try_cluster(cluster_adj=1,cluster_new=2,iter=1);
%try_cluster(cluster_adj=2,cluster_new=1,iter=1);
%try_cluster(cluster_adj=3,cluster_new=2,iter=1);
* That was just an example (that happens to work for this data);
* This part would be greatly enhanced by some iteration testing and/or data-appropriate modifications;
* Now merge back on the 'current' clusters, since the current cluster_years is actually one worse;
data cluster_years;
merge maindat current_clusters(keep=year cluster);
by year;
run;
* And get the variance just as a verification.;
proc means data=cluster_years;
class cluster;
types cluster;
var response;
output out=cluster_var var=;
run;