1

在我当前的项目中,我们遇到过几次必须根据不同条件实现匹配的情况。首先对问题进行更详细的描述。

我们得到了一个表测试:
key Value
1 10
1 -10
1 10
1 20
1 -10
1 10
2 10
2 -10

现在我们要应用一个规则,以便在一个组(由键的值定义)中消除总和为 0 的对。

预期的结果是:
键值
1 10
1 20

排序顺序不相关。

以下代码是我们解决方案的一个示例。我们希望消除 my_id 为 2 和 7 的观察结果,以及 3 个观察值中的另外 2 个,数量为 10。

data test;
input my_id alias $ amount;
datalines4;
1 aaa 10
2 aaa -10
3 aaa 8000
4 aaa -16000
5 aaa 700
6 aaa 10
7 aaa -10
8 aaa 10
;;;;
run;

/* get all possible matches represented by pairs of my_id */
proc sql noprint;
  create table zwischen_erg as
  select a.my_id as a_id,
         b.my_id as b_id
  from test as a inner join
       test as b on (a.alias=b.alias) 
  where a.amount=-b.amount;
quit;

/* select ids of matches to eliminate */
proc sort data=zwischen_erg ;
  by a_id b_id;
run;

data zwischen_erg1;
  set zwischen_erg;
  by a_id;

  if first.a_id then tmp_id1 = 0;
  tmp_id1 +1;
run;


proc sort data=zwischen_erg;
  by b_id a_id;
run;

data zwischen_erg2;
  set zwischen_erg;
  by b_id;

  if first.b_id then tmp_id2 = 0;
  tmp_id2 +1;
run;

proc sql;
  create table delete_ids as 
  select zwischen_erg1.a_id as my_id
  from zwischen_erg1 as erg1 left join 
       zwischen_erg2 as erg2 on 
                   (erg1.a_id = erg2.a_id and 
                    erg1.b_id = erg2.b_id)
  where tmp_id1 = tmp_id2
;
quit;

/* use delete_ids as filter */
proc sql noprint;
  create table erg as
  select a.*
  from test as a left join
       delete_ids as b on (a.my_id = b.my_id) 
  where b.my_id=.;
quit;

该算法似乎有效,至少没有人发现导致错误的输入数据。但是没有人能向我解释它为什么起作用,我也不详细了解它是如何工作的。

所以我有几个问题。

  1. 对于输入数据的所有可能组合,该算法是否以正确的方式消除了配对?
  2. 如果它确实工作正确,那么该算法如何详细工作?尤其是
    tmp_id1 = tmp_id2 的部分。
  3. 有没有更好的算法来消除对应对?

在此先感谢,祝您编码愉快,
迈克尔

4

2 回答 2

1

作为第三个问题的答案。以下方法对我来说似乎更简单。并且可能性能更高。(因为我没有加入)

/*For every (absolute) value, find how many more positive/negative occurrences we have per key*/
proc sql;
    create view V_INTERMEDIATE_VIEW as
    select key, abs(Value) as Value_abs, sum(sign(value)) as balance
    from INPUT_DATA
    group by key, Value_abs
    ;
quit;

*The balance variable here means how many times more often did we see the positive than the negative of this value. I.e., how many of either the positive or the negative were we not able to eliminate;

/*Now output*/
data OUTPUT_DATA (keep=key Value);
    set V_INTERMEDIATE_VIEW;
    Value = sign(balance)*Value_abs; *Put the correct value back;

    do i=1 to abs(balance) by 1;
        output;
    end;
run;




如果您只想要纯 SAS(所以没有 proc sql),您可以按如下方式进行。请注意,它背后的想法保持不变。

data V_INTERMEDIATE_VIEW /view=V_INTERMEDIATE_VIEW;
    set INPUT_DATA;
    value_abs = abs(value);
run;
proc sort data=V_INTERMEDIATE_VIEW out=INTERMEDIATE_DATA;
    by key value_abs; *we will encounter the negatives of each value and then the positives;
run;

data OUTPUT_DATA (keep=key value);
    set INTERMEDIATE_DATA;
    by key value_abs;

    retain balance 0;
    balance = sum(balance,sign(value));

    if last.value_abs then do;
        value = sign(balance)*value_abs; *set sign depending on what we have in excess;            
        do i=1 to abs(balance) by 1;
            output;
        end;

        balance=0; *reset balance for next value_abs;
    end;
run;

注意:感谢 Joe 提供的一些有用的性能建议。

于 2013-10-16T15:31:12.363 回答
0

快速阅读后我没有看到任何错误。但是“zwischen_erg”可能有很多不必要的多对多匹配,这将是低效的。

这似乎有效(但不能保证),并且可能更有效。也更短,所以也许更容易看到发生了什么。

data test;
input my_id alias $ amount;
datalines4;
1 aaa 10
2 aaa -10
3 aaa 8000
4 aaa -16000
5 aaa 700
6 aaa 10
7 aaa -10
8 aaa 10
;;;;
run;

proc sort data=test;
    by alias amount;
run;

data zwischen_erg;
    set test;
    by alias amount;
    if first.amount then occurrence = 0;
    occurrence+1;
run;

proc sql;
    create table zwischen as
    select
        a.my_id,
        a.alias,
        a.amount
    from zwischen_erg as a
    left join zwischen_erg as b
    on a.amount = (-1)*b.amount and a.occurrence = b.occurrence
    where b.my_id is missing;
quit;
于 2013-10-16T15:42:04.913 回答