0

我想将我在“父”数据集中的观察结果拆分为他们自己独特的“子”数据集。我需要对几个父数据集执行此操作,因此我尝试创建一个内部带有 do 循环的宏来生成这些数据集。但是我的代码不起作用(可能有多种原因)。

这是手动代码作为我想要自动化的示例(此代码工作正常,“父”数据集ta220092在这种情况下有四个观察结果,但在其他“父”数据集中它可能更大或更小):

data ta2200921 ta2200922 ta2200923 ta2200924;
set ta220092;
if _n_ = 1 then output ta2200921;
if _n_ = 2 then output ta2200922;
if _n_ = 3 then output ta2200923;
if _n_ = 4 then output ta2200924;
run;

在试图自动化这一点。我想我应该使用自动“ n ”变量添加到数据集名称和 %to 语句,因为每个“父”数据集中的观察数量各不相同,但我不知道该怎么做。我创建了以下代码,其中有一个问题,我希望有人能提供帮助:

%macro treatmentsplit(j);
%do i = 1 %to &j.;
&j. = _n_;
data tatest220092&i.;
set ta220092 (where = (_n_ = &i.));
run;
%end;
%mend treatmentsplit;
%treatmentsplit;

谢谢你。

除了为了清楚起见编辑上述内容外,我还需要编辑我的问题以解决为什么我不相信这是 Joe 标记的重复问题。他提出的重复问题是分割 sas 数据集以进行批处理的最快方法是什么?

我认为这个问题不是重复的有两个原因。首先,想要分裂的根本原因是不同的。对于我的问题,这不是试图分解大型数据集以进行合理批处理的问题。我将在下一段中解决我想要拆分的根本原因。我不认为这是重复的第二个原因是解决“什么是对 SAS 数据集进行批处理进行分区的最快方法”的代码不适用于我的情况。提供的两个代码答案指定了要拆分父数据集的多个数据集。我事先不知道要拆分的每个数据集的拆分数量,因为每个数据集中的观察数量不同。我试图根据我的情况修改第二个答案(由 RWill),并且到目前为止一直没有成功。这是我迄今为止修改我的情况的第二个答案的最佳尝试(尝试过变体):

%macro nobs(dsn);
%local nobs dsid rc;
%let nobs=0;
%let dsid = %sysfunc(open(&dsn));
%if &dsid %then %do;
  %let nobs = %sysfunc(attrn(&dsid,NOBS));
%end;
%else %put Open for dataset &dsn failed - %sysfunc(sysmsg());
%let rc   = %sysfunc(close(&dsid));
%mend nobs;

%macro batch_process(dsn_in,dsn_out_prefix);
%let dsn_obs = %nobs(&dsn_in);
%let obs_per_dsn = 1;
data
 %do i = 1 %to &dsn_obs;
    &dsn_out_prefix.&i
 %end; ;
 set &dsn_in;
 drop _count;
 retain _count 0;
 _count = _count + 1;
 %do i = 1 %to &dsn_obs;
    if (1 + ((&i - 1) * 1) <= _count <= (&i * 1) then do;
       output &dsn_out_prefix.&i;
    end; 
 %end;
  run;
%mend batch_process;
%batch_process( dsn_in=tmp1.ta220092 , dsn_out_prefix = ta220092);

日志中的错误似乎表明 do 循环中的 DSN_OBS 变量存在问题(第二个宏中的第 5 行):

SYMBOLGEN:宏变量 DSN_OBS 解析为错误:%EVAL 函数没有要计算的表达式,或者 %IF 语句没有条件。错误:%DO I 循环的 %TO 值无效。

为了解决我想要将数据集拆分为每个数据集一个观察值的根本原因,我修改了一个宏,它几乎可以按照我需要的方式工作,但有一个问题。我修改的原始宏用于倾向得分匹配http://www.biostat.umn.edu/~will/6470stuff/Class25-12/PSmatching.sas. 我对其进行了修改以更好地处理我的数据集(更改变量名称),并且我还添加了一个我称为“CC”的方法来计算卡尺,因为我想捕获在我的治疗匹配变量的 10% 或 20% 范围内的所有控件组(将有第二个匹配变量,由最近的邻居选择,但我对该步骤的代码没有问题)。问题在于,在处理数据集(例如上面的 ta220092)中,有两个观察值具有匹配的变量,这些变量具有重叠的计算厚度“区域”——一个具有 62 的资产,一个具有 64 的资产。宏有一个替换选项; 如果我选择“是”,那么我会得到与同一控件匹配的处理 100 次(不是我想要的,我想要计算出的卡尺内的所有控件)。如果我选择“

    %macro Matching(datatreatment=, datacontrol=, method=, numberofcontrols=, caliper=, ccpercent=,
     replacement=, out=);

    /* Create copies of the treated units if N > 1 */;
     data _Treatment0(drop= i);
      set &datatreatment;
      do i= 1 to &numberofcontrols;
      RandomNumber= ranuni(12345);
    output;
    end;
    run;
    /* Randomly sort both datasets */
    proc sort data= _Treatment0 out= _Treatment(drop= RandomNumber);
    by RandomNumber;
    run;
    data _Control0;
    set &datacontrol;
    RandomNumber= ranuni(45678);
    run;
    proc sort data= _Control0 out= _Control(drop= RandomNumber);
    by RandomNumber;
    run;

     data Matched (keep = cikSelectedControl atControl roacontrol roatreat fyear  industry MatchedToTreatcik atTreat);
      length atC 8;
      length cikC 8;
      /* Load Control dataset into the hash object */
      if _N_= 1 then do;
    declare hash h(dataset: "_Control", ordered: 'no');
    declare hiter iter('h');
    h.defineKey('cikC');
    h.defineData('roac','atC','cikC');
    h.defineDone();
    call missing(cikC, atC, roac);
    end;
    /* Open the treatment */
    set _Treatment;
    %if %upcase(&method) ~= RADIUS %then %do;
    retain BestDistance 99;
    %end;
    /* Iterate over the hash */
    rc= iter.first();
    if (rc=0) then BestDistance= 99;
    do while (rc = 0);
    /* Caliper */
    %if %upcase(&method) = CALIPER %then %do;
    if (atT - &caliper) <= atC <= (atT + &caliper) then do;
    ScoreDistance = abs(atT - atC);
    if ScoreDistance < BestDistance then do;
    BestDistance = ScoreDistance;
    cikSelectedControl = cikC;
    atControl =  atC;
    MatchedToTreatcik = cikT;
    atTreat = atT;
    end;
    end;
    %end;
    /* Calculated caliper */
    %if %upcase(&method) = CC %then %do;
    ccdist = &ccpercent*atT;
    if (atT - ccdist) <= atC <= (atT + ccdist) then do;
    ScoreDistance = abs(atT - atC);
    if ScoreDistance < BestDistance then do;
    BestDistance = ScoreDistance;
    cikSelectedControl = cikC;
    atControl =  atC;

    MatchedToTreatcik = cikT;
    atTreat = atT;
    ROAControl = roaC;
    ROATreat=roat;
    end;
    end;
    %end;
    /* NN */
    %if %upcase(&method) = NN %then %do;
    ScoreDistance = abs(atT - atC);
    if ScoreDistance < BestDistance then do;
    BestDistance = ScoreDistance;
    cikSelectedControl = cikC;
    atControl =  atC;
    MatchedToTreatcik = cikT;
    atTreat = atT;
    end;
    %end;

    %if %upcase(&method) = NN or %upcase(&method) = CALIPER or %upcase(&method) = CC      %then %do;
    rc = iter.next();
    /* Output the best control and remove it */
    if (rc ~= 0) and BestDistance ~=99 then do;
    output;
    %if %upcase(&replacement) = NO %then %do;
    rc1 = h.remove(key: cikSelectedControl);
    %end;
    end;
    %end;
    /* Radius */
    %if %upcase(&method) = RADIUS %then %do;
    if (atT - &caliper) <= atC <= (atT + &caliper) then do;
    cikSelectedControl = cikC;
    atControl =  atC;
    MatchedToTreatcik = cikT;
    atTreat = atT;
    output;
    end;
    rc = iter.next();
    %end;
    end;
    run;
    /*to download datasets from wrds to investigate*/
    proc download data=matched; run;
    proc download data=_Control; run;
    /* Delete temporary tables. Quote for debugging */
    proc datasets NOLIST; /*Nolist option should prevent printing of dataset list*/
    delete _:(gennum=all);

    run;
     data &out;
       set Matched;
     run;
    proc datasets NOLIST; /*Nolist option should prevent printing of dataset list*/
    delete Matched;
    %mend Matching;


    %Matching(datatreatment= Ta220092, datacontrol= ca220092, method= cc,
     numberofcontrols= 100, caliper=1, ccpercent=.2, replacement= no, out= matchtest4);

另一个注意事项是,我将通过 WRDS 系统上的 PC SAS 运行这场比赛,这样更快,并且在处理过程中不会冻结我的计算机。

4

1 回答 1

0

我提高了对宏的理解并修改了宏以使其工作。事实证明,计算出的卡尺基本上是具有半径限制的最近邻匹配。因此,当我修改宏以包含计算出的半径时,宏能够匹配我需要的方式(参见上面的问题)。下面是修改后的宏:

/************************************************ 
matching.sas   adapted from

Paper 185-2007  SAS Global Forum 2007
Local and Global Optimal Propensity Score Matching
Marcelo Coca-Perraillon
Health Care Policy Department, Harvard Medical School, Boston, MA

-------------------------------
Treatment and Control observations must be in separate datasets such that 
Control data includes: cikC =  subject_cik, atC = total assets
Treatment data includes: cikT, atT = total assets
cik must be numeric 

method = NN (nearest neighbor), caliper, or radius, or CC or RC --  CC/RC added by 
MRL calcpercent= percentage to be applied to ccvariable or rcvariable to create 
calculated caliper or calculated radius

caliper value = max for matching

replacement = yes/no  whether controls can be matched to more than one case

out = output data set name

example call:

 %Matching(datatreatment= T, datacontrol= C, method= RC,
  numberofcontrols= 1, caliper=, calcpercent=.20, replacement= no, out= matches);

************************************************/
rsubmit;
%macro Matching(datatreatment=, datacontrol=, method=, numberofcontrols=, caliper=,
 calcpercent=, replacement=, out=);

    /* Create copies of the treated units if N > 1 */;
     data _Treatment0(drop= i);
      set &datatreatment;
      do i= 1 to &numberofcontrols;
      RandomNumber= ranuni(12345);
    output;
    end;
    run;
    /* Randomly sort both datasets */
    proc sort data= _Treatment0 out= _Treatment(drop= RandomNumber);
    by RandomNumber;
    run;
    data _Control0;
    set &datacontrol;
    RandomNumber= ranuni(45678);
    run;
    proc sort data= _Control0 out= _Control(drop= RandomNumber);
    by RandomNumber;
    run;

     data Matched (keep = cikSelectedControl atControl roacontrol roatreat fyear industry MatchedToTreatcik atTreat);
      length atC 8;
      length cikC 8;
      /* Load Control dataset into the hash object */
      if _N_= 1 then do;
    declare hash h(dataset: "_Control", ordered: 'no');
    declare hiter iter('h');
    h.defineKey('cikC');
    h.defineData('roac','atC','cikC');
    h.defineDone();
    call missing(cikC, atC, roac);
    end;
    /* Open the treatment */
    set _Treatment;
    %if %upcase(&method) ~= RADIUS or %upcase(&method) ~= CR %then %do;
    retain BestDistance 99;
    %end;
    /* Iterate over the hash */
    rc= iter.first();
    if (rc=0) then BestDistance= 99;
    do while (rc = 0);
    /* Caliper */
    %if %upcase(&method) = CALIPER %then %do;
    if (atT - &caliper) <= atC <= (atT + &caliper) then do;
    ScoreDistance = abs(atT - atC);
    if ScoreDistance < BestDistance then do;
    BestDistance = ScoreDistance;
    cikSelectedControl = cikC;
    atControl =  atC;
    MatchedToTreatcik = cikT;
    atTreat = atT;
    end;
    end;
    %end;
    /* Calculated caliper */
    %if %upcase(&method) = CC %then %do;
    ccdist = &calcpercent*atT;
    if (atT - ccdist) <= atC <= (atT + ccdist) then do;
    ScoreDistance = abs(atT - atC);
    if ScoreDistance < BestDistance then do;
    BestDistance = ScoreDistance;
    cikSelectedControl = cikC;
    atControl =  atC;

    MatchedToTreatcik = cikT;
    atTreat = atT;
    ROAControl = roaC;
    ROATreat=roat;
    end;
    end;
    %end;
    /* NN */
    %if %upcase(&method) = NN %then %do;
    ScoreDistance = abs(atT - atC);
    if ScoreDistance < BestDistance then do;
    BestDistance = ScoreDistance;
    cikSelectedControl = cikC;
    atControl =  atC;
    MatchedToTreatcik = cikT;
    atTreat = atT;
    end;
    %end;

    %if %upcase(&method) = NN or %upcase(&method) = CALIPER or %upcase(&method) = CC %then %do;
    rc = iter.next();
    /* Output the best control and remove it */
    if (rc ~= 0) and BestDistance ~=99 then do;
    output;
    %if %upcase(&replacement) = NO %then %do;
    rc1 = h.remove(key: cikSelectedControl);
    %end;
    end;
    %end;
    /* Radius */
    %if %upcase(&method) = RADIUS %then %do;
    if (atT - &caliper) <= atC <= (atT + &caliper) then do;
    cikSelectedControl = cikC;
    atControl =  atC;
    MatchedToTreatcik = cikT;
    atTreat = atT;
    ROAControl = roaC;
    ROATreat=roat;
    output;
    end;
    rc = iter.next();
    %end;
    /* Calculated Radius */
    %if %upcase(&method) = CR %then %do;
    rcdist = &calcpercent*atT;
    if (atT - rcdist) <= atC <= (atT + rcdist) then do;
    cikSelectedControl = cikC;
    atControl =  atC;
    MatchedToTreatcik = cikT;
    atTreat = atT;
    ROAControl = roaC;
    ROATreat=roat;
    output;
    end;
    rc = iter.next();
    %end;
    end;
    run;
    /*for when testing and  using wrds
    proc download data=matched; run;
    proc download data=_Control; run;*/
    /* Delete temporary tables. Quote for debugging */
    proc datasets NOLIST; /*Nolist option should prevent printing of dataset list*/
    delete _:(gennum=all);

    run;
     data &out;
       set Matched;
     run;
    proc datasets NOLIST; /*Nolist option should prevent printing of dataset list*/
    delete Matched;
    %mend Matching;
于 2014-07-15T11:37:46.830 回答