0

我有一个项目,我经常使用数据合并数据集并设置即

data want;
set have1 have2;
run;

问题是,这些数据集通常是相同的,但观察值具有不同的长度,并且会发生数据截断。因此,我开始编写一个宏,该宏允许基于字符变量最长长度的输入数据集进行动态长度更改。到目前为止,我已经构建了这段代码,非常简单

%Macro Formatting;

proc contents data = engdata.assets2 out = Assets1 noprint;
run;

data Assets2;
set Assets1;

keep NAME LENGTH;
if FORMAT = "$";
run;

proc contents data = historic.assets2016 out = HAssets1 noprint;
run;

data HAssets2;
set HAssets1;

keep NAME LENGTH;
if FORMAT = "$";
run;

proc contents data = engdata.Liabilities2 out = Liabilities1 noprint;
run;

data Liabilities2;
set Liabilities1;

keep NAME LENGTH;
if FORMAT = "$";

run;

proc contents data = historic.Liabilities2016 out = HLiabilities1 noprint;
run;

data HLiabilities2;
set HLiabilities1;

keep NAME LENGTH;
if FORMAT = "$";

run;

proc contents data = engdata.bonds2 out = bonds1 noprint;
run;

data bonds2;
set bonds1;

keep NAME LENGTH;
if FORMAT = "$";

run;

proc contents data = engdata.Irswaps2 out = Irswaps1 noprint;
run;

data Irswaps2;
set Irswaps1;

keep NAME LENGTH;
if FORMAT = "$";

run;

proc contents data = historic.Money_Market_2016 out = MoneyMarket1 noprint;
run;

data MoneyMarket2;
set MoneyMarket1;

keep NAME LENGTH;
if FORMAT = "$";

run;

proc sql;
    create table AllLength as
    select a.*
          ,a.Length as Length1
          ,b.Length as Length2
          ,c.Length as Length3
          ,d.Length as Length4
          ,e.Length as Length5
          ,f.Length as Length6
          ,g.Length as Length7
    from Liabilities2 as a
    left join Assets2 as b
    on a.Name = b.Name
    left join Bonds2 as c
    on a.Name = c.Name
    left join Irswaps2 as d
    on a.Name = d.Name
    left join HLiabilities2 as e
    on a.Name = e.Name
    left join HAssets2 as f
    on a.Name = f.Name
    left join MoneyMarket2 as g
    on a.Name = g.Name
    order by Name;
quit;

data AllLength2;
set AllLength;

array LengthVar Length1-Length7;
largest = max(of LengthVar[*]);
index    = whichn(largest, of LengthVar[*]);
Varname = vname(LengthVar[index]);

keep name largest;

run;

proc sql noprint;
select name into: Var1 separated by " " from AllLength2;
select largest into: Var2 separated by " " from AllLength2;
quit;

%put &var1;
%put &var2;

%let index = 1;

%do %until (%Scan(&Var1,&index," ")=);

%let Varr1 = %Scan(&Var1,&index," ");
%let Varr2 = %Scan(&Var2,&index," ");

data engdata.liabilities2;
length &Varr1 $&Varr2..;
set engdata.liabilities2;

format &Varr1 $&Varr2..;
informat &Varr1 $&Varr2..;

run;

data engdata.assets2;
length &Varr1 $&Varr2..;
set engdata.assets2;

format &Varr1 $&Varr2..;
informat &Varr1 $&Varr2..;

run;

data engdata.bonds2;
length &Varr1 $&Varr2..;
set engdata.bonds2;

format &Varr1 $&Varr2..;
informat &Varr1 $&Varr2..;

run;

data engdata.irswaps2;
length &Varr1 $&Varr2..;
set engdata.irswaps2;

format &Varr1 $&Varr2..;
informat &Varr1 $&Varr2..;

run;

data historic.liabilities2016;
length &Varr1 $&Varr2..;
set historic.liabilities2016;

format &Varr1 $&Varr2..;
informat &Varr1 $&Varr2..;

run;

data historic.assets2016;
length &Varr1 $&Varr2..;
set historic.assets2016;

format &Varr1 $&Varr2..;
informat &Varr1 $&Varr2..;

run;

data Historic.moneymarket2016;
length &Varr1 $&Varr2..;
set Historic.moneymarket2016;

format &Varr1 $&Varr2..;
informat &Varr1 $&Varr2..;

run;

%let index = %eval(&Index + 1);

%end;

%mend;

%Formatting;

有时,我正在查看的变量格式在某些数据集中不存在,我在日志中得到以下信息

NOTE: There were 0 observations read from the data set HISTORIC.MONEYMARKET2016.
NOTE: The data set HISTORIC.MONEYMARKET2016 has 0 observations and 11 variables.
NOTE: DATA statement used (Total process time):
  real time           0.02 seconds
  cpu time            0.01 seconds

当我查看数据集时,一切都在那里?我会丢失任何工作吗,有没有办法使用这个循环,当变量不存在时,跳过它?

4

1 回答 1

3

与其单独操作所有数据集(这会导致大量磁盘活动),不如考虑编写代码来构建一个attrib语句,该语句是变量属性的同质化。构造语句会放在数据堆叠SET语句之前,从而强制 pdv 使用同质化属性,这意味着所有传入的数据都符合 pdv 的长度属性,并且不会出现警告。

例如,考虑三个数据集

data one;
  s='aaaa';
  y=4;
  length y 4;
run;

data two;
  length s $50;
  t = 'for 2';
  y = 1.75;
run;

data three;
  length s $20;
  z = -1;
run;

以均匀的方式堆叠

%big_stack_attack (datasets=
  one
  two
  three,
  out=next_big_thing
)

stacking 宏是一个简单的包装器,有一个额外的转折,它获得了同质化数据集变量的属性语句。

%macro big_stack_attack(datasets=, out=);

  %local attr_code;

  %* obtain the attrib statements that homogenize the data;
  %homogenize (datasets=&datasets, result=attr_code);

  * stack the data, using the attrib statements first to predefine the PDV ;
  * into which the SET statement will place values;

  data &out;
    &attr_code;
    set &datasets;
  run;

%mend big_stack_attack;

用于构造属性语句的宏检查数据集的内容并使用构造的属性语句的最长长度

%macro homogenize (datasets=, result=);

  %* construct attribute statements as the result value
   * The statements use the longest length when a variable+type appears
   * in more than one dataset
   * No checks are done for like named variables of differing types;

  %* extract each data set ;

  %local i N;
  %let i = 1;
  %do %while (%length(%scan(&datasets,&i)));
    %local data&i;
    %let data&i = %scan(&datasets,&i);
    %let i = %eval(&i + 1);
  %end;
  %let N = %eval (&i - 1);

  %* get contents of each data set;

  %do i = 1 %to &N;
    proc contents noprint data=&&data&i out=_contents&i;
    run;
  %end;

  %* construct and concatenate an attrib statement for each variable+type;

  proc sql noprint;
    select "attrib " 
      || trim(name) || " length="
      || case when type=2 then "$" else " " end
      || cats(max(length))
      || case 
          when type=2 then " format=$" || cats(max(length)) || "."
          else " " 
         end
    into
      :&result       %* NOTE: result parameter is name of macro-var in containing scope;
    separated by 
      ';'
    from 
(
  %do i = 1 %to &N;
    %if &i > 1 %then UNION;
    select * from _contents&i
  %end;
)
    group by name, type
    ;
  quit;

%mend homogenize;

不同类型的同名变量的情况需要额外的编码和要求(应该将字符变量尝试强制转换为数字类型值,还是应该将数字变量强制转换为字符类型值)

于 2018-06-06T15:33:05.837 回答