这可以通过一些相当有创意的数据步骤编程来实现。下面的代码使用了一种贪心的方法,依次从每次访问中采样,只对之前没有采样过的 id 进行采样。如果超过 90% 的访问 id 已经被采样,则输出不到 10%。在极端情况下,当访问的每个 id 都已被采样时,该访问不会输出任何行。
/*Create some test data*/
data test_data;
call streaminit(1);
do visit = 1 to 1000;
do id = 1 to ceil(rand('uniform')*1000);
output;
end;
end;
run;
data sample;
/*Create a hash object to keep track of unique IDs not sampled yet*/
if 0 then set test_data;
call streaminit(0);
if _n_ = 1 then do;
declare hash h();
rc = h.definekey('id');
rc = h.definedata('available');
rc = h.definedone();
end;
/*Find out how many not-previously-sampled ids there are for the current visit*/
do ids_per_visit = 1 by 1 until(last.visit);
set test_data;
by visit;
if h.find() ne 0 then do;
available = 1;
rc = h.add();
end;
available_per_visit = sum(available_per_visit,available);
end;
/*Read through the current visit again, randomly sampling from the not-yet-sampled ids*/
samprate = 0.1;
number_to_sample = round(available_per_visit * samprate,1);
do _n_ = 1 to ids_per_visit;
set test_data;
if available_per_visit > 0 then do;
rc = h.find();
if available = 1 then do;
if rand('uniform') < number_to_sample / available_per_visit then do;
available = 0;
rc = h.replace();
samples_per_visit = sum(samples_per_visit,1);
output;
number_to_sample = number_to_sample - 1;
end;
available_per_visit = available_per_visit - 1;
end;
end;
end;
run;
/*Check that there are no duplicate IDs*/
proc sort data = sample out = sample_dedup nodupkey;
by id;
run;