我有以下数据和代码(用于匹配两个数据集data1和data2),我想知道我是否可以改进代码以实现效率。
kk<-structure(list(dummy = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), exact = c(4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), common1 = c(87L,
79L, 82L, 87L, 94L, 68L, 67L, 83L, 73L, 83L, 83L, 87L, 66L, 87L,
77L, 84L, 62L, 80L, 75L, 76L, 80L, 74L, 83L, 81L, 93L, 81L, 76L,
84L, 73L, 52L, 73L, 87L, 69L, 81L, 87L, 79L, 66L, 63L, 63L, 83L,
69L, 70L, 44L, 71L, 72L, 80L, 84L, 47L), common2 = c(5.70294879390762,
9.13248693277132, 9.24850283307053, 9.525315331908, 9.7981270368783,
10.2750511089686, 10.5186731916264, 10.2750511089686, 9.30565055178051,
9.47270463644367, 9.74096862303835, 10.3417424834673, 10.0432494949113,
9.99879773234045, 9.99879773234045, 9.30565055178051, 9.03598698483141,
8.88183630500415, 9.74096862303835, 9.5468126085974, 9.90348755253613,
8.9226582995244, 10.1266311038503, 9.7981270368783, 9.39266192877014,
9.7981270368783, 9.21034037197618, 9.5468126085974, 10.3417424834673,
9.5468126085974, 9.62362482913648, 9.61748739820009, 9.21830854162536,
9.2259184019395, 8.75384509275524, 10.4777385781522, 9.51247992951689,
9.07268620667739, 8.06463647577422, 9.7981270368783, 9.5468126085974,
9.68034400122192, 9.04782144247841, 10.4631033404715, 9.21034037197618,
10.2750511089686, 9.10497985631836, 9.04782144247841), y1 = c(NA,
NA, NA, NA, 4400, 1000, 30150, 100, 30, 249000, 38400, 857000,
1930, 18100, 5030, 140000, 380, 300, 120700, 2500, 35500, 200,
500, 6600, 129000, 44000, 1000, 162230, 174010, 700, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 810, 0, 250, 0, 14300, 5200, 19400,
0, 0), y2 = c(NA, NA, NA, NA, 1e+05, 2e+05, 1e+05, 150000, 95000,
1e+05, 50000, 1e+05, 51000, 1e+05, 73000, 125000, 55000, 17000,
3e+05, 3000, 106000, 80000, 150000, 44000, 50000, 55000, 60000,
4e+05, 130000, 60000, NA, NA, NA, NA, NA, NA, NA, NA, NA, 45000,
3000, 45000, 7500, 60000, 120000, 1e+05, 40000, 10000)), .Names = c("dummy",
"exact", "common1", "common2", "y1", "y2"), row.names = 65:112, class = "data.frame")
head(kk)
dummy exact common1 common2 y1 y2
65 1 4 87 5.702949 NA NA
66 1 4 79 9.132487 NA NA
67 1 4 82 9.248503 NA NA
68 1 4 87 9.525315 NA NA
69 0 4 94 9.798127 4400 1e+05
70 0 4 68 10.275051 1000 2e+05
列:Dummy 是一个变量,如果数据为 data1,则其值为 1,如果数据为 data2,则为 0。Common1 和 Common2 是 data1 和 data2 中共有的变量。y1 和 y2 是 data2 独有的变量,因此对于 data1 (dummy==1),这些是 NA。我正在尝试使用 StatMatch 包中的马氏距离(md)来查找基于变量“exact”给出的每个组的公共变量 common1 和 common2 的距离。之后,我试图找到 (md.dif) 的最小距离,然后选择具有 data(dummy==0) 最小值的 data2 (dummy==0) 行。如果有平局,我会从最低限度取样一个。
代码如下:
library(Statmatch) # for mahalanobis distance
for (i in unique(kk$exact)){
cat("number of individuals in data1","\n")
data1.length<-nrow(kk[kk$dummy==1 & kk$exact==i ,])
show(data1.length)
cat("number of individuals in data2","\n")
data2.length<-nrow(kk[kk$dummy==0 & kk$exact==i ,])
show(data2.length)
cat("mahalanobis distance for individuals (data1 and data2) in each exact","\n")
md<-mahalanobis.dist(kk[kk$dummy==1 & kk$exact==i,c("common1","common2")],kk[kk$dummy==0 & kk$exact==i,c("common1","common2")])
show(md)
cat("minimum mahalanobis distance for individuals (data1 and data2) in each exact","\n")
md.dif <-sapply(as.list(rownames(md)),function(x) min(md[x,]))
show(md.dif)
#For each data1 individuals in each exact, there may be more than individuals in data2 that has the same minimum distance (or has same min).
# This reflects the ties
cat("matched data2 individuals for each individuals in data1 in each exact","\n")
nn<-lapply(as.list(rownames(md)),function(x) which(md[x,]==min(md[x,])))
show(nn)
# If there is a tie (more than one individuals in data2 for each individual in data1), sample one of these; if there is no tie, then we have one data2 individual for each data1 individual
cat("matched data2 individuals for each indiviudal in data1 in each exact with sample","\n")
set.seed(123) # for reproducibility
mm<-list()
for (j in (1:length(nn))){
if (length(nn[[j]])>1)
mm[[j]]<-sample(nn[[j]],1,replace=FALSE)
else mm[[j]]<-nn[[j]]
}
#names of mm gives the row index of matched data2 individual for each data1
ss<-sapply(mm,names)
show(ss)
kk[kk$dummy==1 & kk$exact==i ,"data2row"]<-as.numeric(ss)
kk[kk$dummy==1 & kk$exact==i,"md.dif"]<-md.dif
# Imputting the data2 vars (y1 and y2) for matched individuals by creating the new vars; e.g. if data2 var is y1 then data2.y1
# gives imputted y1 for matched data1 individuals
data2vars<-names(kk)[5:6]
cat("imputting the data2 vars (y1 and y2)","\n")
for (k in data2vars){
kk[kk$dummy==1 & kk$exact==i, paste0("data2.",k)]<-kk[[k]][match(as.numeric(ss),rownames(kk))]
}
}
上述代码生成的输出如下:
dummy exact common1 common2 y1 y2 data2row md.dif data2.y1 data2.y2
65 1 4 87 5.702949 NA NA 82 3.7385027 300 17000
66 1 4 79 9.132487 NA NA 82 0.3018370 300 17000
67 1 4 82 9.248503 NA NA 80 0.2422656 140000 125000
68 1 4 87 9.525315 NA NA 92 0.3312446 162230 400000
69 0 4 94 9.798127 4400 1e+05 NA NA NA NA
70 0 4 68 10.275051 1000 2e+05 NA NA NA NA