我正在尝试连续加入几个数据集,并标记第一个数据集中在后续数据集中找不到匹配项的观察结果。下面是一个例子,我模拟原始数据集加上三个附加加入。当前的代码可以满足我的要求,但效率非常低。对于大型数据集,可能需要几天时间。是否可以使用 apply 或其他功能来完成此任务?
#Toy datasets: x, y, z and w
#dataset X
id <- c(1:10, 1:100)
X1 <- rnorm(110, mean = 0, sd = 1)
year <- c("2004","2005","2006","2001","2002")
year <- rep(year, 22)
month = c("Jul","Aug","Sep","Oct","Nov","Dec","Jan","Feb","Mar","Apr")
month <- rep(month, 11)
x <- data.frame(id, X1, month, year)
#dataset Y
id2 <- c(1:10, 41:110)
Y1 <- rnorm(80, mean = 0 , sd = 1)
year <- c("2004","2005","2006","2001")
year <- rep(year, 20)
month = c("Jul","Aug","Sep","Oct","Nov","Dec","Jan","Feb","Mar","Apr")
month <- rep(month, 8)
y <- data.frame(id2,Y1, year,month)
#dataset z
id3 = c(1:60, 401:10000)
Z1 = rpois(9660, 10)
year = c('2004','2005','2006','2002')
year = rep(year, 2415)
month = c("Jul","Aug","Sep","Oct","Nov","Dec","Jan","Feb","Mar","Apr")
month <- rep(month, 966)
z = data.frame(id3,Z1,year,month)
#dataset w
id4 = c(1:300, 20:29)
W1 = rnorm(310, 20, 36)
year = c('2004','2005','2006','2000','2002')
year = rep(year, 62)
month = c("Jul","Aug","Sep","Oct","Nov","Dec","Jan","Feb","Mar","Apr")
month <- rep(month, 31)
w = data.frame(id4, W1, year, month)
x$id2 = x$yflag = x$zflag = x$wflag = rep(NA, nrow(x))
y.index = rep(NA, nrow(x))
z.index = rep(NA, nrow(x))
w.index = rep(NA, nrow(x))
for(i in 1:nrow(x)) {
#compare to dataset y, insert yflag == 1 if the same ID, month, year is in x, otherwise 0
y.index = which(as.character(y$id2) == as.character(x$id[i])
& as.character(y$year) == as.character(x$year[i])
& as.character(y$month) == as.character(x$month[i]))
x$yflag[i] = ifelse(length(y.index==1), 1, 0)
x$id2[i] = ifelse(length(y.index) == 1, y$id2[y.index], x$id[i])
## compare to dataset z, insert zflag == 1 if the same ID, month, year is in x, otherwise 0
z.index <- which(as.character(z$id3) == as.character(x$id[i])
& as.character(z$month) == as.character(x$month[i])
& as.character(z$year) == as.character(x$year[i]))
x$zflag[i] <- ifelse(length(z.index == 1), 1, 0)
## compare to dataset w, insert wflag == 1 if the same ID, month, year is in x, otherwise 0
w.index <- which(as.character(w$id4) == as.character(x$id[i])
& as.character(w$month) == as.character(x$month[i])
& as.character(w$year) == as.character(x$year[i]))
x$wflag[i] <- ifelse(length(w.index == 1), 1, 0)
}
print(x)