这有点笨拙,但它有效。
这是数据:
test <- c(
"[1234],Bob Smith,",
"Q-0,Male",
"Q-1,18-25",
"Q-2,Computer Science",
",",
"[5678],Julie Lewis",
"Q-0,Female",
"Q-1,18-25",
",",
"[1234],Bob Smith,",
"Q-1,18-25",
"Q-2,Computer Science",
","
)
这是操作代码:
#remove rows with just a comma
test <- test[test!=","]
#find id cases and remove the commas between the id and the name
#and add an id label
idcases <- grep("\\[.*\\]",test)
test[idcases] <- paste("id,",gsub(",","",test[idcases]),sep="")
#find id values positions and end position
idvals <- c(idcases,length(test)+1)
#generate a sequence identifier for each respondent
setid <- rep(1:(length(idvals)-1),diff(idvals))
#put the set id against each value
result1 <- paste(setid,test,sep=",")
#split the strings up and make them a data.frame
result2 <- data.frame(do.call(rbind,strsplit(result1,",")))
#get the final dataset with a reshape
final <- reshape(result2,idvar="X1",timevar="X2",direction="wide")[,-1]
#clean up the names etc
names(final) <- c("name","gender","age","major")
final$id <- gsub("(\\[.*\\])(.*)","\\1",final$name)
final$name <- gsub("(\\[.*\\])(.*)","\\2",final$name)
这使:
> final
name gender age major id
1 Bob Smith Male 18-25 Computer Science [1234]
5 Julie Lewis Female 18-25 <NA> [5678]
8 Bob Smith <NA> 18-25 Computer Science [1234]