这个问题似乎有两个关键部分。
- 处理字符串以两种不同方式编码的事实
- 将字符串拼接到适当的数据列中
注意:至于一次对多个值应用函数,许多函数已经可以处理向量。例如str_locate
和substr
。
第 1 部分 - 为 m/f // 01/02 编码清理字符串
# We will be using this library later for str_detect, str_replace, etc
library(stringr)
# first, make sure diagnosis is character (strings) and not factor (category)
diagnosis <- as.character(diagnosis)
# We will use a temporary vector, to preserve the original, but this is not a necessary step.
diagnosisTmp <- diagnosis
males <- str_locate(diagnosisTmp, "_01")
females <- str_locate(diagnosisTmp, "_02")
# NOTE: All of this will work fine as long as '_01'/'_02' appears *__only__* as gender code.
# Therefore, we put in the next two lines to check for errors, make sure we didn't accidentally grab a "_01" from the middle of the string
#-------------------------
if (any(str_length(diagnosisTmp) != males[,2], na.rm=T)) stop ("Error in coding for males")
if (any(str_length(diagnosisTmp) != females[,2], na.rm=T)) stop ("Error in coding for females")
#------------------------
# remove all the '_01'/'_02' (replacing with "")
diagnosisTmp <- str_replace(diagnosisTmp, "_01", "")
diagnosisTmp <- str_replace(diagnosisTmp, "_02", "")
# append to front of string appropriate m/f code
diagnosisTmp[!is.na(males[,1])] <- paste0("m", diagnosisTmp[!is.na(males[,1])])
diagnosisTmp[!is.na(females[,1])] <- paste0("m", diagnosisTmp[!is.na(females[,1])])
# remove superfluous underscores
diagnosisTmp <- str_replace(diagnosisTmp, "_", "")
# display the original next to modified, for visual spot check
cbind(diagnosis, diagnosisTmp)
第 2 部分 - 拼接字符串
# gender is the first char, hospital is the last.
gender <- toupper(str_sub(diagnosisTmp, 1,1))
hosp <- str_sub(diagnosisTmp, -1,-1)
# age, if present is char 2-5. A warning will be shown if values are missing. Age needs to be cleaned up
age <- as.numeric(str_sub(diagnosisTmp, 2,5)) # as.numeric will convert none-numbers to NA
age[!is.na(age)] <- paste(substr(age[!is.na(age)], 1, 2), substr(age[!is.na(age)], 3, 4), sep="-")
# diagnosis is variable length, so we have to find where to start
diagStart <- 2 + 4*(!is.na(age))
diag <- str_sub(diagnosisTmp, diagStart, -2)
# Put it all together into a data frame
dat <- data.frame(diagnosis, hosp, diag, age, gender)
## OR WITHOUT ORIGINAL DIAGNOSIS STRING ##
dat <- data.frame(hosp, diag, age, gender)