我们可以使用gsubfn来获取二进制格式。我们从 'INCOM' 中删除$,with ,捕获 中的数字,将其转换为,与 35000 进行比较并提取二进制数gsubgsubfnnumeric
library(gsubfn)
df1$ind <- as.integer(sub(".* ", "", gsubfn("(\\d+) - (\\d+)",
~ +(any(as.numeric(c(x, y)) > 35000)), gsub("[$,]", "", df1$INCOM))))
df1$ind
#[1] 1 0 1
或者一个选项tidyverse
library(tidyverse)
library(readr)
df1 %>%
extract(INCOM, into = c("col1", "col2"), remove = FALSE,
".*\\$(\\d+,\\d+) - \\$(\\d+,\\d+)") %>%
mutate_at(vars(starts_with('col')), parse_number) %>%
mutate(Ind = as.integer(col1 > 35000 | col2 > 35000)) %>%
select(-col1, -col2)
# Household INCOM Ind
#1 1 (5) $50,000 - $74,999 1
#2 2 (3) $25,000 - $34,99 0
#3 3 (4) $35,000 - $49,999 1
或者另一种选择是
str_remove_all(df1$INCOM, ",") %>%
str_extract_all("(?<=[$])([0-9]+)") %>%
map_int(~ +(any(as.numeric(.x) > 35000)))
#[1] 1 0 1
数据
df1 <- structure(list(Household = 1:3, INCOM = c("(5) $50,000 - $74,999",
"(3) $25,000 - $34,99", "(4) $35,000 - $49,999")), class = "data.frame",
row.names = c(NA,
-3L))