r - 正则表达式 - 在 R 中查找“C”而不是“JC”的匹配项

Question

设置：

我正在使用正则表达式将棒球阵容组织成一个数据框。

LINEUPS <- c('OF Andrew Johnson P Victor Bailey OF Walter Hill 2B Carl Smith 3B Brian Rivera P Joseph Cox 1B Steven Parker SS William Gonzales OF Christopher Taylor C David Washington
',
             'SS J.C. Roberts P Dennis Flores OF Jason Torres 2B Jack Rodriguez OF Randy Baker P Edward Anderson C David Washington 3B Thomas Wilson OF Ryan Walker 1B Robert Harris Jr
',
             '1B J.P. Allen P Philip Hernandez OF Ryan Walker OF Christopher Taylor 2B Jack Rodriguez C Russell James 3B Brian Rivera P Joseph Cox OF Andrew Johnson SS Ralph Martinez
')

mm <- gregexpr("\\b(P|C|OF|SS|1B|2B|3B)\\b", LINEUPS)
players <- do.call("rbind", unname(Map(function(x, m, i) {
  pstart <- m
  pend <- pstart + attr(m, "match.length")
  hstart <- pend + 1
  hend <- c(tail(pstart,-1)-1, nchar(x))
  data.frame(game=i, pos=substring(x, pstart, pend), name=substring(x, hstart, hend))
  
}, LINEUPS, mm, seq_along(LINEUPS))))
players$pos <- sub("^\\s|\\s+$","", players$pos)
players$name <- sub("^\\s|\\s+$","", players$name)

library(dplyr)
library(tidyr)

players <- players %>% 
  group_by(game, pos) %>% 
  mutate(pos=if_else(rep(n(),n())>1, paste0(pos, row_number()), pos)) %>% 
  pivot_wider(game, names_from=pos, values_from=name)

问题：

当球员的名字包含也恰好与其中一个位置匹配的首字母时，我遇到了问题。在上面的例子中：SS J.C. Roberts匹配位置C和1B J.P. Allen匹配位置P，导致字符串被错误的分割。

问题：

如何修改当前搜索以排除这些类型的匹配，以便最终得到以下结果：

P1 <- c('Victor Bailey','Dennis Flores','Philip Hernandez')
P2 <- c('Joseph Cox','Edward Anderson','Joseph Cox')
C <- c('David Washington','David Washington','Russell James')
"1B" <- c('Steven Parker','Robert Harris Jr', 'J.P. Allen')
"2B" <- c('Carl Smith','Jack Rodriguez','Jack Rodriguez')
"3B" <- c('Brian Rivera','Thomas Wilson','Brian Rivera')
SS <- c('William Gonzales','J.C. Roberts','Ralph Martinez')
OF1 <- c('Andrew Johnson','Jason Torres','Ryan Walker')
OF2 <- c('Walter Hill','Randy Baker','Christopher Taylor')
OF3 <- c('Christopher Taylor','Ryan Walker','Andrew Johnson')

RESULT <- data.frame(P1, P2, C, `1B`, `2B`, `3B`, SS, OF1, OF2, OF3)

score 1 · Accepted Answer

主要技巧：

在正则表达式中使用负前瞻(?!<your-pattern>)来禁止在您的单个字母位置模式之后出现以下字符 - 在这种情况下(?!\\.)。

辅助函数，最后是处理函数process_lineups()：

require(stringr)

extract_positions <- function(lineups, pos_pattern) {
  sapply(stringr::str_extract_all(lineups, pos_pattern), stringr::str_trim)
}

extract_names <- function(lineups, pos_pattern) {
  res <- sapply(stringr::str_split(lineups, pos_pattern), stringr::str_trim)
  res[2:nrow(res), ]
}

get_indexes_matching <- function(pattern, vec) {
  # Return all pattern-matching index positions in vec. `pattern` can be regex.
  grep(pattern, vec)
}

pattern2names <- function(pattern, df) {
  # Utility function to prepare names of result data frame. 
  # 1. clean from "^" and "$" in patterns.
  # 2. Add numberings if multiple hits. 
  #    (e.g. for "^P$" -> "P" -(if multi-hits add numbering)-> "P1" "P2")
  cleaned_pattern <- gsub("^\\^", "", gsub("\\$$", "", pattern))
  if (ncol(df) > 1) {
    paste0(cleaned_pattern, 1:ncol(df))
  } else {
    cleaned_pattern
  }
}

extract_patterns_to_df <- function(pattern, positions, names) {
  # Return all hits of positions as names and the positions as column name(s).
  # It returns a data frame. (e.g. columns: "P1" "P2"  or single hit: column: "C")
  res <- sapply(1:ncol(positions), function(i) names[get_indexes_matching(pattern, positions[, i]), i])
  if (is.matrix(res)) {
    df <- as.data.frame(t(res))
  } else if (is.vector(res)) {
    df <- data.frame("col" = res)
  }
  names(df) <- pattern2names(pattern, df)
  df
}

process_lineups <- function(LINEUPS, position_pattern, ordered_patterns) {
  # All necessary procedures to generate the final RESULT data frame.
  positions <- extract_positions(LINEUPS, position_pattern)
  names <- extract_names(LINEUPS, position_pattern)
  Reduce(cbind, 
         lapply(ordered_patterns, 
                function(pos) extract_patterns_to_df(pos, positions, names)))
}

应用功能process_lineups()：

LINEUPS <- c('OF Andrew Johnson P Victor Bailey OF Walter Hill 2B Carl Smith 3B Brian Rivera P Joseph Cox 1B Steven Parker SS William Gonzales OF Christopher Taylor C David Washington',
             'SS J.C. Roberts P Dennis Flores OF Jason Torres 2B Jack Rodriguez OF Randy Baker P Edward Anderson C David Washington 3B Thomas Wilson OF Ryan Walker 1B Robert Harris Jr',
             '1B J.P. Allen P Philip Hernandez OF Ryan Walker OF Christopher Taylor 2B Jack Rodriguez C Russell James 3B Brian Rivera P Joseph Cox OF Andrew Johnson SS Ralph Martinez')
# use negative lookahead (?!<pattern>) to forbid e.g. P or C followed by a `\\.`
position_pattern <- "\\b(P(?!\\.)|C(?!\\.)|OF|SS|1B|2B|3B)\\b"
ordered_patterns <- c("^P$", "^C$", "^1B$", "^2B$", "^3B$", "^SS$", "^OF$")

res_df <- process_lineups(LINEUPS, position_pattern, ordered_patterns)

结果：

# > res_df
#                 P1              P2                C               1B
# 1    Victor Bailey      Joseph Cox David Washington    Steven Parker
# 2    Dennis Flores Edward Anderson David Washington Robert Harris Jr
# 3 Philip Hernandez      Joseph Cox    Russell James       J.P. Allen
#               2B            3B               SS            OF1
# 1     Carl Smith  Brian Rivera William Gonzales Andrew Johnson
# 2 Jack Rodriguez Thomas Wilson     J.C. Roberts   Jason Torres
# 3 Jack Rodriguez  Brian Rivera   Ralph Martinez    Ryan Walker
#                  OF2                OF3
# 1        Walter Hill Christopher Taylor
# 2        Randy Baker        Ryan Walker
# 3 Christopher Taylor     Andrew Johnson

最后，可以将“1B”、“2B”、“3B”重命名为“X1B”、“X2B”、“X3B”。

score 1 · Accepted Answer

假设你想匹配C一个完整的单词，而不是一个完整的单词J.C.。

利用

\bC\b(?<!\bJ\.C(?=\.))

见证明。使用您的正则表达式：

\b(P|C|OF|SS|1B|2B|3B)\b(?<!\bJ\.C(?=\.))

请参阅此演示。

在您的代码中：

mm <- gregexpr("\\b(P|C|OF|SS|1B|2B|3B)\\b(?<!\\bJ\\.C(?=\\.))", LINEUPS, perl=TRUE)

score 0 · Accepted Answer

您不是在要求优化，但我忍不住尝试 ;-)

样本数据

LINEUPS <- c('OF Andrew Johnson P Victor Bailey OF Walter Hill 2B Carl Smith 3B Brian Rivera P Joseph Cox 1B Steven Parker SS William Gonzales OF Christopher Taylor C David Washington',
             'SS J.C. Roberts P Dennis Flores OF Jason Torres 2B Jack Rodriguez OF Randy Baker P Edward Anderson C David Washington 3B Thomas Wilson OF Ryan Walker 1B Robert Harris Jr',
             '1B J.P. Allen P Philip Hernandez OF Ryan Walker OF Christopher Taylor 2B Jack Rodriguez C Russell James 3B Brian Rivera P Joseph Cox OF Andrew Johnson SS Ralph Martinez')

代码

#split on delimeters, while keeping the delimiter 
#  also, trim whitespace using trimws
pattern <- "(?<=.)(?=\\b(P|C|OF|SS|1B|2B|3B)[^\\.]\\b)"
L <- lapply( strsplit( LINEUPS, pattern, perl = TRUE ), trimws )
#split after first space
pattern2 <- "^(\\w+)\\s?(.*)$"
lapply( L, function(x) {
  data.frame( position = sub( pattern2, "\\1", x ), 
              player   = sub( pattern2, "\\2",x ) )
})

输出

# [[1]]
# position             player
# 1        OF     Andrew Johnson
# 2         P      Victor Bailey
# 3        OF        Walter Hill
# 4        2B         Carl Smith
# 5        3B       Brian Rivera
# 6         P         Joseph Cox
# 7        1B      Steven Parker
# 8        SS   William Gonzales
# 9        OF Christopher Taylor
# 10        C   David Washington
# 
# [[2]]
# position           player
# 1        SS     J.C. Roberts
# 2         P    Dennis Flores
# 3        OF     Jason Torres
# 4        2B   Jack Rodriguez
# 5        OF      Randy Baker
# 6         P  Edward Anderson
# 7         C David Washington
# 8        3B    Thomas Wilson
# 9        OF      Ryan Walker
# 10       1B Robert Harris Jr
# 
# [[3]]
# position             player
# 1        1B         J.P. Allen
# 2         P   Philip Hernandez
# 3        OF        Ryan Walker
# 4        OF Christopher Taylor
# 5        2B     Jack Rodriguez
# 6         C      Russell James
# 7        3B       Brian Rivera
# 8         P         Joseph Cox
# 9        OF     Andrew Johnson
# 10       SS     Ralph Martinez

如果您需要按位置将输出存储为对象，您可以使用list2env

将上述代码的输出存储到ans，然后：

list2env(
  split(
    data.table::rbindlist( ans, use.names = TRUE ),
    by = "position",
    keep.by = FALSE ),
envir = .GlobalEnv )

score 0 · Accepted Answer

这里有一些很好的解决方案，但我相信我找到了一个更有效的解决方案：.完全使用LINEUPS <- gsub(".", "", LINEUPS, fixed = TRUE). 就我的目的而言，名称是否与原始输入数据完全匹配并不重要——只要它们以我可以使用的方式组织即可。

简单而实用。:)

r - 正则表达式 - 在 R 中查找“C”而不是“JC”的匹配项

4 回答 4

Related

Reference