0

期望的输出

需要以下输出

df2 <-
  data.frame(
    v1 = c(1100001, 1100002, 1100003, 1100004, 1100005)
  , v2 = c("A R", "W R", "A K", "M", "A C")
  , v3 = c("P", "G P", "G P", "P", "P")
  , v4 = c(110, 161, 129, 132, "Absent")
  , v5 = c(55, 80.5, 64.5, 66,  "Absent")
    )
df2

       v1  v2  v3     v4     v5
1 1100001 A R   P    110     55
2 1100002 W R G P    161   80.5
3 1100003 A K G P    129   64.5
4 1100004   M   P    132     66
5 1100005 A C   P Absent Absent

这是我的原创data.frame

df1 <-
  structure(list(value = c(
"1100001     A R                P             110    55", 
"1100002     W R                 G P 161    80.5", 
"1100003     A K                  G P 129    64.5", 
"1100004     M                      P             132    66",
"1100005     A C                     P             Absent    Absent"
)), row.names = c(NA, -5L), class = c("data.frame")
)

df1

                                                              value
1            1100001     A R                P             110    55
2                   1100002     W R                 G P 161    80.5
3                  1100003     A K                  G P 129    64.5
4        1100004     M                      P             132    66
5 1100005     A C                     P             Absent    Absent

使用了 中cSplit的函数splitstackshape,但无法获得所需的输出。任何点。

library(splitstackshape)
cSplit(indt = df1, splitCols = "value", sep = " ")

   value_1 value_2 value_3 value_4 value_5 value_6 value_7
1: 1100001       A       R       P     110      55      NA
2: 1100002       W       R       G       P     161    80.5
3: 1100003       A       K       G       P     129    64.5
4: 1100004       M       P     132      66      NA      NA
5: 1100005       A       P  Absent  Absent      NA      NA
4

2 回答 2

1

我们假设一个新字段在两个或多个空格之后开始,或者一个空格后跟一个数字,其中该数字是下一个字段的开始。用逗号替换这些分隔符,然后使用read.tablewithsep=","

df1$value |>
  gsub(pattern = "  +| (?=\\d)", replacement = ",", perl = TRUE) |>
  textConnection(name = "") |>
  read.table(sep = ",")

给出这个data.frame:

       V1  V2  V3     V4     V5
1 1100001 A R   P    110     55
2 1100002 W R G P    161   80.5
3 1100003 A K G P    129   64.5
4 1100004   M   P    132     66
5 1100005   A   P Absent Absent

会话日志

> df1 <-
+   structure(list(value = c(
+ "1100001     A R                P             110    55", 
+ "1100002     W R                 G P 161    80.5", 
+ "1100003     A K                  G P 129    64.5", 
+ "1100004     M                      P             132    66",
+ "1100005     A C                     P             Absent    Absent"
+ )), row.names = c(NA, -5L), class = c("data.frame")
+ )
> 
> df2 <-
+   data.frame(
+     v1 = c(1100001, 1100002, 1100003, 1100004, 1100005)
+   , v2 = c("A R", "W R", "A K", "M", "A C")
+   , v3 = c("P", "G P", "G P", "P", "P")
+   , v4 = c(110, 161, 129, 132, "Absent")
+   , v5 = c(55, 80.5, 64.5, 66,  "Absent")
+     )
> 
> df2a <- df1$value |>
+   gsub(pattern = "  +| (?=\\d)", replacement = ",", perl = TRUE) |>
+   textConnection(name = "") |>
+   read.table(sep = ",")
> 
> all(df2 == df2a)
[1] TRUE
于 2021-10-01T12:53:34.627 回答
1

1.假设分隔规则是:a)多个空格 b)字母和空格后面的数字将被拆分

2.我们确保所有即将成为的变量由一个以上的空格分隔(用于将字母和数字之间的一个空格gsub替换为两个空格)

3.然后我们使用两个或多个空格作为tidyr::separate分隔符将字符串分隔为变量

library(dplyr)
library(tidyr)
df1 %>% 
  mutate(value = gsub("([A-z])( )([0-9])", "\\1  \\3", value)) %>% 
  separate(value, c(paste0("v", 1:5)),"[ ]{2,}")

回报:

       v1  v2  v3     v4     v5
1 1100001 A R   P    110     55
2 1100002 W R G P    161   80.5
3 1100003 A K G P    129   64.5
4 1100004   M   P    132     66
5 1100005   A   P Absent Absent

编辑

关于原始示例中未包含的新约束(数字和字母之间只有一个空格):

建议的解决方案:

我们只是重复使用“反转”正则表达式添加额外空格的命令。这样数字和字母之间的任何单个空格都会得到一个额外的空格,然后会被separate调用分隔

df1 %>% 
  mutate(value = gsub("([A-z])( )([0-9])", "\\1  \\3", value)) %>% 
  mutate(value = gsub("([0-9])( )([A-z])", "\\1  \\3", value)) %>% 
  separate(value, c(paste0("v", 1:5)),"[ ]{2,}")
于 2021-10-01T12:53:47.747 回答