2

如您所见,我正在处理一些严重的脏数据。这段代码有效,但看起来有点笨拙。有没有一种更高效、更动态的方式来实现最终结果而无需那么多编码?

我必须分阶段执行此操作,首先标记内容类型,然后利用内容类型将它们填充到相应的列类型中。

感谢你的帮助

#load library
library(dplyr)
library(stringr)
library(lubridate)

#create sample data
df <- tibble(c1 = c('9996155', '4001096', '4001525', '4000590','2020-01-23', '2019-12-23', '2020-01-20', '2019-12-08'),
             c2 = c('4001902', '5000009', '2020-01-23', '2019-12-23', '2020-01-20', '2019-12-08', '4000461', '4000311'),
             c3 = c('W-7', 'W-8', 'W-9', 'W-2', 'W-1', 'W-1','3.527E+20', '3.498E+20'),
             c4 = c('B09/20', 'B04/20', 'B05/20', 'B10/20', 'B06/20',  '3.408E+20', '3.229E+20', '3.225E+20')
             )

数据看起来像这样

> df
# A tibble: 8 x 4
  c1         c2         c3        c4       
  <chr>      <chr>      <chr>     <chr>    
1 9996155    4001902    W-7       B09/20   
2 4001096    5000009    W-8       B04/20   
3 4001525    2020-01-23 W-9       B05/20   
4 4000590    2019-12-23 W-2       B10/20   
5 2020-01-23 2020-01-20 W-1       B06/20   
6 2019-12-23 2019-12-08 W-1       3.408E+20
7 2020-01-20 4000461    3.527E+20 3.229E+20
8 2019-12-08 4000311    3.498E+20 3.225E+20

我做了这样的事情来让它成形


df %>%
  mutate(across(#flag them now to allow next step for data population
    starts_with('c'),
    ~ case_when(
      is.na(.) ~ NA_character_,
      str_detect(., regex('(^20[1,2][0-9]\\-)|(\\/20[1,2][0-9]$)')) ~ 'date',
      str_detect(., regex('\\d\\.\\d{3}[eE][+-]\\d{2}+')) ~ 'numericScientificNotation',
      str_detect(.,regex('(^[a-zA-Z][0-9]{2}\\/2[0-1]{1}$)|(^[A-Z]{1,2}\\-\\d.*[a-zA-Z]*$)|(^[a-zA-Z][0-9]{2})|(^[A-Z][0-9]$)')) ~ 'batches',
      str_detect(., regex('^-?\\d+$')) ~ 'integers',
      TRUE ~ NA_character_
    ),
    .names = paste0('test', "_{col}")
  )) %>% #casewhen to populate new columns
  mutate(integer = case_when(test_c1 == 'integers' ~ c1,
                             test_c2 == 'integers' ~ c2,
                             test_c3 == 'integers' ~ c3,
                             test_c4 == 'integers' ~ c4),
         date = case_when(test_c1 == 'date' ~ c1,
                             test_c2 == 'date' ~ c2,
                             test_c3 == 'date' ~ c3,
                             test_c4 == 'date' ~ c4),
         batches = case_when(test_c1 == 'batches' ~ c1,
                               test_c2 == 'batches' ~ c2,
                               test_c3 == 'batches' ~ c3,
                               test_c4 == 'batches' ~ c4),
         numericScientificNotation = case_when(test_c1 == 'numericScientificNotation' ~ c1,
                               test_c2 == 'numericScientificNotation' ~ c2,
                               test_c3 == 'numericScientificNotation' ~ c3,
                               test_c4 == 'numericScientificNotation' ~ c4)
         ) %>% 
  select(9:12) #this is all that i need

只需要这个有组织的输出。

谢谢!

# A tibble: 8 x 4
  integer date       batches numericScientificNotation
  <chr>   <chr>      <chr>   <chr>                    
1 9996155 NA         W-7     NA                       
2 4001096 NA         W-8     NA                       
3 4001525 2020-01-23 W-9     NA                       
4 4000590 2019-12-23 W-2     NA                       
5 NA      2020-01-23 W-1     NA                       
6 NA      2019-12-23 W-1     3.408E+20                
7 4000461 2020-01-20 NA      3.527E+20                
8 4000311 2019-12-08 NA      3.498E+20                
4

2 回答 2

3

这是一种简化并减少重复的方法:

library(dplyr)

regex_list <- list(date = '(^20[1,2][0-9]\\-)|(\\/20[1,2][0-9]$)', 
                  numericScientificNotation = '\\d\\.\\d{3}[eE][+-]\\d{2}+', 
                  batches = '(^[a-zA-Z][0-9]{2}\\/2[0-1]{1}$)|(^[A-Z]{1,2}\\-\\d.*[a-zA-Z]*$)|(^[a-zA-Z][0-9]{2})|(^[A-Z][0-9]$)', 
                  integers = '^-?\\d+$')


purrr::imap_dfc(regex_list, function(x, y) 
                  df %>%
                    mutate(across(.fns = ~ifelse(str_detect(.x, x), .x, NA))) %>%
                    transmute(!!y := do.call(coalesce, .)))

#  date       numericScientificNotation batches integers
#  <chr>      <chr>                     <chr>   <chr>   
#1 NA         NA                        W-7     9996155 
#2 NA         NA                        W-8     4001096 
#3 2020-01-23 NA                        W-9     4001525 
#4 2019-12-23 NA                        W-2     4000590 
#5 2020-01-23 NA                        W-1     NA      
#6 2019-12-23 3.408E+20                 W-1     NA      
#7 2020-01-20 3.527E+20                 NA      4000461 
#8 2019-12-08 3.498E+20                 NA      4000311 
于 2021-05-13T06:52:09.313 回答
2

你也可以这样做

df <- data.frame(c1 = c('9996155', '4001096', '4001525', '4000590','2020-01-23', '2019-12-23', '2020-01-20', '2019-12-08'),
             c2 = c('4001902', '5000009', '2020-01-23', '2019-12-23', '2020-01-20', '2019-12-08', '4000461', '4000311'),
             c3 = c('W-7', 'W-8', 'W-9', 'W-2', 'W-1', 'W-1','3.527E+20', '3.498E+20'),
             c4 = c('B09/20', 'B04/20', 'B05/20', 'B10/20', 'B06/20',  '3.408E+20', '3.229E+20', '3.225E+20')
)

library(tidyverse)

df %>% mutate(rowid = row_number()) %>%
  pivot_longer(!rowid) %>%
  mutate(new = case_when(str_detect(value, '(^20[1,2][0-9]\\-)|(\\/20[1,2][0-9]$)') ~ 'date',
                         str_detect(value, '\\d\\.\\d{3}[eE][+-]\\d{2}+') ~ 'numeric',
                         str_detect(value, '(^[a-zA-Z][0-9]{2}\\/2[0-1]{1}$)|(^[A-Z]{1,2}\\-\\d.*[a-zA-Z]*$)|(^[a-zA-Z][0-9]{2})|(^[A-Z][0-9]$)') ~ 'Batches',
                         str_detect(value, '^-?\\d+$') ~ 'Integer',
                         TRUE ~ 'Other')) %>%
  pivot_wider(id_cols = rowid, names_from = new, values_from = value, values_fill = NA, values_fn = first)
#> # A tibble: 8 x 5
#>   rowid Integer Batches date       numeric  
#>   <int> <chr>   <chr>   <chr>      <chr>    
#> 1     1 9996155 W-7     <NA>       <NA>     
#> 2     2 4001096 W-8     <NA>       <NA>     
#> 3     3 4001525 W-9     2020-01-23 <NA>     
#> 4     4 4000590 W-2     2019-12-23 <NA>     
#> 5     5 <NA>    W-1     2020-01-23 <NA>     
#> 6     6 <NA>    W-1     2019-12-23 3.408E+20
#> 7     7 4000461 <NA>    2020-01-20 3.527E+20
#> 8     8 4000311 <NA>    2019-12-08 3.498E+20

reprex 包于 2021-05-13 创建 (v2.0.0 )

于 2021-05-13T06:54:18.273 回答