r - 是否有 R 函数可以删除大型数据集的单元格中的重复值？

Question

我有一个大数据集，我的问题如下：我在一个单元格中有多个条目。我想删除单元格中的重复项，但保留所有列的重复行条目。这是一个示例代码，它将导致类似于我现在所处的情况：

          employee <- c('John_Doe|John Doe,','PeterGynn, Peter Gynn|Peter_Gynn','Jolie Hope','John Doe,','JohnDoe,','John Doe','John Doe')
id <- c(1,2,3,4,5,6,7)
salary <- c(21000, 23400, 26800, 666604, 55, 66, 22)
startdate <- as.Date(c('2010-11-1','2008-3-25',',2007-3-14','2007-3-14|','2007-3-14','2007-3-14','2007-3-14'))
employ.data <- data.frame(employee, id, salary, startdate)
View(employ.data)


employee1<- c('John Doe','Peter Gynn','Jolie Hope',"test1 test1 test1: test1", "test2","test2","test2","test2","test2")
id1 <-c(1,2,3,4,5,6,7,8,9)
salary1 <- c(21000, 55, 66,66,66,66,66,66,66)
startdate1 <- as.Date(c('2010-11-1','2008-3-22','2007-3-14',"2007-3-14","2007-3-14","2007-3-14","2007-3-14","2007-3-14","2007-3-14"))
employ.data1 <- data.frame(employee1, id1, salary1, startdate1)
names(employ.data1)[1] <- "employee"
names(employ.data1)[2] <- "id"
names(employ.data1)[3] <- "salary"
names(employ.data1)[4] <- "startdate"

Test <- merge(employ.data, employ.data1, by="id", all = TRUE) 

names(Test)
Test$employ.data.z <- paste(Test$employee.x,Test$employee.y) 
Test$salary.z <- paste(Test$salary.x,Test$salary.y) 
Test$startdate.z <- paste(Test$startdate.x,Test$startdate.y) 
Test$employee.x <- NULL
Test$employee.y <- NULL
Test$salary.x <- NULL
Test$salary.y <- NULL
Test$startdate.x <- NULL
Test$startdate.y <- NULL
names(Test)
View(Test)  #here you get an idea, what my dataset looks like right now
dput(Test) #probably unnecessary here but added it anyway just in case
Input <- Test


employee2 <- c('John Doe','Peter Gynn','Jolie Hope','John Doe','John Doe','John Doe','John Doe')
id2 <- c(1,2,3,4,5,6,7)
salary2 <- c(21000, 23400, 26800, 666604, 55, 66, 22)
startdate2 <- c('2010-11-1','2008-3-25',',2007-3-14','2007-3-14','2007-3-14','2007-3-14','2007-3-14')
employ.data2 <- data.frame(employee2, id2, salary2, startdate2)
View(employ.data2)
names(employ.data2)[1] <- "employee"
names(employ.data2)[2] <- "id"
names(employ.data2)[3] <- "salary"
names(employ.data2)[4] <- "startdate"

employee3<- c(' ',' ',' ',"test1", "test2","test2","test2","test2","test2")
id3 <-c(1,2,3,4,5,6,7,8,9)
salary3 <- c(" ", 55, 66,66,66," ",66,66,66)
startdate3 <- c(' ','2008-3-22',' '," "," "," "," "," "," ")
employ.data3 <- data.frame(employee3, id3, salary3, startdate3)
names(employ.data3)[1] <- "employee"
names(employ.data3)[2] <- "id"
names(employ.data3)[3] <- "salary"
names(employ.data3)[4] <- "startdate"

Output <- merge(employ.data2, employ.data3, by="id", all = TRUE) 
View(Output)
names(Output)
Output$employ.data.z <- paste(Output$employee.x,Output$employee.y) 
Output$salary.z <- paste(Output$salary.x,Output$salary.y) 
Output$startdate.z <- paste(Output$startdate.x,Output$startdate.y) 
Output$employee.x <- NULL
Output$employee.y <- NULL
Output$salary.x <- NULL
Output$salary.y <- NULL
Output$startdate.x <- NULL
Output$startdate.y <- NULL
View(Input)    #This is an example, similar to what I get with my dataset
View(Output)   #This is what I would want it to be like, if possible

该数据集有 >1000 行和 4 列，在很大程度上都存在这个问题。所以手动解决它们是没有选择的。

我无法在单元格中找到此类重复问题的解决方案。整列都有相似的条目，有时它们是相同的，有时它们是不同的。所以重要的是，不同的行仍然有可能具有相似的条目。还有标点符号问题，我想稍后解决，因为现在重复是更大的问题。

有什么建议么？

亲切的问候，

score 0 · Accepted Answer

我们可以使用separate_rows将数据拆分为新行。对于employ.data.z，数据有很多可以包含在sep参数中的分隔符。我们可以salary.z用startdate.z空格分隔行。一旦我们有了长格式的数据，我们就可group_by id以为所有列创建一个逗号分隔的字符串。

library(dplyr)
library(tidyr)

Input %>%
  separate_rows(employ.data.z, sep = '[|,:]') %>%
  separate_rows(salary.z, startdate.z, sep = "\\s+") %>%
  group_by(id) %>%
  summarise_at(vars(-group_cols()), ~toString(unique(.)))

# A tibble: 9 x 4
#     id employ.data.z                                 salary.z   startdate.z           
#  <dbl> <chr>                                         <chr>      <chr>                 
#1     1 John_Doe, John Doe,  John Doe                 21000      2010-11-01            
#2     2 PeterGynn,  Peter Gynn, Peter_Gynn Peter Gynn 23400, 55  2008-03-25, 2008-03-22
#3     3 Jolie Hope Jolie Hope                         26800, 66  NA, 2007-03-14        
#4     4 John Doe,  test1 test1 test1,  test1          666604, 66 2007-03-14            
#5     5 JohnDoe,  test2                               55, 66     2007-03-14            
#6     6 John Doe test2                                66         2007-03-14            
#7     7 John Doe test2                                22, 66     2007-03-14            
#8     8 NA test2                                      NA, 66     NA, 2007-03-14        
#9     9 NA test2                                      NA, 66     NA, 2007-03-14

r - 是否有 R 函数可以删除大型数据集的单元格中的重复值？

1 回答 1

Related

Reference