r - How can I specify how to choose between duplicate rows in R?

Question

I have a data frame (below) consisting of a first column (ID) that I want to make into the rownames. However, there are duplicates in this first column. However, rather than just using !duplicated to remove the duplicates, I want to use the rows that do not have an NA for a specificied column. For example the column named UHU in the example data below (or anyone that I would use as argument). What is the most efficient way of doing this without looping through each duplication and checking the original against its duplicate for an NA. I'd also like to be able to specify columns to check for NAs in an order of preference, i.e. first select duplicate with no NA for the UHU column, and if equal check duplicate in another column, etc.

structure(list(ID = c("A1_0SM", "A1_0SP", "A2_04U", 
"A2_04U", "A2_04U", "A2_04U", "A2_0CM", 
"A2_0CM", "A2_0CM", "A2_0CM", "A2_0CM", 
"A2_0CM", "A2_0D0", "A2_0D0", "A2_0D2", 
"A2_0D2", "A2_0D2", "A2_0SX", "A2_0SX", 
"A2_0SX", "A2_0SX", "A2_0SX", "A2_0T0", 
"A2_0T0", "A2_0T0", "A2_0T2", "A2_0YE", 
"A2_0YE", "A2_0YE", "A2_1G6", "A2_1G6", 
"A2_1G6", "A7_0DA", "A7_0DA", "A7_0DA", 
"A7_26G", "A7_26G", "A8_07C", "A8_07O", 
"A8_08R", "A8_09X", "AC_2QH", "AN_04D", 
"AN_0AL", "AN_0AR", "AN_0AT", "AN_0G0", 
"AN_0XU", "AO_03U", "AO_03U", "AO_03U", 
"AO_03U", "AO_03U", "AO_03U", "AO_0J4", 
"AO_0J4", "AO_0J4", "AO_0J6", "AO_0J6", 
"AO_0J6", "AO_0JL", "AO_0JL", "AO_0JL", 
"AO_0JL", "AO_124", "AO_124", "AO_124", 
"AO_128", "AO_128", "AO_128", "AO_128", 
"AO_129", "AO_129", "AO_129", "AO_12F", 
"AO_12F", "AO_12F", "AO_1MR", "AQ_04J", 
"AQ_04J", "AQ_04J", "AQ_04J", "AQ_04J", 
"AQ_04J", "AR_0TS", "AR_0TU", "AR_0U1", 
"AR_0U4", "AR_1AR", "AR_1AR", "AR_1AY", 
"AR_256", "AR_2LR", "BH_0B3", "BH_0B3", 
"BH_0B3", "BH_0B9", "BH_0B9", "BH_0BG", 
"BH_0BL"), UHU = c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
TRUE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, 
TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, 
TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
TRUE), days_to_selection_end = c(NA, "172", "196", "119", 
"119", "670", "147", "147", "601", "615", "433", "NA", 
"125", "125", "123", "123", "179", "1359", "132", "1359", "132", 
"NA", "234", "234", "234", "212", "[Completed]", 
"[Completed]", "[Completed]", "172", "119", "119", "198", "107", 
"107", "151", "151", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, "215", "215", "215", "215", "215", "215", "175", "203", "175", 
"172", "116", "116", "178", "493", "122", "122", "125", "125", 
"237", "124", "124", "124", "124", "161", "161", "161", "189", 
"189", "189", NA, "104", "104", "159", "104", "104", "159", NA, 
NA, NA, NA, NA, NA, NA, NA, NA, "213", "115", "115", "111", "111", 
NA, NA), days_to_selection_start = c(NA, "107", "133", "78", 
"78", "480", "98", "98", "391", "391", "391", "234", "62", "62", 
"74", "74", "137", "1289", "62", "1289", "62", "1429", "61", 
"61", "61", "92", "[Completed]", "[Completed]", "[Completed]", 
"132", "69", "69", "122", "65", "65", "89", "89", NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, "70", "70", "70", "70", "70", 
"70", "131", "189", "131", "130", "74", "74", "136", "136", "80", 
"80", "60", "60", "146", "82", "82", "82", "82", "67", "67", 
"67", "63", "63", "63", NA, "61", "61", "117", "61", "61", "117", 
NA, NA, NA, NA, NA, NA, NA, NA, NA, "136", "52", "52", "48", 
"48", NA, NA), selection_name = c(NA, "NA", "T0101tere", 
"nw8m100", "991xan", "kkw", "991xan", "nw8m100", "i11io18el", 
"Cape1000", "kkw", "99f101fen", "991xan", "i11io18el", 
"991xan", "nw8m100", "T0101l", "82eplan18", "nw8m100", 
"Gem1000", "991xan", "Xeloda", "T0101tere", "991xan", "nw8m100", 
"Xeloda", "T0101tere", "nw8m100", "991xan", "i11io18el", "nw8m100", 
"991xan", "T0101l", "nw8m100", "991xan", "991xan", "T0101tere", 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "iu18101", "iu18101", 
"111o1018", "98aj", "98aj", "111o1018", 
"fox", "114iio18el", "teo882s", "114Iio18EL", 
"teO882s", "98aj", "114Iio18EL", "Uwm", 
"teO882s", "98aj", "teo882s", "98aj", 
"114iio18el", "98aj", "teo882s", "teo882s", 
"98aj", "98aj", "114iio18el", "teo882s", 
"114iio18el", "98aj", "teo882s", NA, "991xan", 
"991xan", "T0101l", "nw8m100", "nw8m100", "T0101l", NA, 
NA, NA, NA, NA, NA, NA, NA, NA, "114iio18el", "teo882s", 
"98aj", "teo882s", "98aj", NA, NA
)), .Names = c("ID", "UHU", "days_to_selection_end", 
"days_to_selection_start", "selection_name"), row.names = c(NA, 
100L), class = "data.frame")

score 1 · Accepted Answer

用这个：

DF[with(DF, ave(ID, ID, FUN=length)==1 | !is.na(UHU)),]

如果仍有重复项，则将上述结果保存回来DF并再次运行调用，替换UHU为检查NAs 的下一列。

r - How can I specify how to choose between duplicate rows in R?

1 回答 1

Related

Reference