
example.df <- data.frame(author=c("Mikey", "Donald", "Mikey", "Daisy", "Minnie", "Daisy"),
                         message=c("Hello World! Mikey Mouse", 
                                   "Quack Quack! Donald Duck", 
                                   "I was born in 1928. Mikey Mouse", 
                                   "Quack Quack! Daisy Duck", 
                                   "The quick fox jump over Minnie Mouse", 
                                   "Quack Quack! Daisy Duck"))


由于函数 getLongestCommonSubstring,我发现看起来很有希望的 bioconductor 包 RLibstree,但我不知道如何将该函数分组到来自同一作者的所有消息。


3 回答 3



## load packages in this order

example.df[["message"]] %>% 
    stringi::stri_split_regex(., "(?<=[.?!]{1,5})\\s+") %>%

## $Daisy
## $Daisy[[1]]
## [1] "Quack Quack!" "Daisy Duck"  
## $Daisy[[2]]
## [1] "Quack Quack!" "Daisy Duck"  
## $Donald
## $Donald[[1]]
## [1] "Quack Quack!" "Donald Duck" 
## $Mikey
## $Mikey[[1]]
## [1] "Hello World!" "Mikey Mouse" 
## $Mikey[[2]]
## [1] "I was born in 1928." "Mikey Mouse"        
## $Minnie
## $Minnie[[1]]
## [1] "The quick fox jump over Minnie Mouse"
于 2014-10-22T13:41:20.807 回答



> tapply(as.character(example.df$message), example.df$author, function(x) x)
[1] "Quack Quack! Daisy Duck" "Quack Quack! Daisy Duck"

[1] "Quack Quack! Donald Duck"

[1] "Hello World! Mikey Mouse"        "I was born in 1928. Mikey Mouse"

[1] "The quick fox jump over Minnie Mouse"

当然,您可以使用自己的函数代替function(x) x

于 2015-06-12T14:27:27.590 回答


example.df <- data.frame(author=c("Mikey", "Donald", "Mikey",
                                  "Daisy", "Minnie", "Daisy"),
                         message=c("Hello World! Mikey Mouse", 
                                   "Quack Quack! Donald Duck", 
                                   "I was born in 1928. Mikey Mouse", 
                                   "Quack Quack! Daisy Duck", 
                                   "The quick fox jump over Minnie Mouse", 
                                   "Quack Quack! Daisy Duck"))

signlen = function(am)  # determine signature length of an author's messages
    if (length(am) <= 1) return(0)  # return if not more than 1 message

    # turn the messages into reversed vectors of single characters
    # in order to conveniently access the suffixes from index 1 on
    am = lapply(strsplit(as.character(am), ''), rev)
    # find the longest common suffix in the messages
    longest_common = .Machine$integer.max
    for (m in 2:length(am))
        i = 1
        max_length = min(length(am[[m]]), length(am[[m-1]]), longest_common)
        while (i <= max_length && am[[m]][i] == am[[m-1]][i]) i = i+1
        longest_common = i-1
        if (longest_common == 0) return(0)  # shortcut: need not look further

# determine signature length of every author's messages
signature_length = tapply(example.df$message, example.df$author, signlen)
#> signature_length
# Daisy Donald  Mikey Minnie 
#    23      0     12      0 

# determine resulting length "to" of messages with signatures removed
to = nchar(as.character(example.df$message))-signature_length[example.df$author]
#> to
# Mikey Donald  Mikey  Daisy Minnie  Daisy 
#    12     24     19      0     36      0 

# remove the signatures by replacing messages with resulting substring
example.df$message = substr(example.df$message, 1, to)
#> example.df
#  author                              message
#1  Mikey                         Hello World!
#2 Donald             Quack Quack! Donald Duck
#3  Mikey                  I was born in 1928.
#4  Daisy                                     
#5 Minnie The quick fox jump over Minnie Mouse
#6  Daisy                                     
于 2015-06-24T09:15:15.253 回答