这是一个不使用其他库的实现。
example.df <- data.frame(author=c("Mikey", "Donald", "Mikey",
"Daisy", "Minnie", "Daisy"),
message=c("Hello World! Mikey Mouse",
"Quack Quack! Donald Duck",
"I was born in 1928. Mikey Mouse",
"Quack Quack! Daisy Duck",
"The quick fox jump over Minnie Mouse",
"Quack Quack! Daisy Duck"))
signlen = function(am) # determine signature length of an author's messages
{
if (length(am) <= 1) return(0) # return if not more than 1 message
# turn the messages into reversed vectors of single characters
# in order to conveniently access the suffixes from index 1 on
am = lapply(strsplit(as.character(am), ''), rev)
# find the longest common suffix in the messages
longest_common = .Machine$integer.max
for (m in 2:length(am))
{
i = 1
max_length = min(length(am[[m]]), length(am[[m-1]]), longest_common)
while (i <= max_length && am[[m]][i] == am[[m-1]][i]) i = i+1
longest_common = i-1
if (longest_common == 0) return(0) # shortcut: need not look further
}
return(longest_common)
}
# determine signature length of every author's messages
signature_length = tapply(example.df$message, example.df$author, signlen)
#> signature_length
# Daisy Donald Mikey Minnie
# 23 0 12 0
# determine resulting length "to" of messages with signatures removed
to = nchar(as.character(example.df$message))-signature_length[example.df$author]
#> to
# Mikey Donald Mikey Daisy Minnie Daisy
# 12 24 19 0 36 0
# remove the signatures by replacing messages with resulting substring
example.df$message = substr(example.df$message, 1, to)
#> example.df
# author message
#1 Mikey Hello World!
#2 Donald Quack Quack! Donald Duck
#3 Mikey I was born in 1928.
#4 Daisy
#5 Minnie The quick fox jump over Minnie Mouse
#6 Daisy