
no <- c(3, 3, 5, 5, 5, 24, 24, 35, 35, 41, 41, 41)
article <- c("earnings went up.", "earnings went up.", "massive layoff.", "they moved their offices.", "Mr. X joined the company.", "class action filed.", "accident in warehouse.", "blabla one.", "blabla two.", "blabla three.", "blabla four.", "blabla five.")
class <- c("p","p","n","x","x","n","n","x","p","p","n","p")

mydf <- data.frame(no, article, class)

#    no                   article class
# 1   3         earnings went up.     p
# 2   3         earnings went up.     p
# 3   5           massive layoff.     n
# 4   5 they moved their offices.     x
# 5   5 Mr. X joined the company.     x
# 6  24       class action filed.     n
# 7  24    accident in warehouse.     n
# 8  35               blabla one.     x
# 9  35               blabla two.     p
# 10 41             blabla three.     p
# 11 41              blabla four.     n
# 12 41              blabla five.     p


# examples:
# "p", "x"      --> "p"
# "p", "n"      --> "x" 
# "x", "n", "x" --> "n" 
# "p", "n", "p" --> "p"  

# the resulting data.frame should look like this:

#    no                                                            article  class
# 1   3                                                   earnings went up.     p
# 2   5 massive layoff. they moved their offices. Mr. X joined the company.     n
# 3  24                          class action filed. accident in warehouse.     n
# 4  35                                             blabla one. blabla two.     p
# 5  41                                           blabla four. blabla five.     p


df2 <- with(mydf, sentCombine(article, no))

df2$class <- df2$no %l% vect2df(c(tapply(mydf[, 3], mydf[, 1], function(x){
tab <- table(x)
ifelse(sum(tab %in% max(tab)) > 1, "x", names(tab)[max(tab) == tab])

我试图更改此代码,但我对如何编写函数和 qdap 知之甚少,无法真正理解这一点。


1 回答 1



require(dplyr) # for aggregation

  ret<-"x"                         # return x, unless
  if(n.n>n.p)ret<-"n"              # there are more n's than p's (return p)
  if(n.n<n.p)ret<-"p"              # or more p's than n's (return n)

group_by(mydf,no) %.%
  summarise(article=paste0(unique(article),collapse=" "),class=getclass(class))

Source: local data frame [5 x 3]

  no                                                             article class
1  3                                                   earnings went up.     p
2  5 massive layoff. they moved their offices. Mr. X joined the company.     n
3 24                          class action filed. accident in warehouse.     n
4 35                                             blabla one. blabla two.     p
5 41                             blabla three. blabla four. blabla five.     p
于 2014-03-10T09:20:15.083 回答