2

假设我在 R 中有一个数据框:data.frame(x=1:4, y=c("a b c", "b", "a c", "c"))

  x     y
1 1 a b c
2 2     b
3 3   a c
4 4     c

现在我想构建一个新的数据框,一个在 IR 或推荐系统中很常见的倒排索引,来自它:

y    x
a    1 3
b    1 2
c    1 3 4

我怎样才能有效地做到这一点?

4

3 回答 3

1
conv <- function(x) {
  l <- function(z) {
    paste(x$x[grep(z, x$y)], collapse=' ')
  }
  lv <- Vectorize(l)

  alphabet <- unique(unlist(strsplit(as.character(x$y), ' '))) # hard-coding this might be preferred for some uses.
  y <- lv(alphabet)
  data.frame(y=names(y), x=y)
}

x <- data.frame(x=1:4, y=c("a b c", "b", "a c", "c"))
> conv(x)
##   y     x
## a a   1 3
## b b   1 2
## c c 1 3 4
于 2013-01-24T05:23:26.307 回答
0

y转换为字符后的尝试:

test <- data.frame(x=1:4,y=c("a b c","b","a c","c"),stringsAsFactors=FALSE)

result <- strsplit(test$y," ")
result2 <- sapply(unique(unlist(result)),function(y) sapply(result,function(x) y %in% x))
result3 <- apply(result2,2,function(x) test$x[which(x)])
final <- data.frame(x=names(result3),y=sapply(result3,paste,collapse=" "))

> final
  x     y
a a   1 3
b b   1 2
c c 1 3 4
于 2013-01-24T05:26:23.797 回答
0

又快又脏

  original.df <- data.frame(x=1:4, y=c("a b c", "b", "a c", "c"))

  original.df$y <- as.character(original.df$y)

  y.split <- strsplit(original.df$y, " ")

  y.unlisted <- unique(unlist(y.split))

  new.df <- 
    sapply(y.unlisted, function(element) 
      paste(which(sapply(y.split, function(y.row) element %in% y.row)), collapse=" " ))

  as.data.frame(new.df)

  >  new.df
  a    1 3
  b    1 2
  c  1 3 4
于 2013-01-24T05:26:56.403 回答