0

我有一个关键字之间相似性的大型数据框,我想将其更改为包含所有成对相似性的格式。我当前的数据框如下所示:

> df
  kwd1 kwd2 sim
1  a  b 1
2  b  a 1
3  c  a 2
4  a  c 2

我想将其转换为以下形式的 data.frame:

> df
  kwd1 kwd2 sim
  a b 1
  a c 2
  b c 0

谢谢你的帮助!

到目前为止,我的代码是:

df <- data.frame(c('a', 'b', 'c', 'a'), c('b', 'a', 'a', 'c'), c(.1,.1,.2,.2))
colnames(df) = c('kwd1', 'kwd2', 'sim')
> dput(df)
structure(list(kwd1 = structure(c(1L, 2L, 3L, 1L), .Label = c("a", 
"b", "c"), class = "factor"), kwd2 = structure(c(2L, 1L, 1L, 
3L), .Label = c("a", "b", "c"), class = "factor"), sim = c(1, 
1, 2, 2)), .Names = c("kwd1", "kwd2", "sim"), row.names = c(NA, 
-4L), class = "data.frame")
4

2 回答 2

2
library(plyr)
res <- merge(expand.grid(kwd1 = unique(c(df$kwd1, df$kwd2)), 
kwd2 = unique(c(df$kwd1, 
    df$kwd2))), df, all.x = T)

res <- ddply(res, .(kwd1, kwd2), function(x) {
    if (which(letters == x$kwd1) != which(letters == x$kwd2)) {
        if (which(letters == x$kwd1) > which(letters == x$kwd2)) {
            return(data.frame(kwd1 = x$kwd2, kwd2 = x$kwd1, sim = x$sim))
        } else {
            return(x)
        }
    }
})
res1 <- res[!duplicated(res), ] 

> res1
  kwd1 kwd2 sim
1    a    b 0.1
2    a    c 0.2
4    b    c  NA

你可以把它变成一个函数,让它更容易调用。

convert_df <- function(df) {
    res <- merge(expand.grid(kwd1 = unique(c(df$kwd1, df$kwd2)), 
kwd2 = unique(c(df$kwd1, 
    df$kwd2))), df, all.x = T)
res <- ddply(res, .(kwd1, kwd2), function(x) {
    if (which(letters == x$kwd1) != which(letters == x$kwd2)) {
        if (which(letters == x$kwd1) > which(letters == x$kwd2)) {
            return(data.frame(kwd1 = x$kwd2, kwd2 = x$kwd1, sim = x$sim))
        } else {
            return(x)
        }
    }
})
 return(res[!duplicated(res), ])
}
# Then simply run this to convert your actual data.frame
convert_df(df)
于 2012-09-12T22:48:32.290 回答
0

可能有一种更优雅的方式,但这是一种方法:

# make a data.frame with all possible combinations of kwd1 and kwd2.
# the ones that aren't in df are NA for sim.
k <- merge(expand.grid(kwd1=df$kwd1, kwd2=df$kwd2), df, all=TRUE)
# order the result to put the NA rows at the end, so that rows that are in df 
# have priority in the following step.
k <- k[order(k$sim), ]
# remove all rows where the kwd1-kwd2 combo appears earlier in the data.frame
k <- k[! duplicated(apply(k[1:2], MARGIN=1, sort), MARGIN=2), ]
# assuming you don't want the rows where kwd1 and kwd2 are the same, remove them.
k <- subset(k, kwd1 != kwd2)
# set the NA values to 0
k[is.na(k)] <- 0

   kwd1 kwd2 sim
5     a    b 0.1
7     a    c 0.2
12    b    c 0.0
于 2012-09-12T22:28:12.230 回答