3

I have the following data frame named DF in r:

   1          2         3    
1  VW       Mercedes  Audi                                    
2  Porsche  BMW       VW                                                    
3  Audi     Honda     Toyota                                             
4  Dodge    Opel      VW                                     
5  Lexus    Volvo     BMW                                                      
6  Dodge    VW        Porsche 

i want to create a new dataframe (DF2) where each element of DF are the column names of new data frame and column names of DF are elements of DF2:

     Audi BMW Dodge Honda Lexus Mercedes Opel Porsche Toyota Volvo VW
1    3     0  0     0      0     2        0    0        0     0    1
2    0     2  0     0      0     0        0    1        0     0    3
3    1     0  0     2      0     0        0    0        3     0    0 
4    0     0  1     0      0     0        2    0        3     0    3     
5    0     3  0     0      1     0        0    0        0     2    0
6    0     0  1     0      0     0        0    3        0     0    2
4

4 回答 4

4

尝试这个:

names <- unique(unlist(df))
x <- sapply(names, function(x) apply(df, 1, function(y) names(df)[x==y]))
x[] <- as.numeric(x)
x[is.na(x)] <- 0
x
     VW Porsche Audi Dodge Lexus Mercedes BMW Honda Opel Volvo Toyota
[1,] 1  0       3    0     0     2        0   0     0    0     0     
[2,] 3  1       0    0     0     0        2   0     0    0     0     
[3,] 0  0       1    0     0     0        0   2     0    0     3     
[4,] 3  0       0    1     0     0        0   0     2    0     0     
[5,] 0  0       0    0     1     0        3   0     0    2     0     
[6,] 2  3       0    1     0     0        0   0     0    0     0  
于 2015-12-19T21:31:19.897 回答
2

这是另一个选项acastfromreshape2

library(reshape2)
acast(melt(as.matrix(df)), Var1~value, value.var='Var2', fill=0)
#  Audi BMW Dodge Honda Lexus Mercedes Opel Porsche Toyota Volvo VW
#1    3   0     0     0     0        2    0       0      0     0  1
#2    0   2     0     0     0        0    0       1      0     0  3
#3    1   0     0     2     0        0    0       0      3     0  0
#4    0   0     1     0     0        0    2       0      0     0  3
#5    0   3     0     0     1        0    0       0      0     2  0
#6    0   0     1     0     0        0    0       3      0     0  2
于 2015-12-20T04:51:44.133 回答
2

另一种选择:

library(tidyr)
library(dplyr)

DF %>%
  add_rownames() %>%
  gather(key, value, -rowname, convert = TRUE) %>% 
  spread(value, key, fill = 0) %>%
  select(-rowname)

这使:

#Source: local data frame [6 x 11]
#
#   Audi   BMW Dodge Honda Lexus Mercedes  Opel Porsche Toyota Volvo    VW
#  (dbl) (dbl) (dbl) (dbl) (dbl)    (dbl) (dbl)   (dbl)  (dbl) (dbl) (dbl)
#1     3     0     0     0     0        2     0       0      0     0     1
#2     0     2     0     0     0        0     0       1      0     0     3
#3     1     0     0     2     0        0     0       0      3     0     0
#4     0     0     1     0     0        0     2       0      0     0     3
#5     0     3     0     0     1        0     0       0      0     2     0
#6     0     0     1     0     0        0     0       3      0     0     2
于 2015-12-20T00:02:21.287 回答
1

这也有效:

DF <- read.table( text =
"  VW       Mercedes  Audi                                    
   Porsche  BMW       VW                                                    
   Audi     Honda     Toyota                                             
   Dodge    Opel      VW                                     
   Lexus    Volvo     BMW                                                      
   Dodge    VW        Porsche " )

DF1 <- apply(DF,1:2,as.character) # Convert factors to strings, if necessary.
cars <- sort(unique(c(as.matrix(DF1))))
DF2 <- data.frame( matrix(0,nrow(DF),length(cars)) )
colnames(DF2) <- cars
for ( i in 1:nrow(DF) ) { DF2[i,DF1[i,]] <- 1:ncol(DF) }

-loop 是无害的for,因为它没有在里面生长。

  Audi BMW Dodge Honda Lexus Mercedes Opel Porsche Toyota Volvo VW
1    3   0     0     0     0        2    0       0      0     0  1
2    0   2     0     0     0        0    0       1      0     0  3
3    1   0     0     2     0        0    0       0      3     0  0
4    0   0     1     0     0        0    2       0      0     0  3
5    0   3     0     0     1        0    0       0      0     2  0
6    0   0     1     0     0        0    0       3      0     0  2
> 

for-loop 更快。奇怪,不是吗?

library(microbenchmark)

mra68 <- function()
{
  DF1 <- apply(DF,1:2,as.character)
  cars <- sort(unique(c(as.matrix(DF1))))
    DF2 <- data.frame( matrix(0,nrow(DF),length(cars)) )
  colnames(DF2) <- cars
    for ( i in 1:nrow(DF) ) { DF2[i,DF1[i,]] <- 1:ncol(DF) }
    return( DF2 )
}

DatamineR <- function()
{
  names <- unique(unlist(DF))
  x <- sapply(names, function(x) apply(DF, 1, function(y) names(DF)[x==y]))
  x[] <- as.numeric(x)
  x[is.na(x)] <- 0
  return(x)
}

microbenchmark( mra68(), DatamineR() )

.

> microbenchmark( mra68(), DatamineR() )
Unit: milliseconds
        expr      min        lq     mean    median        uq       max neval
     mra68() 2.360912  4.618337  4.74136  4.738126  4.931509  8.496653   100
 DatamineR() 8.151552 16.083225 16.42256 16.284309 16.480636 20.860074   100
于 2015-12-19T22:29:50.237 回答