r - 如何在 r 中创建设计矩阵

Question

我有两个因素。因素 A 有 2 级，因素 B 有 3 级。

如何创建以下设计矩阵？

     factorA1 factorA2 factorB1 factorB2 factorB3
[1,]        1        0        1        0        0
[2,]        1        0        0        1        0
[3,]        1        0        0        0        1
[4,]        0        1        1        0        0
[5,]        0        1        0        1        0
[6,]        0        1        0        0        1

score 4 · Accepted Answer

你有几个选择：

使用 base 并自己拼凑：

(iris.dummy<-with(iris,model.matrix(~Species-1))) 
(IRIS<-data.frame(iris,iris.dummy))

或者使用ade4包如下：

dummy <- function(df) {
    require(ade4)
    ISFACT <- sapply(df, is.factor)
    FACTS <- acm.disjonctif(df[, ISFACT, drop = FALSE])
    NONFACTS <- df[, !ISFACT,drop = FALSE]
    data.frame(NONFACTS, FACTS)
}

dat <-data.frame(eggs = c("foo", "foo", "bar", "bar"),  
    ham = c("red","blue","green","red"), x=rnorm(4)) 
dummy(dat)



##           x eggs.bar eggs.foo ham.blue ham.green ham.red
## 1 0.3365302        0        1        0         0       1
## 2 1.1341354        0        1        1         0       0
## 3 2.0489741        1        0        0         1       0
## 4 1.1019108        1        0        0         0       1

score 1 · Accepted Answer

model.matrix是lm和其他人在后台使用的为您转换的过程。

dat <- data.frame(f1=sample(LETTERS[1:3],20,T),f2=sample(LETTERS[4:5],20,T),id=1:20)
dat

model.matrix(~dat$f1 + dat$f2)

它将 INTERCEPT 变量创建为一列 1，但如果需要，您可以轻松删除它。

model.matrix(~dat$f1 + dat$f2)[,-1]

编辑：现在我看到这与其他评论之一基本相同，但更简洁。

score 1 · Accepted Answer

假设您的数据位于名为的 data.frame 中dat，假设这两个因素在此示例中给出：

> dat <- data.frame(f1=sample(LETTERS[1:3],20,T),f2=sample(LETTERS[4:5],20,T),id=1:20)
> dat
   f1 f2 id
1   C  D  1
2   B  E  2
3   B  E  3
4   A  D  4
5   C  E  5
6   C  E  6
7   C  D  7
8   B  E  8
9   C  D  9
10  A  D 10
11  B  E 11
12  C  E 12
13  B  D 13
14  B  E 14
15  A  D 15
16  C  E 16
17  C  D 17
18  C  D 18
19  B  D 19
20  C  D 20
> dat$f1
 [1] C B B A C C C B C A B C B B A C C C B C
Levels: A B C
> dat$f2
 [1] D E E D E E D E D D E E D E D E D D D D
Levels: D E

对于每个因素，您可以使用outer如您所展示的那样获取矩阵：

> F1 <- with(dat, outer(f1, levels(f1), `==`)*1)
> colnames(F1) <- paste("f1",sep="=",levels(dat$f1))
> F1
      f1=A f1=B f1=C
 [1,]    0    0    1
 [2,]    0    1    0
 [3,]    0    1    0
 [4,]    1    0    0
 [5,]    0    0    1
 [6,]    0    0    1
 [7,]    0    0    1
 [8,]    0    1    0
 [9,]    0    0    1
[10,]    1    0    0
[11,]    0    1    0
[12,]    0    0    1
[13,]    0    1    0
[14,]    0    1    0
[15,]    1    0    0
[16,]    0    0    1
[17,]    0    0    1
[18,]    0    0    1
[19,]    0    1    0
[20,]    0    0    1

现在对第二个因素做同样的事情：

> F2 <- with(dat, outer(f2, levels(f2), `==`)*1)
> colnames(F2) <- paste("f2",sep="=",levels(dat$f2))

他们cbind得到最终结果：

> cbind(F1,F2)

score 0 · Accepted Answer

扩展和概括@Ferdinand.kraft 的答案：

dat <- data.frame(
    f1 = sample(LETTERS[1:3], 20, TRUE),
    f2 = sample(LETTERS[4:5], 20, TRUE),
    row.names = paste0("id_", 1:20))

covariates <- c("f1", "f2") # in case you have other columns that you don't want to include in the design matrix
design <- do.call(cbind, lapply(covariates, function(covariate){
    apply(outer(dat[[covariate]], unique(dat[[covariate]]), FUN = "=="), 2, as.integer)
}))
rownames(design) <- rownames(dat)
colnames(design) <- unlist(sapply(covariates, function(covariate) unique(dat[[covariate]])))
design <- design[, !duplicated(colnames(design))] # duplicated colnames happen sometimes
design
#       C A B D E
# id_1  1 0 0 1 0
# id_2  0 1 0 1 0
# id_3  0 0 1 1 0
# id_4  1 0 0 1 0
# id_5  0 1 0 1 0
# id_6  0 1 0 0 1
# id_7  0 0 1 0 1

score 0 · Accepted Answer

model.matrix()如果我们添加协变量和与因素的x相互作用，事情会再次变得复杂。x

a=rep(1:2,3)
b=rep(1:3,2)
x=1:6
df=data.frame(A=a,B=b,x=x)
# Lie and pretend there's a level 0 in each factor.
df$A=factor(a,as.character(0:2))

df$B=factor(b,as.character(0:3))

mm=model.matrix (~A + B + A:x + B:x,df)

print(mm)

  (Intercept) A1 A2 B1 B2 B3 A0:x A1:x A2:x B1:x B2:x B3:x
1           1  1  0  1  0  0    0    1    0    1    0    0
2           1  0  1  0  1  0    0    0    2    0    2    0
3           1  1  0  0  0  1    0    3    0    0    0    3
4           1  0  1  1  0  0    0    0    4    4    0    0
5           1  1  0  0  1  0    0    5    0    0    5    0
6           1  0  1  0  0  1    0    0    6    0    0    6

mm截距也是如此，但现在A:x交互项具有不需要的级别A0:x 如果我们将 x 作为单独的项重新引入，我们将取消该不需要的级别

mm2=model.matrix (~ x + A +   B + A:x + B:x, df)
print(mm2)
  (Intercept) x A1 A2 B1 B2 B3 x:A1 x:A2 x:B1 x:B2 x:B3
1           1 1  1  0  1  0  0    1    0    1    0    0
2           1 2  0  1  0  1  0    0    2    0    2    0
3           1 3  1  0  0  0  1    3    0    0    0    3
4           1 4  0  1  1  0  0    0    4    4    0    0
5           1 5  1  0  0  1  0    5    0    0    5    0
6           1 6  0  1  0  0  1    0    6    0    0    6

我们可以摆脱不需要的拦截和不需要的裸x词

dm2=as.matrix(mm2[,c(-1,-2)])
print(dm2)

  A1 A2 B1 B2 B3 x:A1 x:A2 x:B1 x:B2 x:B3
1  1  0  1  0  0    1    0    1    0    0
2  0  1  0  1  0    0    2    0    2    0
3  1  0  0  0  1    3    0    0    0    3
4  0  1  1  0  0    0    4    4    0    0
5  1  0  0  1  0    5    0    0    5    0
6  0  1  0  0  1    0    6    0    0    6

score 0 · Accepted Answer

模型矩阵只允许对公式中的第一个因素进行所谓的“虚拟”编码。如果存在拦截，则它扮演该角色。为了获得冗余索引矩阵的预期效果（对应因子级别的每一列中都有 1，其他地方为 0），您可以撒谎model.matrix()并假装有一个额外的级别。然后修剪截距柱。


> a=rep(1:2,3)
> b=rep(1:3,2)
> df=data.frame(A=a,B=b)
> # Lie and pretend there's a level 0 in each factor.
> df$A=factor(a,as.character(0:2))

> df$B=factor(b,as.character(0:3))

> mm=model.matrix (~A+B,df)

> mm
  (Intercept) A1 A2 B1 B2 B3
1           1  1  0  1  0  0
2           1  0  1  0  1  0
3           1  1  0  0  0  1
4           1  0  1  1  0  0
5           1  1  0  0  1  0
6           1  0  1  0  0  1
attr(,"assign")
[1] 0 1 1 2 2 2
attr(,"contrasts")
attr(,"contrasts")$A
[1] "contr.treatment"
attr(,"contrasts")$B
[1] "contr.treatment"

> # mm has an intercept column not requested, so kill it
> dm=as.matrix(mm[,-1])
> dm
  A1 A2 B1 B2 B3
1  1  0  1  0  0
2  0  1  0  1  0
3  1  0  0  0  1
4  0  1  1  0  0
5  1  0  0  1  0
6  0  1  0  0  1

> # You can also add interactions
> mm2=model.matrix (~A*B,df)
> dm2=as.matrix(mm2[,-1])
> dm2
  A1 A2 B1 B2 B3 A1:B1 A2:B1 A1:B2 A2:B2 A1:B3 A2:B3
1  1  0  1  0  0     1     0     0     0     0     0
2  0  1  0  1  0     0     0     0     1     0     0
3  1  0  0  0  1     0     0     0     0     1     0
4  0  1  1  0  0     0     1     0     0     0     0
5  1  0  0  1  0     0     0     1     0     0     0
6  0  1  0  0  1     0     0     0     0     0     1

r - 如何在 r 中创建设计矩阵

6 回答 6

Related

Reference