1

我注意到通过 stackoverflow 搜索类似的问题,这些问题已经被问过好几次了,但实际上并没有得到正确的回答。也许在其他用户的帮助下,这篇文章可以成为编程多元正态分布参数的数值估计的有用指南。

我知道我知道!封闭形式的解决方案可用且易于实施。就我而言,我有兴趣为特定目的修改似然函数,并且我不希望有一个精确的分析解决方案,所以这是一个检查程序的测试用例。

所以这是我的尝试。请评论。特别是如果我错过了优化的机会。请注意,我不是统计学家,所以我会很感激任何指针。

ll_multN <- function(theta,X) {

  # theta = c(mu, diag(Sigma), Sigma[upper.tri(Sigma)])
  # X is an nxk dataset

  # MLE: L = - (nk/2)*log(2*pi) - (n/2)*log(det(Sigma)) - (1/2)*sum_i(t(X_i-mu)^2 %*% Sigma^-1 %*% (X_i-mu)^2)
  # summation over i is performed using a apply call for efficiency

  n <- nrow(X)
  k <- ncol(X)

  # def mu
  mu.vec <- theta[1:k]

  # def Sigma
  Sigma.diag <- theta[(k+1):(2*k)]
  Sigma.offd <- theta[(2*k+1):length(theta)]
  Sigma <- matrix(NA, k, k)
  Sigma[upper.tri(Sigma)] <- Sigma.offd
  Sigma <- t(Sigma)
  Sigma[upper.tri(Sigma)] <- Sigma.offd

  diag(Sigma) <- Sigma.diag

  # compute summation 
  sum_i <- sum(apply(X, 1, function(x) (matrix(x,1,k)-mu.vec)%*%solve(Sigma)%*%t(matrix(x,1,k)-mu.vec)))

  # compute log likelihood
  logl <- -.5*n*k*log(2*pi) - .5*n*log(det(Sigma)) 
  logl <- logl - .5*sum_i
  return(-logl)
}

rmvnorm()使用包“mvtnorm”中的函数生成的模拟数据集。使用附加函数生成的随机正定协方差矩阵Posdef()(取自此处:https ://stat.ethz.ch/pipermail/r-help/2008-February/153708 )

library(mvtnorm)

Posdef <- function (n, ev = runif(n, 0, 5)) { 
  # generates a random positive definite covariance matrix
  Z <- matrix(ncol=n, rnorm(n^2))
  decomp <- qr(Z)
  Q <- qr.Q(decomp) 
  R <- qr.R(decomp)
  d <- diag(R)
  ph <- d / abs(d)
  O <- Q %*% diag(ph)
  Z <- t(O) %*% diag(ev) %*% O
  return(Z)
}

set.seed(2)

n <- 1000 # number of data points
k <- 3 # number of variables
mu.tru <- sample(0:3, k, replace=T) # random mean vector
Sigma.tru <- Posdef(k) # random covariance matrix
eigen(Sigma.tru)$val # check positive def (all lambda > 0)

# Generate simulated dataset
X <- rmvnorm(n, mean=mu.tru, sigma=Sigma.tru)

# initial parameter values
pars.init <- c(mu=rep(0,k), sig_ii=rep(1,k), sig_ij=rep(0, k*(k-1)/2))

# limits for optimization algorithm
eps <- .Machine$double.eps  # get a small value for bounding the paramter space to avoid things such as log(0).
lower.bound <- c(rep(-Inf,k), # bound on mu
                 rep(eps,k), # bound on sigma_ii
                 rep(-Inf,k)) # bound on sigma_ij i=/=j
upper.bound <- c(rep(Inf,k), # bound on mu
                 rep(100,k), # bound on sigma_ii
                 rep(100,k)) # bound on sigma_ij i=/=j

system.time(
  o <- optim(pars.init,
             ll_multN, X=X, method="L-BFGS-B",
             lower = lower.bound,
             upper = upper.bound)  
)

plot(x=c(mu.tru,diag(Sigma.tru),Sigma.tru[upper.tri(Sigma.tru)]),
     y=o$par,
     xlab="Parameter",
     ylab="Estimate",
     pch=20)
abline(c(0,1), col="red", lty=2)

这目前在我的笔记本电脑上运行

   user  system elapsed 
 47.852  24.014  24.611

并给出这个图形输出: 估计均值和方差

特别是关于限制设置或算法选择的任何建议将不胜感激。

谢谢

4

0 回答 0