我有一个包含数据子集的 data.frame。这些子集由存储在名为 MASTStation 的列中的 ID 标识。这是我的数据中的一个示例。
> Dummy4
Key MASTStation Longitude Latitude z Pressure Temperature Salinity SigmaT SigmaTheta PAR ChlAFluor
1 1 CH-A01 -168.497 65.998 0 0 1.80255 32.53150 26.00931 26.00931 35.844 5.6362
2 2 CH-A01 -168.497 65.998 -1 1 1.80255 32.53150 26.00931 26.00932 35.844 5.6362
3 3 CH-A01 -168.497 65.998 -2 2 1.80255 32.53150 26.00942 26.00943 35.844 5.5869
4 4 CH-A01 -168.497 65.998 -3 3 1.80190 32.53240 26.00935 26.00936 21.383 5.8863
5 5 CH-A01 -168.497 65.998 -4 4 1.80220 32.53250 26.00929 26.00931 16.383 5.9729
6 782 CH-H02 -167.048 69.493 0 0 7.71730 31.10890 24.26078 24.26079 25.697 1.5285
7 783 CH-H02 -167.048 69.493 -1 1 7.71730 31.10890 24.26078 24.26081 25.697 1.5285
8 784 CH-H02 -167.048 69.493 -2 2 7.71730 31.10890 24.25936 24.25939 25.697 1.5127
9 785 CH-H02 -167.048 69.493 -3 3 7.72100 31.10885 24.26016 24.26021 22.246 1.4385
10 786 CH-H02 -167.048 69.493 -4 4 7.72125 31.11020 24.26036 24.26042 17.901 1.4320
我有一个要应用于每个子集的函数,所以我尝试像这样使用 by() :
by(Dummy4,list(Dummy4[,2]),calculate.mld,Dummy4[,9],Dummy4[,5])
它返回以下错误和回溯():
Error in FUN(data[x, , drop = FALSE], ...) :
sigma and z vectors must have equal length
11 stop("sigma and z vectors must have equal length")
10 FUN(data[x, , drop = FALSE], ...)
9 FUN(X[[1L]], ...)
8 lapply(X = split(X, group), FUN = FUN, ...)
7 tapply(seq_len(10L), list(structure(c(1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L), .Label = c("CH-A01", "CH-H02"), class = "factor")),
function (x)
FUN(data[x, , drop = FALSE], ...), simplify = TRUE)
6 eval(expr, envir, enclos)
5 eval(substitute(tapply(seq_len(nd), IND, FUNx, simplify = simplify)),
data)
4 structure(eval(substitute(tapply(seq_len(nd), IND, FUNx, simplify = simplify)),
data), call = match.call(), class = "by")
3 by.data.frame(Dummy4, list(Dummy4[, 2]), calculate.mld, Dummy4[,
9], Dummy4[, 5])
2 by(Dummy4, list(Dummy4[, 2]), calculate.mld, Dummy4[, 9], Dummy4[,
5])
1 as.matrix(by(Dummy4, list(Dummy4[, 2]), calculate.mld, Dummy4[,
9], Dummy4[, 5]))
所以问题出在我的功能上。但是,当我根据子集将矩阵拆分为两个单独的矩阵并在每个矩阵上运行函数时,没有问题。这是我的功能:
> calculate.mld
function(sigma, z, deltaSigma = 0.125)
{
# check that the number of sigma and z values are equal
if (length(sigma) != length(z))
{
stop('sigma and z vectors must have equal length')
}
# remove the na's
keepers <- !(is.na(z) | is.na(sigma))
z <- z[keepers]
sigma <- sigma[keepers]
# return an NA and a warning if there aren't at least two
# numeric values
if (length(sigma) < 2)
{
warning('fewer than two valid sigma and depth-value combinations entered, NA returned')
NA
} else {
# I use negative depths to be consistent with the Scripps database
if (all(z >= 0))
{
z <- z * -1
pos <- TRUE
} else {
pos <- FALSE
}
# be sure the data are sorted by depths
ord <- order(z, decreasing = TRUE)
z <- z[ord]
sigma <- sigma[ord]
#z <- sort(z, decreasing = TRUE)
#sigma <- sigma[order(z, decreasing = TRUE)]
# Manuscript uses a z of 10 m as the initial sigmaRef, but we will
# take the closest value in case the 10-m measurement is missing.
minDepth <- which(abs(z + 10) == min(abs(z + 10)))
minDepth <- ifelse(length(minDepth) > 1, minDepth[2], minDepth)
sigmaRef <- sigma[minDepth]
sigma <- sigma[minDepth:length(sigma)]
z <- z[minDepth:length(z)]
diffs <- abs(sigma - sigmaRef)
# if sigma never changes by at least deltaSigma, return the lowest depth
# Otherwise proceed
if (max(diffs, na.rm = TRUE) >= deltaSigma)
{
# the uniform region, if present, occurs where the change between
# any two points is <= deltaSigma * 1/10, and the change in sigma over
# the profile has not yet exceeded deltaSigma
uniformRegion <- (abs(sigma[2:length(sigma)] -
sigma[1:(length(sigma) - 1)]) <=
(deltaSigma / 10)) &
(diffs[2:length(diffs)] < deltaSigma)
if (any(uniformRegion))
{
sigmaRefPos <- max(which(uniformRegion))
# change sigmaRef to the base of the uniform region
sigmaRef <- sigma[sigmaRefPos]
# calculate change from the new reference
reachedDeltaSigma <- abs(sigma[(sigmaRefPos + 1):length(sigma)] - sigmaRef) >= deltaSigma
# if any deeper measurements of sigma reach or exceed deltaSigma,
# linearly interpolate between the nearest points to find
# mixed-layer depth
if (any(reachedDeltaSigma))
{
pair <- min(which(reachedDeltaSigma)) + sigmaRefPos - 1
linmod <- lm(z[c(pair, pair + 1)] ~ sigma[c(pair, pair + 1)])
mld <- as.vector(linmod$coefficients[1] +
linmod$coefficients[2] *
(sigmaRef + deltaSigma))
} else {
# otherwise, mixed-layer depth is the deepest point
mld <- min(z)
}
} else {
# if there is no uniform region, just linearly interpolate mld
pair <- min(which(diffs >= deltaSigma)) - 1
linmod <- lm(z[c(pair, pair + 1)] ~ sigma[c(pair, pair + 1)])
mld <- as.vector(linmod$coefficients[1] +
linmod$coefficients[2] *
(sigmaRef + deltaSigma))
}
} else {
mld <- min(z)
}
if (pos) mld <- abs(mld)
mld
}
}
我不明白为什么在使用 by() 运行函数时两个变量列不再具有相等的长度。我确信这是一个足够简单的解决方案,但我没有看到它。有任何想法吗?
我是 R 的初学者,所以请多多包涵。