1

我有一个包含数据子集的 data.frame。这些子集由存储在名为 MASTStation 的列中的 ID 标识。这是我的数据中的一个示例。

> Dummy4
   Key MASTStation Longitude Latitude  z Pressure Temperature Salinity   SigmaT SigmaTheta    PAR ChlAFluor
1    1      CH-A01  -168.497   65.998  0        0     1.80255 32.53150 26.00931   26.00931 35.844    5.6362
2    2      CH-A01  -168.497   65.998 -1        1     1.80255 32.53150 26.00931   26.00932 35.844    5.6362
3    3      CH-A01  -168.497   65.998 -2        2     1.80255 32.53150 26.00942   26.00943 35.844    5.5869
4    4      CH-A01  -168.497   65.998 -3        3     1.80190 32.53240 26.00935   26.00936 21.383    5.8863
5    5      CH-A01  -168.497   65.998 -4        4     1.80220 32.53250 26.00929   26.00931 16.383    5.9729
6  782      CH-H02  -167.048   69.493  0        0     7.71730 31.10890 24.26078   24.26079 25.697    1.5285
7  783      CH-H02  -167.048   69.493 -1        1     7.71730 31.10890 24.26078   24.26081 25.697    1.5285
8  784      CH-H02  -167.048   69.493 -2        2     7.71730 31.10890 24.25936   24.25939 25.697    1.5127
9  785      CH-H02  -167.048   69.493 -3        3     7.72100 31.10885 24.26016   24.26021 22.246    1.4385
10 786      CH-H02  -167.048   69.493 -4        4     7.72125 31.11020 24.26036   24.26042 17.901    1.4320

我有一个要应用于每个子集的函数,所以我尝试像这样使用 by() :

by(Dummy4,list(Dummy4[,2]),calculate.mld,Dummy4[,9],Dummy4[,5])

它返回以下错误和回溯():

Error in FUN(data[x, , drop = FALSE], ...) : 
  sigma and z vectors must have equal length 
11 stop("sigma and z vectors must have equal length") 
10 FUN(data[x, , drop = FALSE], ...) 
9 FUN(X[[1L]], ...) 
8 lapply(X = split(X, group), FUN = FUN, ...) 
7 tapply(seq_len(10L), list(structure(c(1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L), .Label = c("CH-A01", "CH-H02"), class = "factor")), 
    function (x) 
    FUN(data[x, , drop = FALSE], ...), simplify = TRUE) 
6 eval(expr, envir, enclos) 
5 eval(substitute(tapply(seq_len(nd), IND, FUNx, simplify = simplify)), 
    data) 
4 structure(eval(substitute(tapply(seq_len(nd), IND, FUNx, simplify = simplify)), 
    data), call = match.call(), class = "by") 
3 by.data.frame(Dummy4, list(Dummy4[, 2]), calculate.mld, Dummy4[, 
    9], Dummy4[, 5]) 
2 by(Dummy4, list(Dummy4[, 2]), calculate.mld, Dummy4[, 9], Dummy4[, 
    5]) 
1 as.matrix(by(Dummy4, list(Dummy4[, 2]), calculate.mld, Dummy4[, 
    9], Dummy4[, 5])) 

所以问题出在我的功能上。但是,当我根据子集将矩阵拆分为两个单独的矩阵并在每个矩阵上运行函数时,没有问题。这是我的功能:

> calculate.mld
function(sigma, z, deltaSigma = 0.125) 
    { 
        # check that the number of sigma and z values are equal 
        if (length(sigma) != length(z)) 
        { 
            stop('sigma and z vectors must have equal length') 
        }
        # remove the na's 
        keepers <- !(is.na(z) | is.na(sigma)) 
        z <- z[keepers] 
        sigma <- sigma[keepers] 
        # return an NA and a warning if there aren't at least two 
        # numeric values 
        if (length(sigma) < 2) 
        { 
            warning('fewer than two valid sigma and depth-value combinations entered, NA returned') 
            NA 
        } else { 
            # I use negative depths to be consistent with the Scripps database 
            if (all(z >= 0)) 
            { 
                z <- z * -1 
                pos <- TRUE 
            } else { 
                pos <- FALSE 
            } 
            # be sure the data are sorted by depths 
            ord <- order(z, decreasing = TRUE) 
            z <- z[ord] 
            sigma <- sigma[ord] 
            #z <- sort(z, decreasing = TRUE) 
            #sigma <- sigma[order(z, decreasing = TRUE)] 
            # Manuscript uses a z of 10 m as the initial sigmaRef, but we will 
            # take the closest value in case the 10-m measurement is missing. 
            minDepth <- which(abs(z + 10) == min(abs(z + 10))) 
            minDepth <- ifelse(length(minDepth) > 1, minDepth[2], minDepth) 
            sigmaRef <- sigma[minDepth] 
            sigma <- sigma[minDepth:length(sigma)] 
            z <- z[minDepth:length(z)] 
            diffs <- abs(sigma - sigmaRef) 
            # if sigma never changes by at least deltaSigma, return the lowest depth 
            # Otherwise proceed 
            if (max(diffs, na.rm = TRUE) >= deltaSigma) 
            { 
                # the uniform region, if present, occurs where the change between 
                # any two points is <= deltaSigma * 1/10, and the change in sigma over 
                # the profile has not yet exceeded deltaSigma 
                uniformRegion <- (abs(sigma[2:length(sigma)] - 
                                          sigma[1:(length(sigma) - 1)]) <= 
                                      (deltaSigma / 10)) & 
                    (diffs[2:length(diffs)] < deltaSigma) 
                if (any(uniformRegion)) 
                { 
                    sigmaRefPos <- max(which(uniformRegion)) 
                    # change sigmaRef to the base of the uniform region 
                    sigmaRef <- sigma[sigmaRefPos] 
                    # calculate change from the new reference 
                    reachedDeltaSigma <- abs(sigma[(sigmaRefPos + 1):length(sigma)] - sigmaRef) >= deltaSigma 
                    # if any deeper measurements of sigma reach or exceed deltaSigma, 
                    # linearly interpolate between the nearest points to find 
                    # mixed-layer depth 
                    if (any(reachedDeltaSigma)) 
                    { 
                        pair <- min(which(reachedDeltaSigma)) + sigmaRefPos - 1 
                        linmod <- lm(z[c(pair, pair + 1)] ~ sigma[c(pair, pair + 1)]) 
                        mld <- as.vector(linmod$coefficients[1] + 
                                             linmod$coefficients[2] * 
                                             (sigmaRef + deltaSigma)) 
                    } else { 
                        # otherwise, mixed-layer depth is the deepest point 
                        mld <- min(z) 
                    } 
                } else { 
                    # if there is no uniform region, just linearly interpolate mld 
                    pair <- min(which(diffs >= deltaSigma)) - 1 
                    linmod <- lm(z[c(pair, pair + 1)] ~ sigma[c(pair, pair + 1)]) 
                    mld <- as.vector(linmod$coefficients[1] + 
                                         linmod$coefficients[2] * 
                                         (sigmaRef + deltaSigma)) 
                } 
            } else { 
                mld <- min(z) 
            } 
            if (pos) mld <- abs(mld) 
            mld 
        } 
    }

我不明白为什么在使用 by() 运行函数时两个变量列不再具有相等的长度。我确信这是一个足够简单的解决方案,但我没有看到它。有任何想法吗?

我是 R 的初学者,所以请多多包涵。

4

1 回答 1

2

这就是您可以by在这里使用的方式。您需要一个接受 data.frame 作为参数的函数:

by(Dummy4, list(Dummy4$MASTStation), function(df) calculate.mld(df$SigmaT, df$z))
#: CH-A01
#[1] -4
#----------------------------------------------------------------------------------------------------------- 
#: CH-H02
#[1] -4

我更喜欢使用包 data.table:

library(data.table)
setDT(Dummy4)
Dummy4[, .(MLD = calculate.mld(SigmaT, z)), by = MASTStation]
#   MASTStation MLD
#1:      CH-A01  -4
#2:      CH-H02  -4

可重现的数据(使用 创建dput):

Dummy4 <- structure(list(Key = c(1L, 2L, 3L, 4L, 5L, 782L, 783L, 784L, 
785L, 786L), MASTStation = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L), .Label = c("CH-A01", "CH-H02"), class = "factor"), 
    Longitude = c(-168.497, -168.497, -168.497, -168.497, -168.497, 
    -167.048, -167.048, -167.048, -167.048, -167.048), Latitude = c(65.998, 
    65.998, 65.998, 65.998, 65.998, 69.493, 69.493, 69.493, 69.493, 
    69.493), z = c(0L, -1L, -2L, -3L, -4L, 0L, -1L, -2L, -3L, 
    -4L), Pressure = c(0L, 1L, 2L, 3L, 4L, 0L, 1L, 2L, 3L, 4L
    ), Temperature = c(1.80255, 1.80255, 1.80255, 1.8019, 1.8022, 
    7.7173, 7.7173, 7.7173, 7.721, 7.72125), Salinity = c(32.5315, 
    32.5315, 32.5315, 32.5324, 32.5325, 31.1089, 31.1089, 31.1089, 
    31.10885, 31.1102), SigmaT = c(26.00931, 26.00931, 26.00942, 
    26.00935, 26.00929, 24.26078, 24.26078, 24.25936, 24.26016, 
    24.26036), SigmaTheta = c(26.00931, 26.00932, 26.00943, 26.00936, 
    26.00931, 24.26079, 24.26081, 24.25939, 24.26021, 24.26042
    ), PAR = c(35.844, 35.844, 35.844, 21.383, 16.383, 25.697, 
    25.697, 25.697, 22.246, 17.901), ChlAFluor = c(5.6362, 5.6362, 
    5.5869, 5.8863, 5.9729, 1.5285, 1.5285, 1.5127, 1.4385, 1.432
    )), .Names = c("Key", "MASTStation", "Longitude", "Latitude", 
"z", "Pressure", "Temperature", "Salinity", "SigmaT", "SigmaTheta", 
"PAR", "ChlAFluor"), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8", "9", "10"))
于 2015-10-07T09:47:14.357 回答