-9

我正在使用“共享模型”来估计缺失观测值的值。使用示例数据集my.data,我将根据 1970 年的观测分布情况按比例填充三年中每年的缺失观测值(尽管我可以使用 2010 年或 1970 年和 2010 年两者来完成)。

下面我介绍了示例数据、期望的结果和代码,以两种方式获得期望的估计。第一种方法的代码非常特定于这个例子。我希望创建一个比第二种方法更通用的功能。在我看来,创建一个更通用的函数需要在列表列表上调用一个函数。我希望有人可以就如何将函数应用于列表列表提供建议。

这是示例数据集和高度具体的解决方案:

my.data <- read.table(text = '
 county  y1970  y1980  y1990  y2000  y2010
   aa      50     NA     70     NA     500
   cc      10     20     NA     NA     100
   ee     800     NA     NA    400    8000
   gg    1000   1900     NA     NA   10000
   ii     200    400    300    100    2000
   kk      20     40     30     NA     200
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)

my.total <- read.table(text = '
   county  y1970  y1980  y1990  y2000  y2010
   total    2080   4000   3000   1000  20800
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)

desired.result <- read.table(text = '
 county  y1970  y1980         y1990       y2000       y2010
   aa      50  96.47059         70      23.148148       500
   cc      10     20         14.36464    4.629630       100
   ee     800   1543.529   1149.17127      400         8000
   gg    1000   1900       1436.46409   462.962963    10000
   ii     200    400           300         100         2000
   kk      20     40            30       9.259259       200
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)

x70 <- c(50, 800)
estimates.for.80 <- (x70 / sum(x70)) * (my.total$y1980 - sum(my.data$y1980, na.rm = TRUE))

x80 <- c(10, 800, 1000)
estimates.for.90 <- (x80 / sum(x80)) * (my.total$y1990 - sum(my.data$y1990, na.rm = TRUE))

x90 <- c(50, 10, 1000, 20)
estimates.for.00 <- (x90 / sum(x90)) * (my.total$y2000 - sum(my.data$y2000, na.rm = TRUE))

这是功能。d.counties如果我知道如何将其作为输入列表包含到函数中,我认为这可以概括。换句话说,我怎样才能包含d.countiesmy.input仍然使该功能正常工作?我认为我的困惑源于d.counties不同年份的长度。

state <- 'my.state'

my.df <- read.table(text = '
  county   y1970  y1980  y1990  y2000   y2010
      aa      50     NA     70     NA     500
      cc      10     20     NA     NA     100
      ee     800     NA     NA    400    8000
      gg    1000   1900     NA     NA   10000
      ii     200    400    300    100    2000
      kk      20     40     30     NA     200
   total    2080   4000   3000   1000   20800
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)

pre.divide.up <- tail(my.df[,2:ncol(my.df)], 1) - colSums(head(my.df[,2:ncol(my.df)], -1), na.rm = TRUE)

# For each column containing NA's define the years to use as shares
# If use.years = 'pre'  then use the year in pre.year
# If use.years = 'post' then use the year in post.year
# If use.years = 'both' then use both the year in pre.year and the year in post.year
#
# Here I define pre.year = y1970 and post.year = 2010 for every year
# However, 'pre.year' and 'post.year' are variables.  They can differ among rows below.
shares <- read.table(text = '
      cyear   pre.year  post.year  use.years
      y1980     y1970     y2010        pre
      y1990     y1970     y2010        pre
      y2000     y1970     y2010        pre
', header = TRUE, na.strings = "NA")

d.counties.80 <- c( 'aa' ,
                    'ee' )

d.counties.90 <- c( 'cc' ,
                    'ee' , 
                    'gg' )

d.counties.00 <- c( 'aa' ,
                    'cc' ,
                    'gg' ,
                    'kk' )

d.counties <- list(d.counties.80, d.counties.90, d.counties.00)

my.input <- data.frame(shares)

my.function <- function(y) {

# extract years of interest from my.df and store in data.frame called year.data
if(y[[4]] != 'last') year.data = my.df[names(my.df) %in% c("county", y[[2]], y[[1]], y[[3]])]
if(y[[4]] == 'last') year.data = my.df[names(my.df) %in% c("county", y[[2]], y[[1]]        )]

# subset counties in year.data to only include counties with NA's in current year
if(as.numeric(substr(y[1], 2, 5)) == 1980) year.data = year.data[year.data$county %in% d.counties.80,]
if(as.numeric(substr(y[1], 2, 5)) == 1990) year.data = year.data[year.data$county %in% d.counties.90,]
if(as.numeric(substr(y[1], 2, 5)) == 2000) year.data = year.data[year.data$county %in% d.counties.00,]

# reorder columns in year.data
if(y[[4]] != 'last') year.data = year.data[, c('county', y[[2]], y[[1]], y[[3]])]
if(y[[4]] == 'last') year.data = year.data[, c('county', y[[2]], y[[1]]        )]

# values to be divided, or distributed, among counties with NA's in the current year
divide.up <- pre.divide.up[, y[[1]]] 

# sum values from designated pre and/or post years and bind those totals to bottom of year.data
if(y[[4]] != 'last') colsums.year = data.frame('total', as.data.frame(t(as.numeric(colSums(year.data[,c(2:4)], na.rm=TRUE)))))
if(y[[4]] == 'last') colsums.year = data.frame('total', as.data.frame(t(as.numeric(colSums(year.data[,c(2:3)], na.rm=TRUE)))))
names(colsums.year) <- names(year.data)
year.data.b <- rbind(year.data, colsums.year)

# obtain percentages in designated pre and/or post years for counties with NA's in current year
year.data.c <- year.data.b
year.data.c[, -1] <- lapply( year.data.c[  , -1], function(x){ x/x[nrow(year.data.b)] } )

# estimate county values for current year by distributing total missing values in current year
# according to how values were distributed in those same counties in other years
if(y[[4]] == 'both') year.data.b[, y[[1]]] = rowMeans(data.frame(year.data.c[, y[[2]]], year.data.c[, y[[3]]])) * as.numeric(divide.up)
if(y[[4]] ==  'pre') year.data.b[, y[[1]]] = year.data.c[, y[[2]]] * as.numeric(divide.up)
if(y[[4]] == 'post') year.data.b[, y[[1]]] = year.data.c[, y[[3]]] * as.numeric(divide.up)
if(y[[4]] == 'last') year.data.b[, y[[1]]] = year.data.c[, y[[2]]] * as.numeric(divide.up)

# extract estimates for current year along with the county column, then remove the last row
year.data.last <- year.data.b[names(year.data.b) %in% c("county", y[[1]])]
year.data.last <- year.data.last[-nrow(year.data.last),]
colnames(year.data.last) <- c('county', 'acreage')

# create a data set for export
this.year <- rep(as.numeric(substr(y[[1]], 2, 5)), nrow(year.data.last))
revised.data <- data.frame(state, this.year, year.data.last)
return(revised.data) 
}

my.list  <- apply(shares, 1, function(y) my.function(y))
my.list2 <- do.call("rbind", my.list)
my.list2

      state this.year county     acreage
1  my.state      1980     aa   96.470588
3  my.state      1980     ee 1543.529412
2  my.state      1990     cc   14.364641
31 my.state      1990     ee 1149.171271
4  my.state      1990     gg 1436.464088
11 my.state      2000     aa   23.148148
21 my.state      2000     cc    4.629630
41 my.state      2000     gg  462.962963
6  my.state      2000     kk    9.259259

尽管此函数不像我在下面的答案中那样通用,但上面的函数确实允许明确指定哪些县具有相关的缺失值。在实际数据中,有两种类型的缺失值,下面我的答案中的函数无法区分这两种类型。上面的函数可以区分它们,因为我准确地告诉它每年要考虑哪些县。

再次感谢您的任何建议和已经提供的建议。

4

3 回答 3

4

我认为您的整个问题可以用几行来概括。这太长了。如果您的问题确实是,如标题所述,将函数应用于列表列表,那么您需要递归地应用函数。有一个结构可以做到这一点,它是rapply

w <- 1:5
x <- 1:5
y <- 6:10
z <- 6:10

ll <- list( list( w , x) , list( y , z) )
str(ll)
List of 2
 $ :List of 2
  ..$ : int [1:5] 1 2 3 4 5
  ..$ : int [1:5] 1 2 3 4 5
 $ :List of 2
  ..$ : int [1:5] 6 7 8 9 10
  ..$ : int [1:5] 6 7 8 9 10

rapply( ll , mean )
[1] 3 3 8 8

作为一个建议,基本上你可以把你的问题归结为......

我有这个列表列表,但是当我尝试使用 lapply 时它不起作用......

lapply( ll , mean )
[[1]]
[1] NA

[[2]]
[1] NA

Warning messages:
1: In mean.default(X[[1L]], ...) :
  argument is not numeric or logical: returning NA
2: In mean.default(X[[2L]], ...) :
  argument is not numeric or logical: returning NA
于 2013-08-02T23:58:23.867 回答
2

这个问题搞砸了。我将尝试先解释您的问题,然后再回答,以便更容易确定我是否理解您的问题。

你的问题的要点:

好的,在筛选完您的问题以了解所需内容后,我发现您有一个 data.frame my.data

  county y1970 y1980 y1990 y2000 y2010
1     aa    50    NA    70    NA   500
2     cc    10    20    NA    NA   100
3     ee   800    NA    NA   400  8000
4     gg  1000  1900    NA    NA 10000
5     ii   200   400   300   100  2000
6     kk    20    40    30    NA   200

还有一个my.total

  county y1970 y1980 y1990 y2000 y2010
1  total  2080  4000  3000  1000 20800

你想要的是(desired.result):

  county y1970      y1980      y1990      y2000 y2010
1     aa    50   96.47059   70.00000  23.148148   500
2     cc    10   20.00000   14.36464   4.629630   100
3     ee   800 1543.52900 1149.17127 400.000000  8000
4     gg  1000 1900.00000 1436.46409 462.962963 10000
5     ii   200  400.00000  300.00000 100.000000  2000
6     kk    20   40.00000   30.00000   9.259259   200

标准:

我理解您的标准的方式是,对于其中包含 NA 的每个数字/整数列my.data,获取与这些 NA 条目相对应的 1970 列的条目,并用公式替换这些 NA 条目:

vals <- corresponding entries in column 1970
this column's NA's <- vals/sum(vals) * (my.total of this column - 
                    sum(this.column, na.rm=TRUE))

可能的解决方案:

我认为这里不需要“列表列表”。这是使用简单 for 循环的一种方法(因为它在概念上更容易解决这个问题)。这是因为您想修改 data.frame 某些列中的某些项目。df将是desired.result.

df <- my.data
for (i in which(colSums(is.na(df), na.rm=TRUE) > 0)) {
    idx <- which(is.na(df[[i]]))
    xx <- df[["y1970"]][idx]
    df[[i]][idx] <- (xx/sum(xx)) * (my.total[[i]] - sum(df[[i]], na.rm=TRUE))
}

  county y1970      y1980      y1990      y2000 y2010
1     aa    50   96.47059   70.00000  23.148148   500
2     cc    10   20.00000   14.36464   4.629630   100
3     ee   800 1543.52941 1149.17127 400.000000  8000
4     gg  1000 1900.00000 1436.46409 462.962963 10000
5     ii   200  400.00000  300.00000 100.000000  2000
6     kk    20   40.00000   30.00000   9.259259   200
于 2013-08-03T06:11:15.460 回答
0

在我原来的帖子中,我询问了如何在d.counties不使用一系列特定if语句的情况下将列表作为输入包含到函数中。这是我想出的解决方案。

步骤 1. 创建列表d.counties以便保留名称:

d.counties.1980 <- c( 'aa' ,
                      'ee' )

d.counties.1990 <- c( 'cc' ,
                      'ee' , 
                      'gg' )

d.counties.2000 <- c( 'aa' ,
                      'cc' ,
                      'gg' ,
                      'kk' )

list.function <-  function() { 

     sapply(c("d.counties.1980",
              "d.counties.1990",
              "d.counties.2000"), get, environment(), simplify = FALSE) 
} 

d.counties <- list.function()

if步骤 2. 在函数内部,将明确指定个别年份的缺失观测值的一系列语句替换为以下通用行,该行d.counties使用其中的名称访问列表,而无需明确指定个别年份的个别名称:

year.data = year.data[year.data$county %in% d.counties[substr(names(d.counties), 12, 15) == substr(y[1], 2, 5)][[1]],]

这是此解决方案的完整代码:

state <- 'my.state'

my.df <- read.table(text = '
  county   y1970  y1980  y1990  y2000   y2010
      aa      50     NA     70     NA     500
      cc      10     20     NA     NA     100
      ee     800     NA     NA    400    8000
      gg    1000   1900     NA     NA   10000
      ii     200    400    300    100    2000
      kk      20     40     30     NA     200
   total    2080   4000   3000   1000   20800
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)

pre.divide.up <- tail(my.df[,2:ncol(my.df)], 1) - colSums(head(my.df[,2:ncol(my.df)], -1), na.rm = TRUE)

# For each column containing NA's define the years to use as shares
# If use.years = 'pre'  then use the year in pre.year
# If use.years = 'post' then use the year in post.year
# If use.years = 'both' then use both the year in pre.year and the year in post.year
#
# Here I define pre.year = y1970 and post.year = 2010 for every year
# However, 'pre.year' and 'post.year' are variables.  They can differ among rows below.
shares <- read.table(text = '
      cyear   pre.year  post.year  use.years
      y1980     y1970     y2010        pre
      y1990     y1970     y2010        pre
      y2000     y1970     y2010        pre
', header = TRUE, na.strings = "NA")

d.counties.1980 <- c( 'aa' ,
                      'ee' )

d.counties.1990 <- c( 'cc' ,
                      'ee' , 
                      'gg' )

d.counties.2000 <- c( 'aa' ,
                      'cc' ,
                      'gg' ,
                      'kk' )

list.function <-  function() { 

     sapply(c("d.counties.1980",
              "d.counties.1990",
              "d.counties.2000"), get, environment(), simplify = FALSE) 
} 

d.counties <- list.function()
d.counties

my.input <- data.frame(shares)

my.function <- function(y) {

# extract years of interest from my.df and store in data.frame called year.data
if(y[[4]] != 'last') year.data = my.df[names(my.df) %in% c("county", y[[2]], y[[1]], y[[3]])]
if(y[[4]] == 'last') year.data = my.df[names(my.df) %in% c("county", y[[2]], y[[1]]        )]

# subset counties in year.data to only include counties with NA's in current year
year.data = year.data[year.data$county %in% d.counties[substr(names(d.counties), 12, 15) == substr(y[1], 2, 5)][[1]],]

# reorder columns in year.data
if(y[[4]] != 'last') year.data = year.data[, c('county', y[[2]], y[[1]], y[[3]])]
if(y[[4]] == 'last') year.data = year.data[, c('county', y[[2]], y[[1]]        )]

# values to be divided, or distributed, among counties with NA's in the current year
divide.up <- pre.divide.up[, y[[1]]] 

# sum values from designated pre and/or post years and bind those totals to bottom of year.data
if(y[[4]] != 'last') colsums.year = data.frame('total', as.data.frame(t(as.numeric(colSums(year.data[,c(2:4)], na.rm=TRUE)))))
if(y[[4]] == 'last') colsums.year = data.frame('total', as.data.frame(t(as.numeric(colSums(year.data[,c(2:3)], na.rm=TRUE)))))
names(colsums.year) <- names(year.data)
year.data.b <- rbind(year.data, colsums.year)

# obtain percentages in designated pre and/or post years for counties with NA's in current year
year.data.c <- year.data.b
year.data.c[, -1] <- lapply( year.data.c[  , -1], function(x){ x/x[nrow(year.data.b)] } )

# estimate county values for current year by distributing total missing values in current year
# according to how values were distributed in those same counties in other years
if(y[[4]] == 'both') year.data.b[, y[[1]]] = rowMeans(data.frame(year.data.c[, y[[2]]], year.data.c[, y[[3]]])) * as.numeric(divide.up)
if(y[[4]] ==  'pre') year.data.b[, y[[1]]] = year.data.c[, y[[2]]] * as.numeric(divide.up)
if(y[[4]] == 'post') year.data.b[, y[[1]]] = year.data.c[, y[[3]]] * as.numeric(divide.up)
if(y[[4]] == 'last') year.data.b[, y[[1]]] = year.data.c[, y[[2]]] * as.numeric(divide.up)

# extract estimates for current year along with the county column, then remove the last row
year.data.last <- year.data.b[names(year.data.b) %in% c("county", y[[1]])]
year.data.last <- year.data.last[-nrow(year.data.last),]
colnames(year.data.last) <- c('county', 'estimates')

# create a data set for export
this.year <- rep(as.numeric(substr(y[[1]], 2, 5)), nrow(year.data.last))
revised.data <- data.frame(state, this.year, year.data.last)
return(revised.data) 
}

my.list  <- apply(shares, 1, function(y) my.function(y))
my.list2 <- do.call("rbind", my.list)
my.list2

      state this.year county   estimates
1  my.state      1980     aa   96.470588
3  my.state      1980     ee 1543.529412
2  my.state      1990     cc   14.364641
31 my.state      1990     ee 1149.171271
4  my.state      1990     gg 1436.464088
11 my.state      2000     aa   23.148148
21 my.state      2000     cc    4.629630
41 my.state      2000     gg  462.962963
6  my.state      2000     kk    9.259259

这是根据 Arun 的答案开发的替代功能。使用这个函数,我data.frame shares通过使用sapply来访问函数内部,以允许我将列名视为变量。但是,此功能不足以完成该任务,因为我的实际数据有两种类型的缺失观测值,并且此功能无法区分两者。上面的函数可以区分两者,因为我在列表中明确指定了相关的缺失观察d.counties。在我的示例数据集中,我假设所有缺失的观测值都是相同的,因此两个函数都返回相同的估计值。

# data set
my.data <- read.table(text = '
 county  y1970  y1980  y1990  y2000  y2010
   aa      50     NA     70     NA     550
   cc      10     20     NA     NA     100
   ee     800     NA     NA    400    9000
   gg    1000   1900     NA     NA   12000
   ii     200    400    300    100    1500
   kk      20     40     30     NA     100
total    2080   4000   3000   1000   23250
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)

# extract columns with NA's
my.data2 <- my.data[(which(colSums(is.na(my.data), na.rm=TRUE) > 0))]

# For each column containing NA's define the years to use as shares
# If use.years = 'pre'  then use the year in pre.year
# If use.years = 'post' then use the year in post.year
# If use.years = 'both' then use both the year in pre.year and the year in post.year
#
# Here I define pre.year = y1970 and post.year = 2010 for every year
# However, 'pre.year' and 'post.year' are variables.  They can differ among rows below.
shares <- read.table(text = '
      cyear   pre.year   post.year   use.years
      y1980     y1970      y2010        pre
      y1990     y1970      y2010        post
      y2000     y1970      y2010        both
', header = TRUE, na.strings = "NA")

# extract last row of my.data2
my.total   <- my.data2[nrow(my.data),]

# For each column sum all but the last row of my.data2
my.colsums <- colSums(my.data2[1:(nrow(my.data2)-1),], na.rm = TRUE)

# For each column in my.data2 calculate the number to be divided among rows with NA's
divide.up  <- my.total - my.colsums

my.function <- function(x) { 
    idx <- which(is.na((my.data2)[x]))
    names.x <- as.character(colnames(my.data2)[x])

    my.pre.col  <- as.character(shares$pre.year[shares$cyear==names.x])
    my.post.col <- as.character(shares$post.year[shares$cyear==names.x])
    my.use.year <- as.character(shares$use.years[shares$cyear==names.x])

    xx.pre  <- my.data[[my.pre.col]][idx]
    xx.post <- my.data[[my.post.col]][idx]

    if(my.use.year=='pre' ) my.data2[[x]][idx] =   (xx.pre /sum(xx.pre ))                                * divide.up[[x]]
    if(my.use.year=='post') my.data2[[x]][idx] =   (xx.post/sum(xx.post))                                * divide.up[[x]]
    if(my.use.year=='both') my.data2[[x]][idx] = (((xx.pre /sum(xx.pre )) + (xx.post/sum(xx.post))) / 2) * divide.up[[x]]

    return(my.data2[x])

}

na.estimates <- sapply(1:ncol(my.data2), function(x) {my.function(x)})
revised.data  <- t(do.call("rbind", na.estimates))
revised.data

          y1980      y1990       y2000
[1,]   96.47059   70.00000   22.358388
[2,]   20.00000   12.32227    4.275599
[3,] 1543.52941 1109.00474  400.000000
[4,] 1900.00000 1478.67299  466.775599
[5,]  400.00000  300.00000  100.000000
[6,]   40.00000   30.00000    6.590414
[7,] 4000.00000 3000.00000 1000.000000
于 2013-08-05T04:48:40.217 回答