2

我有一个大型数据集,其中每个站点都具有相同的纬度和经度。在数据集中,一些行缺少纬度和经度,而是说“未知”。我需要用未丢失数据的其他站点的纬度来填充未知数。

在此示例中,我希望第 5 行为 lat 和 lon 插入 3 和 8:

> station <- c("a","b","c","c","c")
> lat <- c("1","2","3","3","unknown")
> lon <- c("6","7","8","8","unknown")
> data.frame(station,lat,lon)
  station     lat     lon
1       a       1       6
2       b       2       7
3       c       3       8
4       c       3       8
5       c unknown unknown

我的数据集中有一百万行,如果需要几分钟才能完成,那很好,因为它只在分析开始前运行一次。除非真的有必要,否则我宁愿不安装另一个软件包。

4

3 回答 3

3

像这样的东西,也许——

df$station <- as.character(df$station)

unknownstations <- unique(subset(df,df$lat == "unknown","station"))
unknownstationscoords <- unique(subset(df,station %in% unknownstations$station & lat != "unknown"))

for( i in unknownstations$station)
{
df[df$station == i,"lat"] <- subset(unknownstationscoords,station %in% i,"lat")
df[df$station == i,"lon"] <- subset(unknownstationscoords,station %in% i,"lon")
}
于 2013-11-03T07:37:27.020 回答
2

我会na.locf从 zoo 包中使用。首先,我将更unknown改为NA然后应用na.locf

> library(zoo)
> df[ df=="unknown"] <- NA
> df2 <- do.call(rbind, lapply(split(df, df$station), na.locf))
> df2[, -1]  <- sapply(df2[, -1], as.numeric)  # numeric variables should be numeric 
> df2
    station lat lon
a         a   1   6
b         b   2   7
c.3       c   3   8
c.4       c   3   8
c.5       c   3   8

如果您想念出行名,请使用rownames并分配名称:

> rownames(df2) <- 1:nrow(df2)
> df2
  station lat lon
1       a   1   6
2       b   2   7
3       c   3   8
4       c   3   8
5       c   3   8
于 2013-11-03T10:27:41.880 回答
0
y=function(station,lat,lon){

  temp=cbind(station,lat,lon)
  lat_ind=lat!="unknown"
  lon_ind=lon!="unknown"


  if(all(lat_ind)==0){
    hash=unique(temp[lat_ind,])
    ind2=hash[,1]==station[!lat_ind]
    temp[!lat_ind,]=temp[ind2,]

    return(temp) 

  }else if(all(lon_ind)==0){
    hash=unique(temp[lon_ind,])
    ind2=hash[,1]==station[!lon_ind]
    temp[!lon_ind,]=temp[ind2,]

    return(temp)


  }else {

    return(temp)
  }


}




##case1

station <- c("a","b","c","c","c")
lat <- c("1","2","3","3","unknown")
lon <- c("6","7","8","8","unknown")

y(station,lat,lon)
# station lat lon
# [1,] "a"     "1" "6"
# [2,] "b"     "2" "7"
# [3,] "c"     "3" "8"
# [4,] "c"     "3" "8"
# [5,] "c"     "3" "8"


##case2

station <- c("a","b","c","c","c")
lat <- c("1","2","3","3","3")
lon <- c("6","7","8","8","unknown")
y(station,lat,lon)
# station lat lon
# [1,] "a"     "1" "6"
# [2,] "b"     "2" "7"
# [3,] "c"     "3" "8"
# [4,] "c"     "3" "8"
# [5,] "c"     "3" "8"


##case3

station <- c("a","b","c","c","c")
lat <- c("1","2","3","3","unknown")
lon <- c("6","7","8","8","8")
y(station,lat,lon)
# station lat lon
# [1,] "a"     "1" "6"
# [2,] "b"     "2" "7"
# [3,] "c"     "3" "8"
# [4,] "c"     "3" "8"
# [5,] "c"     "3" "8"
于 2013-11-03T07:45:57.677 回答