2

我正在分析来自鸡蛋调查的数据。数据来自北海的不同地点,一些站点在不同日期记录了两次。大海应该被 0.5 x 0.5 度的正方形覆盖。我有两个问题我还没有找到任何解决方案:

  1. 如何将具有重复位置和不同日期的点替换为平均值?我知道如何删除重复项或如何用最大值或最小值替换它们,但找不到计算平均值的方法。

  2. 如何根据相邻单元格计算缺失点的插值。只有当至少两个记录点相邻时,才应计算插值。

我尝试设置一个网格,但并没有走得很远,因为我找不到一种方法来告诉 R 什么时候插值,什么时候不插值。

样本数据:

egg_data <- structure(list(Latitude = c(54.25, 54.25, 54.25, 54.25, 54.25, 
54.25, 54.25, 54.25, 54.25, 54.25, 54.25, 54.25, 54.25, 54.25, 
55.25, 55.25, 55.25, 55.25, 55.25, 55.25, 55.25, 55.25, 55.25, 
55.25, 55.25, 55.25, 55.25, 55.25, 55.25, 55.25, 55.25, 55.25, 
55.25, 55.25, 55.25, 55.25, 55.25, 55.25, 55.25, 55.25, 55.25, 
55.25, 55.25, 55.25, 54.25, 54.25, 54.25, 53.25, 58.25, 57.75, 
57.25, 57.25, 57.25, 57.25, 57.25, 57.25, 57.25, 57.25, 56.75, 
56.75, 56.75, 56.75, 56.75, 56.75, 56.75, 56.75, 56.75, 56.75, 
56.75, 56.75, 56.75, 56.25, 56.25, 56.25, 56.25, 56.25, 56.25, 
56.25, 56.25, 56.25, 56.25, 56.25, 56.25, 56.25, 56.25, 56.25, 
56.25, 56.75, 56.75, 56.75), Longitude = c(6.25, 5.25, 5.25, 
4.25, 4.25, 3.25, 3.25, 2.25, 2.25, 1.25, 1.25, 0.25, 0.25, 0.25, 
0.25, 0.25, 0.25, 0.25, 1.25, 1.25, 2.25, 2.25, 3.25, 3.25, 4.25, 
4.25, 5.25, 5.25, 5.25, 5.25, 4.25, 4.25, 3.25, 3.25, 2.25, 2.25, 
1.25, 1.25, 0.25, 0.25, 0.25, 0.25, 1.25, 1.25, 0.25, 0.25, 0.25, 
0.25, 3.25, 3.25, 3.25, 2.75, 2.25, 1.75, 1.25, 0.75, 0.25, 0.25, 
0.25, 0.25, 0.75, 1.25, 1.75, 2.25, 2.75, 3.25, 3.75, 4.25, 4.75, 
5.25, 5.75, 6.25, 5.75, 5.25, 4.75, 4.25, 3.75, 3.25, 2.25, 1.75, 
1.25, 0.75, 0.25, 0.25, 0.75, 1.25, 1.75, 1.75, 1.25, 0.75), 
    Eggs = c(9L, 6L, 4L, 20L, 57L, 14L, 35L, 18L, 4L, 1L, 3L, 
    100L, 1L, 201L, 0L, 51L, 52L, 23L, 19L, 4L, 5L, 23L, 11L, 
    18L, 7L, 7L, 14L, 6L, 3L, 4L, 20L, 13L, 19L, 5L, 16L, 23L, 
    28L, 11L, 9L, 12L, 19L, 62L, 6L, 3L, 15L, 110L, 57L, 0L, 
    14L, 3L, 3L, 8L, 94L, 62L, 7L, 19L, 511L, 59L, 283L, 308L, 
    20L, 44L, 61L, 24L, 10L, 10L, 15L, 6L, 8L, 12L, 32L, 2L, 
    5L, 10L, 21L, 4L, 1L, 19L, 3L, 4L, 4L, 17L, 51L, 108L, 1213L, 
    132L, 4L, 0L, 0L, 0L)), .Names = c("Latitude", "Longitude", 
"Eggs"), class = "data.frame", row.names = c("1", "2", "3", "4", 
"5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", 
"16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", 
"27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", 
"38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", 
"49", "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", 
"60", "61", "62", "63", "64", "65", "66", "67", "68", "69", "70", 
"71", "72", "73", "74", "75", "76", "77", "78", "79", "80", "81", 
"82", "83", "84", "85", "86", "87", "88", "89", "90"))

非常感谢!!

4

1 回答 1

1

为每个位置添加一个因素

egg_data <- inside(egg_data, Location <- paste("(", Latitude, ", ", Longitude, ")", sep = "")

编辑:对此有兴趣没有意义,因为我们想尽快扭转这个过程。

egg_data <- within(egg_data, 
  Location <- paste(Latitude, Longitude, sep = ",")
)

然后有很多方法可以得到平均值。

means_by_location <- with(egg_data, tapply(Eggs, Location, mean))

或者

library(plyr)
means_by_location2 <- ddply(egg_data, .(Location), summarise, Mean.eggs = mean(Eggs))

或者

means_by_location3 <- aggregate(Eggs ~ Location, egg_data, mean)

或者

means_by_location4 <- with(egg_data, by(Eggs, Location, mean))

编辑:接下来,您希望将结果保存在数据框中,因此请使用方法 2 或 3。

将纬度和经度重新添加到新数据集中。(有很多方法可以做到这一点。)

lat_long <- strsplit(means_by_location2$Location, ",")
means_by_location2$Latitude <- sapply(lat_long, function(x) x[1]) 
means_by_location2$Longitude <- sapply(lat_long, function(x) x[2])

这是你回答的第一个问题。


对于第二个问题,你需要多想。按位置查看鸡蛋图。

library(ggplot2)
(p <- ggplot(means_by_location2, aes(Longitude, Latitude, colour = log10(Mean.eggs  +1))) +
  geom_point() +
  scale_colour_gradient(low = "#FFFFFF", high = "#0000FF", space = "Lab")
)

您是从北到南,还是从东到西,或与所有相邻点进行插值?有很多不同的可能性,他们可能有不同的答案。说出哪种插值最好是一项艰巨的任务。

于 2011-10-26T13:18:07.923 回答