1

我试图使用来自另一个数据集(“obs”)(均为同一类型)的间隔和类来获取一个数据集(“sim”)的频率表。我尝试在 R 中使用 table () 函数,但它没有使用“obs”间隔给我称为“sim”的数据集的频率。可能有一些数据超出了“obs”定义的范围,我们的想法是那些被省略了。有没有一种简单的方法来获取这种情况下的频率表?

这是我的数据样本(向量):

  X  obs   sim
1 1 11.2  8.44
2 2 22.5 15.51
3 3 26.0 20.08
4 4 28.1 23.57
5 5 29.0 26.46
6 6 29.5 28.95
...etc...

我给你留下代码行:


# Set working directory
setwd("C:/Users/...")

# Vector has 2 set of data, "obs" and "sim"
vector <- read.csv("vector.csv", fileEncoding = 'UTF-8-BOM')

# Divide the range of "obs" into intervals, using Sturges for number of classes:
factor_obs <- cut(vector$obs, breaks=nclass.Sturges(vector$obs), include.lowest = T)

# Get a frequency table using the table() function for "obs"
obs_out <- as.data.frame(table(factor_obs))
obs_out <- transform(obs_out, cumFreq = cumsum(Freq), relative = prop.table(Freq))

# Get a frequency table using the table() function for "sim", using cut from "obs"
sim_out <- as.data.frame(table(factor_obs, vector$sim > 0))

这是我从“obs”频率表中得到的:

> obs_out
   factor_obs Freq cumFreq   relative
1 [11.1,25.6]    2       2 0.04166667
2 (25.6,40.1]   10      12 0.20833333
3 (40.1,54.5]   17      29 0.35416667
4   (54.5,69]    4      33 0.08333333
5   (69,83.4]    8      41 0.16666667
6 (83.4,97.9]    5      46 0.10416667
7  (97.9,112]    2      48 0.04166667

这是我从“sim”频率表中得到的:

> sim_out
   factor_obs Var2 Freq
1 [11.1,25.6] TRUE    2
2 (25.6,40.1] TRUE   10
3 (40.1,54.5] TRUE   17
4   (54.5,69] TRUE    4
5   (69,83.4] TRUE    8
6 (83.4,97.9] TRUE    5
7  (97.9,112] TRUE    2

这与“obs”表中的频率相同。思路是统计“obs”类定义的每个区间中“sim”的元素,省略“obs”范围外的极值。

如果有人可以指导我,那将很有帮助。非常感谢!!

4

1 回答 1

1

您将需要定义自己的断点,因为如果您允许cut这样做,则不会保存值以供您与sim变量一起使用。首先用于dput(vector)将数据放在 R 的简单形式中:

vector <- structure(list(X = 1:48, obs = c(11.2, 22.5, 26, 28.1, 29, 29.5, 
30.8, 32, 33.5, 35, 35.5, 38.9, 41, 41, 41, 43, 43.51, 44, 46, 
48.5, 50, 50, 50, 50, 50.8, 51.5, 51.5, 53, 54.4, 55, 57.5, 59.5, 
66.9, 70.6, 74.2, 75, 77, 80.2, 81.5, 82, 83, 83.6, 85, 85.1, 
93.8, 94, 106.7, 112.3), sim = c(8.44, 15.51, 20.08, 23.57, 26.46, 
28.95, 31.16, 33.17, 35.02, 36.75, 38.37, 39.92, 41.39, 42.81, 
44.19, 45.52, 46.82, 48.09, 49.34, 50.56, 51.78, 52.98, 54.18, 
55.37, 56.55, 57.75, 58.94, 60.14, 61.36, 62.59, 63.83, 65.1, 
66.4, 67.74, 69.11, 70.53, 72.01, 73.55, 75.18, 76.9, 78.75, 
80.76, 82.98, 85.46, 88.35, 91.84, 96.41, 103.48)), class = "data.frame",
row.names = c(NA, -48L))

现在我们需要类别的数量和断点:

nbreaks <- nclass.Sturges(vector$obs)
minval <- min(vector$obs)
maxval <- max(vector$obs)
int <- round((maxval - minval) / nbreaks, 3) # round to 1 digit more thab obs or sim
brks <- c(minval, minval + seq(nbreaks-1) * int, maxval)

数据表obs

factor_obs <- cut(vector$obs, breaks=brks, include.lowest=TRUE)
obs_out <- transform(table(factor_obs), cumFreq = cumsum(Freq), relative = prop.table(Freq))
print(obs_out, digits=3)
#    factor_obs Freq cumFreq relative
# 1 [11.2,25.6]    2       2   0.0417
# 2 (25.6,40.1]   10      12   0.2083
# 3 (40.1,54.5]   17      29   0.3542
# 4   (54.5,69]    4      33   0.0833
# 5   (69,83.4]    8      41   0.1667
# 6 (83.4,97.9]    5      46   0.1042
# 7  (97.9,112]    2      48   0.0417

现在sim数据:

factor_sim <- cut(vector$sim, breaks=brks, include.lowest=TRUE)
sim_out <- transform(table(factor_sim), cumFreq = cumsum(Freq), relative = prop.table(Freq))
print(sim_out, digits=3)
#    factor_sim Freq cumFreq relative
# 1 [11.2,25.6]    3       3   0.0638
# 2 (25.6,40.1]    8      11   0.1702
# 3 (40.1,54.5]   11      22   0.2340
# 4   (54.5,69]   11      33   0.2340
# 5   (69,83.4]    9      42   0.1915
# 6 (83.4,97.9]    4      46   0.0851
# 7  (97.9,112]    1      47   0.0213

请注意,仅显示 47 个案例而不是 48 个,因为一个值小于最小值。

addmargins(table(factor_obs, factor_sim, useNA="ifany"))
#              factor_sim
# factor_obs    [11.2,25.6] (25.6,40.1] (40.1,54.5] (54.5,69] (69,83.4] (83.4,97.9] (97.9,112] <NA> Sum
  # [11.2,25.6]           1           0           0         0         0           0          0    1   2
  # (25.6,40.1]           2           8           0         0         0           0          0    0  10
  # (40.1,54.5]           0           0          11         6         0           0          0    0  17
  # (54.5,69]             0           0           0         4         0           0          0    0   4
  # (69,83.4]             0           0           0         1         7           0          0    0   8
  # (83.4,97.9]           0           0           0         0         2           3          0    0   5
  # (97.9,112]            0           0           0         0         0           1          1    0   2
  # Sum                   3           8          11        11         9           4          1    1  48

于 2020-12-18T00:13:41.757 回答