2

我有一些解决这个问题的想法,但我希望大师们能想出更好的东西。我向 Mechanical Turk 提​​交了一堆行输入。我需要表格中的一行,并且我有一个字段,我要求他们在其中键入以逗号分隔的行的值。然后在 RI 中进行了 strsplit,我现在正在比较多个 Turkers 条目的结果。

一个常见的模式是一个 Turker 将错过一个条目,将其余条目丢掉一个。因此,挑战在于知道将缺失值放在哪里。假设他们只会错过输入一个条目(我有错误检查代码来确认这一点),并且我可能从每个表行中获得了多达 3 个重复(因此可能有 1-2 个适当长度,并且1-2 太短了。条目大约是下面的大小,我只有大约 50 个,所以计算效率不是最重要的。假设最长的条目是适当的长度。

这是一个这样的行的示例(存储为列表,每个元素都是不同 Turker 的复制):

tt <- list(structure(c(4, 4, 5, 7, 9, 13, 15, 18, 20, 22, 24, 
27, 30, 32, 35, 37, 41, 43, 46, 48, 51, 54, 57, 60, 63), .Dim = c(25L, 
1L)), structure(c(4, 4, 5, 7, 9, 11, 13, 15, 18, 20, 22, 25, 
27, 30, 32, 35, 37, 40, 43, 46, 48, 51, 54, 57, 60, 63), .Dim = c(26L, 
1L)), structure(c(4, 4, 5, 7, 9, 11, 13, 15, 19, 20, 22, 25, 
27, 30, 32, 35, 37, 42, 43, 46, 48, 51, 54, 57, 61, 63), .Dim = c(26L, 
1L)))

lengths <- sapply(tt,length)
longs <- simplify2array(tt[lengths==max(lengths)],FALSE)
shorts <- simplify2array(tt[lengths==max(lengths)-1],FALSE)

我考虑过的算法是:

  • max(lengths)在每个可能的地方用 NA创建排列,并使用对总偏差的一些估计将它们同时与适当长度的 1-2 个进行比较。
  • 循环遍历每个元素并与适当长度的 1-2 个元素进行比较,直到找到不完全匹配的元素。然后决定与 NA 的所有后续差异相比差异有多大。例如,如果它们匹配到第 5 个条目,但将 NA 放在第 5 个条目中仍然使其余部分的差异超过第 5 个条目的差异,则继续向下移动向量。

好奇每个人将如何实现这一点。我很难避免循环并以优雅的方式编写它。可能类似的东西filter可能会有所帮助。

有问题的输入和期望输出的示例

有问题的输入(缺少一个值;其他值没有拼写错误)

> tt1 <- list(c(4, 4, 7, 9, 11), c(4, 4, 5, 7, 9, 11), c(4, 4, 5, 7, 9, 
11))
> tt1
[[1]]
[1]  4  4  7  9 11

[[2]]
[1]  4  4  5  7  9 11

[[3]]
[1]  4  4  5  7  9 11

期望的输出

> tt1
  [,1] [,2] [,3]
1    4    4    4
2    4    4    4
3   NA    5    5
4    7    7    7
5    9    9    9
6   11   11   11

有问题的输入(缺失值 + 另一个值的错字)

> tt2 <- list(c(4, 4, 7, 9, 11), c(4, 3, 5, 7, 9, 11), c(4, 4, 5, 7, 9, 
11))
> tt2
[[1]]
[1]  4  4  7  9 11

[[2]]
[1]  4  3  5  7  9 11

[[3]]
[1]  4  4  5  7  9 11

期望的输出

> tt2[[1]][4:6] <- tt2[[1]][3:5]
> tt2[[1]][3] <- NA
> simplify2array(tt2,FALSE)
     [,1] [,2] [,3]
[1,]    4    4    4
[2,]    4    3    4
[3,]   NA    5    5
[4,]    7    7    7
[5,]    9    9    9
[6,]   11   11   11

应该优雅地容忍其他类型的拼写错误。请注意,向量通常会增加(您可以将它们视为随着噪声单调增加)。因此,如果有人将 7 误认为 4,那可能是一个错字。另请注意,对于大多数情况,我只进行了 2 次重复,因此没有任何方法可以让一个非缺失值比任何其他非缺失值更可信。将不得不查看整个模式,或者至少利用它们普遍增加的事实。

完整的数据框

上面的每个 tt 示例都是下面 data.frame 中给定英尺图像级别的所有 TotalTime 条目。这是整个数据集。请注意,组之间的条目总数可能会发生变化image。这个值是预先知道的,或者你可以从条目的最大值中得到它。

dat <- structure(list(feet = c(1, 2, 3, 3, 1, 1, 7, 7, 8, 9, 9, 1, 1, 
2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 
6, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 6, 6, 7, 7, 8, 8, 9, 10, 10
), TotalTime = c("4,3,4,6,6,10,12,14,16,18,20,22,25,28,30,32,34,36,41,44,46,49,51,55,58", 
"4,4,5,7,9,11,13,15,18,20,22,25,27,30,32,35,37,41,43,46,48,51,54,57,60,63", 
"3,4,6,8,11,13,15,17,20,22,25,27,32,34,38,39,41,44,47,49,52,55,58,61,64,67", 
"3,4,6,8,11,13,15,17,20,22,25,27,32,34,36,39,41,44,47,49,52,55,58,61,64,67", 
"4,3,4,6,8,20,22,24,26,28,30,31,34,36,38,40,42,44,46,48,50,52,54,56,58,60", 
"4,3,4,6,8,10,12,14,16,18,20,22,25,28,30,32,34,38,41,44,46,49,51,55,58", 
"4,4,4,7,10,15,18,21,24,29,32,35,38,43,47,52,56,60,63,67,72,76,82,84", 
"4,4,4,7,10,15,18,21,24,29,32,35,38,43.47,52,56,60,63,67,72,76,82,84", 
"4,3,5,8,14,16,20,24,27,31,34,37,42,46,49,55,59,64,68,73,77,83,89,91", 
"4,4,6,9,15,18,22,25,28,32,36,40,44,49,53,59,63,68,74,80,85,93,94", 
"4,4,6,9,15,18,22,25,28,32,36,40,44,49,53,59,63,68,74,80,85,88,93,94", 
"4,3,4,6,8,10,12,14,16,18,20,22,25,28,30,32,34,36,41,44,46,49,51,55,58", 
"4,3,4,6,8,10,12,14,16,18,20,22,25,28,30,32,34,36,38,41,44,46,49,51,55,58", 
"4,4,5,7,9,11,13,15,18,20,22,25,27,31,32,35,37,41,43,46,48,51,54,57,60,63", 
"4,4,5,7,9,11,13,15,18,20,22,25,27,30,32,35,37,41,43,46,48,51,54,57,60,63", 
"3,4,6,8,11,13,15,17,20,22,25,27,32,34,38,39,41,44,47,49,52,55,58,61,64,67", 
"3,4,6,8,11,13,15,17,20,22,25,27,32,34,36,39,41,44,47,49,52,55,58,61,64,67", 
"3,5,7,9,12,14,16,19,22,24,29,31,34,36,38,41,44,47,50,53,58,61,64,67,69,72", 
"3,5,7,9,12,14,16,19,22,24,29,31,34,36,38,41,44,47,50,53,58,61,64,67,69,72", 
"4,6,8,11,13,15,19,21,25,28,30,33,36,38,41,44,49,52,55,58,61,65,68,71,75,79", 
"4,6,8,11,13,15,19,21,25,28,30,33,36,38,41,44,49,52,55,58,61,65,68,71,75,79", 
"4,6,9,11,14,17,21,24,27,30,33,35,38,42,45,49,52,55,58,63,67,70,73,78,82,85", 
"4,6,9,11,14,17,21,24,27,30,33,35,36,42,45,49,52,55,58,63,67,70,73,78,82,85", 
"2,4,6,9,11,13,16,16,20,23,24,26,28,29,31,33,35,37,39,40,42,43,45,47,52", 
"2,4,6,9,11,13,16,18,20,21,23,24,26,28,29,31,33,35,37,39,40,42,43,45,47,52", 
"2,5,7,11,12,14,17,19,21,22,24,26,28,29,31,35,36,39,41,42,44,46,48,50,52,54", 
"2,5,7,11,12,14,17,19,21,22,24,26,28,29,31,35,36,39,41,42,44,46,48,50,52,54", 
"4,6,9,11,13,16,18,20,22,24,27,29,31,32,35,37,39,41,43,45,46,49,51,53,55,57", 
"4,6,9,11,13,16,18,20,22,24,27,29,31,32,35,37,39,41,43,45,46,49,51,53,55,57", 
"6,7,10,13,15,18,20,23,24,28,30,32,34,37,39,41,43,45,47,49,54,57,59,61,63", 
"6,7,10,13,15,18,20,23,24,26,28,30,32,34,37,39,41,43,45,47,49,54,57,59,61,63", 
"6,8,10,14,16,19,21,23,25,28,30,32,36,39,41,43,45,47,49,52,54,57,59,61,63,65", 
"6,8,10,14,16,19,21,23,25,28,30,32,36,39,41,43,45,47,49,52,54,57,59,61,63,65", 
"7,9,12,14,18,20,23,24,27,31,33,35,38,40,43,45,47,49,51,55,58,60,62,65,67,69", 
"7,9,12,14,18,20,23,24,27,31,33,35,38,40,43,45,47,49,51,55,58,60,62,65,67,69", 
"4,3,5,7,10,13,17,20,23,26,29,33,36,40,43,48,51,55,60,64,67,72,75,77", 
"4,3,5,7,10,13,17,20,23,26,29,33,36,40,43,48,51,55,60,64,67,72,75,77", 
"4,4,4,7,10,15,18,21,24,29,32,35,38,43,47,52,56,60,63,67,72,76,82,84", 
"4,4,4,7,10,15,18,21,24,29,32,35,38,43,47,52,56,60,63,67,72,76,82,84", 
"4,3,5,8,14,16,20,24,27,31,34,37,42,46,49,55,59,64,68,73,77,83,89,91", 
"4,3,5,8,14,16,20,24,27,31,34,37,42,46,49,55,59,64,68,73,77,83,89,91", 
"4,4,6,9,15,18,22,25,28,32,36,40,44,49,53,59,63,68,74,80,85,88,93,94", 
"4,4,6,9,15,18,22,25,28,32,36,40,44,49,53,59,63,68,74,80,85,88,93,94", 
"0,0,0,1,1,1,3,3,3,5,5,5,6,6,7,7,8,8,9,10,11,10,11,11", "0,0,0,1,1,1,3,3,3,5,5,6,6,7,7,8,8,9,10,11,10,11,11", 
"6,4,7,10,13,16,20,22,25,27,30,32,35,38,43,45,48,52,54,57,60,62,64,67", 
"6,4,7,10,13,16,20,22,25,27,30,32,35,38,43,45,48,52,54,57,60,62,64,67", 
"6,4,7,10,14,19,21,23,26,28,33,36,39,42,45,47,50,53,56,60,62,65,69,70", 
"6,4,7,10,14,19,21,23,26,28,33,36,39,42,45,47,50,53,56,60,62,65,69,70", 
"2,5,9,12,14,20,21,24,29,32,34,37,41,44,46,50,53,59,62,65,68,72,75,76", 
"2,5,9,12,14,20,21,24,29,32,34,37,41,44,46,50,53,59,62,65,68,72,75,76", 
"2,5,9,13,17,20,24,27,30,33,37,42,45,48,52,55,58,62,65,67,72,75,78,80", 
"3,6,10,15,18,23,25,26,28,32,36,40,43,47,50,53,58,61,65,67,70,75,78,83,86", 
"3,6,10,15,18,23,25,28,32,36,40,43,47,50,53,58,61,65,67,70,75,78,83,86"
), image = c(1, 1, 1, 1, 3, 3, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 
2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4)), .Names = c("feet", 
"TotalTime", "image"), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L, 11L, 14L, 15L, 16L, 17L, 19L, 20L, 22L, 23L, 
24L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 
38L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 49L, 50L, 51L, 53L, 
54L, 55L, 56L, 57L, 58L, 59L, 61L, 62L, 63L), class = "data.frame")
4

2 回答 2

1

我希望这个能帮上忙:

f <- function(tt) {
  len <- (sapply(tt, length))
  tar <- rowMeans(do.call("cbind", tt[len == max(len)]))
  tt[len < max(len)] <- 
    lapply(tt[len < max(len)],
      function(x) {
        r <- lapply(combn(max(len), max(len)-length(x)),
          function(i) {z <- numeric(max(len)); z[i] <- NA; z[!is.na(z)] <- x; z})
        r[[which.min(sapply(r, function(x) sum((x - tar)^2, na.rm = T)))]]
    })
  simplify2array(tt,FALSE)
}

然后,

> f(tt)
      [,1] [,2] [,3]
 [1,]    4    4    4
 [2,]    3    4    4
 [3,]    4    5    5
... snip ...
[24,]   55   57   57
[25,]   58   60   61
[26,]   NA   63   63

> f(tt1)
     [,1] [,2] [,3]
[1,]    4    4    4
[2,]    4    4    4
[3,]   NA    5    5
[4,]    7    7    7
[5,]    9    9    9
[6,]   11   11   11

> f(tt2)
     [,1] [,2] [,3]
[1,]    4    4    4
[2,]    4    3    4
[3,]   NA    5    5
[4,]    7    7    7
[5,]    9    9    9
[6,]   11   11   11

这是您完整数据的示例:

dlply(dat, .(feet, image), function(x) f(lapply(strsplit(x$TotalTime, ","), as.numeric)))

看起来工作得很好。

于 2011-11-11T14:26:44.787 回答
1

这是一个旨在可读的解决方案。毫无疑问,它可以折叠成更少的代码行:

desiredLength <- function(x){
  len <- sapply(x, length)
  max(len)
}

insertNA <- function(x, position=1){
  c(x[seq_along(x) < position], NA, x[seq_along(x) >= position]) 
}

fixLength <- function(x, position=1){
  dlen <- desiredLength(x)
  sapply(x, function(zz) if(length(zz) < dlen) insertNA(zz, position) else zz)
}

objectiveFunction <- function(x){
  sum(apply(x, 1, function(z)length(unique(z))))
}

findMinObjective <- function(x){
  pos <- NA
  obj <- Inf
  for(i in 1:desiredLength(x)){
    z <- objectiveFunction(fixLength(x, position=i))
    if(z < obj){
      obj <- z
      pos <- i
    }
  }
  fixLength(x, pos)
}

结果:

> findMinObjective(tt1)
     [,1] [,2] [,3]
[1,]    4    4    4
[2,]    4    4    4
[3,]   NA    5    5
[4,]    7    7    7
[5,]    9    9    9
[6,]   11   11   11

> findMinObjective(tt2)
     [,1] [,2] [,3]
[1,]    4    4    4
[2,]    4    3    4
[3,]   NA    5    5
[4,]    7    7    7
[5,]    9    9    9
[6,]   11   11   11
于 2011-11-11T14:33:23.050 回答