5

我有一个数据集,其中充满了不适当间隔的句子。我试图想出一种方法来删除一些空格。

我从一个转换为单词数据框的句子开始:

> word5 <- "hotter the doghou se would be bec ause the co lor was diffe rent"
> abc1 <- data.frame(filler1 = 1,words1=factor(unlist(strsplit(word5, split=" "))))
> abc1
   filler1 words1
1        1 hotter
2        1    the
3        1 doghou
4        1     se
5        1  would
6        1     be
7        1    bec
8        1   ause
9        1    the
10       1     co
11       1    lor
12       1    was
13       1  diffe
14       1   rent

接下来,我使用以下代码尝试拼写检查和组合单词,这些单词是它们之前或之后的单词的组合:

abc2 <- abc1
i <- 1
while(i < nrow(abc1)){
  print(abc2)
  if(nrow(aspell(abc1$words1[i])) == 0){
    print(paste(i,"Words OK",sep=" | "));flush.console() 
    i <- i + 1
  }
 else{
  if(nrow(aspell(abc1$words1[i])) > 0 & i != 1){
    preWord1 <- abc1$words1[i-1]
    postWord1 <- abc1$words1[i+1]
    badWord1 <- abc1$words1[i]
    newWord1 <- factor(paste(preWord1,badWord1,sep=""))
    newWord2 <- factor(paste(badWord1,postWord1,sep=""))

    if(nrow(aspell(newWord1)) == 0 & nrow(aspell(newWord2)) != 0){
      abc2[i,"words1"] <-as.character(newWord1)
      abc2 <- abc2[-c(i+1),]
      print(paste(i,"word1",sep=" | "));flush.console()
      i <- i + 1
    }

    if(nrow(aspell(newWord1)) != 0 & nrow(aspell(newWord2)) == 0){
      abc2[i ,"words1"] <-as.character(newWord2)
      abc2 <- abc2[-c(i-1),]
      print(paste(i,"word2",sep=" | "));flush.console()
      i <- i + 1
    }

  }
}
}

在玩了一段时间后,我得出的结论是我需要某种类型的迭代器,但不确定如何在 R 中实现它。有什么建议吗?

4

2 回答 2

10

注意:我想出了一个完全不同的更好的解决方案,因为它规避了之前解决方案的所有缺点。但我仍然想保留旧的解决方案。因此,我将其添加为新答案,如果我这样做错了,请纠正我。

在这种方法中,我重新格式化了数据集。基础就是我所说的 wordpair 对象。例如:

> word5
[1] "hotter the doghou se would be bec ause the col or was diffe rent"

看起来像:

> abc1_pairs
    word1  word2
1  hotter    the
2     the doghou
3  doghou     se
4      se  would
5   would     be
6      be    bec
7     bec   ause
8    ause    the
9     the    col
10    col     or
11     or    was
12    was  diffe
13  diffe   rent

接下来,我们遍历单词对并查看它们本身是否是有效单词,递归执行此操作直到找不到有效的新单词(请注意,本文底部列出了一些附加函数):

# Recursively delete wordpairs which lead to a correct word
merge_wordpairs = function(wordpairs) {
  require(plyr)
  merged_pairs = as.character(mlply(wordpairs, merge_word))
  correct_words_idxs = which(sapply(merged_pairs, word_correct))
  if(length(correct_words_idxs) == 0) {
    return(wordpairs)
  } else {
    message(sprintf("Number of words about to be merged in this pass: %s", length(correct_words_idxs)))
    for(idx in correct_words_idxs) {
      wordpairs = merge_specific_pair(wordpairs, idx, delete_pair = FALSE)
    }
    return(merge_wordpairs(wordpairs[-correct_words_idxs,])) # recursive call
  }
}

应用于示例数据集,这将导致:

> word5 <- "hotter the doghou se would be bec ause the col or was diffe rent"
> abc1 = strsplit(word5, split = " ")[[1]]
> abc1_pairs = wordlist2wordpairs(abc1)
> abc1_pairs
    word1  word2
1  hotter    the
2     the doghou
3  doghou     se
4      se  would
5   would     be
6      be    bec
7     bec   ause
8    ause    the
9     the    col
10    col     or
11     or    was
12    was  diffe
13  diffe   rent
> abc1_merged_pairs = merge_wordpairs(abc1_pairs)
Number of words about to be merged in this pass: 4
> merged_sentence = paste(wordpairs2wordlist(abc1_merged_pairs), collapse = " ")
> c(word5, merged_sentence)
[1] "hotter the doghou se would be bec ause the col or was diffe rent"
[2] "hotter the doghouse would be because the color was different"    

需要的附加功能:

# A bunch of functions
# Data transformation
wordlist2wordpairs = function(word_list) {
  require(plyr)
  wordpairs = ldply(seq_len(length(word_list) - 1), 
                    function(idx) 
                      return(c(word_list[idx], 
                               word_list[idx+1])))
  names(wordpairs) = c("word1", "word2")
  return(wordpairs)
}
wordpairs2wordlist = function(wordpairs) {
  return(c(wordpairs[[1]], wordpairs[[2]][nrow(wordpairs)]))
}

# Some checking functions
# Is the word correct?
word_correct = function(word) return(nrow(aspell(factor(word))) == 0)
# Merge two words
merge_word = function(word1, word2) return(paste(word1, word2, sep = ""))

# Merge a specific pair, option to postpone deletion of pair
merge_specific_pair = function(wordpairs, idx, delete_pair = TRUE) {
  # merge pair into word
  merged_word = do.call("merge_word", wordpairs[idx,])
  # assign the pair to the idx above
  if(!(idx == 1)) wordpairs[idx - 1, "word2"] = merged_word
  if(!(idx == nrow(wordpairs))) wordpairs[idx + 1, "word1"] = merged_word
  # assign the pair to the index below (if not last one)
  if(delete_pair) wordpairs = wordpairs[-idx,]
  return(wordpairs)
}
于 2012-08-13T19:17:29.973 回答
3

你可以做的是使用递归。下面的代码对您的示例进行了稍微修改的版本。它检查所有单词是否正确,如果正确,则返回单词列表。如果不是,它会尝试将该单词与前面的单词以及后面的单词结合起来。如果前面单词的合并是正确的,这将导致一个看起来像 的合并paste(word_before, word, word_after)。在尝试合并之后,在新单词列表上调用合并单词的函数。这种递归一直持续到没有留下错误的单词为止。

# Wrap the spell checking in a function, makes your code much more readable
word_correct = function(word) return(nrow(aspell(factor(word))) == 0)
# Merge two words
merge_word = function(word1, word2) return(paste(word1, word2, sep = ""))
# Merge two words and replace in list
merge_words_in_list = function(word_list, idx1, idx2) {
  word_list[idx1] = merge_word(word_list[idx1], word_list[idx2])
  return(word_list[-idx2])
}
# Function that recursively combines words 
combine_words = function(word_list) {
  message("Current sentence: ", paste(word_list, collapse = " "))
  words_ok = sapply(word_list, word_correct)
  if(all(words_ok)) {
    return(word_list) 
  } else {
    first_wrong_word = which(!words_ok)[1]
    combination_before = merge_word(word_list[first_wrong_word], 
                                    word_list[first_wrong_word-1])
    if(word_correct(combination_before)) {
      word_list = merge_words_in_list(word_list, first_wrong_word-1, 
                                      first_wrong_word)
    }
    combination_after = merge_word(word_list[first_wrong_word], 
                                   word_list[first_wrong_word+1])
    if(word_correct(combination_after)) {
      word_list = merge_words_in_list(word_list, first_wrong_word, 
                                      first_wrong_word+1)
    }
    return(combine_words(word_list))  # Recursive call
  }
}

将这组函数应用于(稍作修改)版本的句子:

word5 <- "hotter the doghou se would be bec ause the col or was diffe rent"
abc1 = strsplit(word5, split = " ")[[1]]
combine_words(abc1)
Current sentence: hotter the doghou se would be bec ause the col or was diffe rent
Current sentence: hotter the doghouse would be bec ause the col or was diffe rent
Current sentence: hotter the doghouse would be because the col or was diffe rent
Current sentence: hotter the doghouse would be because the col or was different

一些问题:

  • 仍然存在的问题是,如果两者combination_beforecombination_after无效,程序会陷入无限递归。程序仅在所有单词都有效时停止。
  • 如果两者都与前一个词合并,并且下一个词是有效的,我们该怎么办?
  • 代码只合并错误的单词,例如'col' 和'or' 被认为是好词,aspell而你可能想要合并。这带来了一个新的挑战:在这种情况下,合并是显而易见的,但在大型数据集中,如何组合一组本身正确的单词可能并不明显。

但尽管如此,我认为这个例子很好地说明了一种递归方法。

于 2012-08-13T10:17:27.430 回答