python - 如何验证python中的单词？

Question

我在 Python 中有一个这样的列表：

`list = ['thatCreation', 'happeningso', '’', 'comebecause',]

问题：

我想要具体的词：

For e.g. -> 'thatCreation' -> 'that', 'creation'
            'happeningso' -> 'happening', 'so'
            'comebeacause' -> 'come', 'because' `

提前感谢您在 python 中解决它。

score 1 · Accepted Answer

看起来您正试图以驼峰式大小写合并在一起的单词并将其分开。有一个很棒的算法叫做Viterbi可以很好地做到这一点。

我无法解释它背后的魔力，但我最近在我的程序中实现了它并且效果非常好。我的理解是它计算每个单词的概率并对其进行拆分。该算法在任何情况下都可以拆分单词。

def word_prob(word): return dictionary[word] / total
def words(text): return re.findall('[a-z]+', text.lower()) 
dictionary = Counter(words(open(words_path).read()))
max_word_length = max(map(len, dictionary))
total = float(sum(dictionary.values()))

def viterbi_segment(text):
    probs, lasts = [1.0], [0]
    for i in range(1, len(text) + 1):
        prob_k, k = max((probs[j] * word_prob(text[j:i]), j)
                        for j in range(max(0, i - max_word_length), i))
        probs.append(prob_k)
        lasts.append(k)
    words = []
    i = len(text)
    while 0 < i:
        words.append(text[lasts[i]:i])
        i = lasts[i]
    words.reverse()
    return words, probs[-1]

sentence = ' '.join(viterbi_segment('thatCreation'.lower())[0])
print('sentence: {0}'.format(sentence))
word = ''.join(a.capitalize() for a in split('([^a-zA-Z0-9])', sentence)
       if a.isalnum())
print('word: {0}'.format(word[0].lower() + word[1:]))

你需要一本包含大量单词的字典，那里有多个单词，但我使用了： https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english-no-swears。文本

并用它没有的新词更新它。

score 0 · Accepted Answer

借用 Peter Norvig 的 pytudes 来进行分词。请试试..

import re
import math
import random
import matplotlib.pyplot as plt
from collections import Counter
from itertools   import permutations
from typing      import List, Tuple, Set, Dict, Callable

!wget https://raw.githubusercontent.com/dwyl/english-words/master/words.txt

Word = str    # We implement words as strings
cat = ''.join # Function to concatenate strings together


def tokens(text) -> List[Word]:
    """List all the word tokens (consecutive letters) in a text. Normalize to lowercase."""
    return re.findall('[a-z]+', text.lower()) 

TEXT = open('big.txt').read()
WORDS = tokens(TEXT)


class ProbabilityFunction:
    def __call__(self, outcome):
        """The probability of `outcome`."""
        if not hasattr(self, 'total'):
            self.total = sum(self.values())
        return self[outcome] / self.total
    
class Bag(Counter, ProbabilityFunction): """A bag of words."""
    

Pword = Bag(WORDS)


def Pwords(words: List[Word]) -> float:
    "Probability of a sequence of words, assuming each word is independent of others."
    return Π(Pword(w) for w in words)

def Π(nums) -> float:
    "Multiply the numbers together.  (Like `sum`, but with multiplication.)"
    result = 1
    for num in nums:
        result *= num
    return result

def splits(text, start=0, end=20) -> Tuple[str, str]:
    """Return a list of all (first, rest) pairs; start <= len(first) <= L."""
    return [(text[:i], text[i:]) 
            for i in range(start, min(len(text), end)+1)]

def segment(text) -> List[Word]:
    """Return a list of words that is the most probable segmentation of text."""
    if not text: 
        return []
    else:
        candidates = ([first] + segment(rest)
                      for (first, rest) in splits(text, 1))
        return max(candidates, key=Pwords)

strings = ['thatCreation', 'happeningso', 'comebecause']
[segment(string.lower()) for string in strings]

--2020-08-04 18:48:06-- https://raw.githubusercontent.com/dwyl/english-words/master/words.txt 解决raw.githubusercontent.com (raw.githubusercontent.com).. . 151.101.0.133, 151.101.64.133, 151.101.128.133, ... 连接到 raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... 已连接。HTTP 请求已发送，等待响应... 200 OK 长度：4863005 (4.6M) [text/plain] 保存到：'words.txt.2'</p>

words.txt.2 100%[====================>] 4.64M 162KB/s 25s

2020-08-04 18:48:31 (192 KB/s) - 'words.txt.2' 已保存 [4863005/4863005]

[['那个'，'创造']，['正在发生'，'所以']，['来'，'因为']]

score 0 · Accepted Answer

import re
from collections import Counter

def viterbi_segment(text):
    probs, lasts = [1.0], [0]
    for i in range(1, len(text) + 1):
        prob_k, k = max((probs[j] * word_prob(text[j:i]), j)
                    for j in range(max(0, i - max_word_length), i))
    probs.append(prob_k)
    lasts.append(k)
    words = []
    i = len(text)
    while 0 < i:
        words.append(text[lasts[i]:i])
        i = lasts[i]
    words.reverse()
    return words, probs[-1]
    

def word_prob(word): return dictionary[word] / total
def words(text): return re.findall('[a-z]+', text.lower())   
dictionary = Counter(words(open('big.txt').read()))
max_word_length = max(map(len, dictionary))  
total = float(sum(dictionary.values()))
l = ['thatCreation', 'happeningso', 'comebecause',]

for w in l:
    print(viterbi_segment(w.lower()))

O/p will be - 
(['that', 'creation'], 1.63869514118246e-07)
(['happening', 'so'], 1.1607123777400279e-07)
(['come', 'because'], 4.81658105705814e-07)

我从@Darius Bacon 那里得到了我的问题的解决方案，为此，您需要将所有字符串都设为小写字符串。谢谢你们的帮助。

访问此链接以下载 big.txt： https ://norvig.com/big.txt

python - 如何验证python中的单词？

3 回答 3

Related

Reference