借用 Peter Norvig 的 pytudes 来进行分词。请试试..
import re
import math
import random
import matplotlib.pyplot as plt
from collections import Counter
from itertools import permutations
from typing import List, Tuple, Set, Dict, Callable
!wget https://raw.githubusercontent.com/dwyl/english-words/master/words.txt
Word = str # We implement words as strings
cat = ''.join # Function to concatenate strings together
def tokens(text) -> List[Word]:
"""List all the word tokens (consecutive letters) in a text. Normalize to lowercase."""
return re.findall('[a-z]+', text.lower())
TEXT = open('big.txt').read()
WORDS = tokens(TEXT)
class ProbabilityFunction:
def __call__(self, outcome):
"""The probability of `outcome`."""
if not hasattr(self, 'total'):
self.total = sum(self.values())
return self[outcome] / self.total
class Bag(Counter, ProbabilityFunction): """A bag of words."""
Pword = Bag(WORDS)
def Pwords(words: List[Word]) -> float:
"Probability of a sequence of words, assuming each word is independent of others."
return Π(Pword(w) for w in words)
def Π(nums) -> float:
"Multiply the numbers together. (Like `sum`, but with multiplication.)"
result = 1
for num in nums:
result *= num
return result
def splits(text, start=0, end=20) -> Tuple[str, str]:
"""Return a list of all (first, rest) pairs; start <= len(first) <= L."""
return [(text[:i], text[i:])
for i in range(start, min(len(text), end)+1)]
def segment(text) -> List[Word]:
"""Return a list of words that is the most probable segmentation of text."""
if not text:
return []
else:
candidates = ([first] + segment(rest)
for (first, rest) in splits(text, 1))
return max(candidates, key=Pwords)
strings = ['thatCreation', 'happeningso', 'comebecause']
[segment(string.lower()) for string in strings]
--2020-08-04 18:48:06-- https://raw.githubusercontent.com/dwyl/english-words/master/words.txt
解决raw.githubusercontent.com (raw.githubusercontent.com).. . 151.101.0.133, 151.101.64.133, 151.101.128.133, ... 连接到 raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... 已连接。HTTP 请求已发送,等待响应... 200 OK 长度:4863005 (4.6M) [text/plain] 保存到:'words.txt.2'</p>
words.txt.2 100%[====================>] 4.64M 162KB/s 25s
2020-08-04 18:48:31 (192 KB/s) - 'words.txt.2' 已保存 [4863005/4863005]
[['那个','创造'],['正在发生','所以'],['来','因为']]