python - Python：创建多个字母转换字典

Question

所以我和我的同学们正在尝试建立一个马尔可夫模型来找出文本文件中字母转换的概率。在文本文件中，我们有一组单词“Steam, Teams, Meets, Teems, Eat, Ate, State, Tease, Test, Mast, Mates”。在代码中，我们在每个单词的第一个字母的开头和最后一个字母之后添加了空格。所以我们遇到的问题是制作一个函数，将字母转换放入单独的字典中。例如，所有 e 转换（例如：“_e”、“ea”...等，_ 是一个空格）都会进入字典，然后是 t、s、a 和 m。

这是我们到目前为止的代码：

import random
import re

inFile = open("markov.txt",'r')
file = inFile.read().lower()
inFile.close()
file=re.sub('[^[a-z\ \']+', " ", file)

fileTuple=tuple(file.split())
fileList=list(fileTuple)
fileString=file


def addSpaces(atuple):
    theString=''
    for i in atuple:
        theString=theString+' '+i+' '
    return(theString)

print('The words in the text file:',addSpaces(fileTuple))


fileDict = { }
for i in fileList:
    fileDict['_'+i+'_']=''

print("This is a dictionary of the words in the text file with underscores as spaces:",fileDict)

def countTotalWords(atuple):
    count=0
    for i in atuple:
        count=count+1
    return(count)

print('Total amount of words:',countTotalWords(fileTuple))

def findFirstLetter(aDict):
    for i in aDict:
        aDict[i]=i[0:2]
    return(aDict)

print('The first letters of each word in the file:',findFirstLetter(fileDict))



valueList=list(fileDict.values())
keyList=list(fileDict.keys())



def countFirstLetters(alist):
    d={}
    count = 0
    for character in alist:
        if character in d:
            d[character] += 1
        else:
            d[character] = 1

    return d

print('Total amount of occurences of each first letter:',countFirstLetters(valueList))

def countFirstLettersProbability(alist):
    d={}
    count = 0
    for character in alist:
        if character in d:
            d[character] += (1/countTotalWords(fileTuple))
        else:
            d[character] = (1/countTotalWords(fileTuple))

    return d


print('Probility that each letter is the first in the word:',countFirstLettersProbability(valueList))


def countAllLetters(alist):
    d={}
    for word in alist:
        for char in word:
            if char in d:
                d[char] += 1
            else:
                d[char] = 1

    return d

print('Total amount of occurences of each letter:',countFirstLetters(fileString))

score 1 · Accepted Answer

这是一个坚实的开始；我已将您的代码重写为马尔可夫类。

from random import choice
import re
from collections import defaultdict
from itertools import chain, tee, izip

def strip_non_alpha(text, reg=re.compile('[^a-z\']+', re.IGNORECASE)):
    return reg.sub(' ', text.strip())

def nwise(iterable, n):
    "s -> (s0,s1, ... sn-1), (s1,s2, ... sn), (s2, s3, ... sn+1), ..."
    args = tee(iterable, n)
    for i,t in enumerate(args):
        for j in range(i):
            next(t, None)
    return izip(*args)

class Markov():
    CHAINLEN = 3
    PRE = ' '*(CHAINLEN - 1)

    @classmethod
    def from_file(cls, fname):
        with open(fname) as inf:
            return Markov(inf)

    def __init__(self, text):
        """
        Create a new Markov chain model

            text
                Either a string or a sequence of strings
        """
        self.lookup = defaultdict(list)
        self.words = 0
        self.strings = 0

        if hasattr(text, '__iter__'):
            for s in text:
                self.add_text(s)
        else:
            self.add_text(text)

    def add_text(self, text):
        """
        Add a string to the lookup table

            text
                string to add
        """
        text = strip_non_alpha(text).lower()
        self.words += len(text.split())
        self.strings += 1
        for chars in nwise(chain(Markov.PRE, text, Markov.PRE), Markov.CHAINLEN):
            stem = ''.join(chars[:-1])
            self.lookup[stem].append(chars[-1])

    def gen_text(self, upto=200):
        """
        Generate a string

            upto
                maximum length of string to be generated
        """
        s = Markov.PRE
        res = []
        for i in range(upto + Markov.CHAINLEN):
            ch = choice(self.lookup[s])
            res.append(ch)
            s = s[1:] + ch
            if s == Markov.PRE:    # terminal string
                break
        return ''.join(res[:-(Markov.CHAINLEN - 1)])

    def __str__(self):
        return '\n'.join("'{}': {}".format(k, self.lookup[k]) for k in sorted(self.lookup))

def main():
    # mc = Markov.from_file('markov.txt')
    mc = Markov('Steam,Teams,Meets,Teems,Eat,Ate,State,Tease,Test,Mast,Mates'.split(','))

    print mc.strings, mc.words
    print mc

    for i in range(10):
        print(mc.gen_text())

if __name__=="__main__":
    main()

python - Python：创建多个字母转换字典

1 回答 1

Related

Reference