from glob import glob
pattern = "D:\\report\\shakeall\\*.txt"
filelist = glob(pattern)
def countwords(fp):
with open(fp) as fh:
return len(fh.read().split())
print "There are" ,sum(map(countwords, filelist)), "words in the files. " "From directory",pattern
import os
import re
import string
uniquewords = set([])
for root, dirs, files in os.walk("D:\\report\\shakeall"):
for name in files:
[uniquewords.add(x) for x in open(os.path.join(root,name)).read().split()]
wordlist = list(uniquewords)
此代码计算唯一字数和总字数。但是,问题是,如果我写 len(uniquewords) ,它会显示不合理的数字,因为它可以识别例如 'shake' 'shake!' “摇一摇”和“摇一摇?” 作为不同的独特词。我试图通过制作列表并修改它来从 uniquewords 中删除标点符号,但一切都失败了。有谁能够帮我?