我有一个问题,这对我来说太重要了。我有一个逐行编写的文本,其大小为 100mb。正文中的几行:
yüceltmek;yücelt;tiğimizin
getirtmek;getirt;tiğimizin
kemikleşmek;kemikleş;tiğimizin
kronikleşmek;kronikleş;tiğimizin
şehirleşmek;şehirleş;tiğimizin
sakinleşmek;sakinleş;tiğimizin
gevşetmek;gevşetmek;tiğimizin
sanayileşmek;sanayileş;tiğimizin
第一个词是字典的键,其他词是它的值。当我将文本加载到字典时,我的程序大小为 1.8 gb。我的一个问题是我如何才能使用有效的内存?另一个问题是为什么磁盘和内存之间有这么大的差距。
读取文本文件:
def LoadMorphemes():
try:
dicttKokMorphemes = collections.defaultdict(lambda:list());
with codecs.open("C:/Users/Ali/workspace/QTNGram/src/testmorphemes.txt", mode="rb", encoding="utf-8", errors="ignore") as testf:
for kk in testf:
if kk:
kk = re.sub(ur"[^' \;abcçdefgğhıijklmnoöpqrsştuüvwxyz0-9]", " ", kk.lower(),re.UNICODE)
kk = kk.split()
if kk:
temp = kk[0].split(';')
if len(temp) == 3:
if temp[0]:
if len(temp[1]) > 0 and temp[2]:
dicttKokMorphemes["".join([temp[1],temp[2]])].append(DictMorphemes(temp[0]));
elif len(temp) == 1:
dicttKokMorphemes[temp[0]].append(DictMorphemes(temp[0]));
except Exception as ex:
print(ex)
class DictMorphemes(object):
def __init__(self,__morpheme_ ,_negatition_=None,_morphemes_=None):
if _negatition_ == None:
_negatition_=str();
if _morphemes_ == None:
_morphemes_=str();
if __morpheme_:
self.Morpheme=__morpheme_;
self.Negation=_negatition_
self.Morphemes=_morphemes_
else:
print(__morpheme_)
raise
def getMorpheme(self):
return self.Morpheme;
def getNegation(self):
return self.Negation;
def getMorphemes(self):
return self.Morphemes;
def setMorpheme(self,_morpheme_):
self.Morpheme = _morpheme_ ;
def setNegation(self,_negation_):
self.Negation = _negation_;
def setMorphemes(self,_morphemes_):
self.Morphemes = _morphemes_;