我在我的服务器上运行此代码以从某些 rss 提要内容中提取名词短语(某种)。我在 Web Faction 服务器上运行它并且它正在泄漏内存。任何关于它如何工作的指针,或者它是否是我的代码,所以我可以修复它,将不胜感激。代码是:
import MySQLdb
import nltk, re, pprint
def RgxChunk(document):
sentences = nltk.sent_tokenize(document)
sentences = [nltk.word_tokenize(sent) for sent in sentences]
sentences = [nltk.pos_tag(sent) for sent in sentences]
grammar = "NP: {<RB>?<DT>?<JJ.*>*<NN.*>*}"
cp = nltk.RegexpParser(grammar)
sentences = [cp.parse(sent) for sent in sentences]
return sentences;
db = MySQLdb.connect(host="HOST", user="USER" , passwd="PASS", db="DB")
cursor = db.cursor()
cursor2 = db.cursor()
cursor.execute("SELECT * FROM `rss_posts` WHERE length(`text`) > 100 AND `link` LIKE 'http%'")
numrows = int(cursor.rowcount)
for x in range(0,numrows):
row = cursor.fetchone()
text = row[6]
parsed = RgxChunk(text)
insert_sql = 'INSERT INTO `nltk_terms` VALUES'
insert_values = ''
for sent in parsed:
for word in sent:
if isinstance(word, nltk.tree.Tree):
if word.node =='NP':
wordcount = len(word)
if(wordcount == 1):
thephrase = db.escape_string(word[0][0].lower())
elif(wordcount > 1):
thephrase = word[0][0]
for i in range(1,wordcount):
thephrase = thephrase + ' ' +word[i][0]
thephrase = db.escape_string(thephrase.lower())
thelink = db.escape_string(row[2])
insert_values = insert_values + "('" + thelink + "','" + thephrase + "','" + thephrase + thelink + "'),"
insert_sql = insert_sql + insert_values[:-1]
cursor2.execute(insert_sql)
db.commit()
print str(x+1) + ' articles processed of ' + str(numrows)