0

我在我的服务器上运行此代码以从某些 rss 提要内容中提取名词短语(某种)。我在 Web Faction 服务器上运行它并且它正在泄漏内存。任何关于它如何工作的指针,或者它是否是我的代码,所以我可以修复它,将不胜感激。代码是:

import MySQLdb
import nltk, re, pprint

def RgxChunk(document):
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    grammar = "NP: {<RB>?<DT>?<JJ.*>*<NN.*>*}"
    cp = nltk.RegexpParser(grammar)
    sentences = [cp.parse(sent) for sent in sentences]
    return sentences;

db = MySQLdb.connect(host="HOST", user="USER" ,  passwd="PASS", db="DB")
cursor = db.cursor()
cursor2 = db.cursor()
cursor.execute("SELECT * FROM `rss_posts` WHERE length(`text`) > 100 AND `link` LIKE 'http%'")
numrows = int(cursor.rowcount)
for x in range(0,numrows):
    row = cursor.fetchone()
    text = row[6]
    parsed = RgxChunk(text)
    insert_sql = 'INSERT INTO `nltk_terms` VALUES'
    insert_values = ''
    for sent in parsed:
        for word in sent:
            if isinstance(word, nltk.tree.Tree): 
                if word.node =='NP':
                    wordcount = len(word)
                    if(wordcount == 1):
                        thephrase = db.escape_string(word[0][0].lower())
                    elif(wordcount > 1):
                        thephrase = word[0][0]
                        for i in range(1,wordcount):
                            thephrase = thephrase + ' ' +word[i][0]
                        thephrase = db.escape_string(thephrase.lower())     
                    thelink = db.escape_string(row[2])
                    insert_values = insert_values + "('" + thelink + "','" + thephrase + "','" + thephrase + thelink + "'),"
    insert_sql = insert_sql + insert_values[:-1]
    cursor2.execute(insert_sql)
    db.commit()
    print str(x+1) + ' articles processed of ' + str(numrows) 
4

0 回答 0