-6

我有一个searchengine.py文件,我还为此创建了一个索引。

searchengine.py

import sqlite3
import urllib2
from bs4 import BeautifulSoup
from urlparse import urljoin

# Create a list of words to igonre
ignorewords=set(['the','of','to','and','a','in','is','it'])

class crawler:
    # Initialize the crawler with the name of database
    def __init__(self,dbname):
        self.con=sqlite3.connect(dbname)

    def __del__(self):
        self.con.close()

    def dbcommit(self):
        pass

    # Auxilliary function for getting an entry id and
    # adding it if not present
    def getentryid(self, table, field, value, createnew=True):
        cur=self.con.execute("select rowid from %s where %s='%s'" % (table,field,value))
        res=cur.fetchone()
        if res==None:
            cur=self.con.execute("insert into %s (%s) values ('%s')" % (table,field,value))
            return cur.lastrowid
        else:
            return res[0]

    # Index an individual page
    def addtoindex(self,url,soup):
        if self.isindexed(url): return
        print 'Indexing %s' %url

        # Get the individual words
        text=self.gettextonly(soup)
        words=self.separatewords(text)

        # Get the URL id
        urlid=self.getentryid('urllist','url',url)

        # Link each word to this url
        for i in range(len(words)):
            word=words[i]
            if word in ignorewords: continue
            wordid=self.getentryid('wordlist','word',word)
            self.con.execute("insert into wordlocation(urlid,wordid,location) \
                values (%d,%d,%d)" % (urlid,wordid,i))


    # Extract the text from an HTML page (no tags)
    def gettextonly(self,soup):
        v=soup.string
        if v==None:
            c=soup.contents
            resulttext=''
            for t in c:
                subtext=self.gettextonly(t)
                resulttext+=subtext+'\n'
            return resulttext
        else:
            return v.strip()


    # Sepetate the words by any non-whitespace character
    def separatewords(self, text):
        splitter=re.compile('\\W*')
        return [s.lower() for s in splitter.split(text) if s!='']

    # Return true if this url is already indexed
    def isindexed(self, url):
        u=self.con.execute("select rowid from urllist where url='%s'" % url).fetchone()
        if u!=None:
            # Check if it has actually been crawled
            v=self.con.execute('select * from wordlocation where urlid=%d' % u[0]).fetchone()
            if v!=None: return True
        return False

    # Add a link between two pages
    def addlinkref(self,urlFrom,urlTo,linkText):
        pass

    # Starting with a list of pages, do a breadth first search to
    # the given depth, indexing pages as we go
    def crawl(self,pages,depth=2):
        pass

    # Create the database tables
    def createindextables(self):
        pass

    def crawl(self,pages,depth=2):
        for i in range(depth):
            newpages=set()
            for page in pages:
                try:
                    c=urllib2.urlopen(page)
                except:
                    print "Could not open %s" % page
                    continue
                soup=BeautifulSoup(c.read())
                self.addtoindex(page,soup)

                links=soup('a')
                for link in links:
                    if ('href' in dict(link.attrs)):
                        url=urljoin(page,link['href'])
                        if url.find("'")!=-1: continue
                        url=url.split('#')[0] # remove location portion
                        if url[0:4]=='http' and not self.isindexed(url):
                            newpages.add(url)
                        linkText=self.gettextonly(link)
                        self.addlinkref(page,url,linkText)

                self.dbcommit()

            pages=newpages

    # Creating index tables
    def createindextables(self):
        self.con.execute('create table urllist(url)')
        self.con.execute('create table wordlist(word)')
        self.con.execute('create table wordlocation(urlid,wordid,location)')
        self.con.execute('create table link(fromid integer,toid integer)')
        self.con.execute('create table linkwords(wordid,linkid)')
        self.con.execute('create index wordid on wordlist(word)')
        self.con.execute('create index urlid on urllist(url)')
        self.con.execute('create index wordurlidx on wordlocation(wordid)')
        self.con.execute('create index urltoidx on link(toid)')
        self.con.execute('create index urlfromidx on link(fromid)')
        self.dbcommit()

创建索引 - searchindex.db 使用 python shell

>>> reload(searchengine)
>>> crawler=searchengine.crawler('searchindex.db')
>>> crawler.createindextables( )

我尝试像这样使用它,但它引发了一个错误:

>>> reload(searchengine)
>>> crawler=searchengine.crawler('searchindex.db')
>>> pages=['http://kiwitobes.co/wiki/Categorical_list_of_programming_languages.html']
>>> crawler.crawl(pages)
Indexing http://www.tartarus.org/~martin/PorterStemmer/index.html

Traceback (most recent call last):
  File "<pyshell#22>", line 1, in <module>
    crawler.crawl(pages)
  File "C:/Users/dj/Desktop\searchengine.py", line 103, in crawl
    self.addtoindex(page,soup)
  File ""C:/Users/dj/Desktop\searchengine.py", line 38, in addtoindex
    words=self.separatewords(text)
  File ""C:/Users/dj/Desktop\searchengine.py", line 68, in separatewords
    splitter=re.compile('\\W*')
NameError: global name 're' is not defined

python版本:2.7,操作系统:windows 8

4

1 回答 1

0

您在代码中使用该re模块:

def separatewords(self, text):
    splitter=re.compile('\\W*')
    # here --^
    return [s.lower() for s in splitter.split(text) if s!='']

但我没有一次看到你有:

import re

它将模块加载re到内存中。尝试使用未加载到内存中的模块会生成NameError.

因此,要解决您的问题,只需将import re所有其他导入添加到脚本的顶部。

于 2013-10-06T15:18:38.940 回答