我有一个searchengine.py
文件,我还为此创建了一个索引。
searchengine.py
:
import sqlite3
import urllib2
from bs4 import BeautifulSoup
from urlparse import urljoin
# Create a list of words to igonre
ignorewords=set(['the','of','to','and','a','in','is','it'])
class crawler:
# Initialize the crawler with the name of database
def __init__(self,dbname):
self.con=sqlite3.connect(dbname)
def __del__(self):
self.con.close()
def dbcommit(self):
pass
# Auxilliary function for getting an entry id and
# adding it if not present
def getentryid(self, table, field, value, createnew=True):
cur=self.con.execute("select rowid from %s where %s='%s'" % (table,field,value))
res=cur.fetchone()
if res==None:
cur=self.con.execute("insert into %s (%s) values ('%s')" % (table,field,value))
return cur.lastrowid
else:
return res[0]
# Index an individual page
def addtoindex(self,url,soup):
if self.isindexed(url): return
print 'Indexing %s' %url
# Get the individual words
text=self.gettextonly(soup)
words=self.separatewords(text)
# Get the URL id
urlid=self.getentryid('urllist','url',url)
# Link each word to this url
for i in range(len(words)):
word=words[i]
if word in ignorewords: continue
wordid=self.getentryid('wordlist','word',word)
self.con.execute("insert into wordlocation(urlid,wordid,location) \
values (%d,%d,%d)" % (urlid,wordid,i))
# Extract the text from an HTML page (no tags)
def gettextonly(self,soup):
v=soup.string
if v==None:
c=soup.contents
resulttext=''
for t in c:
subtext=self.gettextonly(t)
resulttext+=subtext+'\n'
return resulttext
else:
return v.strip()
# Sepetate the words by any non-whitespace character
def separatewords(self, text):
splitter=re.compile('\\W*')
return [s.lower() for s in splitter.split(text) if s!='']
# Return true if this url is already indexed
def isindexed(self, url):
u=self.con.execute("select rowid from urllist where url='%s'" % url).fetchone()
if u!=None:
# Check if it has actually been crawled
v=self.con.execute('select * from wordlocation where urlid=%d' % u[0]).fetchone()
if v!=None: return True
return False
# Add a link between two pages
def addlinkref(self,urlFrom,urlTo,linkText):
pass
# Starting with a list of pages, do a breadth first search to
# the given depth, indexing pages as we go
def crawl(self,pages,depth=2):
pass
# Create the database tables
def createindextables(self):
pass
def crawl(self,pages,depth=2):
for i in range(depth):
newpages=set()
for page in pages:
try:
c=urllib2.urlopen(page)
except:
print "Could not open %s" % page
continue
soup=BeautifulSoup(c.read())
self.addtoindex(page,soup)
links=soup('a')
for link in links:
if ('href' in dict(link.attrs)):
url=urljoin(page,link['href'])
if url.find("'")!=-1: continue
url=url.split('#')[0] # remove location portion
if url[0:4]=='http' and not self.isindexed(url):
newpages.add(url)
linkText=self.gettextonly(link)
self.addlinkref(page,url,linkText)
self.dbcommit()
pages=newpages
# Creating index tables
def createindextables(self):
self.con.execute('create table urllist(url)')
self.con.execute('create table wordlist(word)')
self.con.execute('create table wordlocation(urlid,wordid,location)')
self.con.execute('create table link(fromid integer,toid integer)')
self.con.execute('create table linkwords(wordid,linkid)')
self.con.execute('create index wordid on wordlist(word)')
self.con.execute('create index urlid on urllist(url)')
self.con.execute('create index wordurlidx on wordlocation(wordid)')
self.con.execute('create index urltoidx on link(toid)')
self.con.execute('create index urlfromidx on link(fromid)')
self.dbcommit()
创建索引 - searchindex.db 使用 python shell
>>> reload(searchengine)
>>> crawler=searchengine.crawler('searchindex.db')
>>> crawler.createindextables( )
我尝试像这样使用它,但它引发了一个错误:
>>> reload(searchengine)
>>> crawler=searchengine.crawler('searchindex.db')
>>> pages=['http://kiwitobes.co/wiki/Categorical_list_of_programming_languages.html']
>>> crawler.crawl(pages)
Indexing http://www.tartarus.org/~martin/PorterStemmer/index.html
Traceback (most recent call last):
File "<pyshell#22>", line 1, in <module>
crawler.crawl(pages)
File "C:/Users/dj/Desktop\searchengine.py", line 103, in crawl
self.addtoindex(page,soup)
File ""C:/Users/dj/Desktop\searchengine.py", line 38, in addtoindex
words=self.separatewords(text)
File ""C:/Users/dj/Desktop\searchengine.py", line 68, in separatewords
splitter=re.compile('\\W*')
NameError: global name 're' is not defined
python版本:2.7,操作系统:windows 8