import time
from multiprocessing import Process, Pool
import sys, os, inspect
import urllib
import re
index ={}
graph={}
# Common words that we don't want to be part of the index
g=['is','a','the','ga','all','to','under']
def rm_tag(data):
p = re.compile(r'<.*?>')
return p.sub('', data)
def get_page(url):
try:
f = urllib.urlopen(url)
page = f.read()
f.close()
return page
except:
return ""
def union(a,b):
for e in b:
if e not in a:
a.append(e)
def get_next_url(page):
start_link=page.find("<a href=")
if(start_link==-1):
return None,0
start_quote=page.find('"',start_link)
end_quote=page.find('"',start_quote+1)
url=page[start_quote+1:end_quote]
return url,end_quote
def get_all_links(page):
links=[]
while True:
url,endpos=get_next_url(page)
page=page[endpos:]
if url:
links.append(url)
else:
break
print "get_all_links: %i links found\n" % len(links)
graph[page]=[links]
return graph[page]
def add_to_index(index,url,keyword):
if keyword in index:
if url not in index[keyword]:
index[keyword].append(url)
return
global g
if keyword not in g:
index[keyword]=[url]
def add_page_to_index(index, url, content):
words = content.split()
for word in words:
add_to_index(index, url,word)
def compute_ranks(graph):
d=0.8
numloops=20
ranks={}
npages=len(graph)
for page in graph:
ranks[page]=1.0/npages
for i in range(0,numloops):
newranks={}
for page in graph:
newrank=(1-d)/npages
for node in graph:
if page in graph[node]:
newrank=newrank + d * (ranks[node]/len(graph[node]))
newranks[page]=newrank
ranks=newranks
return ranks
def Look_up(index, keyword):
if keyword in index:
return index[keyword]
else:
return None
def Look_up_new(index,ranks,keyword):
pages=Look_up(index,keyword)
if pages:
for i in pages:
print '\n Results with Page Ranks :\n'+i+" --> "+str(ranks[i])
else:
print "Keyword does not exist"
def lucky_search(index, ranks, keyword):
try:
pages = Look_up(index, keyword)
if pages:
bestpage = pages[0]
for candidate in pages:
if ranks[candidate] > ranks[bestpage]:
bestpage = candidate
return (bestpage, ranks[bestpage], pages)
except:
print "Keyword does not exist",sys.exc_info()[0]
return None
def print_profile(index, graph):
print "*****************"
print "Length of index", len(index)
print "Length of graph", len(graph)
i = 0
for e in index:
i = i + 1
print i, ":", e
if i > 20:
break
print "*****************"
def print_profile_top (index, ranks):
max = 0
for e in index:
link_count = len(index[e])
if link_count > max:
max = link_count # updating the highest no of links we have found so far
print e, link_count, lucky_search(index,ranks,e)[1]
print "*********************"
#print "result for :", search_term, ": ", lucky_search(index,ranks,search_term)
#print lucky_search(index,ranks,'limited')
def chunks(l, n):
for i in xrange(0,len(l),n):
yield l[i:i+n] # instantly makes chunks a "generator function" instead of a normal function
if __name__ == '__main__':
start = time.clock()
c = 3
seed= "http://www.python.org"
keyword = "CGI"
max_page = 20
tocrawl=[seed]
crawled =[]
print '\nCrawling using ' + seed + " as seed_page and search_term: " + keyword
pool = Pool (c)
while tocrawl:
page=tocrawl.pop()
if page not in crawled:
max_page -= 1
if max_page<=0:
break
content=get_page(page)
text=content
partitioned_text= list(chunks(text, len(text) / c))
links=pool.map(get_all_links, partitioned_text)
#links=get_all_links(content)
#content=rm_tag(content)
add_page_to_index(index,page,content)
for e in links:
if e not in tocrawl:
tocrawl.append(e)
#union(tocrawl,graph[page])
crawled.append(page)
#pool.terminate()
ranks=compute_ranks(graph)
print_profile(index, graph)
print_profile_top(index, ranks)
print "result for :", keyword, ": ", lucky_search(index,ranks,keyword)
t=time.clock() - start
print "Processing Time :",t
#print crawled,index,graph """
while running the code the following error shows. please help me to fix.
Traceback (most recent call last):
File "C:\Documents and Settings\priyanka.14790\My Documents\Dropbox\Udacity\parallel.py", line 250, in <module>
partitioned_text= list(chunks(text, len(text) / c))
File "C:\Documents and Settings\priyanka.14790\My Documents\Dropbox\Udacity\parallel.py", line 229, in chunks
for i in xrange(0,len(l),n):
ValueError: xrange() arg 3 must not be zero
Here is the code for normal search engine without any error
import sys, os, inspect
import urllib
import re
max_page=5
# Common words that we don't want to be part of the index
g=['is','a','the','ga','all','to','under']
def rm_tag(data):
p = re.compile(r'<.*?>')
return p.sub('', data)
def get_page(url):
try:
f = urllib.urlopen(url)
page = f.read()
f.close()
return page
except:
return ""
def union(a,b):
for e in b:
if e not in a:
a.append(e)
def get_next_url(page):
start_link=page.find("<a href=")
if(start_link==-1):
return None,0
start_quote=page.find('"',start_link)
end_quote=page.find('"',start_quote+1)
url=page[start_quote+1:end_quote]
return url,end_quote
def get_all_links(page):
links=[]
while True:
url,endpos=get_next_url(page)
page=page[endpos:]
if url:
links.append(url)
else:
break
return links
def add_to_index(index,url,keyword):
if keyword in index:
if url not in index[keyword]:
index[keyword].append(url)
index[keyword]=[url]
def add_page_to_index(index, url, content):
words = content.split()
for word in words:
add_to_index(index, url,word)
def compute_ranks(graph):
d=0.8
numloops=20
ranks={}
npages=len(graph)
for page in graph:
ranks[page]=1.0/npages
for i in range(0,numloops):
newranks={}
for page in graph:
newrank=(1-d)/npages
for node in graph:
if page in graph[node]:
newrank=newrank + d * (ranks[node]/len(graph[node]))
newranks[page]=newrank
ranks=newranks
return ranks
def crawl_web(seed):
tocrawl=[seed]
crawled =[]
index ={}
graph={}
global max_page
while tocrawl:
page=tocrawl.pop()
if page not in crawled:
max_page -= 1
if max_page<=0:
break
c = get_page(page)
graph[page]=get_all_links(c)
c=rm_tag(c)
add_page_to_index(index,page,c)
union(tocrawl,graph[page])
crawled.append(page)
return crawled,index,graph
def Look_up(index, keyword):
if keyword in index:
return index[keyword]
else:
return None
def lucky_search(index, ranks, keyword):
try:
pages = Look_up(index, keyword)
if pages:
bestpage = pages[0]
for candidate in pages:
if ranks[candidate] > ranks[bestpage]:
bestpage = candidate
return (bestpage, ranks[bestpage], pages)
except:
print "Keyword does not exist"
seed_page = "http://www.python.org"
search_term = "Take"
try:
print "Enter the Max Limit to Search :"
max_limit=int(raw_input())
except:
max_page = 10
max_page = max_limit
print '\nCrawling using ' + seed_page + " as seed_page and search_term: " + search_term
crawled,index,graph=crawl_web(seed_page)
ranks=compute_ranks(graph)
def print_profile(index, graph):
print "*****************"
print "Length of index", len(index)
print "Length of graph", len(graph)
i = 0
for e in index:
i = i + 1
print i, ":", e
if i > 20:
break
print "*****************"
def print_profile_top (index, ranks):
max1 = 0
for e in index:
link_count = len(index[e])
if link_count > max1:
max1= link_count
print e, link_count, lucky_search(index,ranks,e)[1]
print "*********************"
print_profile(index, graph)
print_profile_top(index, ranks)
print "result for :", search_term, ": ", lucky_search(index,ranks,search_term)
the output is : Enter the Max Limit to Search : 10
Crawling using http://www.python.org as seed_page and search_term: Take
*****************
Length of index 1281
Length of graph 9
1 : Canada
2 : limited
3 : all
4 : here"-->.
5 : unclear,
6 : CGI,
7 : 08:00
8 : enabled:
9 : results
10 : href=""
11 : :/
12 : subtle
13 : Take
14 : Buildbot,
15 : pyBiblio,
16 : CD”,
17 : href="/search-pymodules.xml"/>
18 : nothing
19 : Foundation
20 : pyArkansas
21 : depend
*****************
Canada 1 0.0222222222222
*********************
result for : Take : ('http://www.timparkin.co.uk/2012/08/why-you-cant-make-digital-look-like-velvia-50/', 0.022821308980213083, ['http://www.timparkin.co.uk/2012/08/why-you-cant-make-digital-look-like-velvia-50/'])
>>>
Please run and see the difference.