-2
import time
from multiprocessing import Process, Pool
import sys, os, inspect
import urllib
import re
index ={}
graph={}
# Common words that we don't want to be part of the index
g=['is','a','the','ga','all','to','under']


def rm_tag(data):
    p = re.compile(r'<.*?>')
    return p.sub('', data)

def get_page(url):
    try:
        f = urllib.urlopen(url)
        page = f.read()
        f.close()
        return page
    except:
        return ""

def union(a,b):
    for e in b:
        if e not in a:
            a.append(e)

def get_next_url(page):
    start_link=page.find("<a href=")
    if(start_link==-1):
        return None,0
    start_quote=page.find('"',start_link)
    end_quote=page.find('"',start_quote+1)
    url=page[start_quote+1:end_quote]
    return url,end_quote

def get_all_links(page):
    links=[]
    while True:
        url,endpos=get_next_url(page)
        page=page[endpos:]
        if url:
            links.append(url)
        else:
            break
    print "get_all_links: %i links found\n" % len(links)
    graph[page]=[links]
    return graph[page]


def add_to_index(index,url,keyword):
        if keyword in index:
                if url not in index[keyword]:
                        index[keyword].append(url)
                return

        global g
        if keyword not in g:
          index[keyword]=[url]

def add_page_to_index(index, url, content):
    words = content.split()
    for word in words:
        add_to_index(index, url,word)

def compute_ranks(graph):
    d=0.8
    numloops=20
    ranks={}
    npages=len(graph)
    for page in graph:
        ranks[page]=1.0/npages
    for i in range(0,numloops):
        newranks={}
        for page in graph:
            newrank=(1-d)/npages
            for node in graph:
                if page in graph[node]:
                    newrank=newrank + d * (ranks[node]/len(graph[node]))
            newranks[page]=newrank
        ranks=newranks
    return ranks
def Look_up(index, keyword):
    if keyword in index:
        return index[keyword]
    else:
        return None

def Look_up_new(index,ranks,keyword):
    pages=Look_up(index,keyword)
    if pages:
        for i in pages:
            print  '\n Results with Page Ranks :\n'+i+" --> "+str(ranks[i])
    else:
        print "Keyword does not exist"

def lucky_search(index, ranks, keyword):
    try:
        pages = Look_up(index, keyword)
        if pages:
            bestpage = pages[0]
            for candidate in pages:
                if ranks[candidate] > ranks[bestpage]:
                    bestpage = candidate
            return (bestpage, ranks[bestpage], pages)
    except:
        print "Keyword does not exist",sys.exc_info()[0]
        return None


def print_profile(index, graph):
    print "*****************"
    print "Length of index", len(index)
    print "Length of graph", len(graph)

    i = 0
    for e in index:
        i = i + 1
        print i, ":", e
        if i > 20:
            break
    print "*****************"

def print_profile_top (index, ranks):
    max = 0
    for e in index:
        link_count = len(index[e])
        if link_count > max:
            max = link_count   # updating the highest no of links we have found so far
            print e, link_count, lucky_search(index,ranks,e)[1]
    print "*********************"


#print "result for :", search_term, ": ", lucky_search(index,ranks,search_term)
#print lucky_search(index,ranks,'limited')

def chunks(l, n):
    for i in xrange(0,len(l),n):
        yield l[i:i+n] # instantly makes chunks a "generator function" instead of a normal function

if __name__ == '__main__':
    start = time.clock()    
    c = 3
    seed= "http://www.python.org"
    keyword = "CGI"
    max_page = 20
    tocrawl=[seed]
    crawled =[]
    print '\nCrawling using ' + seed + " as seed_page and search_term: " + keyword
    pool = Pool (c)
    while tocrawl:
        page=tocrawl.pop()
        if page not in crawled:
            max_page -= 1 
            if max_page<=0:
                break
            content=get_page(page)
            text=content
            partitioned_text= list(chunks(text, len(text) / c))
            links=pool.map(get_all_links, partitioned_text)
            #links=get_all_links(content)
            #content=rm_tag(content)
            add_page_to_index(index,page,content)
            for e in links:
                if e not in tocrawl:
                    tocrawl.append(e)
            #union(tocrawl,graph[page])
            crawled.append(page)

    #pool.terminate()
    ranks=compute_ranks(graph)
    print_profile(index, graph)
    print_profile_top(index, ranks)
    print "result for :", keyword, ": ", lucky_search(index,ranks,keyword)
    t=time.clock() - start
    print "Processing Time :",t
#print crawled,index,graph """

while running the code the following error shows. please help me to fix.

Traceback (most recent call last):
  File "C:\Documents and Settings\priyanka.14790\My Documents\Dropbox\Udacity\parallel.py", line 250, in <module>
    partitioned_text= list(chunks(text, len(text) / c))
  File "C:\Documents and Settings\priyanka.14790\My Documents\Dropbox\Udacity\parallel.py", line 229, in chunks
    for i in xrange(0,len(l),n):
ValueError: xrange() arg 3 must not be zero

Here is the code for normal search engine without any error

    import sys, os, inspect
    import urllib
    import re
    max_page=5

    # Common words that we don't want to be part of the index
    g=['is','a','the','ga','all','to','under']

    def rm_tag(data):
        p = re.compile(r'<.*?>')
        return p.sub('', data)

    def get_page(url):
        try:
            f = urllib.urlopen(url)
            page = f.read()
            f.close()
            return page
        except:
            return ""

    def union(a,b):
        for e in b:
            if e not in a:
                a.append(e)

    def get_next_url(page):
        start_link=page.find("<a href=")
        if(start_link==-1):
            return None,0
        start_quote=page.find('"',start_link)
        end_quote=page.find('"',start_quote+1)
        url=page[start_quote+1:end_quote]
        return url,end_quote

    def get_all_links(page):
        links=[]
        while True:
            url,endpos=get_next_url(page)
            page=page[endpos:]
            if url:
                links.append(url)
            else:
                break
        return links


    def add_to_index(index,url,keyword):
            if keyword in index:
                    if url not in index[keyword]:
                            index[keyword].append(url)
            index[keyword]=[url]



    def add_page_to_index(index, url, content):
        words = content.split()
        for word in words:
            add_to_index(index, url,word)

    def compute_ranks(graph):
        d=0.8
        numloops=20
        ranks={}
        npages=len(graph)
        for page in graph:
            ranks[page]=1.0/npages
        for i in range(0,numloops):
            newranks={}
            for page in graph:
                newrank=(1-d)/npages
                for node in graph:
                    if page in graph[node]:
                        newrank=newrank + d * (ranks[node]/len(graph[node]))
                newranks[page]=newrank
            ranks=newranks
        return ranks

    def crawl_web(seed):
        tocrawl=[seed]
        crawled =[]
        index ={}
        graph={}
        global max_page
        while tocrawl:
            page=tocrawl.pop()
            if page not in crawled:
                max_page -= 1 
                if max_page<=0:
                    break
                c = get_page(page)
                graph[page]=get_all_links(c)
                c=rm_tag(c)
                add_page_to_index(index,page,c)
                union(tocrawl,graph[page])
                crawled.append(page)
        return crawled,index,graph


    def Look_up(index, keyword):
        if keyword in index:
            return index[keyword]
        else:
            return None

    def lucky_search(index, ranks, keyword):
        try:
            pages = Look_up(index, keyword)
            if pages:
                bestpage = pages[0]
                for candidate in pages:
                    if ranks[candidate] > ranks[bestpage]:
                        bestpage = candidate
                return (bestpage, ranks[bestpage], pages)
        except:
            print "Keyword does not exist"


    seed_page = "http://www.python.org"
    search_term = "Take"

    try:
        print "Enter the Max Limit to Search :"
        max_limit=int(raw_input())
    except:
        max_page = 10

    max_page = max_limit

    print '\nCrawling using ' + seed_page + " as seed_page and search_term: " + search_term
    crawled,index,graph=crawl_web(seed_page)
    ranks=compute_ranks(graph)

    def print_profile(index, graph):
        print "*****************"
        print "Length of index", len(index)
        print "Length of graph", len(graph)
        i = 0
        for e in index:
            i = i + 1
            print i, ":", e
            if i > 20:
                break
        print "*****************"

    def print_profile_top (index, ranks):
        max1 = 0
        for e in index:
            link_count = len(index[e])
            if link_count > max1:
                max1= link_count 
                print e, link_count, lucky_search(index,ranks,e)[1]
        print "*********************"

    print_profile(index, graph)
    print_profile_top(index, ranks)

    print "result for :", search_term, ": ", lucky_search(index,ranks,search_term)

the output is : Enter the Max Limit to Search : 10

    Crawling using http://www.python.org as seed_page and search_term: Take
    *****************
    Length of index 1281
    Length of graph 9
    1 : Canada
    2 : limited
    3 : all
    4 : here"-->.
    5 : unclear,
    6 : CGI,
    7 : 08:00
    8 : enabled:
    9 : results
    10 : href=&quot;&quot;
    11 : :/
    12 : subtle
    13 : Take
    14 : Buildbot,
    15 : pyBiblio,
    16 : CD&#8221;,
    17 : href="/search-pymodules.xml"/>
    18 : nothing
    19 : Foundation
    20 : pyArkansas
    21 : depend
    *****************
    Canada 1 0.0222222222222
    *********************
    result for : Take :  ('http://www.timparkin.co.uk/2012/08/why-you-cant-make-digital-look-like-velvia-50/', 0.022821308980213083, ['http://www.timparkin.co.uk/2012/08/why-you-cant-make-digital-look-like-velvia-50/'])
    >>> 

Please run and see the difference.

4

2 回答 2

1

只是避免确切地告诉你错误。

In [6]: len(text)
Out[6]: 0

In [7]: c
Out[7]: 3

In [8]: 0 / 3
Out[8]: 0

文本的长度,作为一个空字符串返回 0。

一个可能的解决方法是捕获错误并设置len(text)c.

于 2012-10-31T11:47:52.733 回答
0

好像你text是空的。既然你有except: return ""in get_page,任何事情都可能发生(可能是死链接)。添加一些日志到get_page.

于 2012-10-31T11:46:25.680 回答