python - 基本蜘蛛程序将无法运行

Question

我在用 Python 构建基本的蜘蛛程序时遇到了麻烦。每当我尝试运行时，都会出现错误。错误发生在最后七行代码的某处。

#These modules do most of the work.
import sys
import urllib2
import urlparse
import htmllib, formatter
from cStringIO import StringIO


def log_stdout(msg):
    """Print msg to the screen."""
    print msg

def get_page(url, log):
    """Retrieve URL and return contents, log errors."""
    try:
        page = urllib2.urlopen(url)
    except urllib2.URLError:
        log("Error retrieving: " + url)
        return ''
    body = page.read()
    page.close()
    return body

def find_links(html):
    """Return a list links in html."""
    # We're using the parser just to get the HREFs
    writer = formatter.DumbWriter(StringIO())
    f = formatter.AbstractFormatter(writer)
    parser = htmllib.HTMLParser(f)
    parser.feed(html)
    parser.close()
    return parser.anchorlist

class Spider:

    """
    The heart of this program, finds all links within a web site.

    run() contains the main loop.
    process_page() retrieves each page and finds the links.
    """

    def __init__(self, startURL, log=None):
            #This method sets initial values
        self.URLs = set()
        self.URLs.add(startURL)
        self.include = startURL
        self._links_to_process = [startURL]
        if log is None:
            # Use log_stdout function if no log provided
            self.log = log_stdout
        else:
            self.log = log

    def run(self):
        #Processes list of URLs one at a time
        while self._links_to_process:
            url = self._links_to_process.pop()
            self.log("Retrieving: " + url)
            self.process_page(url)

    def url_in_site(self, link):
        #Checks whether the link starts with the base URL
        return link.startswith(self.include)

    def process_page(self, url):
        #Retrieves page and finds links in it
        html = get_page(url, self.log)
        for link in find_links(html):
            #Handle relative links
            link = urlparse.urljoin(url, link)
            self.log("Checking: " + link)
            # Make sure this is a new URL within current site
            if link not in self.URLs and self.url_in_site(link):
                self.URLs.add(link)
                self._links_to_process.append(link)

错误消息与此代码块有关。

if __name__ == '__main__':
    #This code runs when script is started from command line
    startURL = sys.argv[1]
    spider = Spider(startURL)
    spider.run()
    for URL in sorted(spider.URLs):
            print URL


The error message:
        startURL = sys.argv[1]
    IndexError: list index out of range

score 4 · Accepted Answer

你没有用参数调用你的蜘蛛程序。sys.argv[0]是您的脚本文件，并且sys.argv[1]将是您传递它的第一个参数。“列表索引超出范围”意味着您没有给它任何参数。

尝试将其称为python spider.py http://www.example.com（使用您的实际 URL）。

score 0 · Accepted Answer

这并不能直接回答您的问题，但是：

我会这样做：

START_PAGE = 'http://some.url.tld'
ahrefs = lxml.html.parse(START_PAGE).getroottree('//a/@href')

lmxl.html然后对对象和multiprocess链接使用可用的方法

这可以处理“半格式错误”的 HTML，您可以插入 BeautifulSoup 库。

如果您甚至想尝试跟踪 JavaScript 生成的链接，则需要做一些工作，但是 - 这就是生活！

python - 基本蜘蛛程序将无法运行

2 回答 2

Related

Reference