python - Python网络爬虫，深度问题

Question

我正在按照本教程使用 Python 制作网络爬虫。我已经设法让我的代码启动并运行，但是我遇到的问题，并且在上面的视频中没有发生是当我将print scraper(url,7)位于代码末尾的数字增加到 8 或更大时，我m 在 shell 中收到以下错误：

Traceback (most recent call last):
File "<pyshell#30>", line 1, in <module>
  execfile("threads/mechanizex.py")
File "threads/mechanizex.py", line 85, in <module>
  print scraper(url,7)
File "threads/mechanizex.py", line 21, in scraper
  for u in step_url:
TypeError: 'NoneType' object is not iterable

而且我不知道我的问题是什么，因为我的代码与视频中的作者完全相同，他将其数量增加到 13 并获得结果链接，而我无法将其增加到 7 以上。这是我的整个代码：

import urllib
import re
import time
from threading import Thread
import MySQLdb
import mechanize
import readability
from bs4 import BeautifulSoup
from readability.readability import Document
import urlparse

url = "http://adbnews.com/area51"

def scraper(root,steps):
    urls = [root]
    visited = [root]
    counter = 0
    while counter < steps:
        step_url = scrapeStep(urls)
        urls = []
        for u in step_url:
            if u not in visited:
                urls.append(u)
                visited.append(u)
        counter +=1

    return visited

def scrapeStep(root):
    result_urls = []
    br = mechanize.Browser()
    br.set_handle_robots(False)
    br.addheaders = [('User-agent', 'Firefox')]

    for url in root:
        try:
            br.open(url)
            for link in br.links():
                newurl = urlparse.urljoin(link.base_url, link.url)
                result_urls.append(newurl)
        except:
            print "error"
        return result_urls

d = {}
threadlist = []

def getReadableArticle(url):
    br = mechanize.Browser()
    br.set_handle_robots(False)
    br.addheaders = [('User-agent', 'Firefox')]

    html = br.open(url).read()

    readable_article = Document(html).summary()
    readable_title = Document(html).short_title()

    soup = BeautifulSoup(readable_article)

    final_article = soup.text

    links = soup.findAll('img', src=True)

    return readable_title
    return final_article

def dungalo(urls):
    article_text = getReadableArticle(urls)[0]
    d[urls] = article_text

def getMultiHtml(urlsList):
    for urlsl in urlsList:
        try:
            t = Thread(target=dungalo, args=(urls1,))
            threadlist.append(t)
            t.start()
        except:
            nnn = True

    for g in threadlist:
        g.join()

    return d

print scraper(url,7)

帮助任何人？

score 5 · Accepted Answer

你的缩进是错误的。它必须是这样的：

def scrapeStep(root):
    result_urls = []
    br = mechanize.Browser()
    br.set_handle_robots(False)
    br.addheaders = [('User-agent', 'Firefox')]

    for url in root:
        try:
            br.open(url)
            for link in br.links():
                newurl = urlparse.urljoin(link.base_url, link.url)
                result_urls.append(newurl)
        except:
            print "error"

    return result_urls

否则它只会查看第一个给出的 URL，如果没有给出 URL，它也会返回 None。

python - Python网络爬虫，深度问题

1 回答 1

Related

Reference