python - 如何告诉 python scrapy 移动到下一个起始 URL

Question

我写了一个有许多 start_urls 并在这些 url 中提取电子邮件地址的 scrapy spider。该脚本需要很长时间才能执行，因此我想告诉 Scrapy 在找到电子邮件并移动到下一个站点时停止抓取特定站点。

编辑：添加代码

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item
import csv
from urlparse import urlparse

from entreprise.items import MailItem

class MailSpider(CrawlSpider):
    name = "mail"
    start_urls = []
    allowed_domains = []
    with open('scraped_data.csv', 'rb') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(reader)
        for row in reader:
            url = row[5].strip()
            if (url.strip() != ""):
                start_urls.append(url)
                fragments = urlparse(url).hostname.split(".")
                hostname = ".".join(len(fragments[-2]) < 4 and fragments[-3:] or fragments[-2:])
                allowed_domains.append(hostname)

    rules = [
        Rule(SgmlLinkExtractor(allow=('.+')), follow=True, callback='parse_item'),
        Rule(SgmlLinkExtractor(allow=('.+')), callback='parse_item')
    ]

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        items = []
        for mail in hxs.select('//body//text()').re(r'[\w.-]+@[\w.-]+'):
            item = MailItem()
            item['url'] = response.url
            item['mail'] = mail
            items.append(item)
        return items

score 2 · Accepted Answer

我结束了使用 process_links

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
import csv
from urlparse import urlparse

class MailItem(Item):
    url = Field()
    mail = Field()

class MailSpider(CrawlSpider):
    name = "mail"

    parsed_hostnames= set()

    rules = [
        Rule(SgmlLinkExtractor(allow=('.+')), follow=True, callback='parse_item', process_links='process_links'),
        Rule(SgmlLinkExtractor(allow=('.+')), callback='parse_item', process_links='process_links')
    ]

    start_urls = []
    allowed_domains = []
    with open('scraped_data.csv', 'rb') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(reader)
        for row in reader:
            url = row[5].strip()
            if (url.strip() != ""):
                start_urls.append(url)
                hostname = urlparse(url).hostname
                allowed_domains.append(hostname)

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        items = []
        mails = hxs.select('//body//text()').re(r'[\w.-]+@[\w.-]+')
        if mails:
            for mail in mails:
                item = MailItem()
                item['url'] = response.url
                item['mail'] = mail
                items.append(item)
                hostname = urlparse(response.url).hostname
                self.parsed_hostnames.add(hostname)

        return items

    def process_links(self, links):
        return [l for l in links if urlparse(l.url).hostname not in self.parsed_hostnames]

score 2 · Accepted Answer

这个想法是使用start_requests方法来决定接下来要抓取哪些 url。此外，我们将跟踪电子邮件是否被解析为parsed_hostnames类级别集中的主机名。

另外，我改变了你从 url 获取主机名的方式，现在使用urlparse。

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
import csv
from urlparse import urlparse


class MailItem(Item):
    url = Field()
    mail = Field()


class MailSpider(CrawlSpider):
    name = "mail"

    parsed_hostnames= set()
    allowed_domains = []

    rules = [
        Rule(SgmlLinkExtractor(allow=('.+')), follow=True, callback='parse_item'),
        Rule(SgmlLinkExtractor(allow=('.+')), callback='parse_item')
    ]

    def start_requests(self):
        with open('scraped_data.csv', 'rb') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
            next(reader)

            for row in reader:
                url = row[5].strip()
                if url:
                    hostname = urlparse(url).hostname
                    if hostname not in self.parsed_hostnames:
                        if hostname not in self.allowed_domains:
                            self.allowed_domains.append(hostname)
                            self.rules[0].link_extractor.allow_domains.add(hostname)
                            self.rules[1].link_extractor.allow_domains.add(hostname)

                        yield self.make_requests_from_url(url)
                    else:
                        self.allowed_domains.remove(hostname)
                        self.rules[0].link_extractor.allow_domains.remove(hostname)
                        self.rules[1].link_extractor.allow_domains.remove(hostname)

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        items = []
        for mail in hxs.select('//body//text()').re(r'[\w.-]+@[\w.-]+'):
            item = MailItem()
            item['url'] = response.url
            item['mail'] = mail
            items.append(item)

        hostname = urlparse(response.url).hostname
        self.parsed_hostnames.add(hostname)

        return items

理论上应该有效。希望有帮助。

python - 如何告诉 python scrapy 移动到下一个起始 URL

2 回答 2

Related

Reference