4

我写了一个有许多 start_urls 并在这些 url 中提取电子邮件地址的 scrapy spider。该脚本需要很长时间才能执行,因此我想告诉 Scrapy 在找到电子邮件并移动到下一个站点时停止抓取特定站点。

编辑:添加代码

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item
import csv
from urlparse import urlparse

from entreprise.items import MailItem

class MailSpider(CrawlSpider):
    name = "mail"
    start_urls = []
    allowed_domains = []
    with open('scraped_data.csv', 'rb') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(reader)
        for row in reader:
            url = row[5].strip()
            if (url.strip() != ""):
                start_urls.append(url)
                fragments = urlparse(url).hostname.split(".")
                hostname = ".".join(len(fragments[-2]) < 4 and fragments[-3:] or fragments[-2:])
                allowed_domains.append(hostname)

    rules = [
        Rule(SgmlLinkExtractor(allow=('.+')), follow=True, callback='parse_item'),
        Rule(SgmlLinkExtractor(allow=('.+')), callback='parse_item')
    ]

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        items = []
        for mail in hxs.select('//body//text()').re(r'[\w.-]+@[\w.-]+'):
            item = MailItem()
            item['url'] = response.url
            item['mail'] = mail
            items.append(item)
        return items
4

2 回答 2

2

我结束了使用 process_links

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
import csv
from urlparse import urlparse

class MailItem(Item):
    url = Field()
    mail = Field()

class MailSpider(CrawlSpider):
    name = "mail"

    parsed_hostnames= set()

    rules = [
        Rule(SgmlLinkExtractor(allow=('.+')), follow=True, callback='parse_item', process_links='process_links'),
        Rule(SgmlLinkExtractor(allow=('.+')), callback='parse_item', process_links='process_links')
    ]

    start_urls = []
    allowed_domains = []
    with open('scraped_data.csv', 'rb') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(reader)
        for row in reader:
            url = row[5].strip()
            if (url.strip() != ""):
                start_urls.append(url)
                hostname = urlparse(url).hostname
                allowed_domains.append(hostname)

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        items = []
        mails = hxs.select('//body//text()').re(r'[\w.-]+@[\w.-]+')
        if mails:
            for mail in mails:
                item = MailItem()
                item['url'] = response.url
                item['mail'] = mail
                items.append(item)
                hostname = urlparse(response.url).hostname
                self.parsed_hostnames.add(hostname)

        return items

    def process_links(self, links):
        return [l for l in links if urlparse(l.url).hostname not in self.parsed_hostnames]
于 2013-06-03T11:09:51.950 回答
2

这个想法是使用start_requests方法来决定接下来要抓取哪些 url。此外,我们将跟踪电子邮件是否被解析为parsed_hostnames类级别集中的主机名。

另外,我改变了你从 url 获取主机名的方式,现在使用urlparse

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
import csv
from urlparse import urlparse


class MailItem(Item):
    url = Field()
    mail = Field()


class MailSpider(CrawlSpider):
    name = "mail"

    parsed_hostnames= set()
    allowed_domains = []

    rules = [
        Rule(SgmlLinkExtractor(allow=('.+')), follow=True, callback='parse_item'),
        Rule(SgmlLinkExtractor(allow=('.+')), callback='parse_item')
    ]

    def start_requests(self):
        with open('scraped_data.csv', 'rb') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
            next(reader)

            for row in reader:
                url = row[5].strip()
                if url:
                    hostname = urlparse(url).hostname
                    if hostname not in self.parsed_hostnames:
                        if hostname not in self.allowed_domains:
                            self.allowed_domains.append(hostname)
                            self.rules[0].link_extractor.allow_domains.add(hostname)
                            self.rules[1].link_extractor.allow_domains.add(hostname)

                        yield self.make_requests_from_url(url)
                    else:
                        self.allowed_domains.remove(hostname)
                        self.rules[0].link_extractor.allow_domains.remove(hostname)
                        self.rules[1].link_extractor.allow_domains.remove(hostname)

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        items = []
        for mail in hxs.select('//body//text()').re(r'[\w.-]+@[\w.-]+'):
            item = MailItem()
            item['url'] = response.url
            item['mail'] = mail
            items.append(item)

        hostname = urlparse(response.url).hostname
        self.parsed_hostnames.add(hostname)

        return items

理论上应该有效。希望有帮助。

于 2013-06-03T09:19:49.277 回答