我写了一个有许多 start_urls 并在这些 url 中提取电子邮件地址的 scrapy spider。该脚本需要很长时间才能执行,因此我想告诉 Scrapy 在找到电子邮件并移动到下一个站点时停止抓取特定站点。
编辑:添加代码
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item
import csv
from urlparse import urlparse
from entreprise.items import MailItem
class MailSpider(CrawlSpider):
name = "mail"
start_urls = []
allowed_domains = []
with open('scraped_data.csv', 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
next(reader)
for row in reader:
url = row[5].strip()
if (url.strip() != ""):
start_urls.append(url)
fragments = urlparse(url).hostname.split(".")
hostname = ".".join(len(fragments[-2]) < 4 and fragments[-3:] or fragments[-2:])
allowed_domains.append(hostname)
rules = [
Rule(SgmlLinkExtractor(allow=('.+')), follow=True, callback='parse_item'),
Rule(SgmlLinkExtractor(allow=('.+')), callback='parse_item')
]
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
for mail in hxs.select('//body//text()').re(r'[\w.-]+@[\w.-]+'):
item = MailItem()
item['url'] = response.url
item['mail'] = mail
items.append(item)
return items