3

我正在尝试使用 javascript 抓取需要登录和加载数据的页面。目前我可以使用scrapy成功登录。但是我的蜘蛛无法看到我需要的数据,因为数据是使用 javascript 加载的。

我做了一些搜索,发现 Selenium 可能是一个可能的解决方案。我想使用 selenium 创建浏览器并查看页面。看来我应该使用 selenium webdriver 工具。但我不知道该怎么做。有谁知道我应该在哪里以及如何向我的蜘蛛添加硒代码?

非常感谢。

#My spider looks like

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request, FormRequest

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

from selenium import selenium
import time

from login.items import SummaryItem

class titleSpider(BaseSpider):
    name = "titleSpider"
    allowed_domains = ["domain.com"]
    start_urls = ["https://www.domain.com/login"]

    # Authentication
    def parse(self, response):
        return [FormRequest.from_response(response,
                formdata={'session_key': 'myusername', 'session_password': 'mypassword'},
                callback=self.after_login)]

    # Request the webpage
    def after_login(self, response):
        # check login succeed before going on
        if "Error" in response.body:
            print "Login failed"
        else:
            print "Login successfully"
            return Request(url="https://www.domain.com/result1",
               callback=self.parse_page) # this page has some data loaded using javascript


    def __init__(self):
        CrawlSpider.__init__(self)
        self.verificationErrors = []
        # How can I know selenium passes authentication? 
        self.selenium = selenium("localhost", 4444, "*firefox", "https://www.domain.com/result1")
        print "Starting the Selenium Server!"
        self.selenium.start()
        print "Successfully, Started the Selenium Server!"

    def __del__(self):
        self.selenium.stop()
        print self.verificationErrors
        CrawlSpider.__del__(self)

    # Parse the page
    def parse_page(self, response):

        item = SummaryItem()
        hxs = HtmlXPathSelector(response)
        item['name']=hxs.select('//span[@class="name"]/text()').extract() # my spider cannot see the name.

        # Should I add selenium codes here? Can it load the page that requires authentication?
        sel= self.selenium
        sel.open(response.url)
        time.sleep(4)
        item['name']=sel.select('//span[@class="name"]/text()').extract() # 

        return item  
4

1 回答 1

0

你可以试试这样的

def __init__(self):
    BaseSpider.__init__(self)
    self.selenium = webdriver.Firefox()

def __del__(self):
    self.selenium.quit()
    print self.verificationErrors

def parse(self, response):

    # Initialize the webdriver, get login page
    sel = self.selenium
    sel.get(response.url)
    sleep(3)
于 2013-01-25T05:32:52.523 回答