0

我正在使用此代码登录该站点。但是成功登录后,我被重定向到主页。

但我想获得所需的网址,然后从那里开始抓取

   class myPySpider(InitSpider):
        name = 'MyPy'
        allowed_domains = ['example.com']
        login_page = 'https://www.example.com/login'
        start_urls = ["http://www.example.com/list.php"]

        def init_request(self):
            #"""This function is called before crawling starts."""
            return Request(url=self.login_page, callback=self.login)

        def login(self, response):
            #"""Generate a login request."""
            return FormRequest.from_response(response,
                        formdata={'session_key': 'user@email.com', 'session_password': 'somepassword'},
                        callback=self.check_login_response)

        def check_login_response(self, response):
            #"""Check the response returned by a login request to see if we aresuccessfully logged in."""
            if "Sign Out" in response.body:
                self.log("\n\n\nSuccessfully logged in. Let's start crawling!\n\n\n")
                # Now the crawling can begin..

                return self.initialized() # ****THIS LINE FIXED THE LAST PROBLEM*****

            else:
                self.log("\n\n\nFailed, Bad times :(\n\n\n")
                # Something went wrong, we couldn't log in, so nothing happens.

        def parse(self, response):
            self.log("\n\n\n We got data! \n\n\n")
            hxs = HtmlXPathSelector(response)
            sites = hxs.select('//ol[@id=\'result-set\']/li')
            items = []
            for site in sites:
                item = LinkedPyItem()
                item['title'] = site.select('h2/a/text()').extract()
                item['link'] = site.select('h2/a/@href').extract()
                items.append(item)
            return items
4

0 回答 0