0

我需要弄清楚如何抓取网站并从经过身份验证的网站下载文件。

一个脚本需要

  1. 使用用户名/密码登录本网站
  2. 浏览页面以到达下载页面
  3. 在表单中设置一些字段并点击下载按钮
  4. 保存下载的文件

我一直在看 Jsoup(因为 Java 是我的首选),但也可以尝试 scrapy 等。但我需要了解这些是否通常完成,以及是否有其他技术可以实现这一点。我可以使用 Selenium 之类的东西来设置它,但我不想要一个使用浏览器作为 UA 的工具,因为会有巨大的额外开销。我到了某个地方,但整个 cookie 管理变得非常混乱。

谢谢,维维克

4

2 回答 2

1

如果您需要与您描述的网页进行大量交互,则无法使用真正的浏览器 - 至少根据我的经验。然而,Selenium webdriver 与 phantomjs 配合得很好,因此开销不会太大。

正如下面评论中所指出的,您也可以使用 mechanize 之类的东西,但是当存在更改页面上 DOM 的 javascript 时,此类解决方案往往无用。(见http://wwwsearch.sourceforge.net/mechanize/faq.html#script

于 2013-05-07T19:14:04.407 回答
0

我建议您使用 Fiddler2 并正常浏览该站点。

一旦你完成了它,你应该能够轻松地复制所需的页面调用以及任何 Javascript 可能已经完成的事情,而无需大惊小怪和代码。

我倾向于使用以下内容一次下载多种形式的页面,并为登录站点等保存cookie:

function Download($href)
   {

        curl_setopt($this->ch, CURLOPT_COOKIEJAR, COOKIE_FILE);   // Cookie management.
        curl_setopt($this->ch, CURLOPT_COOKIEFILE, COOKIE_FILE);
        curl_setopt($this->ch, CURLOPT_TIMEOUT, CURL_TIMEOUT);    // Timeout
        curl_setopt($this->ch, CURLOPT_USERAGENT, WEBBOT_NAME);   // Webbot name
        curl_setopt($this->ch, CURLOPT_VERBOSE, FALSE);           // Minimize logs
        curl_setopt($this->ch, CURLOPT_SSL_VERIFYPEER, FALSE);    // No certificate
        curl_setopt($this->ch, CURLOPT_FOLLOWLOCATION, TRUE);     // Follow redirects
        curl_setopt($this->ch, CURLOPT_MAXREDIRS, 4);             // Limit redirections to four
        curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, TRUE);     // Return in string
        curl_setopt($this->ch, CURLOPT_URL, $href);             // Target site
        curl_setopt($this->ch, CURLOPT_REFERER, $href);            // Referer value
        curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, true);

        # Create return arrays
        $return_array['FILE']   = curl_exec($this->ch); 
        $return_array['STATUS'] = curl_getinfo($this->ch);
        $return_array['ERRORS']  = curl_error($this->ch);
        $dom_document = new DOMDocument();
        @$dom_document->loadHTML($return_array['FILE']);
        $return_array['DOM'] = new DOMXpath($dom_document);


        return $return_array;
   }

这是我的 HttpHelper 类。易于使用,它只是 Html:

<?php
class HttpHelper {


    function __construct() {
    //setcookie("UserPostcode","2065",time() + 3600);
        $this->ch = curl_init();
        define("WEBBOT_NAME", "Test Webbot");
        # Length of time cURL will wait for a response (seconds)
        define("CURL_TIMEOUT", 25);
        # Location of your cookie file. (Must be fully resolved local address)
        define("COOKIE_FILE", "cookie.txt");
        # DEFINE METHOD CONSTANTS
        define("HEAD", "HEAD");
        define("GET",  "GET");
        define("POST", "POST");
        # DEFINE HEADER INCLUSION
        define("EXCL_HEAD", FALSE);
        define("INCL_HEAD", TRUE);


        $header = array();
        $header[0] = "Accept: text/xml,application/xml,application/xhtml+xml,";
        $header[0] .= "text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
        $header[] =  "Cache-Control: max-age=0";
        $header[] =  "Connection: keep-alive";
        $header[] = "Keep-Alive: 300";
        $header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7";
        $header[] = "Accept-Language: en-us,en;q=0.5";
        $header[] = "Pragma: "; // browsers keep this blank.

        curl_setopt($this->ch, CURLOPT_HTTPHEADER, $header);        // Set Header Information

   }

   // Collects the HTML, Status, Errors and a DOM.
   function Download($href)
   {

        curl_setopt($this->ch, CURLOPT_COOKIEJAR, COOKIE_FILE);   // Cookie management.
        curl_setopt($this->ch, CURLOPT_COOKIEFILE, COOKIE_FILE);
        curl_setopt($this->ch, CURLOPT_TIMEOUT, CURL_TIMEOUT);    // Timeout
        curl_setopt($this->ch, CURLOPT_USERAGENT, WEBBOT_NAME);   // Webbot name
        curl_setopt($this->ch, CURLOPT_VERBOSE, FALSE);           // Minimize logs
        curl_setopt($this->ch, CURLOPT_SSL_VERIFYPEER, FALSE);    // No certificate
        curl_setopt($this->ch, CURLOPT_FOLLOWLOCATION, TRUE);     // Follow redirects
        curl_setopt($this->ch, CURLOPT_MAXREDIRS, 4);             // Limit redirections to four
        curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, TRUE);     // Return in string
        curl_setopt($this->ch, CURLOPT_URL, $href);             // Target site
        curl_setopt($this->ch, CURLOPT_REFERER, $href);            // Referer value
        curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, true);

        # Create return arrays
        $return_array['FILE']   = curl_exec($this->ch); 
        $return_array['STATUS'] = curl_getinfo($this->ch);
        $return_array['ERRORS']  = curl_error($this->ch);
        $dom_document = new DOMDocument();
        @$dom_document->loadHTML($return_array['FILE']);
        $return_array['DOM'] = new DOMXpath($dom_document);


        return $return_array;
   }

   function http_post_form($target, $ref, $data_array)
    {
    return $this->http($target, $ref, $method="POST", $data_array, EXCL_HEAD);
    }

function http_post_withheader($target, $ref, $data_array)
    {
    return http($target, $ref, $method="POST", $data_array, INCL_HEAD);
    }

   function http($target, $ref, $method, $data_array, $incl_head)
    {
    # Initialize PHP/CURL handle
    $ch = curl_init();

    # Prcess data, if presented
    if(is_array($data_array))
        {
        # Convert data array into a query string (ie animal=dog&sport=baseball)
        foreach ($data_array as $key => $value) 
            {
            if(strlen(trim($value))>0)
                $temp_string[] = $key . "=" . urlencode($value);
            else
                $temp_string[] = $key;
            } 
        $query_string = join('&', $temp_string);
        }else{ 
            $query_string =$data_array;
        }

    # HEAD method configuration
    if($method == HEAD)
        {
        curl_setopt($ch, CURLOPT_HEADER, TRUE);                // No http head
        curl_setopt($ch, CURLOPT_NOBODY, TRUE);                // Return body
        }
    else
        {
        # GET method configuration
        if($method == GET)
            {
            if(isset($query_string))
                $target = $target . "?" . $query_string;
            curl_setopt ($ch, CURLOPT_HTTPGET, TRUE); 
            curl_setopt ($ch, CURLOPT_POST, FALSE); 
            }
        # POST method configuration
        if($method == POST)
            {
            if(isset($query_string))
                curl_setopt ($ch, CURLOPT_POSTFIELDS, $query_string);
            curl_setopt ($ch, CURLOPT_POST, TRUE); 
            curl_setopt ($ch, CURLOPT_HTTPGET, FALSE); 
            }
        curl_setopt($ch, CURLOPT_HEADER, $incl_head);   // Include head as needed
        curl_setopt($ch, CURLOPT_NOBODY, FALSE);        // Return body
        }

    curl_setopt($ch, CURLOPT_COOKIEJAR, COOKIE_FILE);   // Cookie management.
    curl_setopt($ch, CURLOPT_COOKIEFILE, COOKIE_FILE);
    curl_setopt($ch, CURLOPT_TIMEOUT, CURL_TIMEOUT);    // Timeout
    curl_setopt($ch, CURLOPT_USERAGENT, WEBBOT_NAME);   // Webbot name
    curl_setopt($ch, CURLOPT_URL, $target);             // Target site
    curl_setopt($ch, CURLOPT_REFERER, $ref);            // Referer value
    curl_setopt($ch, CURLOPT_VERBOSE, FALSE);           // Minimize logs
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);    // No certificate
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);     // Follow redirects
    curl_setopt($ch, CURLOPT_MAXREDIRS, 4);             // Limit redirections to four
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);     // Return in string

    # Create return array
    $return_array['FILE']   = curl_exec($ch); 
    $return_array['STATUS'] = curl_getinfo($ch);
    $return_array['ERROR']  = curl_error($ch);

    # Close PHP/CURL handle
    curl_close($ch);

    # Return results
    return $return_array;
    }

   function InnerHtml($element) 
    { 
        $innerHTML = ""; 
        if($element != NULL && $element->hasChildNodes())
        {
            $children = $element->childNodes; 
            foreach ($children as $child) 
            { 
                $tmp_dom = new DOMDocument(); 
                $tmp_dom->appendChild($tmp_dom->importNode($child, true)); 
                $innerHTML.=trim($tmp_dom->saveHTML()); 
            } 
        }
        return $innerHTML; 
    } 


   function Split($data, $split)
   {
        return explode($split, $data);
   }

    function correctImgUrls($html, $url)
    {
        $DOM = new DOMDocument;
        $DOM->loadHTML($html);

        $imgs = $DOM->getElementsByTagName('img');
        foreach($imgs as $img){
            $src = $img->getAttribute('src');
            if(strpos($src, $url) !== 0){
                $img->setAttribute('src', $url.$src);
            }
        }

        $html = $DOM->saveHTML();
        return $html;
    }

    function correctUrls($html, $url)
    {
        $DOM = new DOMDocument;
        $DOM->loadHTML($html);

        $imgs = $DOM->getElementsByTagName('a');
        foreach($imgs as $img){
            $src = $img->getAttribute('href');
            if(strpos($src, $url) !== 0){
                $img->setAttribute('a', $url.$src);
            }
        }

        $html = $DOM->saveHTML();
        return $html;
    }

    function removeHref($html)
    {
        $DOM = new DOMDocument;
        $DOM->loadHTML($html);

        $imgs = $DOM->getElementsByTagName('a');
        foreach($imgs as $img){
            $src = $img->getAttribute('href');
            $img->setAttribute('href', "#");
        }

        $html = $DOM->saveHTML();
        return $html;
    }


   function QuerySelector($dom, $xPath)
   {
        return $dom->query($xPath);
   }
   /*
   function __destruct() {
        # Close PHP/CURL handle
        echo "Destruct Called..";
        curl_close($ch);
   }*/


}
?>

模拟您的登录并做您需要做的事情:这是我用来登录我的 oDesk 帐户并抓取职位发布然后通过电子邮件发送给自己的示例:P

include("Business/Http/HttpHelper.php");
    $bot = new HttpHelper;
    //$download = $bot ->Download("https://www.odesk.com/login");
    $data['username'] = "myusername";
    $data['password'] = "myPassword";
    $bot -> http_post_form("https://www.odesk.com/login", "https://www.odesk.com/login", $data);

你欠我!

于 2013-05-08T15:16:07.583 回答