我建议您使用 Fiddler2 并正常浏览该站点。
一旦你完成了它,你应该能够轻松地复制所需的页面调用以及任何 Javascript 可能已经完成的事情,而无需大惊小怪和代码。
我倾向于使用以下内容一次下载多种形式的页面,并为登录站点等保存cookie:
function Download($href)
{
curl_setopt($this->ch, CURLOPT_COOKIEJAR, COOKIE_FILE); // Cookie management.
curl_setopt($this->ch, CURLOPT_COOKIEFILE, COOKIE_FILE);
curl_setopt($this->ch, CURLOPT_TIMEOUT, CURL_TIMEOUT); // Timeout
curl_setopt($this->ch, CURLOPT_USERAGENT, WEBBOT_NAME); // Webbot name
curl_setopt($this->ch, CURLOPT_VERBOSE, FALSE); // Minimize logs
curl_setopt($this->ch, CURLOPT_SSL_VERIFYPEER, FALSE); // No certificate
curl_setopt($this->ch, CURLOPT_FOLLOWLOCATION, TRUE); // Follow redirects
curl_setopt($this->ch, CURLOPT_MAXREDIRS, 4); // Limit redirections to four
curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, TRUE); // Return in string
curl_setopt($this->ch, CURLOPT_URL, $href); // Target site
curl_setopt($this->ch, CURLOPT_REFERER, $href); // Referer value
curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, true);
# Create return arrays
$return_array['FILE'] = curl_exec($this->ch);
$return_array['STATUS'] = curl_getinfo($this->ch);
$return_array['ERRORS'] = curl_error($this->ch);
$dom_document = new DOMDocument();
@$dom_document->loadHTML($return_array['FILE']);
$return_array['DOM'] = new DOMXpath($dom_document);
return $return_array;
}
这是我的 HttpHelper 类。易于使用,它只是 Html:
<?php
class HttpHelper {
function __construct() {
//setcookie("UserPostcode","2065",time() + 3600);
$this->ch = curl_init();
define("WEBBOT_NAME", "Test Webbot");
# Length of time cURL will wait for a response (seconds)
define("CURL_TIMEOUT", 25);
# Location of your cookie file. (Must be fully resolved local address)
define("COOKIE_FILE", "cookie.txt");
# DEFINE METHOD CONSTANTS
define("HEAD", "HEAD");
define("GET", "GET");
define("POST", "POST");
# DEFINE HEADER INCLUSION
define("EXCL_HEAD", FALSE);
define("INCL_HEAD", TRUE);
$header = array();
$header[0] = "Accept: text/xml,application/xml,application/xhtml+xml,";
$header[0] .= "text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
$header[] = "Cache-Control: max-age=0";
$header[] = "Connection: keep-alive";
$header[] = "Keep-Alive: 300";
$header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7";
$header[] = "Accept-Language: en-us,en;q=0.5";
$header[] = "Pragma: "; // browsers keep this blank.
curl_setopt($this->ch, CURLOPT_HTTPHEADER, $header); // Set Header Information
}
// Collects the HTML, Status, Errors and a DOM.
function Download($href)
{
curl_setopt($this->ch, CURLOPT_COOKIEJAR, COOKIE_FILE); // Cookie management.
curl_setopt($this->ch, CURLOPT_COOKIEFILE, COOKIE_FILE);
curl_setopt($this->ch, CURLOPT_TIMEOUT, CURL_TIMEOUT); // Timeout
curl_setopt($this->ch, CURLOPT_USERAGENT, WEBBOT_NAME); // Webbot name
curl_setopt($this->ch, CURLOPT_VERBOSE, FALSE); // Minimize logs
curl_setopt($this->ch, CURLOPT_SSL_VERIFYPEER, FALSE); // No certificate
curl_setopt($this->ch, CURLOPT_FOLLOWLOCATION, TRUE); // Follow redirects
curl_setopt($this->ch, CURLOPT_MAXREDIRS, 4); // Limit redirections to four
curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, TRUE); // Return in string
curl_setopt($this->ch, CURLOPT_URL, $href); // Target site
curl_setopt($this->ch, CURLOPT_REFERER, $href); // Referer value
curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, true);
# Create return arrays
$return_array['FILE'] = curl_exec($this->ch);
$return_array['STATUS'] = curl_getinfo($this->ch);
$return_array['ERRORS'] = curl_error($this->ch);
$dom_document = new DOMDocument();
@$dom_document->loadHTML($return_array['FILE']);
$return_array['DOM'] = new DOMXpath($dom_document);
return $return_array;
}
function http_post_form($target, $ref, $data_array)
{
return $this->http($target, $ref, $method="POST", $data_array, EXCL_HEAD);
}
function http_post_withheader($target, $ref, $data_array)
{
return http($target, $ref, $method="POST", $data_array, INCL_HEAD);
}
function http($target, $ref, $method, $data_array, $incl_head)
{
# Initialize PHP/CURL handle
$ch = curl_init();
# Prcess data, if presented
if(is_array($data_array))
{
# Convert data array into a query string (ie animal=dog&sport=baseball)
foreach ($data_array as $key => $value)
{
if(strlen(trim($value))>0)
$temp_string[] = $key . "=" . urlencode($value);
else
$temp_string[] = $key;
}
$query_string = join('&', $temp_string);
}else{
$query_string =$data_array;
}
# HEAD method configuration
if($method == HEAD)
{
curl_setopt($ch, CURLOPT_HEADER, TRUE); // No http head
curl_setopt($ch, CURLOPT_NOBODY, TRUE); // Return body
}
else
{
# GET method configuration
if($method == GET)
{
if(isset($query_string))
$target = $target . "?" . $query_string;
curl_setopt ($ch, CURLOPT_HTTPGET, TRUE);
curl_setopt ($ch, CURLOPT_POST, FALSE);
}
# POST method configuration
if($method == POST)
{
if(isset($query_string))
curl_setopt ($ch, CURLOPT_POSTFIELDS, $query_string);
curl_setopt ($ch, CURLOPT_POST, TRUE);
curl_setopt ($ch, CURLOPT_HTTPGET, FALSE);
}
curl_setopt($ch, CURLOPT_HEADER, $incl_head); // Include head as needed
curl_setopt($ch, CURLOPT_NOBODY, FALSE); // Return body
}
curl_setopt($ch, CURLOPT_COOKIEJAR, COOKIE_FILE); // Cookie management.
curl_setopt($ch, CURLOPT_COOKIEFILE, COOKIE_FILE);
curl_setopt($ch, CURLOPT_TIMEOUT, CURL_TIMEOUT); // Timeout
curl_setopt($ch, CURLOPT_USERAGENT, WEBBOT_NAME); // Webbot name
curl_setopt($ch, CURLOPT_URL, $target); // Target site
curl_setopt($ch, CURLOPT_REFERER, $ref); // Referer value
curl_setopt($ch, CURLOPT_VERBOSE, FALSE); // Minimize logs
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); // No certificate
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE); // Follow redirects
curl_setopt($ch, CURLOPT_MAXREDIRS, 4); // Limit redirections to four
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); // Return in string
# Create return array
$return_array['FILE'] = curl_exec($ch);
$return_array['STATUS'] = curl_getinfo($ch);
$return_array['ERROR'] = curl_error($ch);
# Close PHP/CURL handle
curl_close($ch);
# Return results
return $return_array;
}
function InnerHtml($element)
{
$innerHTML = "";
if($element != NULL && $element->hasChildNodes())
{
$children = $element->childNodes;
foreach ($children as $child)
{
$tmp_dom = new DOMDocument();
$tmp_dom->appendChild($tmp_dom->importNode($child, true));
$innerHTML.=trim($tmp_dom->saveHTML());
}
}
return $innerHTML;
}
function Split($data, $split)
{
return explode($split, $data);
}
function correctImgUrls($html, $url)
{
$DOM = new DOMDocument;
$DOM->loadHTML($html);
$imgs = $DOM->getElementsByTagName('img');
foreach($imgs as $img){
$src = $img->getAttribute('src');
if(strpos($src, $url) !== 0){
$img->setAttribute('src', $url.$src);
}
}
$html = $DOM->saveHTML();
return $html;
}
function correctUrls($html, $url)
{
$DOM = new DOMDocument;
$DOM->loadHTML($html);
$imgs = $DOM->getElementsByTagName('a');
foreach($imgs as $img){
$src = $img->getAttribute('href');
if(strpos($src, $url) !== 0){
$img->setAttribute('a', $url.$src);
}
}
$html = $DOM->saveHTML();
return $html;
}
function removeHref($html)
{
$DOM = new DOMDocument;
$DOM->loadHTML($html);
$imgs = $DOM->getElementsByTagName('a');
foreach($imgs as $img){
$src = $img->getAttribute('href');
$img->setAttribute('href', "#");
}
$html = $DOM->saveHTML();
return $html;
}
function QuerySelector($dom, $xPath)
{
return $dom->query($xPath);
}
/*
function __destruct() {
# Close PHP/CURL handle
echo "Destruct Called..";
curl_close($ch);
}*/
}
?>
模拟您的登录并做您需要做的事情:这是我用来登录我的 oDesk 帐户并抓取职位发布然后通过电子邮件发送给自己的示例:P
include("Business/Http/HttpHelper.php");
$bot = new HttpHelper;
//$download = $bot ->Download("https://www.odesk.com/login");
$data['username'] = "myusername";
$data['password'] = "myPassword";
$bot -> http_post_form("https://www.odesk.com/login", "https://www.odesk.com/login", $data);
你欠我!