我正在尝试构建一个脚本,将信息发布到RoyalMail 跟踪系统并提取输出。
我目前从他们的服务器收到错误 - 请参阅链接,它以某种方式检测到我没有正常使用他们的网站并给我一个错误。
我认为我已经考虑到的事情:
- 通过事先解析使用其形式的精确副本(post 参数)
- 在每个请求之间保存 cookie
- 接受重定向标头
- 提供一个实际有效的refer header(之前访问过的页面)
有谁知道我需要检查的其他内容或可以弄清楚我做错了什么?
源的完整副本在编辑:请在下面查看我的答案
我正在尝试构建一个脚本,将信息发布到RoyalMail 跟踪系统并提取输出。
我目前从他们的服务器收到错误 - 请参阅链接,它以某种方式检测到我没有正常使用他们的网站并给我一个错误。
我认为我已经考虑到的事情:
有谁知道我需要检查的其他内容或可以弄清楚我做错了什么?
源的完整副本在编辑:请在下面查看我的答案
网站通常使用 2 种方法来检测您是人类还是机器人:HTTP REFERER 和 USER AGENT。我建议您使用 Curl 它指定的用户代理和引用者(将“http://something/”替换为您通常会访问的页面的真实 URL,然后再导航到您要使用 PHP 下载的 URL):
<?php
$url = 'http://track2.royalmail.com/portal/rm/track';
$html = file_get_contents2($url, '');
$post['_dyncharset'] = 'ISO-8859-1';
$post['trackConsigniaPage'] = 'track';
$post['/rmg/track/RMTrackFormHandler.value.searchCompleteUrl'] = '/portal/rm/trackresults?catId=22700601&pageId=trt_rmresultspage';
$post['_D:/rmg/track/RMTrackFormHandler.value.searchCompleteUrl'] = '';
$post['/rmg/track/RMTrackFormHandler.value.invalidInputUrl'] = '/portal/rm/trackresults?catId=22700601&pageId=trt_rmresultspage&keyname=track_blank';
$post['_D:/rmg/track/RMTrackFormHandler.value.invalidInputUrl'] = '';
$post['/rmg/track/RMTrackFormHandler.value.searchBusyUrl'] = '/portal/rm/trackresults?catId=22700601&pageId=trt_busypage&keyname=3E_track';
$post['_D:/rmg/track/RMTrackFormHandler.value.searchBusyUrl'] = '';
$post['/rmg/track/RMTrackFormHandler.value.searchWaitUrl'] = '/portal/rm/trackresults?catId=22700601&timeout=true&pageId=trt_timeoutpage&keyname=3E_track';
$post['_D:/rmg/track/RMTrackFormHandler.value.searchWaitUrl'] = '';
$post['/rmg/track/RMTrackFormHandler.value.keyname'] = '3E_track';
$post['_D:/rmg/track/RMTrackFormHandler.value.keyname'] = '';
$post['/rmg/track/RMTrackFormHandler.value.previousTrackingNumber'] = '';
$post['_D:/rmg/track/RMTrackFormHandler.value.previousTrackingNumber'] = '';
$post['/rmg/track/RMTrackFormHandler.value.trackingNumber'] = 'ZW791944749GB';
$post['_D:/rmg/track/RMTrackFormHandler.value.trackingNumber'] = '';
$post['/rmg/track/RMTrackFormHandler.track.x'] = '50';
$post['/rmg/track/RMTrackFormHandler.track.y'] = '14';
$post['_D:/rmg/track/RMTrackFormHandler.track'] = '';
$post['/rmg/track/RMTrackFormHandler.value.day'] = '19';
$post['_D:/rmg/track/RMTrackFormHandler.value.day'] = '';
$post['/rmg/track/RMTrackFormHandler.value.month'] = '5';
$post['_D:/rmg/track/RMTrackFormHandler.value.month'] = '';
$post['/rmg/track/RMTrackFormHandler.value.year'] = '2012';
$post['_D:/rmg/track/RMTrackFormHandler.value.year'] = '';
$post['_DARGS'] = '/portal/rmgroup/apps/templates/html/rm/rmTrackResultPage.jsp';
$url2 = 'http://track2.royalmail.com/portal/rm?_DARGS=/portal/rmgroup/apps/templates/html/rm/rmTrackAndTraceForm.jsp';
$html2 = file_get_contents2($url2, $url, $post);
echo $html2;
function file_get_contents2($address, $referer, $post = false)
{
$useragent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1";
$c = curl_init();
curl_setopt($c, CURLOPT_URL, $address);
curl_setopt($c, CURLOPT_USERAGENT, $useragent);
curl_setopt($c, CURLOPT_HEADER, 0);
curl_setopt($c, CURLOPT_RETURNTRANSFER, 1);
if ($post)
{
$postF = http_build_query($post);
curl_setopt($c, CURLOPT_POST, true);
curl_setopt($c, CURLOPT_POSTFIELDS, $postF);
}
curl_setopt($c, CURLOPT_COOKIEJAR, 'cookie.txt');
//curl_setopt($c, CURLOPT_FRESH_CONNECT, 1);
curl_setopt($c, CURLOPT_REFERER, $referer);
curl_setopt($c, CURLOPT_FOLLOWLOCATION, 1);
if (!$data = curl_exec($c))
{
return false;
}
return $data;
}
上面更新的代码返回给我:
Item ZW791944749GB was posted at 1 High Street RG17 9TJ on 19/05/12 and is being progressed through our network for delivery.
所以它似乎有效。
我现在已经修复它,问题在于 PHP curl 和跟随重定向,它似乎并不总是发布请求数据并在跟随时发送 GET 请求。
为了解决这个问题,我禁用了 curl 跟随位置,curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
然后自己构建了一个递归工作的跟随位置系统。本质上,它从响应中提取位置标头,检查 301 或 302,然后根据需要再次运行该方法。
这意味着信息肯定会再次发布。
我还改进了用户代理字符串,简单地复制我当前的字符串,因为它不会被阻止很长一段时间,因为它在 2012 年正在使用中!
这是 curl 类的最终副本(以防链接失效 - 在过去被否决),它正在工作:
/**
* Make a curl request respecting redirects
* Also supports posts
*/
class pegCurlRequest {
private $url, $postFields = array(), $referer = NULL, $timeout = 3;
private $debug = false, $postString = "";
private $curlInfo = array();
private $content = "";
private $response_meta_info = array();
static $cookie;
function __construct($url, $postFields = array(), $referer = NULL, $timeout = 3) {
$this->setUrl($url);
$this->setPost($postFields);
$this->setReferer($referer);
$this->setTimeout($timeout);
if(empty(self::$cookie)) self::$cookie = tempnam("/tmp", "pegCurlRequest"); //one time cookie
}
function setUrl($url) {
$this->url = $url;
}
function setTimeout($timeout) {
$this->timeout = $timeout;
}
function setPost($postFields) {
if(is_array($postFields)) {
$this->postFields = $postFields;
}
$this->updatePostString();
}
function updatePostString() {
//Cope with posting
$this->postString = "";
if(!empty($this->postFields)) {
foreach($this->postFields as $key=>$value) { $this->postString .= $key.'='.$value.'&'; }
$this->postString= rtrim($this->postString,'&'); //Trim off the waste
}
}
function setReferer($referer) {
//Set a referee either specified or based on the url
$this->referer = $referer;
}
function debugInfo() {
//Debug
if($this->debug) {
echo "<table><tr><td colspan='2'><b><u>Pre Curl Request</b><u></td></tr>";
echo "<tr><td><b>URL: </b></td><td>{$this->url}</td></tr>";
if(!empty(self::$cookie)) echo "<tr><td><b>Cookie String: </b></td><td>".self::$cookie."</td></tr>";
if(!empty($this->referer)) echo "<tr><td><b>Referer: </b></td><td>".$this->referer."</td></tr>";
if(!empty($this->postString)) echo "<tr><td><b>Post String: </b></td><td>".$this->postString."</td></tr>";
if(!empty($this->postFields)) {
echo "<tr><td><b>Post Values:</b></td><td><table>";
foreach($this->postFields as $key=>$value)
echo "<tr><td>$key</td><td>$value</td></tr>";
echo "</table>";
}
echo "</td></tr></table><br />\n";
}
}
function debugFurtherInfo() {
//Debug
if($this->debug) {
echo "<table><tr><td colspan='2'><b><u>Post Curl Request</b><u></td></tr>";
echo "<tr><td><b>URL: </b></td><td>{$this->url}</td></tr>";
if(!empty($this->referer)) echo "<tr><td><b>Referer: </b></td><td>".$this->referer."</td></tr>";
if(!empty($this->curlInfo)) {
echo "<tr><td><b>Curl Info:</b></td><td><table>";
foreach($this->curlInfo as $key=>$value)
echo "<tr><td>$key</td><td>$value</td></tr>";
echo "</table>";
}
echo "</td></tr></table><br />\n";
}
}
/**
* Make the actual request
*/
function makeRequest($url=NULL) {
//Shorthand request
if(!is_null($url))
$this->setUrl($url);
//Output debug info
$this->debugInfo();
//Using a shared cookie
$cookie = self::$cookie;
//Setting up the starting information
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 Safari/536.11" );
curl_setopt($ch, CURLOPT_URL, $this->url);
curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_ENCODING, "gzip");
//register a callback function which will process the headers
//this assumes your code is into a class method, and uses $this->readHeader as the callback //function
curl_setopt($ch, CURLOPT_HEADERFUNCTION, array(&$this,'readHeader'));
//Some servers (like Lighttpd) will not process the curl request without this header and will return error code 417 instead.
curl_setopt($ch, CURLOPT_HTTPHEADER, array("Expect:"));
//Referer
if(empty($this->referer)) {
curl_setopt($ch, CURLOPT_REFERER, dirname($this->url));
} else {
curl_setopt($ch, CURLOPT_REFERER, $this->referer);
}
//Posts
if(!empty($this->postFields)) {
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $this->postString);
}
//Redirects, transfers and timeouts
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_AUTOREFERER, false);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_MAXREDIRS, 10);
//Debug
if($this->debug) {
curl_setopt($ch, CURLOPT_VERBOSE, true); // logging stuffs
curl_setopt($ch, CURLINFO_HEADER_OUT, true); // enable tracking
}
//Get the content and the header info
$content = curl_exec($ch);
$response = curl_getinfo($ch);
//get the default response headers
$headers = curl_getinfo($ch);
//add the headers from the custom headers callback function
$this->response_meta_info = array_merge($headers, $this->response_meta_info);
curl_close($ch); //be nice
//Curl info
$this->curlInfo = $response;
//Output debug info
$this->debugFurtherInfo();
//Are we being redirected?
if ($response['http_code'] == 301 || $response['http_code'] == 302) {
$location = $this->getHeaderLocation();
if(!empty($location)) { //the location exists
$this->setReferer($this->getTrueUrl()); //update referer
return $this->makeRequest($location); //recurse to location
}
}
//Is there a javascript redirect on the page?
elseif (preg_match("/window\.location\.replace\('(.*)'\)/i", $content, $value) ||
preg_match("/window\.location\=\"(.*)\"/i", $content, $value)) {
$this->setReferer($this->getTrueUrl()); //update referer
return $this->makeRequest($value[1]); //recursion
} else {
$this->content = $content; //set the content - final page
}
}
/**
* Get the url after any redirection
*/
function getTrueUrl() {
return $this->curlInfo['url'];
}
function __toString() {
return $this->content;
}
/**
* CURL callback function for reading and processing headers
* Override this for your needs
*
* @param object $ch
* @param string $header
* @return integer
*/
private function readHeader($ch, $header) {
//This is run for every header, use ifs to grab and add
$location = $this->extractCustomHeader('Location: ', '\n', $header);
if ($location) {
$this->response_meta_info['location'] = trim($location);
}
return strlen($header);
}
private function extractCustomHeader($start,$end,$header) {
$pattern = '/'. $start .'(.*?)'. $end .'/';
if (preg_match($pattern, $header, $result)) {
return $result[1];
} else {
return false;
}
}
function getHeaders() {
return $this->response_meta_info;
}
function getHeaderLocation() {
return $this->response_meta_info['location'];
}
}
首先,您说的是皇家邮政。所以我不确定这个简单的小技巧是否会绊倒他们......
但是您可以尝试快速欺骗您的用户代理ini_set()
-
ini_set('user_agent', 'Mozilla/5.0 (X11; CrOS i686 1660.57.0) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.46 Safari/535.19'
那是一个 Ubuntu chrome 用户代理字符串。
cURL
用户代理字符串看起来完全不同。例如:
curl/7.15.5 (i686-redhat-linux-gnu) libcurl/7.15.5 OpenSSL/0.9.8b zlib/1.2.3 libidn/0.6.5
这是一个很长的机会 - 但他们可能会拒绝不是来自已识别浏览器的请求。