0

我正在尝试构建一个脚本,将信息发布到RoyalMail 跟踪系统并提取输出。

我目前从他们的服务器收到错误 - 请参阅链接,它以某种方式检测到我没有正常使用他们的网站并给我一个错误。

我认为我已经考虑到的事情:

  • 通过事先解析使用其形式的精确副本(post 参数)
  • 在每个请求之间保存 cookie
  • 接受重定向标头
  • 提供一个实际有效的refer header(之前访问过的页面)

有谁知道我需要检查的其他内容或可以弄清楚我做错了什么?

源的完整副本在编辑:请在下面查看我的答案

4

3 回答 3

2

网站通常使用 2 种方法来检测您是人类还是机器人:HTTP REFERER 和 USER AGENT。我建议您使用 Curl 它指定的用户代理和引用者(将“http://something/”替换为您通常会访问的页面的真实 URL,然后再导航到您要使用 PHP 下载的 URL):

<?php

$url = 'http://track2.royalmail.com/portal/rm/track';
$html = file_get_contents2($url, '');


$post['_dyncharset'] = 'ISO-8859-1';

$post['trackConsigniaPage'] = 'track';

$post['/rmg/track/RMTrackFormHandler.value.searchCompleteUrl'] = '/portal/rm/trackresults?catId=22700601&pageId=trt_rmresultspage';
$post['_D:/rmg/track/RMTrackFormHandler.value.searchCompleteUrl'] = ''; 
$post['/rmg/track/RMTrackFormHandler.value.invalidInputUrl'] = '/portal/rm/trackresults?catId=22700601&pageId=trt_rmresultspage&keyname=track_blank';
$post['_D:/rmg/track/RMTrackFormHandler.value.invalidInputUrl'] = '';
$post['/rmg/track/RMTrackFormHandler.value.searchBusyUrl'] = '/portal/rm/trackresults?catId=22700601&pageId=trt_busypage&keyname=3E_track';
$post['_D:/rmg/track/RMTrackFormHandler.value.searchBusyUrl'] = ''; 
$post['/rmg/track/RMTrackFormHandler.value.searchWaitUrl'] = '/portal/rm/trackresults?catId=22700601&timeout=true&pageId=trt_timeoutpage&keyname=3E_track';
$post['_D:/rmg/track/RMTrackFormHandler.value.searchWaitUrl'] = ''; 
$post['/rmg/track/RMTrackFormHandler.value.keyname'] = '3E_track';
$post['_D:/rmg/track/RMTrackFormHandler.value.keyname'] = ''; 
$post['/rmg/track/RMTrackFormHandler.value.previousTrackingNumber'] = '';
$post['_D:/rmg/track/RMTrackFormHandler.value.previousTrackingNumber'] = ''; 
$post['/rmg/track/RMTrackFormHandler.value.trackingNumber'] = 'ZW791944749GB';
$post['_D:/rmg/track/RMTrackFormHandler.value.trackingNumber'] = ''; 
$post['/rmg/track/RMTrackFormHandler.track.x'] = '50';
$post['/rmg/track/RMTrackFormHandler.track.y'] = '14';
$post['_D:/rmg/track/RMTrackFormHandler.track'] = ''; 
$post['/rmg/track/RMTrackFormHandler.value.day'] = '19';
$post['_D:/rmg/track/RMTrackFormHandler.value.day'] = ''; 
$post['/rmg/track/RMTrackFormHandler.value.month'] = '5';
$post['_D:/rmg/track/RMTrackFormHandler.value.month'] = '';
$post['/rmg/track/RMTrackFormHandler.value.year'] = '2012';
$post['_D:/rmg/track/RMTrackFormHandler.value.year'] = ''; 
$post['_DARGS'] = '/portal/rmgroup/apps/templates/html/rm/rmTrackResultPage.jsp';

$url2 = 'http://track2.royalmail.com/portal/rm?_DARGS=/portal/rmgroup/apps/templates/html/rm/rmTrackAndTraceForm.jsp';
$html2 = file_get_contents2($url2, $url, $post);

echo $html2;

function file_get_contents2($address, $referer, $post = false)
{   
    $useragent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1"; 

    $c = curl_init(); 
    curl_setopt($c, CURLOPT_URL, $address);
    curl_setopt($c, CURLOPT_USERAGENT, $useragent);     
    curl_setopt($c, CURLOPT_HEADER, 0);
    curl_setopt($c, CURLOPT_RETURNTRANSFER, 1);

    if ($post)
    {
        $postF = http_build_query($post);
        curl_setopt($c, CURLOPT_POST, true);
        curl_setopt($c, CURLOPT_POSTFIELDS, $postF);    
    }

    curl_setopt($c, CURLOPT_COOKIEJAR, 'cookie.txt');
    //curl_setopt($c, CURLOPT_FRESH_CONNECT, 1);
    curl_setopt($c, CURLOPT_REFERER, $referer);
    curl_setopt($c, CURLOPT_FOLLOWLOCATION, 1);
    if (!$data = curl_exec($c)) 
    {
        return false; 
    } 

    return $data;
}

上面更新的代码返回给我:

Item ZW791944749GB was posted at 1 High Street RG17 9TJ on 19/05/12 and is being progressed through our network for delivery. 

所以它似乎有效。

于 2012-05-19T20:28:40.370 回答
2

我现在已经修复它,问题在于 PHP curl 和跟随重定向,它似乎并不总是发布请求数据并在跟随时发送 GET 请求。

为了解决这个问题,我禁用了 curl 跟随位置,curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);然后自己构建了一个递归工作的跟随位置系统。本质上,它从响应中提取位置标头,检查 301 或 302,然后根据需要再次运行该方法。

这意味着信息肯定会再次发布。

我还改进了用户代理字符串,简单地复制我当前的字符串,因为它不会被阻止很长一段时间,因为它在 2012 年正在使用中!

这是 curl 类的最终副本(以防链接失效 - 在过去被否决),它正在工作:

/**
 * Make a curl request respecting redirects
 * Also supports posts
 */
class pegCurlRequest {
  private $url, $postFields = array(), $referer = NULL, $timeout = 3;
  private $debug = false, $postString = "";
  private $curlInfo = array();
  private $content = "";
  private $response_meta_info = array();

  static $cookie;

  function __construct($url, $postFields = array(), $referer = NULL, $timeout = 3) {
    $this->setUrl($url);
    $this->setPost($postFields);
    $this->setReferer($referer);
    $this->setTimeout($timeout);
    if(empty(self::$cookie)) self::$cookie = tempnam("/tmp", "pegCurlRequest"); //one time cookie
  }

  function setUrl($url) {
    $this->url = $url;
  }

  function setTimeout($timeout) {
    $this->timeout = $timeout;
  }

  function setPost($postFields) {
    if(is_array($postFields)) {
      $this->postFields = $postFields;
    }
    $this->updatePostString();
  }

  function updatePostString() {
    //Cope with posting
    $this->postString = "";
    if(!empty($this->postFields)) {
      foreach($this->postFields as $key=>$value) { $this->postString .= $key.'='.$value.'&'; }
      $this->postString= rtrim($this->postString,'&'); //Trim off the waste
    }   
  }

  function setReferer($referer) {
    //Set a referee either specified or based on the url
    $this->referer = $referer;
  }

  function debugInfo() {
    //Debug
    if($this->debug) {
      echo "<table><tr><td colspan='2'><b><u>Pre Curl Request</b><u></td></tr>";
      echo "<tr><td><b>URL: </b></td><td>{$this->url}</td></tr>";
      if(!empty(self::$cookie)) echo "<tr><td><b>Cookie String: </b></td><td>".self::$cookie."</td></tr>";
      if(!empty($this->referer)) echo "<tr><td><b>Referer: </b></td><td>".$this->referer."</td></tr>";
      if(!empty($this->postString)) echo "<tr><td><b>Post String: </b></td><td>".$this->postString."</td></tr>";

      if(!empty($this->postFields)) {
        echo "<tr><td><b>Post Values:</b></td><td><table>";
        foreach($this->postFields as $key=>$value)
          echo "<tr><td>$key</td><td>$value</td></tr>";
        echo "</table>";
      }
      echo "</td></tr></table><br />\n"; 
    } 
  }

  function debugFurtherInfo() {
    //Debug
    if($this->debug) {
      echo "<table><tr><td colspan='2'><b><u>Post Curl Request</b><u></td></tr>";
      echo "<tr><td><b>URL: </b></td><td>{$this->url}</td></tr>";
      if(!empty($this->referer)) echo "<tr><td><b>Referer: </b></td><td>".$this->referer."</td></tr>";
      if(!empty($this->curlInfo)) {
        echo "<tr><td><b>Curl Info:</b></td><td><table>";
        foreach($this->curlInfo as $key=>$value)
          echo "<tr><td>$key</td><td>$value</td></tr>";
        echo "</table>";
      }
      echo "</td></tr></table><br />\n"; 
    } 
  }

  /**
   * Make the actual request
   */
  function makeRequest($url=NULL) {
    //Shorthand request
    if(!is_null($url))
      $this->setUrl($url);

    //Output debug info
    $this->debugInfo();

    //Using a shared cookie
    $cookie = self::$cookie;

    //Setting up the starting information
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 Safari/536.11" );
    curl_setopt($ch, CURLOPT_URL, $this->url);
    curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
    curl_setopt($ch, CURLOPT_ENCODING, "gzip");

     //register a callback function which will process the headers
    //this assumes your code is into a class method, and uses $this->readHeader as the callback //function
    curl_setopt($ch, CURLOPT_HEADERFUNCTION, array(&$this,'readHeader'));

    //Some servers (like Lighttpd) will not process the curl request without this header and will return error code 417 instead. 
    curl_setopt($ch, CURLOPT_HTTPHEADER, array("Expect:"));

    //Referer
    if(empty($this->referer)) {
      curl_setopt($ch, CURLOPT_REFERER, dirname($this->url));
    } else {
      curl_setopt($ch, CURLOPT_REFERER, $this->referer);
    }

    //Posts
    if(!empty($this->postFields)) {
      curl_setopt($ch, CURLOPT_POST, true);
      curl_setopt($ch, CURLOPT_POSTFIELDS, $this->postString);
    }

    //Redirects, transfers and timeouts
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_AUTOREFERER, false);
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $this->timeout);
    curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
    curl_setopt($ch, CURLOPT_MAXREDIRS, 10);

    //Debug
    if($this->debug) {
      curl_setopt($ch, CURLOPT_VERBOSE, true); // logging stuffs
      curl_setopt($ch, CURLINFO_HEADER_OUT, true); // enable tracking
    }

    //Get the content and the header info
    $content = curl_exec($ch);
    $response = curl_getinfo($ch);

    //get the default response headers
    $headers = curl_getinfo($ch);

    //add the headers from the custom headers callback function
    $this->response_meta_info = array_merge($headers, $this->response_meta_info);

    curl_close($ch); //be nice

    //Curl info
    $this->curlInfo = $response;

    //Output debug info
    $this->debugFurtherInfo();

    //Are we being redirected?
    if ($response['http_code'] == 301 || $response['http_code'] == 302) {
      $location = $this->getHeaderLocation();
      if(!empty($location)) { //the location exists
        $this->setReferer($this->getTrueUrl()); //update referer
        return $this->makeRequest($location); //recurse to location
      }
    } 
    //Is there a javascript redirect on the page?
    elseif (preg_match("/window\.location\.replace\('(.*)'\)/i", $content, $value) ||
      preg_match("/window\.location\=\"(.*)\"/i", $content, $value)) {
      $this->setReferer($this->getTrueUrl()); //update referer
      return $this->makeRequest($value[1]); //recursion
    } else {
      $this->content = $content; //set the content - final page
    }
  }

  /**
   * Get the url after any redirection
   */
  function getTrueUrl() {
    return $this->curlInfo['url'];
  }

  function __toString() {
    return $this->content;
  }

  /**
   * CURL callback function for reading and processing headers
   * Override this for your needs
   * 
   * @param object $ch
   * @param string $header
   * @return integer
   */
  private function readHeader($ch, $header) {
      //This is run for every header, use ifs to grab and add
      $location = $this->extractCustomHeader('Location: ', '\n', $header);
      if ($location) {
          $this->response_meta_info['location'] = trim($location);
      }
      return strlen($header);
  }

  private function extractCustomHeader($start,$end,$header) {
      $pattern = '/'. $start .'(.*?)'. $end .'/';
      if (preg_match($pattern, $header, $result)) {
          return $result[1];
      } else {
          return false;
      }
  }

  function getHeaders() {
      return $this->response_meta_info;
  }

  function getHeaderLocation() {
      return $this->response_meta_info['location'];
  }
}
于 2012-05-20T13:24:20.117 回答
1

首先,您的是皇家邮政。所以我不确定这个简单的小技巧是否会绊倒他们......

但是您可以尝试快速欺骗您的用户代理ini_set()-

ini_set('user_agent', 'Mozilla/5.0 (X11; CrOS i686 1660.57.0) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.46 Safari/535.19'

那是一个 Ubuntu chrome 用户代理字符串。

cURL用户代理字符串看起来完全不同。例如:

curl/7.15.5 (i686-redhat-linux-gnu) libcurl/7.15.5 OpenSSL/0.9.8b zlib/1.2.3 libidn/0.6.5

这是一个很长的机会 - 但他们可能会拒绝不是来自已识别浏览器的请求。

于 2012-05-19T19:24:16.750 回答