0

我有一个使用多个重定向的 URL 列表,如下所示:

url1=>url1redirect1=>url1redirect2=>url1redirect3=>url1final url2=>url2redirect1=>url2redirect2=>url2final ...

该列表采用以下格式:

网址1

网址2

网址3

我不拥有重定向链中的所有网站。其中一些是第三方跟踪软件。

有没有办法捕获所有中间 url 和最终 url 并将它们导出到一个整洁的 csv 文件中,如下所示:

url1,url2,url3,

url1redirect1,url2redirect1,url3redirect1,

url1重定向2,url2重定向2,url3重定向2,

url1redirect3,url2final,url3redirect3,

url1final,,url3redirect4,

...

4

1 回答 1

0

发现这个调用的函数get_all_redirects可以完成这项工作:

function get_redirect_url($url){
    $redirect_url = null; 

    $url_parts = @parse_url($url);
    if (!$url_parts) return false;
    if (!isset($url_parts['host'])) return false; //can't process relative URLs
    if (!isset($url_parts['path'])) $url_parts['path'] = '/';

    $sock = fsockopen($url_parts['host'], (isset($url_parts['port']) ? (int)$url_parts['port'] : 80), $errno, $errstr, 30);
    if (!$sock) return false;

    $request = "HEAD " . $url_parts['path'] . (isset($url_parts['query']) ? '?'.$url_parts['query'] : '') . " HTTP/1.1\r\n"; 
    $request .= 'Host: ' . $url_parts['host'] . "\r\n"; 
    $request .= "Connection: Close\r\n\r\n"; 
    fwrite($sock, $request);
    $response = '';
    while(!feof($sock)) $response .= fread($sock, 8192);
    fclose($sock);

    if (preg_match('/^Location: (.+?)$/m', $response, $matches)){
        if ( substr($matches[1], 0, 1) == "/" )
            return $url_parts['scheme'] . "://" . $url_parts['host'] . trim($matches[1]);
        else
            return trim($matches[1]);

    } else {
        return false;
    }

}

function get_all_redirects($url){
    $redirects = array();
    while ($newurl = get_redirect_url($url)){
        if (in_array($newurl, $redirects)){
            break;
        }
        $redirects[] = $newurl;
        $url = $newurl;
    }
    return $redirects;
}

你可以这样使用它:

$urls = file_get_contents("urls.txt");

$url_list = explode("\n", $urls);

$file_content = '';
foreach ($url_list as $url){
    $rez = get_all_redirects($url);
    $file_content .= "$url,";
    foreach ($rez as $v){
        $file_content .= "$v,";
    }
    $file_content = substr($file_content,0, -1);
    $file_content .= "\n";
}

file_put_contents("output.csv", $file_content);

urls.txt是一个包含您的网址的文本文件(每行一个网址):

http://url1.com
http://url2.com
http://url3.com
...
http://urlN.com
于 2013-08-05T07:07:50.627 回答