我正在开发一个网络爬虫,但我需要在请求之间保留 cookie,就像我在 PHP 中使用 curl 所做的那样。但是,如果我尝试CookieContainer
在 C# 中使用对象,它似乎不会从响应中获取所有 cookie 并将它们发送到下一个请求。
这是我的 C# 类:
public class Scraper
{
public string Username { get; set; }
public string Password { get; set; }
public string UserAgent { get; set; }
public string ContentType { get; set; }
public CookieCollection Cookies { get; set; }
public CookieContainer Container { get; set; }
public Scraper()
{
UserAgent = "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0";
ContentType = "application/x-www-form-urlencoded";
Cookies = new CookieCollection();
Container = new CookieContainer();
}
public string Load(string uri, string postData = "", NetworkCredential creds = null, int timeout = 60000, string host = "", string referer = "", string requestedwith = "")
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri);
request.CookieContainer = Container;
request.CookieContainer.Add(Cookies);
request.UserAgent = UserAgent;
request.AllowWriteStreamBuffering = true;
request.ProtocolVersion = HttpVersion.Version11;
request.AllowAutoRedirect = true;
request.ContentType = ContentType;
request.PreAuthenticate = true;
if (requestedwith.Length > 0)
request.Headers["X-Requested-With"] = requestedwith;
if (host.Length > 0)
request.Host = host;
if (referer.Length > 0)
request.Referer = referer;
if (timeout > 0)
request.Timeout = timeout;
if (creds != null)
request.Credentials = creds;
if (postData.Length > 0)
{
request.Method = "POST";
ASCIIEncoding encoding = new ASCIIEncoding();
byte[] data = encoding.GetBytes(postData);
request.ContentLength = data.Length;
Stream newStream = request.GetRequestStream(); //open connection
newStream.Write(data, 0, data.Length); // Send the data.
newStream.Close();
}
else
request.Method = "GET";
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Cookies = response.Cookies;
StringBuilder page;
using (StreamReader sr = new StreamReader(response.GetResponseStream()))
{
page = new StringBuilder(sr.ReadToEnd());
page = page.Replace("\r\n", ""); // strip all new lines and tabs
page = page.Replace("\r", ""); // strip all new lines and tabs
page = page.Replace("\n", ""); // strip all new lines and tabs
page = page.Replace("\t", ""); // strip all new lines and tabs
}
string str = page.ToString();
str = Regex.Replace(str, @">\s+<", "><");
return str;
}
}
这是我在 cookie jar 中加载和维护 cookie 的 PHP 代码:
private function load($url = 'http://www.google.com/', $postData = array(), $headers = FALSE)
{
$useragent = "User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; " . $this->locale . "; rv:1.9.2.10) Gecko/20100914 BRI/1 Firefox/3.6.10 ( .NET CLR 3.5.30729)";
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($curl, CURLOPT_HEADER, FALSE);
if($headers) curl_setopt($curl, CURLOPT_HTTPHEADER, array('X-Requested-With: XMLHttpRequest'));
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, FALSE);
curl_setopt($curl, CURLOPT_ENCODING, 'UTF-8');
curl_setopt($curl, CURLOPT_USERAGENT, $useragent);
curl_setopt($curl, CURLOPT_POST, !empty($postData));
if(!empty($postData)) curl_setopt($curl, CURLOPT_POSTFIELDS, $postData);
curl_setopt($curl, CURLOPT_COOKIEFILE, $this->cookieFile);
curl_setopt($curl, CURLOPT_COOKIEJAR, $this->cookieFile);
$page = curl_exec ($curl);
$page = str_replace(array("\r\n", "\r", "\n", "\t"), "", $page); // strip all new lines and tabs
$page = preg_replace('~>\s+<~', '><', $page);// strip all whitespace between tags
curl_close ($curl);
return $page;
}
如何在请求之间成功维护 cookie?