2

我正在开发一个网络爬虫,但我需要在请求之间保留 cookie,就像我在 PHP 中使用 curl 所做的那样。但是,如果我尝试CookieContainer在 C# 中使用对象,它似乎不会从响应中获取所有 cookie 并将它们发送到下一个请求。

这是我的 C# 类:

    public class Scraper
    {
        public string Username { get; set; }
        public string Password { get; set; }
        public string UserAgent { get; set; }
        public string ContentType { get; set; }
        public CookieCollection Cookies { get; set; }
        public CookieContainer Container { get; set; }

        public Scraper()
        {
            UserAgent = "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0";
            ContentType = "application/x-www-form-urlencoded";
            Cookies = new CookieCollection();
            Container = new CookieContainer();
        }

        public string Load(string uri, string postData = "", NetworkCredential creds = null, int timeout = 60000, string host = "", string referer = "", string requestedwith = "")
        {
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri);
            request.CookieContainer = Container;
            request.CookieContainer.Add(Cookies);   
            request.UserAgent = UserAgent;
            request.AllowWriteStreamBuffering = true;
            request.ProtocolVersion = HttpVersion.Version11;
            request.AllowAutoRedirect = true;
            request.ContentType = ContentType;
            request.PreAuthenticate = true;

            if (requestedwith.Length > 0)
                request.Headers["X-Requested-With"] = requestedwith;

            if (host.Length > 0)
                request.Host = host;

            if (referer.Length > 0)
                request.Referer = referer;

            if (timeout > 0)
                request.Timeout = timeout;

            if (creds != null)
                request.Credentials = creds;

            if (postData.Length > 0)
            {
                request.Method = "POST";
                ASCIIEncoding encoding = new ASCIIEncoding();
                byte[] data = encoding.GetBytes(postData);
                request.ContentLength = data.Length;
                Stream newStream = request.GetRequestStream(); //open connection
                newStream.Write(data, 0, data.Length); // Send the data.
                newStream.Close();
            }
            else
                request.Method = "GET";

            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            Cookies = response.Cookies;
            StringBuilder page;
            using (StreamReader sr = new StreamReader(response.GetResponseStream()))
            {
                page = new StringBuilder(sr.ReadToEnd());
                page = page.Replace("\r\n", ""); // strip all new lines and tabs
                page = page.Replace("\r", ""); // strip all new lines and tabs
                page = page.Replace("\n", ""); // strip all new lines and tabs
                page = page.Replace("\t", ""); // strip all new lines and tabs
            }

            string str = page.ToString();
            str = Regex.Replace(str, @">\s+<", "><");

            return str;
        }
    }

这是我在 cookie jar 中加载和维护 cookie 的 PHP 代码:

    private function load($url = 'http://www.google.com/', $postData = array(), $headers = FALSE)
    {
        $useragent = "User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; " . $this->locale . "; rv:1.9.2.10) Gecko/20100914 BRI/1 Firefox/3.6.10 ( .NET CLR 3.5.30729)";

        $curl = curl_init();
        curl_setopt($curl, CURLOPT_URL, $url);
        curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE);
        curl_setopt($curl, CURLOPT_HEADER, FALSE);
        if($headers) curl_setopt($curl, CURLOPT_HTTPHEADER, array('X-Requested-With: XMLHttpRequest'));
        curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);
        curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, FALSE);
        curl_setopt($curl, CURLOPT_ENCODING, 'UTF-8');
        curl_setopt($curl, CURLOPT_USERAGENT, $useragent);
        curl_setopt($curl, CURLOPT_POST, !empty($postData));
        if(!empty($postData)) curl_setopt($curl, CURLOPT_POSTFIELDS, $postData);
        curl_setopt($curl, CURLOPT_COOKIEFILE, $this->cookieFile);
        curl_setopt($curl, CURLOPT_COOKIEJAR, $this->cookieFile);
        $page = curl_exec ($curl);
        $page = str_replace(array("\r\n", "\r", "\n", "\t"), "", $page); // strip all new lines and tabs
        $page = preg_replace('~>\s+<~', '><', $page);// strip all whitespace between tags
        curl_close ($curl);

        return $page;
    }

如何在请求之间成功维护 cookie?

4

1 回答 1

2

我找到了一个名为 LibCurl.NET 的 libcurl 的 .NET 包装器,并且能够以与 C# 中的 PHP 使用 cURL 相同的方式处理 cookie!这是我给感兴趣的人的代码:

using SeasideResearch.LibCurlNet;
using System;
using System.Collections.Generic;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;

namespace Scraping
{
    public class LibCurlScraper
    {
        StringBuilder sb = new StringBuilder();
        MemoryStream ms = new MemoryStream();
        public string CookieFile { get; set; }
        public string RedirectUrl { get; set; }
        public string UserAgent { get; set; }
        public string ContentType { get; set; }
        public bool DisplayHeaders { get; set; }
        public bool FollowRedirects { get; set; }

        public LibCurlScraper()
        {
            UserAgent = "useragent";
            ContentType = "application/x-www-form-urlencoded";
            Curl.GlobalInit((int)CURLinitFlag.CURL_GLOBAL_ALL);
            DisplayHeaders = false;
        }

        private int MyWriteFunction(byte[] buf, int size, int nmemb, Object extraData)
        {
            foreach (byte b in buf)
            {
                //Console.Write((char)b);
                sb.Append((char)b);
            }

            return buf.Length;
        }

        private int MyWriteBinaryFunction(byte[] buf, int size, int nmemb, Object extraData)
        {
            foreach (byte b in buf)
            {
                //Console.Write((char)b);
                ms.WriteByte(b);
            }

            return buf.Length;
        }

        public MemoryStream LoadBinary(string uri, string method = "GET", string postData = "", List<string> headers = null)
        {
            ms = new MemoryStream();
            Easy easy = new Easy();
            Easy.WriteFunction wf = MyWriteBinaryFunction;
            easy.SetOpt(CURLoption.CURLOPT_URL, uri);
            easy.SetOpt(CURLoption.CURLOPT_HEADER, false);
            easy.SetOpt(CURLoption.CURLOPT_FOLLOWLOCATION, true);

            Slist headerSlist = new Slist();

            if (headers != null)
            {
                foreach (var header in headers)
                {
                    headerSlist.Append(header);
                }

            }

            easy.SetOpt(CURLoption.CURLOPT_HTTPHEADER, headerSlist);

            easy.SetOpt(CURLoption.CURLOPT_SSL_VERIFYPEER, false);
            easy.SetOpt(CURLoption.CURLOPT_SSL_VERIFYHOST, false);
            easy.SetOpt(CURLoption.CURLOPT_USERAGENT, UserAgent);
            easy.SetOpt(CURLoption.CURLOPT_TIMEOUT, 10);
            easy.SetOpt(CURLoption.CURLOPT_CONNECTTIMEOUT, 3);

            if (!string.IsNullOrEmpty(postData))
            {
                easy.SetOpt(CURLoption.CURLOPT_POST, true);
                easy.SetOpt(CURLoption.CURLOPT_POSTFIELDS, postData);
            }

            easy.SetOpt(CURLoption.CURLOPT_COOKIEFILE, CookieFile);
            easy.SetOpt(CURLoption.CURLOPT_COOKIEJAR, CookieFile);
            easy.SetOpt(CURLoption.CURLOPT_WRITEFUNCTION, wf);
            easy.Perform();
            int code = 0;
            easy.GetInfo(CURLINFO.CURLINFO_RESPONSE_CODE, ref code);
            easy.Cleanup();

            return ms;
        }

        public string Load(string uri, string method = "GET", string postData = "", List<string> headers = null)
        {
            sb.Clear();
            Easy easy = new Easy();
            Easy.WriteFunction wf = MyWriteFunction;
            easy.SetOpt(CURLoption.CURLOPT_URL, uri);
            easy.SetOpt(CURLoption.CURLOPT_HEADER, DisplayHeaders);
            easy.SetOpt(CURLoption.CURLOPT_FOLLOWLOCATION, FollowRedirects);

            Slist headerSlist = new Slist();

            if (headers != null)
            {
                foreach (var header in headers)
                {
                    headerSlist.Append(header);
                }

            }

            easy.SetOpt(CURLoption.CURLOPT_HTTPHEADER, headerSlist);


            easy.SetOpt(CURLoption.CURLOPT_SSL_VERIFYPEER, false);
            easy.SetOpt(CURLoption.CURLOPT_SSL_VERIFYHOST, false);
            easy.SetOpt(CURLoption.CURLOPT_USERAGENT, UserAgent);
            easy.SetOpt(CURLoption.CURLOPT_TIMEOUT, 10);
            easy.SetOpt(CURLoption.CURLOPT_CONNECTTIMEOUT, 3);

            if (!string.IsNullOrEmpty(postData))
            {
                easy.SetOpt(CURLoption.CURLOPT_POST, true);
                easy.SetOpt(CURLoption.CURLOPT_POSTFIELDS, postData);
            }

            if (method.Equals("POST"))
            {
                easy.SetOpt(CURLoption.CURLOPT_POST, true);
            }

            easy.SetOpt(CURLoption.CURLOPT_COOKIEFILE, CookieFile);
            easy.SetOpt(CURLoption.CURLOPT_COOKIEJAR, CookieFile);
            easy.SetOpt(CURLoption.CURLOPT_WRITEFUNCTION, wf);
            easy.Perform();
            int code = 0;
            easy.GetInfo(CURLINFO.CURLINFO_RESPONSE_CODE, ref code);
            easy.Cleanup();

            //Console.WriteLine(code);
            if (code == 302)
            {
                RedirectUrl = FindString(sb.ToString(), "Location:(.*?)\n");
                //Console.WriteLine(RedirectUrl);
            }


            string page = sb.ToString();
            page = page.Replace("\r\n", ""); // strip all new lines and tabs
            page = page.Replace("\r", ""); // strip all new lines and tabs
            page = page.Replace("\n", ""); // strip all new lines and tabs
            page = page.Replace("\t", ""); // strip all new lines and tabs

            page = Regex.Replace(page, @">\s+<", "><");

            return page;
        }

        public static void OnDebug(CURLINFOTYPE infoType, String msg, Object extraData)
        {
            Console.WriteLine(msg);
            TextWriter tw = new StreamWriter(@"C:\cookies\verbose.txt", true);
            tw.WriteLine(msg);
            tw.Close();
        }
    }
}

我有两种方法,一种用于返回字符串,一种用于返回 MemoryStream。在尝试写入文件之前,您需要初始化 CookieFile 属性并确保目录/文件是可写的。

我注意到如果您的 cookie 文件包含以前运行的旧会话数据,则会出现问题。这可以通过在实例化 LibCurlScraper 的新实例并填充 cookie 文件之前删除您的 cookie 文件来解决。

理想情况下,我们可以为所有 HTTP cookie 使用内置托管类,但在找到更好的解决方案之前,这种方法一直有效。

编辑:
我遇到了一些正确解析“Set-Cookie”标头的代码。它处理以逗号分隔的 cookie 并提取每个 cookie 的名称、到期时间、路径、值和域。这应该是发出 HTTP 请求而不是 LibCurl.NET 的首选方式。您也可以将此方法应用于异步请求。

这段代码比微软自己的 cookie 解析器工作得更好,这确实是官方 cookie 解析器应该做的。我不知道为什么微软还没有解决这个问题,因为这是一个非常常见的问题。

这是原始代码: http ://snipplr.com/view/4427/

我在这里发布它以防链接在某些时候断开:

public static CookieCollection GetAllCookiesFromHeader(string strHeader, string strHost)
{
    ArrayList al = new ArrayList();
    CookieCollection cc = new CookieCollection();
    if (strHeader != string.Empty)
    {
        al = ConvertCookieHeaderToArrayList(strHeader);
        cc = ConvertCookieArraysToCookieCollection(al, strHost);
    }
    return cc;
}


private static ArrayList ConvertCookieHeaderToArrayList(string strCookHeader)
{
    strCookHeader = strCookHeader.Replace("\r", "");
    strCookHeader = strCookHeader.Replace("\n", "");
    string[] strCookTemp = strCookHeader.Split(',');
    ArrayList al = new ArrayList();
    int i = 0;
    int n = strCookTemp.Length;
    while (i < n)
    {
        if (strCookTemp[i].IndexOf("expires=", StringComparison.OrdinalIgnoreCase) > 0)
        {
            al.Add(strCookTemp[i] + "," + strCookTemp[i + 1]);
            i = i + 1;
        }
        else
        {
            al.Add(strCookTemp[i]);
        }
        i = i + 1;
    }
    return al;
}


private static CookieCollection ConvertCookieArraysToCookieCollection(ArrayList al, string strHost)
{
    CookieCollection cc = new CookieCollection();

    int alcount = al.Count;
    string strEachCook;
    string[] strEachCookParts;
    for (int i = 0; i < alcount; i++)
    {
        strEachCook = al[i].ToString();
        strEachCookParts = strEachCook.Split(';');
        int intEachCookPartsCount = strEachCookParts.Length;
        string strCNameAndCValue = string.Empty;
        string strPNameAndPValue = string.Empty;
        string strDNameAndDValue = string.Empty;
        string[] NameValuePairTemp;
        Cookie cookTemp = new Cookie();

        for (int j = 0; j < intEachCookPartsCount; j++)
        {
            if (j == 0)
            {
                strCNameAndCValue = strEachCookParts[j];
                if (strCNameAndCValue != string.Empty)
                {
                    int firstEqual = strCNameAndCValue.IndexOf("=");
                    string firstName = strCNameAndCValue.Substring(0, firstEqual);
                    string allValue = strCNameAndCValue.Substring(firstEqual + 1, strCNameAndCValue.Length - (firstEqual + 1));
                    cookTemp.Name = firstName;
                    cookTemp.Value = allValue;
                }
                continue;
            }
            if (strEachCookParts[j].IndexOf("path", StringComparison.OrdinalIgnoreCase) >= 0)
            {
                strPNameAndPValue = strEachCookParts[j];
                if (strPNameAndPValue != string.Empty)
                {
                    NameValuePairTemp = strPNameAndPValue.Split('=');
                    if (NameValuePairTemp[1] != string.Empty)
                    {
                        cookTemp.Path = NameValuePairTemp[1];
                    }
                    else
                    {
                        cookTemp.Path = "/";
                    }
                }
                continue;
            }

            if (strEachCookParts[j].IndexOf("domain", StringComparison.OrdinalIgnoreCase) >= 0)
            {
                strPNameAndPValue = strEachCookParts[j];
                if (strPNameAndPValue != string.Empty)
                {
                    NameValuePairTemp = strPNameAndPValue.Split('=');

                    if (NameValuePairTemp[1] != string.Empty)
                    {
                        cookTemp.Domain = NameValuePairTemp[1];
                    }
                    else
                    {
                        cookTemp.Domain = strHost;
                    }
                }
                continue;
            }
        }

        if (cookTemp.Path == string.Empty)
        {
            cookTemp.Path = "/";
        }
        if (cookTemp.Domain == string.Empty)
        {
            cookTemp.Domain = strHost;
        }
        cc.Add(cookTemp);
    }
    return cc;
}
于 2014-02-03T14:06:43.680 回答