1

一般来说,当我爬到 www.cnn.com 或 www.foxnews.com 之类的网站时,代码是有效的,但现在我尝试了这个网站:https ://github.com/jasonwupilly/Obsidian/tree/master/Obsidian 我想要下载文件,例如当它爬行时有一个这样的链接:

https://github.com/jasonwupilly/Obsidian/blob/master/Obsidian/Obsidian.sln 所以我硬盘上的文件应该是 Obsidian.sln

这是我的爬取方法:

public List<string> webCrawler(string mainUrl, int levels)
        {
            List<string> csFiles = new List<string>();
            wc = new System.Net.WebClient();
                HtmlWeb hw = new HtmlWeb();
                List<string> webSites;
                csFiles.Add("temp string to know that something is happening in level = " + levels.ToString());
                csFiles.Add("current site name in this level is : " + mainUrl);
                try
                {
                    HtmlAgilityPack.HtmlDocument doc = TimeOut.getHtmlDocumentWebClient(mainUrl, false, "", 0, "", "");
                    if (doc == null)
                    {
                        failed = true;
                        wccfg.failedUrls++;
                        failed = false;
                    }
                    else
                    {
                        done = true;
                        // progress should be reported here I guess
                        Object[] temp_arr = new Object[8];
                        temp_arr[0] = csFiles;
                        temp_arr[1] = mainUrl;
                        temp_arr[2] = levels;
                        temp_arr[3] = currentCrawlingSite;
                        temp_arr[4] = sitesToCrawl;
                        temp_arr[5] = done;
                        temp_arr[6] = wccfg.failedUrls;
                        temp_arr[7] = failed;

                        OnProgressEvent(temp_arr);


                        currentCrawlingSite.Add(mainUrl);
                        webSites = getLinks(doc);


                        removeDupes(webSites);
                        removeDuplicates(webSites, currentCrawlingSite);
                        removeDuplicates(webSites, sitesToCrawl); 
                        if (wccfg.removeext == true)
                        {
                            for (int i = 0; i < webSites.Count; i++)
                            {
                                webSites.Remove(removeExternals(webSites, mainUrl, wccfg.localy));
                            }
                        }
                        if (wccfg.downloadcontent == true)
                        {
                            retwebcontent.retrieveImages(mainUrl);
                        }
                        if (wccfg.downloadallfiles == true)
                        {
                            retwebcontent.retrieveFiles(mainUrl);
                        }
                        if (levels > 0)
                            sitesToCrawl.AddRange(webSites);
                        webSites = FilterJunkLinks(webSites);

                        if (levels == 0)
                        {
                            return csFiles;
                        }
                        else
                        {


                            for (int i = 0; i < webSites.Count(); i++)//&& i < 20; i++) 
                            { 

                                if (wccfg.toCancel == true)
                                {
                                    return new List<string>();
                                }
                                if (pause == true)
                                {
                                    _busy.Reset();
                                }
                                else
                                {
                                    _busy.Set();

                                    string t = webSites[i];
                                    if ((t.StartsWith("http://") == true) || (t.StartsWith("https://") == true)) 
                                    {
                                        csFiles.AddRange(webCrawler(t, levels - 1));

                                    }
                                }
                            }
                            return csFiles;
                        }
                    }
                    return csFiles;
                }

                catch (WebException ex)
                {
                    failed = true;
                    wccfg.failedUrls++;
                    return csFiles;
                }
                catch (Exception ex)
                {
                    failed = true;
                    wccfg.failedUrls++;
                    throw;
                }
        }

在这个方法的开始,我有这一行:

HtmlAgilityPack.HtmlDocument doc = TimeOut.getHtmlDocumentWebClient(mainUrl, false, "", 0, "", "");

我在这一行上使用了一个断点并将其输入到类中:

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using HtmlAgilityPack;
using System.IO;
using System.Text.RegularExpressions;
using System.Xml.Linq;
using System.Net;
using System.Web;
using System.Threading;
using DannyGeneral;

namespace GatherLinks
{
    class TimeOut
    {
        static HtmlAgilityPack.HtmlDocument doc;

        public TimeOut()
        {
        }


        class MyClient : WebClient
        {
            public bool HeadOnly { get; set; }
            protected override WebRequest GetWebRequest(Uri address)
            {
                WebRequest req = base.GetWebRequest(address);
                if (HeadOnly && req.Method == "GET")
                {
                    req.Method = "HEAD";
                }
                return req;
            }
        }

        public static HtmlAgilityPack.HtmlDocument getHtmlDocumentWebClient(string url, bool useProxy, string proxyIp, int proxyPort, string usename, string password)
        {
            try
            {
                doc = null;
                using (MyClient clients = new MyClient())
                {
                    clients.HeadOnly = true;
                    byte[] body = clients.DownloadData(url);
                    // note should be 0-length
                    string type = clients.ResponseHeaders["content-type"];
                    clients.HeadOnly = false;
                    // check 'tis not binary... we'll use text/, but could
                    // check for text/html
                    if (type == null)
                    {
                        return null;
                    }
                    else
                    {
                        if (type.StartsWith(@"text/html"))
                        {
                            string text = clients.DownloadString(url);


                            doc = new HtmlAgilityPack.HtmlDocument();
                            WebClient client = new WebClient();
                            //client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)");
                            client.Credentials = CredentialCache.DefaultCredentials;
                            client.Proxy = WebRequest.DefaultWebProxy;
                            if (useProxy)
                            {
                                //Proxy                
                                if (!string.IsNullOrEmpty(proxyIp))
                                {
                                    WebProxy p = new WebProxy(proxyIp, proxyPort);
                                    if (!string.IsNullOrEmpty(usename))
                                    {
                                        if (password == null)
                                            password = string.Empty;
                                        NetworkCredential nc = new NetworkCredential(usename, password);
                                        p.Credentials = nc;
                                    }
                                }
                            }
                            doc.Load(client.OpenRead(url));

                        }
                    }
                }
            }
            catch (Exception err)
            {

            }
            return doc;
        }

        private static string GetUrl(string url)
        {
            string startTag = "Url: ";
            string endTag = " ---";
            int startTagWidth = startTag.Length;
            int endTagWidth = endTag.Length;
            int index = 0;
            index = url.IndexOf(startTag, index);
            int start = index + startTagWidth;
            index = url.IndexOf(endTag, start + 1);
            string g = url.Substring(start, index - start);
            return g;
        }
    }
}

它在 getHtmlDocumentWebClient 方法中执行第一行,并且在到达该行时:

byte[] body = clients.DownloadData(url);

它跳转到方法:GetWebRequest 做完这行之后:return req; 它什么也不做,它返回程序,什么也没有发生。

只有当链接是:https ://github.com/jasonwupilly/Obsidian/tree/master/Obsidian

如果链接是www.cnn.com,例如它下载图片没有问题,也没有问题抓取。

代码有些长,但都已连接,所以我必须全部添加。

这里可能是什么问题?

4

0 回答 0