0

我从网站下载图像。这是代码:

class MyClient : WebClient
        {
            public bool HeadOnly { get; set; }
            protected override WebRequest GetWebRequest(Uri address)
            {
                WebRequest req = base.GetWebRequest(address);
                if (HeadOnly && req.Method == "GET")
                {
                    req.Method = "HEAD";
                }
                return req;
            }
        }

然后方法:

public static HtmlAgilityPack.HtmlDocument getHtmlDocumentWebClient(string url, bool useProxy, string proxyIp, int proxyPort, string usename, string password)
        {
            HtmlAgilityPack.HtmlDocument doc = null;
            try
            {
                doc = null;
                using (MyClient clients = new MyClient())
                {
                    clients.HeadOnly = true;
                    byte[] body = clients.DownloadData(url);
                    // note should be 0-length
                    string type = clients.ResponseHeaders["content-type"];
                    clients.HeadOnly = false;
                    // check 'tis not binary... we'll use text/, but could
                    // check for text/html
                    if (type == null)
                    {
                        return null;
                    }
                    else
                    {
                        if (type.StartsWith(@"text/html"))
                        {
                            string text = clients.DownloadString(url);

                            try
                            {
                                doc = new HtmlAgilityPack.HtmlDocument();
                                WebClient client = new WebClient();
                                //client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)");
                                client.Credentials = CredentialCache.DefaultCredentials;
                                client.Proxy = WebRequest.DefaultWebProxy;
                                if (useProxy)
                                {
                                    //Proxy                
                                    if (!string.IsNullOrEmpty(proxyIp))
                                    {
                                        WebProxy p = new WebProxy(proxyIp, proxyPort);
                                        if (!string.IsNullOrEmpty(usename))
                                        {
                                            if (password == null)
                                                password = string.Empty;
                                            NetworkCredential nc = new NetworkCredential(usename, password);
                                            p.Credentials = nc;
                                        }
                                    }
                                }
                                doc.Load(client.OpenRead(url));
                            }
                            catch
                            {

                            }
                        }
                    }
                }

                if (doc == null)
                {
                    MessageBox.Show("Doc is null   " + doc + " The link that did it was    " + url);
                }

            }
            catch
            {

            }
            return doc;
        }

我尝试向该方法添加一般尝试捕获,但它仍然到达 MessageBox 而不是捕获。

无论如何,我试图下载的图像的链接是:

http://members.tripod.com/~DannyWest/bundy.jpg

然后我使用了断点 t 并上线:

if (type.StartsWith(@"text/html"))

它跳到 MessageBox.Show...

现在我看到该类型包含:image/jpeg 我想知道这是否是问题,因为没有 text/html 或其他可能有链接的东西?

编辑**

我试图改变添加这个的方法:

if (type.StartsWith(@"text/html")|| type.StartsWith(@"image/jpeg"))

添加了部分 image/jpeg 但随后它在另一个类上使用了此方法:

private List<string> getLinks(HtmlAgilityPack.HtmlDocument document)
        {

                List<string> mainLinks = new List<string>();
                var linkNodes = document.DocumentNode.SelectNodes("//a[@href]");
                if (linkNodes != null)
                {
                    foreach (HtmlNode link in linkNodes)
                    {
                        var href = link.Attributes["href"].Value;
                        if (href.StartsWith("http://") == true || href.StartsWith("https://") == true || href.StartsWith("www") == true) // filter for http 
                        {
                            mainLinks.Add(href);
                        }
                    }
                }

                return mainLinks;


        }

并且 linkNodes 一直为空。当 type.StartsWith 为 image/jpeg 时,此 linkNodes 始终为空。当 type.StartsWith 为 text/html 时, linkNodes 不为空。

如果需要,我可以将我的项目上传到我的 Skydrive。

4

0 回答 0