private List<string> retrieveImages(string address)
{
System.Net.WebClient wc = new System.Net.WebClient();
List<string> imgList = new List<string>();
doc.Load(wc.OpenRead(address)); //or whatever HTML file you have
HtmlNodeCollection imgs = doc.DocumentNode.SelectNodes("//img[@src]");
if (imgs == null) return new List<string>();
foreach (HtmlNode img in imgs)
{
if (img.Attributes["src"] == null)
continue;
HtmlAttribute src = img.Attributes["src"];
imgList.Add(src.Value);
//Image imgDownload = GetImage(src.Value);
//imgDownload.Save(@"d:\myImages");
}
return imgList;
}
在某些情况下,列表 imgList 包含 33 个项目,它看起来像这样:
首先[0]
我看到:/images/experiments/nav_logo78.png
作为我看不到的图像的链接,http 开头或 www 只是开头/images
然后在[1]
我看到的地方://maps.gstatic.com/mapfiles/transparent.png
然后在一些项目到位后 [10] 我看到: http: //mt1.google.com/vt/lyrs=m@186000000&hl=iw&src=app&x=75&y=51&z=7&s=Gali
我不确定我看不到的加利是什么,.bmp .gif
或者.png
只是加利的图像。
我想要的是从每个链接下载所有这些图像并将其保存到我的硬盘上。所以我有这个下载功能:
private Image GetImage(string url)
{
System.Net.WebRequest request = System.Net.WebRequest.Create(url);
System.Net.WebResponse response = request.GetResponse();
System.IO.Stream responseStream = response.GetResponseStream();
Bitmap bmp = new Bitmap(responseStream);
responseStream.Dispose();
return bmp;
}
当我在retrieveImages() 函数中使用这个GetImage 函数时,它不会对程序做任何事情,甚至不会对我的意思是List imgList 是空的。如果我将这两行标记为现在:
//Image imgDownload = GetImage(src.Value);
//imgDownload.Save(@"d:\myImages");
如果我用//不使用它们标记它们,那么一切都在工作,但如果我使用它们,则没有任何工作,并且它不会将任何内容保存到我的硬盘上。
我应该怎么办?
编辑:
我刚刚将我的retrieveImages 函数更改为:
private List<string> retrieveImages(string address)
{
System.Net.WebClient wc = new System.Net.WebClient();
List<string> imgList = new List<string>();
doc.Load(wc.OpenRead(address));
HtmlNodeCollection imgs = doc.DocumentNode.SelectNodes("//img[@src]");
if (imgs == null) return new List<string>();
foreach (HtmlNode img in imgs)
{
if (img.Attributes["src"] == null)
continue;
HtmlAttribute src = img.Attributes["src"];
imgList.Add(src.Value);
wc.DownloadFile(src.Value , @"d:\MyImages\my.gif");
}
return imgList;
}
我在 wc.DownloadFile 行上使用了一个断点,它抛出了一个异常:Webexception Was Caught
找不到路径“D:\textinputassistant\tia.png”的一部分。
在 src.Value 中,在这种情况下它是:/textinputassistant/tia.png 所以你告诉我要避免一开始没有 http 或 https 或 www 的链接,我会修复它。问题是是否例外是因为这条线以 / 开头并且它没有任何 http/s/www ?
完整的例外:
System.Net.WebException was caught
Message=Could not find a part of the path 'D:\textinputassistant\tia.png'.
Source=System
StackTrace:
at System.Net.WebClient.DownloadFile(Uri address, String fileName)
at System.Net.WebClient.DownloadFile(String address, String fileName)
at GatherLinks.Form1.retrieveImages(String address) in D:\C-Sharp\GatherLinks\GatherLinks\GatherLinks\Form1.cs:line 328
at GatherLinks.Form1.webCrawler(String url, Int32 levels, DoWorkEventArgs eve) in D:\C-Sharp\GatherLinks\GatherLinks\GatherLinks\Form1.cs:line 97
InnerException: System.Net.WebException
Message=Could not find a part of the path 'D:\textinputassistant\tia.png'.
Source=System
StackTrace:
at System.Net.FileWebResponse..ctor(FileWebRequest request, Uri uri, FileAccess access, Boolean asyncHint)
at System.Net.FileWebRequest.GetResponseCallback(Object state)
InnerException: System.IO.DirectoryNotFoundException
Message=Could not find a part of the path 'D:\textinputassistant\tia.png'.
Source=mscorlib
StackTrace:
at System.IO.__Error.WinIOError(Int32 errorCode, String maybeFullPath)
at System.IO.FileStream.Init(String path, FileMode mode, FileAccess access, Int32 rights, Boolean useRights, FileShare share, Int32 bufferSize, FileOptions options, SECURITY_ATTRIBUTES secAttrs, String msgPath, Boolean bFromProxy, Boolean useLongPath)
at System.IO.FileStream..ctor(String path, FileMode mode, FileAccess access, FileShare share, Int32 bufferSize, FileOptions options, String msgPath, Boolean bFromProxy)
at System.Net.FileWebStream..ctor(FileWebRequest request, String path, FileMode mode, FileAccess access, FileShare sharing, Int32 length, Boolean async)
at System.Net.FileWebResponse..ctor(FileWebRequest request, Uri uri, FileAccess access, Boolean asyncHint)
内部异常:
刚刚添加了一个过滤器,因此它将仅保存以 http 开头的链接:
private List<string> retrieveImages(string address)
{
System.Net.WebClient wc = new System.Net.WebClient();
List<string> imgList = new List<string>();
doc.Load(wc.OpenRead(address));
HtmlNodeCollection imgs = doc.DocumentNode.SelectNodes("//img[@src]");
if (imgs == null) return new List<string>();
foreach (HtmlNode img in imgs)
{
if (img.Attributes["src"] == null)
continue;
HtmlAttribute src = img.Attributes["src"];
imgList.Add(src.Value);
if (src.Value.Contains("http"))
{
wc.DownloadFile(src.Value, @"d:\MyImages\my.gif");
}
}
return imgList;
}
现在 src.Value 包含:http://mt1.google.com/vt/lyrs=m@186000000&hl=iw&src=app&x=75&y=51&z=7&s=Gali
然后在它试图下载我得到异常之后: WebException Was Caught
远程服务器返回错误:(403) Forbidden。
System.Net.WebException was caught
Message=The remote server returned an error: (403) Forbidden.
Source=System
StackTrace:
at System.Net.WebClient.DownloadFile(Uri address, String fileName)
at System.Net.WebClient.DownloadFile(String address, String fileName)
at GatherLinks.Form1.retrieveImages(String address) in D:\C-Sharp\GatherLinks\GatherLinks\GatherLinks\Form1.cs:line 330
at GatherLinks.Form1.webCrawler(String url, Int32 levels, DoWorkEventArgs eve) in D:\C-Sharp\GatherLinks\GatherLinks\GatherLinks\Form1.cs:line 97
InnerException:
问题是异常是否抛出,因为在这种情况下,谷歌网站阻止下载,或者因为链接以 Gali 结尾,我不确定它是什么类型的文件?