我是这个网络爬行世界的新手。所以有人在任何网络应用程序上工作过网络爬网吗?如果有人使用 asp.net & C#而不是VB.NET windows 窗体,我需要帮助。
我有一个带有 3 个文本框和一个按钮的默认网络表单,这是背后的代码:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Net;
using System.IO;
public partial class _Default : System.Web.UI.Page
{
String Rstring;
protected void Page_Load(object sender, EventArgs e)
{
}
protected void Button1_Click(object sender, EventArgs e)
{
WebRequest myWebRequest;
WebResponse myWebResponse;
String URL = TextBox1.Text;
myWebRequest = WebRequest.Create(URL);
myWebResponse = myWebRequest.GetResponse();//Returns a response from an Internet resource
Stream streamResponse = myWebResponse.GetResponseStream();//return the data stream from the internet
//and save it in the stream
StreamReader sreader = new StreamReader(streamResponse);//reads the data stream
Rstring = sreader.ReadToEnd();//reads it to the end
String Links = GetContent(Rstring);//gets the links only
TextBox2.Text = Rstring;
TextBox3.Text = Links;
streamResponse.Close();
sreader.Close();
myWebResponse.Close();
}
//public ISet<string> GetNewLinks(string content)
//{
// Regex regexLink = new Regex("(?<=<a\\s*?href=(?:'|\"))[^'\"]*?(?=(?:'|\"))");
// ISet<string> newLinks = new HashSet<string>();
// foreach (var match in regexLink.Matches(content))
// {
// if (!newLinks.Contains(match.ToString()))
// newLinks.Add(match.ToString());
// }
// return newLinks;
//}
private String GetContent(String Rstring)
{
String sString = "";
HTMLDocument d = new HTMLDocument();
IHTMLDocument2 doc = (IHTMLDocument2)d;
doc.write(Rstring);
IHTMLElementCollection L = doc.links;
foreach (IHTMLElement links in L)
{
sString += links.getAttribute("href", 0);
sString += "/n";
}
return sString;
}
}