我是 C# 和 WinForms 的新手,我想创建一个网络爬虫(解析器)——它可以解析网页并分层显示它们。+ 我不知道如何让机器人以特定的超链接深度爬行。
所以我想我有两个问题:
- 如何使机器人以指定的链接深度爬行?
- 如何分层显示所有超链接?
PS如果它是代码示例,我会很棒。
PPS 有 1 个按钮 = button1;和 1 个富文本框 = 富文本框 1;
这是我的代码:我知道它非常难看....(所有代码都在一个按钮中):
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private void button1_Click(object sender, EventArgs e)
{
//Declaration
HttpWebRequest request = (HttpWebRequest) WebRequest.Create(url);
HttpWebResponse response = (HttpWebResponse) request.GetResponse();
StreamReader sr = new StreamReader(response.GetResponseStream());
Match m;
string anotherTest = @"(((ht){1}tp[s]?://)[-a-zA-Z0-9@:%_\+.~#?&\\]+)";
List<string> savedUrls = new List<string>();
List<string> titles = new List<string>();
//Go to this URL:
string url = UrlTextBox.Text = "http://www.yahoo.com";
if (!(url.StartsWith("http://") || url.StartsWith("https://")))
url = "http://" + url;
//Scrape Whole Html code:
string s = sr.ReadToEnd();
try
{
// Get Urls:
m = Regex.Match(s, anotherTest,
RegexOptions.IgnoreCase | RegexOptions.Compiled,
TimeSpan.FromSeconds(1));
while (m.Success)
{
savedUrls.Add(m.Groups[1].ToString());
m = m.NextMatch();
}
// Get TITLES:
Match m2 = Regex.Match(s, @"<title>\s*(.+?)\s*</title>");
if (m2.Success)
{
titles.Add(m2.Groups[1].Value);
}
//Show Title:
richTextBox1.Text += titles[0] + "\n";
//Show Urls:
TrimUrls(ref savedUrls);
}
catch (RegexMatchTimeoutException)
{
Console.WriteLine("The matching operation timed out.");
}
sr.Close();
}
private void TrimUrls(ref List<string> urls)
{
List<string> d = urls.Distinct().ToList();
foreach (var v in d)
{
if (v.IndexOf('.') != -1 && v != "http://www.w3.org")
{
richTextBox1.Text += v + "\n";
}
}
}
}
}
还有一个问题:有人知道如何像树一样将它保存在 XML 中吗?