我是 C# 和 WinForms 的新手,我想创建一个网络爬虫(解析器)——它可以解析网页并分层显示它们。+ 我不知道如何让机器人以特定的超链接深度爬行。
所以我想我有两个问题:
- 如何使机器人以指定的链接深度爬行?
- 如何分层显示所有超链接?
PS如果它是代码示例,我会很棒。
PPS 有 1 个按钮 = button1;和 1 个富文本框 = 富文本框 1;
这是我的代码:我知道它非常难看....(所有代码都在一个按钮中):
public partial class Form1 : Form
{
    public Form1()
    {
        InitializeComponent();
    }
    private void button1_Click(object sender, EventArgs e)
    {
        //Declaration
        HttpWebRequest request = (HttpWebRequest) WebRequest.Create(url);
        HttpWebResponse response = (HttpWebResponse) request.GetResponse();
        StreamReader sr = new StreamReader(response.GetResponseStream());
        Match m;
        string anotherTest = @"(((ht){1}tp[s]?://)[-a-zA-Z0-9@:%_\+.~#?&\\]+)";
        List<string> savedUrls = new List<string>();
        List<string> titles = new List<string>();
        //Go to this URL:
        string url = UrlTextBox.Text = "http://www.yahoo.com";
        if (!(url.StartsWith("http://") || url.StartsWith("https://")))
            url = "http://" + url;
       //Scrape Whole Html code:
        string s = sr.ReadToEnd();
        try
        {
            // Get Urls:
            m = Regex.Match(s, anotherTest,
                            RegexOptions.IgnoreCase | RegexOptions.Compiled,
                            TimeSpan.FromSeconds(1));
            while (m.Success)
            {
                savedUrls.Add(m.Groups[1].ToString());
                m = m.NextMatch();
            }
            // Get TITLES:
            Match m2 = Regex.Match(s, @"<title>\s*(.+?)\s*</title>");
            if (m2.Success)
            {
                titles.Add(m2.Groups[1].Value);
            }
            //Show Title:
            richTextBox1.Text += titles[0] + "\n";
            //Show Urls:
            TrimUrls(ref savedUrls);
        }
        catch (RegexMatchTimeoutException)
        {
            Console.WriteLine("The matching operation timed out.");
        }
        sr.Close();
    }
    private void TrimUrls(ref List<string> urls)
    {
        List<string> d = urls.Distinct().ToList();
        foreach (var v in d)
        {
            if (v.IndexOf('.') != -1 && v != "http://www.w3.org")
            {
                richTextBox1.Text += v + "\n";
            }
        }
    }
}
}
还有一个问题:有人知道如何像树一样将它保存在 XML 中吗?