在这种情况下,它无法正常工作,它会不断将相同的链接添加到列表中。
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using HtmlAgilityPack;
using System.IO;
using System.Text.RegularExpressions;
using System.Xml.Linq;
using System.Net;
using System.Web;
namespace GatherLinks
{
public partial class Form1 : Form
{
int sites = 0;
int y = 0;
string url = @"http://www.google.co.il";
string guys = "http://www.google.com";
public Form1()
{
InitializeComponent();
List<string> a = webCrawler(guys, 2);
}
private void Form1_Load(object sender, EventArgs e)
{
}
private int factorial(int n)
{
if (n == 0) return 1;
else y = n * factorial(n - 1);
richTextBox1.Text = y.ToString();
return y;
}
private List<string> getLinks(HtmlAgilityPack.HtmlDocument document)
{
List<string> mainLinks = new List<string>();
var linkNodes = document.DocumentNode.SelectNodes("//a[@href]");
if (linkNodes != null)
{
foreach (HtmlNode link in linkNodes)
{
var href = link.Attributes["href"].Value;
mainLinks.Add(href);
}
}
return mainLinks;
}
private List<string> webCrawler(string url, int levels)
{
HtmlAgilityPack.HtmlDocument doc;
HtmlWeb hw = new HtmlWeb();
List<string> webSites;// = new List<string>();
List<string> csFiles = new List<string>();
csFiles.Add("temp string to know that something is happening in level = " + levels.ToString());
csFiles.Add("current site name in this level is : " + url);
try
{
doc = hw.Load(url);
webSites = getLinks(doc);
if (levels == 0)
{
return csFiles;
}
else
{
int actual_sites = 0;
for (int i = 0; i < webSites.Count() && i < 20; i++) {
string t = webSites[i];
if ((t.StartsWith("http://") == true) || (t.StartsWith("https://") == true)) // replace this with future FilterJunkLinks function
{
// for (int e = 0; e < csFiles.Count; e++)
// {
if (csFiles.Contains(t))
{
}
else
{
actual_sites++;
csFiles.AddRange(webCrawler(t, levels - 1));
Texts(richTextBox1, "Level Number " + levels + " " + t + Environment.NewLine, Color.Red);
}
// }
}
}
// report to a message box only at high levels..
//if (levels==1)
//MessageBox.Show(actual_sites.ToString());
return csFiles;
}
}
catch
{
return csFiles;
}
}
和文本功能:
public void Texts(RichTextBox box, string text, Color color)
{
box.SelectionStart = box.TextLength;
box.SelectionLength = 0;
box.SelectionColor = color;
box.AppendText(text);
box.SelectionColor = box.ForeColor;
}
我需要在 webCrawler 函数中做两件事:
如果无法解析 url 变量,那么 try 和 catch 应该可以完成这项工作。
如果列表 csFiles 已经包含相同的项目,请不要再次添加它们。例如,如果在 csFiles 中已经有http://www.google.com则不要再次添加http://www.google.com所以最后 csFiles 列表将只包含http://www.google.com一次。