0

当我执行此 C# 代码时...

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.IO;
using System.Net;
using System.Dynamic;
using HtmlAgilityPack;

namespace entropedizer
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }

        public String postRequest(string url, string eventTarget)
        {
            // A "pre-request," sent to gather SessionID and POST data parameters for the main request
            HttpWebRequest prequest = (HttpWebRequest)WebRequest.Create("http://www.entropedia.info/Chart.aspx?chart=Chart");
            HttpWebResponse presponse = (HttpWebResponse)prequest.GetResponse();
            Stream pstream = presponse.GetResponseStream();
            StreamReader psr = new StreamReader(pstream);
            string phtml = psr.ReadToEnd();

            Match viewstate = Regex.Match(phtml, "id=\"__VIEWSTATE\".+/>");
            Match eventvalidation = Regex.Match(phtml, "id=\"__EVENTVALIDATION\".+/>");
            ASCIIEncoding encoding = new ASCIIEncoding();
            string postData = "__EVENTTARGET=" + eventTarget + "&__VIEWSTATE=" + Uri.EscapeDataString(viewstate.ToString().Substring(24, viewstate.Length - 28)) + "&__EVENTVALIDATION=" + Uri.EscapeDataString(eventvalidation.ToString().Substring(30, eventvalidation.Length - 34));
            byte[] data = encoding.GetBytes(postData);

            // The main request, intended to retreive the desired HTML
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create("http://www.entropedia.info/Chart.aspx?chart=Chart");
            request.Method = "POST";
            request.ContentType = "application/x-www-form-urlencoded";

            request.CookieContainer = new CookieContainer();
            Cookie sessionId = new Cookie("ASP.NET_SessionId", Regex.Match(presponse.Headers.ToString(), "ASP.NET_SessionId=.+ d").ToString().Substring(18, Regex.Match(presponse.Headers.ToString(), "ASP.NET_SessionId=.+ d").Length - 21), "/", ".entropedia.info");
            request.CookieContainer.Add(new Uri("http://www.entropedia.info/Chart.aspx?chart=Chart"), sessionId);

            Stream stream = request.GetRequestStream();
            stream.Write(data, 0, data.Length);
            stream.Close();

            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            stream = response.GetResponseStream();

            StreamReader sr = new StreamReader(stream);

            return sr.ReadToEnd();
        }

        private void Form1_Load(object sender, EventArgs e)
        {
            System.Net.ServicePointManager.Expect100Continue = false;
            HtmlAgilityPack.HtmlDocument hChart = new HtmlAgilityPack.HtmlDocument();
            hChart.LoadHtml(postRequest("http://www.entropedia.info/Chart.aspx?chart=Chart", "ctl00%24ContentPlaceHolder1%24DG1%24ctl19%24ctl05"));

            HtmlNodeCollection chartStrings = hChart.DocumentNode.SelectNodes("/");

            if (chartStrings != null)
            {
                foreach (HtmlNode i in chartStrings)
                {
                    System.IO.File.WriteAllText("C:/Users/Admin/Desktop/WholeDocument.txt", i.OuterHtml);
                }
            }
            else
            {
                MessageBox.Show("Error:  Null item list.");
            }
        }
    }
}

...以下 HTML 被写入文本文件。

http://pastebin.com/FALerBWR

当我将 C# 代码中的行更改为 HtmlNodeCollection 时,正文chartStrings = hChart.DocumentNode.SelectNodes("/html/body");中的 400 多行 HTML 将写入文本文件。

当我将该行更改为HtmlNodeCollection chartStrings = hChart.DocumentNode.SelectNodes("/html/body/form");仅一行代码(带有其属性的表单开始标记)时,将写入文本文件。它应该写很多行(大部分文档)。我相信HtmlAgilityPack由于格式错误的 HTML 标签而感到困惑。有没有办法以编程方式解决这个问题?我不想在每次运行程序时手动更正 HTML!

4

2 回答 2

1

这是“设计使然”的行为。FORM默认情况下,被视为emptyHTML 元素。原因在此处进行了解释(检查我的答案):HtmlAgilityPack -- <form> 是否出于某种原因关闭了自身?

但这也是可配置的,您只需要指示解析器以不同的方式运行,如下所示:

HtmlAgilityPack.HtmlDocument hChart = new HtmlAgilityPack.HtmlDocument();

// remove all specific behaviors for the `FORM` element
HtmlAgilityPack.HtmlNode.ElementsFlags.Remove("form");

hChart.LoadHtml(postRequest("http://www.entropedia.info/Chart.aspx?chart=Chart", "ctl00%24ContentPlaceHolder1%24DG1%24ctl19%24ctl05"));
HtmlNodeCollection chartStrings = hChart.DocumentNode.SelectNodes("/");
于 2013-06-02T06:38:46.430 回答
0

如果您认为这是由错误的 html 引起的,请先整理 html。这是您可以使用的东西... https://github.com/markbeaton/TidyManaged

于 2013-06-02T05:37:43.390 回答