使用 HTMLAgility、C#、XML
我正在使用 HTMLAgility 来抓取网页,然后填充一个类结构,然后将其序列化为 XML 文档。
我正在处理的数据是吉他和弦,因此我需要管理一些特殊字符。
我正在努力解决的特殊字符是前面字符串中的中间字符“Aº7”(这意味着在音乐方面减少了)。
当我从网页中获取字符串时,我在监视窗口中看到一个黑色菱形的问号,这反过来又被填充到 XML 中。
我的选择是 a) 适当地处理字符,以便它在 XML 中呈现为字符。b) 将字符串中此字符的每个实例转换为单词“dim”
最好的解决方法是什么,因为在替换语句中确实找到了字符(使用字符(代码))。
我不确定我“应该”如何解决这个问题。
下面的代码是我用来抓取数据的(为了清楚起见,这是一个一次性完成的功能,一旦我拥有可用格式的数据,就永远不会再次使用!,只是为了创建一个 xml 序列化对象结构而构建)。
public void BuildDBFromWebSite()
{
string[] chordKeys = { "A", "A#", "Ab", "B", "Bb", "C", "C#", "D", "D#", "Db", "E", "Eb", "F", "F#", "G", "G#", "Gb" };
HtmlWeb web = new HtmlWeb();
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
foreach (string chordKeyName in chordKeys)
{
//LOOP THROUGH THE CHORD KEYS
chordKey theChordKey = new chordKey() { KeyName = chordKeyName };
_keys.Add(theChordKey);
//grab the tone page
doc = web.Load("http://www.scales-chords.com/showchbykey.php?key=" + theChordKey.KeyName);
HtmlNode chordListTable = doc.DocumentNode.SelectSingleNode("/html/body/div[@id='wrapper']/div[@id='body']/div[@id='left']/div[@id='visit']/table/tbody");
// CHORDS
HtmlNodeCollection chordRows = chordListTable.SelectNodes("tr");
for (int i = 2; i < chordRows.Count; i++)
{
//LOOP THROUGH THE CHORDS
Chord theChord = new Chord();
HtmlNodeCollection chordInfoCells = chordRows[i].SelectNodes("td");
HtmlNode chordLink = chordInfoCells[0].SelectSingleNode("a[@href]");
//each of the next 3 cells can contain a bad glyph for diminished chords
theChord.ChordName = chordInfoCells[0].InnerText;
theChord.ChordNameText = chordInfoCells[1].InnerText;
theChord.Family = chordInfoCells[2].InnerText;
theChord.Importance = chordInfoCells[3].InnerText;
//HtmlAgilityPack.HtmlAttribute href = chordLink.Attributes["href"];
//HTMLAgility tries to encode the bad glyph but uses the wrong escape and breaks the href, work around is to manually strip the href myself
string theURL = chordLink.OuterHtml;
theURL = theURL.Remove(0,9);
int startPos = theURL.IndexOf(">") - 1;
theURL = theURL.Substring(0, startPos);
const string theBadCode = "º";
theChord.ChordNameURL = HTMLEncodeSpecialChars(theURL);
theChordKey.Chords.Add(theChord);
}
//VARIATIONS ETC
foreach (Chord theChord in theChordKey.Chords)
{
//grab the tone page
doc = web.Load("http://www.scales-chords.com/" + theChord.ChordNameURL);
HtmlNode chordMoreInfoTable = doc.DocumentNode.SelectSingleNode("/html/body/div[@id='wrapper']/div[@id='body']/div[@id='left']/div[@id='visit']/center/table[1]/tbody");
HtmlNodeCollection chordMoreInfoRows = chordMoreInfoTable.SelectNodes("tr");
theChord.Notes = chordMoreInfoRows[3].SelectNodes("td")[1].InnerText;
theChord.Structure = chordMoreInfoRows[4].SelectNodes("td")[1].InnerText;
theChord.BelongsTo = chordMoreInfoRows[6].SelectNodes("td")[1].InnerText;
HtmlNodeCollection variationHTML = doc.DocumentNode.SelectNodes("/html/body/div[@id='wrapper']/div[@id='body']/div[@id='left']/div[@id='visit']/center/b");
for (int iVariation = 1; iVariation < variationHTML.Count; iVariation=iVariation+2)
{
Variation theVariation = new Variation();
theVariation.Notation = variationHTML[iVariation].NextSibling.InnerHtml;
theVariation.Difficuty = variationHTML[iVariation + 1].NextSibling.InnerText;
string[] theStrings = theVariation.Notation.Split(' ');
try
{
theVariation.String1 = theStrings[1];
theVariation.String2 = theStrings[2];
theVariation.String3 = theStrings[3];
theVariation.String4 = theStrings[4];
theVariation.String5 = theStrings[5];
theVariation.String6 = theStrings[6];
}
catch (Exception ex)
{
}
theChord.Variations.Add(theVariation);
Console.WriteLine(theChord.ChordNameText + " : " + theVariation.Notation);
}
}
}
this.SaveToDisk("C:\\chords.xml");
}
谢谢
担