我正在使用敏捷包从网站上抓取数据以获取航班信息。我可以获取包含详细信息的航班列表,但我无法获取航班日期,因为数据位于单个表中并以不同的日期分隔。
下面是我试图抓取的表格示例
<table width="100%" border="0" cellspacing="0" cellpadding="0">
<tr>
<td class="schedulehead" width="14%">AIRLINE</td>
<td class="schedulehead">FLIGHT</td>
<td class="schedulehead">Origin</td>
<td class="schedulehead">TIME</td>
<td class="schedulehead">ESTIMATED</td>
<td class="schedulehead" width="14%">STATUS</td>
</tr>
<tr>
<td colspan="6" class="sumheadtop"> Thursday 11 April 2013</td>
</tr>
<tr>
<td colspan="6" class="sumheadbot"> PASSENGER ARRIVALS | DOMESTIC & INTERNATIONAL | All Airlines | ALL OriginS</td>
</tr>
<tr class="schedulerow" style="height:2px"><td colspan="6"></td></tr>
<tr class="schedulerow" valign="top">
<td class="airline"><img src="/webfids/images/ul.gif" width="100" height="24" vspace="0" alt="Air Lanka"/></td>
<td class="flight" nowrap>UL 103</td>
<td class="city">Colombo</td>
<td class="time">19:25</td>
<td class="estimated">19:13</td>
<td class="status"><div class="statusone">LANDED</div></td>
</tr>
<tr class="schedulerow" valign="top">
<td class="airline"><img src="/webfids/images/mj.gif" width="100" height="24" vspace="0" alt="Other"/></td>
<td class="flight" nowrap>MJ 1104</td>
<td class="city"></td>
<td class="time"></td>
<td class="estimated"> </td>
<td class="status"> </td>
</tr>
<tr class="schedulerow" valign="top">
<td class="airline"><img src="/webfids/images/ey.gif" width="100" height="24" vspace="0" alt="Other"/></td>
<td class="flight" nowrap>EY 7400</td>
<td class="city"></td>
<td class="time"></td>
<td class="estimated"> </td>
<td class="status"> </td>
</tr>
<tr class="schedulerow" style="height:2px"><td colspan="6"></td></tr>
<tr>
<td colspan="6" class="sumheadtop"> Friday 12 April 2013</td>
</tr>
<tr>
<td colspan="6" class="sumheadbot"> PASSENGER ARRIVALS | DOMESTIC & INTERNATIONAL | All Airlines | ALL OriginS</td>
</tr>
<tr class="schedulerowtwo" style="height:2px"><td colspan="6"></td></tr>
<tr class="schedulerowtwo" valign="top">
<td class="airline"><img src="/webfids/images/fz.gif" width="100" height="24" vspace="0" alt="Other"/></td>
<td class="flight" nowrap>FZ 561</td>
<td class="city">Dubai</td>
<td class="time">24:45</td>
<td class="estimated"> </td>
<td class="status"> </td>
</tr>
<tr class="schedulerowtwo" style="height:2px"><td colspan="6"></td></tr>
<tr class="schedulerow" style="height:2px"><td colspan="6"></td></tr>
<tr class="schedulerow" valign="top">
<td class="airline"><img src="/webfids/images/ul.gif" width="100" height="24" vspace="0" alt="Air Lanka"/></td>
<td class="flight" nowrap>UL 105</td>
<td class="city">Colombo</td>
<td class="time">01:55</td>
<td class="estimated"> </td>
<td class="status"> </td>
</tr>
<tr class="schedulerow" valign="top">
<td class="airline"><img src="/webfids/images/mh.gif" width="100" height="24" vspace="0" alt="Malaysia A/l"/></td>
<td class="flight" nowrap>MH 9706</td>
<td class="city"></td>
<td class="time"></td>
<td class="estimated"> </td>
<td class="status"> </td>
</tr>
<tr class="schedulerowtwo" style="height:2px"><td colspan="6"></td></tr>
<tr>
<td colspan="6" class="sumheadtop"> Saturday 13 April 2013</td>
</tr>
<tr>
<td colspan="6" class="sumheadbot"> PASSENGER ARRIVALS | DOMESTIC & INTERNATIONAL | All Airlines | ALL OriginS</td>
</tr>
<tr class="schedulerow" style="height:2px"><td colspan="6"></td></tr>
<tr class="schedulerow" valign="top">
<td class="airline"><img src="/webfids/images/ul.gif" width="100" height="24" vspace="0" alt="Air Lanka"/></td>
<td class="flight" nowrap>UL 107</td>
<td class="city">Colombo</td>
<td class="time">24:55</td>
<td class="estimated"> </td>
<td class="status"> </td>
</tr>
<tr class="schedulerowtwo" style="height:2px"><td colspan="6"></td></tr>
</table>
目前,我将其用作命令行代码,并遵循我当前使用的代码。
public void get_schedule()
{
string local_fname = "nhl-2010-2011.htm";
var schedule_doc = new HtmlAgilityPack.HtmlDocument();
schedule_doc.Load(local_fname);
// identify all the td nodes that directly contain the text "Date"
var ruw_nodes = schedule_doc.DocumentNode.Descendants()
.Where(n => n.Name == "tr")
.Where(n => n.GetAttributeValue("valign", null) == "top");
foreach (var row_node in ruw_nodes)
{
var div_nodes = row_node.Elements("td").ToList();
var airline_node = div_nodes.Where(n => n.GetAttributeValue("class", null) == "airline").FirstOrDefault();
var flight_node = div_nodes.Where(n => n.GetAttributeValue("class", null) == "flight").FirstOrDefault();
var city_nodes = div_nodes.Where(n => n.GetAttributeValue("class", null) == "city").FirstOrDefault();
var time_node = div_nodes.Where(n => n.GetAttributeValue("class", null) == "time").FirstOrDefault();
var est_node = div_nodes.Where(n => n.GetAttributeValue("class", null) == "estimated").FirstOrDefault();
var status_node = div_nodes.Where(n => n.GetAttributeValue("class", null) == "status").FirstOrDefault();
string logo = airline_node.Element("img").Attributes["src"].Value;
string airline = airline_node.Element("img").Attributes["alt"].Value;
string flight = flight_node != null ? flight_node.InnerText : "";
flight = clean_text(flight);
string city = city_nodes != null ? city_nodes.InnerText : "";
city = clean_text(city);
string time = time_node != null ? time_node.InnerText : "";
time = clean_text(time);
string est = est_node != null ? est_node.InnerText : "";
est = clean_text(est);
string status = status_node != null ? status_node.InnerText : "";
status = clean_text(status);
System.Console.WriteLine(" {0} | {1} | {2} | {3}", airline, flight, city, time);
}
System.Console.ReadKey();
}
public string strip_excessive_whitespace(string s)
{
char[] seps = { '\n', '\t', ' ' };
string[] pieces = Enumerable.ToArray<string>(s.Split().Where(x => x.Length > 0));
string r = System.String.Join(" ", pieces);
return r;
}
public string clean_text(string s)
{
string r = s;
r = s.Replace(" ", " ");
r = strip_excessive_whitespace(r);
r = r.Trim();
return r;
}
我想像这样获取数据
- 日期 1
- 飞行1次
- 飞行2次
- 日期 2
- 飞行1次
- 飞行2次
或者像这样
- 日期 1 航班 1 时间
- 日期 1 航班 2 时间
- 日期 2 航班 1 时间
- 日期 2 航班 2 时间