1

我正在使用敏捷包从网站上抓取数据以获取航班信息。我可以获取包含详细信息的航班列表,但我无法获取航班日期,因为数据位于单个表中并以不同的日期分隔。

下面是我试图抓取的表格示例

<table width="100%" border="0" cellspacing="0" cellpadding="0">
<tr>
    <td class="schedulehead" width="14%">AIRLINE</td>
    <td class="schedulehead">FLIGHT</td>
    <td class="schedulehead">Origin</td>
    <td class="schedulehead">TIME</td>
    <td class="schedulehead">ESTIMATED</td>
    <td class="schedulehead" width="14%">STATUS</td>
</tr>

<tr>
    <td colspan="6" class="sumheadtop"> Thursday 11 April 2013</td>
</tr>
<tr>
    <td colspan="6" class="sumheadbot">&nbsp;PASSENGER ARRIVALS | DOMESTIC &amp; INTERNATIONAL | All Airlines | ALL OriginS</td>
</tr>
<tr class="schedulerow" style="height:2px"><td colspan="6"></td></tr>
<tr class="schedulerow" valign="top">
    <td class="airline"><img src="/webfids/images/ul.gif" width="100" height="24" vspace="0" alt="Air Lanka"/></td>
    <td class="flight" nowrap>UL 103</td>
    <td class="city">Colombo</td>
    <td class="time">19:25</td>
    <td class="estimated">19:13</td>
    <td class="status"><div class="statusone">LANDED</div></td>
</tr>
<tr class="schedulerow" valign="top">
    <td class="airline"><img src="/webfids/images/mj.gif" width="100" height="24" vspace="0" alt="Other"/></td>
    <td class="flight" nowrap>MJ 1104</td>
    <td class="city"></td>
    <td class="time"></td>
    <td class="estimated">&nbsp;</td>
    <td class="status">&nbsp;</td>
</tr>
<tr class="schedulerow" valign="top">
    <td class="airline"><img src="/webfids/images/ey.gif" width="100" height="24" vspace="0" alt="Other"/></td>
    <td class="flight" nowrap>EY 7400</td>
    <td class="city"></td>
    <td class="time"></td>
    <td class="estimated">&nbsp;</td>
    <td class="status">&nbsp;</td>
</tr>

<tr class="schedulerow" style="height:2px"><td colspan="6"></td></tr>
<tr>
    <td colspan="6" class="sumheadtop"> Friday 12 April 2013</td>
</tr>
<tr>
    <td colspan="6" class="sumheadbot">&nbsp;PASSENGER ARRIVALS | DOMESTIC &amp; INTERNATIONAL | All Airlines | ALL OriginS</td>
</tr>
<tr class="schedulerowtwo" style="height:2px"><td colspan="6"></td></tr>
<tr class="schedulerowtwo" valign="top">
    <td class="airline"><img src="/webfids/images/fz.gif" width="100" height="24" vspace="0" alt="Other"/></td>
    <td class="flight" nowrap>FZ 561</td>
    <td class="city">Dubai</td>
    <td class="time">24:45</td>
    <td class="estimated">&nbsp;</td>
    <td class="status">&nbsp;</td>
</tr>
<tr class="schedulerowtwo" style="height:2px"><td colspan="6"></td></tr>
<tr class="schedulerow" style="height:2px"><td colspan="6"></td></tr>
<tr class="schedulerow" valign="top">
    <td class="airline"><img src="/webfids/images/ul.gif" width="100" height="24" vspace="0" alt="Air Lanka"/></td>
    <td class="flight" nowrap>UL 105</td>
    <td class="city">Colombo</td>
    <td class="time">01:55</td>
    <td class="estimated">&nbsp;</td>
    <td class="status">&nbsp;</td>
</tr>
<tr class="schedulerow" valign="top">
    <td class="airline"><img src="/webfids/images/mh.gif" width="100" height="24" vspace="0" alt="Malaysia A/l"/></td>
    <td class="flight" nowrap>MH 9706</td>
    <td class="city"></td>
    <td class="time"></td>
    <td class="estimated">&nbsp;</td>
    <td class="status">&nbsp;</td>
</tr>
<tr class="schedulerowtwo" style="height:2px"><td colspan="6"></td></tr>
<tr>
    <td colspan="6" class="sumheadtop"> Saturday 13 April 2013</td>
</tr>
<tr>
    <td colspan="6" class="sumheadbot">&nbsp;PASSENGER ARRIVALS | DOMESTIC &amp; INTERNATIONAL | All Airlines | ALL OriginS</td>
</tr>
<tr class="schedulerow" style="height:2px"><td colspan="6"></td></tr>
<tr class="schedulerow" valign="top">
    <td class="airline"><img src="/webfids/images/ul.gif" width="100" height="24" vspace="0" alt="Air Lanka"/></td>
    <td class="flight" nowrap>UL 107</td>
    <td class="city">Colombo</td>
    <td class="time">24:55</td>
    <td class="estimated">&nbsp;</td>
    <td class="status">&nbsp;</td>
</tr>
<tr class="schedulerowtwo" style="height:2px"><td colspan="6"></td></tr>
</table>

目前,我将其用作命令行代码,并遵循我当前使用的代码。

public void get_schedule()
    {
        string local_fname = "nhl-2010-2011.htm";

        var schedule_doc = new HtmlAgilityPack.HtmlDocument();
        schedule_doc.Load(local_fname);

        // identify all the td nodes that directly contain the text "Date"
        var ruw_nodes = schedule_doc.DocumentNode.Descendants()
                     .Where(n => n.Name == "tr")
                     .Where(n => n.GetAttributeValue("valign", null) == "top");

        foreach (var row_node in ruw_nodes)
        {


            var div_nodes = row_node.Elements("td").ToList();

            var airline_node = div_nodes.Where(n => n.GetAttributeValue("class", null) == "airline").FirstOrDefault();
            var flight_node = div_nodes.Where(n => n.GetAttributeValue("class", null) == "flight").FirstOrDefault();
            var city_nodes = div_nodes.Where(n => n.GetAttributeValue("class", null) == "city").FirstOrDefault();
            var time_node = div_nodes.Where(n => n.GetAttributeValue("class", null) == "time").FirstOrDefault();
            var est_node = div_nodes.Where(n => n.GetAttributeValue("class", null) == "estimated").FirstOrDefault();
            var status_node = div_nodes.Where(n => n.GetAttributeValue("class", null) == "status").FirstOrDefault();

            string logo = airline_node.Element("img").Attributes["src"].Value;

            string airline = airline_node.Element("img").Attributes["alt"].Value;

            string flight = flight_node != null ? flight_node.InnerText : "";
            flight = clean_text(flight);

            string city = city_nodes != null ? city_nodes.InnerText : "";
            city = clean_text(city);

            string time = time_node != null ? time_node.InnerText : "";
            time = clean_text(time);

            string est = est_node != null ? est_node.InnerText : "";
            est = clean_text(est);

            string status = status_node != null ? status_node.InnerText : "";
            status = clean_text(status);

            System.Console.WriteLine(" {0} | {1} | {2} | {3}", airline, flight, city, time);
        }



        System.Console.ReadKey();
    }
    public string strip_excessive_whitespace(string s)
    {
        char[] seps = { '\n', '\t', ' ' };
        string[] pieces = Enumerable.ToArray<string>(s.Split().Where(x => x.Length > 0));
        string r = System.String.Join(" ", pieces);
        return r;
    }
    public string clean_text(string s)
    {
        string r = s;
        r = s.Replace("&nbsp;", " ");
        r = strip_excessive_whitespace(r);
        r = r.Trim();
        return r;

    }

我想像这样获取数据

  • 日期 1
  • 飞行1次
  • 飞行2次
  • 日期 2
  • 飞行1次
  • 飞行2次

或者像这样

  • 日期 1 航班 1 时间
  • 日期 1 航班 2 时间
  • 日期 2 航班 1 时间
  • 日期 2 航班 2 时间
4

0 回答 0