0

我正在尝试解析包含站点地图列表的站点地图索引。

我成功解析了 sitemapindex.xml 并获得了 .gz 链接列表;但我想知道将它们打开为 xml 的最佳方式是什么?

        String sitemap = "http://www.site.com/siteindex.xml";
        XmlDocument xml = new XmlDocument();
        xml.Load(sitemap);
        XmlNamespaceManager manager = new XmlNamespaceManager(xml.NameTable);
        manager.AddNamespace("s", xml.DocumentElement.NamespaceURI); //Using xml's properties instead of hard-coded URI 
        XmlNodeList xnList = xml.SelectNodes("/s:sitemapindex/s:sitemap", manager);

        var parallelLoop1 = xnList.Count;
        Parallel.For(0, parallelLoop1, parOptions, index =>
        {
            String NAME = xnList[index]["loc"].InnerText;
            System.Net.HttpWebRequest req = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(NAME);
            req.Timeout = 1000 * 60 * 60; // milliseconds 
            System.Net.WebResponse res = req.GetResponse();
            Stream responseStream = res.GetResponseStream();
            XmlDocument xml2 = new XmlDocument();
            xml2.Load(responseStream); //this is the part where it fails- file is .gz, but xml expected
            responseStream.Close();
    ......... more code
        }
4

1 回答 1

0

这就是我解决它的方法,:

            GZipStream zip = new GZipStream(responseStream, CompressionMode.Decompress);
            XmlDocument xml2 = new XmlDocument();
            xml2.Load(zip);

这是我的最终代码:

    String sitemap = "http://www.site.com/siteindex.xml"; 
    XmlDocument xml = new XmlDocument(); 
    xml.Load(sitemap); 
    XmlNamespaceManager manager = new XmlNamespaceManager(xml.NameTable); 
    manager.AddNamespace("s", xml.DocumentElement.NamespaceURI); //Using xml's properties instead of hard-coded URI  
    XmlNodeList xnList = xml.SelectNodes("/s:sitemapindex/s:sitemap", manager); 

    var parallelLoop1 = xnList.Count; 
    Parallel.For(0, parallelLoop1, parOptions, index => 
    { 
        String NAME = xnList[index]["loc"].InnerText; 
        System.Net.HttpWebRequest req = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(NAME); 
        req.Timeout = 1000 * 60 * 60; // milliseconds  
        System.Net.WebResponse res = req.GetResponse(); 
        Stream responseStream = res.GetResponseStream(); 
            GZipStream zip = new GZipStream(responseStream, CompressionMode.Decompress);
            XmlDocument xml2 = new XmlDocument();
            xml2.Load(zip);
        responseStream.Close(); 
......... more code 
    } 
于 2012-07-25T17:09:43.433 回答