1

我希望有人能指出我(可能是愚蠢的)这个脚本的问题。我正在尝试爬取网站以获取该网站上的帖子并将其加载到 XML 文档中。我尝试结合几个示例脚本 - 爬虫和 nytimes 示例。

该脚本运行没有错误,但只<edublogs date="02.10.2015"></edublogs>导出标签。

在此先感谢您的帮助。

<?xml version="1.0" encoding="UTF-8"?>

<config charset="UTF-8">

<!-- set initial page -->
<var-def name="home"><<SNIPPED>></var-def>

<!-- define script functions and variables -->
<script><![CDATA[
    /* checks if specified URL is valid for download */
    boolean isValidUrl(String url) {
        String urlSmall = url.toLowerCase();
        return urlSmall.startsWith("http://<<SNIPPED>>/") || urlSmall.startsWith("https://<<SNIPPED>>/");
    }

    /* set of unvisited URLs */
    Set unvisited = new HashSet();
    unvisited.add(home);

    /* pushes to web-harvest context initial set of unvisited pages */
    SetContextVar("unvisitedVar", unvisited);

    /* set of visited URLs */
    Set visited = new HashSet();
]]></script>

<file action="write" path="posts${sys.date()}.xml" charset="UTF-8">
    <template>
        <![CDATA[ <allposts date="${sys.datetime("dd.MM.yyyy")}"> ]]>
    </template>

    <!-- loop while there are any unvisited links -->
    <while condition="${unvisitedVar.toList().size() != 0}">
        <loop item="currUrl">
            <list>
                <var name="unvisitedVar"/>
            </list>
            <body>
                <empty>
                    <!-- Get page content -->
                    <var-def name="content">
                        <html-to-xml>
                            <http url="${currUrl}"/>
                        </html-to-xml>
                    </var-def>
                    <!-- Get variables -->
                    <xquery>
                    <xq-param name="doc">
                            <var name="content"/>
                    </xq-param>
                    <xq-expression><![CDATA[
                        declare variable $doc as node() external;

                        let $title := data($doc//h1)
                        let $text := data($doc//div[@class="post-entry"])
                        let $categories := data($doc//div[@class="post-data"])
                            return 
                            <post>
                                <title>{data($title)}</title>
                                <url>$(currUrl)</url>
                                <text>{data($text)}</text>
                                <categories>{data($categories)}</categories>
                            </post>
                        ]]></xq-expression>
                    </xquery>

                    <!-- adds current URL to the list of visited -->
                    <script><![CDATA[
                        visited.add(sys.fullUrl(home, currUrl));
                        Set newLinks = new HashSet();
                    ]]></script>

                    <!-- loop through all collected links on the downloaded page -->
                    <loop item="currLink">
                        <list>
                            <xpath expression="//a/@href">
                                <var name="content"/>
                            </xpath>
                        </list>
                        <body>
                            <script><![CDATA[
                                String fullLink = sys.fullUrl(home, currLink);
                                fullLink = fullLink.replaceAll("#.*","");
                                if ( isValidUrl(fullLink.toString()) && !visited.contains(fullLink) && !unvisitedVar.toList().contains(fullLink) && !fullLink.endsWith(".png") ) {
                                    newLinks.add(fullLink);
                                }
                            ]]></script>
                        </body>
                    </loop>
                </empty>
            </body>
        </loop>

        <!-- unvisited link are now all the collected new links from downloaded pages  -->
        <script><![CDATA[
             SetContextVar("unvisitedVar", newLinks);
        ]]></script>
    </while>
    <![CDATA[ </posts> ]]>
</file>

4

1 回答 1

0

这是因为你while不返回任何东西。很可能是因为你已经包围了body-empty这将强制不返回任何结果(参见手册)。它设置变量等,但不会将任何内容返回到“控制台”以file进行打印。

于 2015-11-13T16:48:32.637 回答