1

我正在使用 WebHarvest 工具从几个网站上抓取 Web 数据。我已经浏览了这些示例,但无法找到一种在网站中进行身份验证然后从中抓取数据的方法。

任何人都可以举一个示例配置来通过身份验证实现网络数据抓取吗?如何发送登录参数,然后接收主页内容?感谢您对此的帮助。

4

1 回答 1

0

我刚刚修改了 Web Harvest 的一个示例(http://web-harvest.sourceforge.net/samples.php?num=4),它使用登录凭据运行良好。您可能会获得更新的代码并尝试:

<?xml version="1.0" encoding="UTF-8"?>

<config charset="ISO-8859-1">

    <!-- sends post request with needed login information -->
    <http method="post" url="http://www.nytimes.com/auth/login">
        <http-param name="is_continue">true</http-param>
        <http-param name="URI">http://</http-param>
        <http-param name="OQ"></http-param>
        <http-param name="OP"></http-param>
        <http-param name="USERID">web-harvest</http-param>
        <http-param name="PASSWORD">web-harvest</http-param>
    </http>

    <var-def name="startUrl">http://www.nytimes.com/pages/todayspaper/index.html</var-def>

    <file action="write" path="D:/nytimes/nytimes${sys.date()}.xml" charset="UTF-8">
        <template>
            <![CDATA[ <newyourk_times date="${sys.datetime("dd.MM.yyyy")}"> ]]>
        </template>

        <loop item="articleUrl" index="i">
            <!-- collects URLs of all articles from the front page -->
            <list>
                <xpath expression="//div[@class='story']">
                    <html-to-xml>
                        <http url="${startUrl}"/>
                    </html-to-xml>
                </xpath>
            </list>

            <!-- downloads each article and extract data from it -->
            <body>
                <xquery>
                    <xq-param name="doc">
                        <var name="articleUrl"/>
                    </xq-param>
                    <xq-expression><![CDATA[
                        declare variable $doc as node() external;
                        $doc
                    ]]></xq-expression>
                </xquery>
            </body>
        </loop>

        <![CDATA[ </newyourk_times> ]]>
    </file>

</config>
于 2014-04-15T09:44:26.490 回答