3

我正在尝试使用cheerio从页面中抓取数据并通过以下方式请求:

我正在努力解决如何做到这一点(注意,我只熟悉节点 js 和 Cheerio/request 来完成这个任务,即使它可能并不优雅,所以我没有在寻找替代库或语言来做到这一点,抱歉) . 我想我错过了一些东西,因为我什至想不出这怎么可能。


编辑

让我换一种方式试试。这是代码的第一部分:

    var request = require('request'),
    cheerio = require('cheerio');

    request('http://api.trove.nla.gov.au/result?key=6k6oagt6ott4ohno&zone=book&l-advformat=Thesis&sortby=dateDesc&q=+date%3A[2000+TO+2014]&l-availability=y&l-australian=y&n=1&s=0', function(error, response, html) {

    if (!error && response.statusCode == 200) {
        var $ = cheerio.load(html, {
          xmlMode: true
        });

        var id = ($('work').attr('id'))
        var total = ($('record').attr('total'))
    }
});

返回的第一个页面如下所示

<response>
  <query>date:[2000 TO 2014]</query>
  <zone name="book">
    <records s="0" n="1" total="69977" next="/result?l-advformat=Thesis&sortby=dateDesc&q=+date%3A%5B2000+TO+2014%5D&l-availability=y&l-australian=y&n=1&zone=book&s=1">
      <work id="189231549" url="/work/189231549">
        <troveUrl>http://trove.nla.gov.au/work/189231549</troveUrl>
        <title>
        Design of physiological control and magnetic levitation systems for a total artificial heart
        </title>
        <contributor>Greatrex, Nicholas Anthony</contributor>
        <issued>2014</issued>
        <type>Thesis</type>
        <holdingsCount>1</holdingsCount>
        <versionCount>1</versionCount>
        <relevance score="0.001961126">vaguely relevant</relevance>
        <identifier type="url" linktype="fulltext">http://eprints.qut.edu.au/65642/</identifier>
      </work>
    </records>
  </zone>
</response>

上面的 URL 需要递增 s=0、s=1 等“总”次。'id' 需要在第二个请求中输入到下面的 url:

request('http://api.trove.nla.gov.au/work/" +(id)+ "?key=6k6oagt6ott4ohno&reclevel=full', function(error, response, html) {

    if (!error && response.statusCode == 200) {
        var $ = cheerio.load(html, {
          xmlMode: true
        });

        //extract data here etc.

    }
});

例如,当使用第一个请求返回的 id="189231549" 时,第二个返回的页面如下所示

<work id="189231549" url="/work/189231549">
  <troveUrl>http://trove.nla.gov.au/work/189231549</troveUrl>
  <title>
    Design of physiological control and magnetic levitation systems for a total artificial heart
  </title>
  <contributor>Greatrex, Nicholas Anthony</contributor>
  <issued>2014</issued>
  <type>Thesis</type>
  <subject>Total Artificial Heart</subject>
  <subject>Magnetic Levitation</subject>
  <subject>Physiological Control</subject>
  <abstract>
    Total Artificial Hearts are mechanical pumps which can be used to replace the failing natural heart. This novel study developed a means of controlling a new design of pump to reproduce physiological flow bringing closer the realisation of a practical artificial heart. Using a mathematical model of the device, an optimisation algorithm was used to determine the best configuration for the magnetic levitation system of the pump. The prototype device was constructed and tested in a mock circulation loop. A physiological controller was designed to replicate the Frank-Starling like balancing behaviour of the natural heart. The device and controller provided sufficient support for a human patient while also demonstrating good response to various physiological conditions and events. This novel work brings the design of a practical artificial heart closer to realisation.
  </abstract>
  <language>English</language>
  <holdingsCount>1</holdingsCount>
  <versionCount>1</versionCount>
  <tagCount>0</tagCount>
  <commentCount>0</commentCount>
  <listCount>0</listCount>
  <identifier type="url" linktype="fulltext">http://eprints.qut.edu.au/65642/</identifier>
</work>

所以我现在的问题是如何将这两个部分(循环)联系在一起以实现结果(下载并解析大约 70000 页)?

我不知道如何在 JavaScript 中为 Node.js 编写代码。我是 JavaScript 新手

4

2 回答 2

1

您可以通过研究现有的著名网站复制器(闭源或开源)来了解如何做到这一点

例如 - 使用http://www.tenmax.com/teleport/pro/home.htm的试用版来废弃您的页面,然后使用http://www.httrack.com尝试相同的操作,您应该知道如何他们很清楚地做到了(以及如何做到)。

关键的编程概念是lookup cachetask queue

如果您的解决方案应该很好地扩展到多个 node.js 工作进程和多个页面,那么递归在这里不是一个成功的概念

编辑:澄清评论后

在你开始将你的报废引擎改造成更可扩展的架构之前,作为一个新的 Node.js 开发人员,你可以简单地从由 @lucio-m- 创建的wait.for包提供的Node.js 回调地狱的同步替代方案开始塔托。

下面的代码与您提供的链接一起为我工作

var request = require('request');
var cheerio = require('cheerio');
var wait = require("wait.for");

function requestWaitForWrapper(url, callback) {
  request(url, function(error, response, html) {
    if (error)
      callback(error, response);
    else if (response.statusCode == 200)
      callback(null, html);
    else
      callback(new Error("Status not 200 OK"), response);
  });
}

function readBookInfo(baseUrl, s) {
  var html = wait.for(requestWaitForWrapper, baseUrl + '&s=' + s.toString());
  var $ = cheerio.load(html, {
    xmlMode: true
  });

  return {
    s: s,
    id: $('work').attr('id'),
    total: parseInt($('records').attr('total'))
  };
}

function readWorkInfo(id) {
  var html = wait.for(requestWaitForWrapper, 'http://api.trove.nla.gov.au/work/' + id.toString() + '?key=6k6oagt6ott4ohno&reclevel=full');
  var $ = cheerio.load(html, {
    xmlMode: true
  });

  return {
    title: $('title').text(),
    contributor: $('contributor').text()
  }
}

function main() {
  var baseBookUrl = 'http://api.trove.nla.gov.au/result?key=6k6oagt6ott4ohno&zone=book&l-advformat=Thesis&sortby=dateDesc&q=+date%3A[2000+TO+2014]&l-availability=y&l-australian=y&n=1';
  var baseInfo = readBookInfo(baseBookUrl, 0);

  for (var s = 0; s < baseInfo.total; s++) {
    var bookInfo = readBookInfo(baseBookUrl, s);
    var workInfo = readWorkInfo(bookInfo.id);
    console.log(bookInfo.id + ";" + workInfo.contributor + ";" + workInfo.title);
  }
}

wait.launchFiber(main);
于 2014-08-05T04:13:43.203 回答
0

您可以使用附加的异步模块来处理多个请求并通过多个页面进行迭代。在此处阅读有关异步的更多信息 https://github.com/caolan/async

于 2015-08-15T15:02:54.977 回答