1

我正在尝试使用名为“web-scraper” (https://apify.com/apify/web-scraper的 Apify 演员从https://en.wikipedia.org/wiki/List_of_hedge_funds抓取 URL

具体来说,我正在尝试使用以下 ApifypageFunction来抓取该目标页面并从 HTML 中存在的锚标记返回 URL 列表。

页面函数
async function pageFunction( context ) {
    const url = 'https://en.wikipedia.org/wiki/List_of_hedge_funds';
    const cssSelector = 'tr > td > a';

    const $ = context.jQuery;
    const pageTitle = $('title').first().text();
    const anchorTag = $( cssSelector );

    return {
      url: context.request.url,
      pageTitle, anchorTag,
    };
}

在我的控制台中,我希望href在名为anchorTag. 我还希望在名为pageTitle的属性和url属性中看到页面标题。如下:

我期望看到的:
{
  "url": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
  "pageTitle": "List of hedge funds - Wikipedia",
  "anchorTag": {
    "0": "http://example0.com", // each instance of "http://example.com" represents a unique url on the target page to be scraped
    "1": "http://example1.com",
    "2": "http://example2.com",
    "3": "http://example3.com",
    ...
    "39": "http://example39.com",
}}

但是,参与者返回的不是 URL 列表,而是以下数据集:

我实际看到的:
[{
  "url": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
  "pageTitle": "List of hedge funds - Wikipedia",
  "anchorTag": {
    "0": {},
    "1": {},
    "2": {},
    "3": {},
    "4": {},
    "5": {},
    "6": {},
    "7": {},
    "8": {},
    "9": {},
    "10": {},
    "11": {},
    "12": {},
    "13": {},
    "14": {},
    "15": {},
    "16": {},
    "17": {},
    "18": {},
    "19": {},
    "20": {},
    "21": {},
    "22": {},
    "23": {},
    "24": {},
    "25": {},
    "26": {},
    "27": {},
    "28": {},
    "29": {},
    "30": {},
    "31": {},
    "32": {},
    "33": {},
    "34": {},
    "35": {},
    "36": {},
    "37": {},
    "38": {},
    "39": {},
    "length": 40,
    "prevObject": {
      "0": {
        "location": {
          "href": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
          "ancestorOrigins": {},
          "origin": "https://en.wikipedia.org",
          "protocol": "https:",
          "host": "en.wikipedia.org",
          "hostname": "en.wikipedia.org",
          "port": "",
          "pathname": "/wiki/List_of_hedge_funds",
          "search": "",
          "hash": "",
          "assign": {},
          "reload": {},
          "toString": {},
          "replace": {}
        },
        "write": {},
        "writeln": {},
        "jQuery3410461525655351679551": {
          "events": {
            "mmv-setup-overlay": [
              {
                "type": "mmv-setup-overlay",
                "origType": "mmv-setup-overlay",
                "handler": {
                  "guid": 21
                },
                "guid": 21,
                "namespace": ""
              }
            ],
            "mmv-cleanup-overlay": [
              {
                "type": "mmv-cleanup-overlay",
                "origType": "mmv-cleanup-overlay",
                "handler": {
                  "guid": 22
                },
                "guid": 22,
                "namespace": ""
              }
            ],
            "keyup": [
              {
                "type": "keyup",
                "origType": "keyup",
                "handler": {
                  "guid": 24
                },
                "guid": 24,
                "selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
                "needsContext": false,
                "namespace": ""
              }
            ],
            "mouseover": [
              {
                "type": "mouseover",
                "origType": "mouseover",
                "handler": {
                  "guid": 24
                },
                "guid": 24,
                "selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
                "needsContext": false,
                "namespace": ""
              }
            ],
            "focusout": [
              {
                "type": "focusout",
                "origType": "blur",
                "handler": {
                  "guid": 25
                },
                "guid": 25,
                "selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
                "needsContext": false,
                "namespace": ""
              }
            ],
            "mouseout": [
              {
                "type": "mouseout",
                "origType": "mouseout",
                "handler": {
                  "guid": 25
                },
                "guid": 25,
                "selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
                "needsContext": false,
                "namespace": ""
              }
            ],
            "click": [
              {
                "type": "click",
                "origType": "click",
                "handler": {
                  "guid": 26
                },
                "guid": 26,
                "selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
                "needsContext": false,
                "namespace": ""
              }
            ]
          },
          "handle": {},
          "focusin": 1,
          "focusout": 1
        }
      },
      "length": 1
    }
  }
}]

我究竟做错了什么?

4

1 回答 1

2

您必须访问标签的href属性才能获取 URL。a此外,您需要遍历所有a标签以将它们放入一个数组中。

// ...
const anchorTag = $( cssSelector );
const links = [];

// anchorTag in a JQuery handle, not a normal JavaScript value so it has special JQuery methods
anchorTag.each((index, el) => {
    const link = $(el).attr('href');
    if (link) {
         links.push(link);
    }
})

return {
   url: context.request.url,
   pageTitle,
   links,
};

于 2020-02-28T14:30:52.267 回答