1

我为一个网站写了一个半工作的抓取脚本:

async function pageFunction(context) {
    const {
        request,
        log,
        skipLinks,
        jQuery: $,
        waitFor
    } = context;

        log.info('Pagination');
        let timeoutMillis; // undefined
        const buttonSelector = 'div.pagination-view-more';

        //click on Show more button 5 times
        for (let step = 0; step < 5; step++) {
            log.info('Waiting for the "Show more" button.');
            try {
                await waitFor(buttonSelector, {
                    timeoutMillis
                }); // Default timeout first time.
                timeoutMillis = 5000; // 2 sec timeout after the first.
            } catch (err) {
                // Ignore the timeout error.
                log.info('Could not find the "Show more button", we\'ve reached the end.');
                break;
            }
            log.info('Clicking the "Show more" button.');
            $(buttonSelector).click();
        }

        //export the results
        var result = [];
        $(".thing-card").each(function() {
            result.push({
                title: $(this).attr('title'),
                //format Dec 15, 2019
                date: $(this).find('.item-header .item-date').text().replace(/\s/g, ''),
            });
        });
        return result;

}

在上面的示例中,我在“显示更多”按钮上单击了 5 次,并尝试导出标题和日期作为结果。问题是,我没有得到所有的结果,我认为脚本完成得比它应该的要早。

在最终脚本中,我想删除固定的 for 循环并运行此循环,直到结果的日期从今天起最多 -7 天(或 1 周)。Apify 有可能吗?

4

1 回答 1

0

我猜你几乎有这个。Apify 没有任何限制,因为您可以编写任何您想要的代码 :) 所以这是一个比 Apify 特定的更普遍的 JS 问题。

您可以检查最后一项的日期,而不是固定循环(我假设这些项目是从最近的一项开始排序的)。

尽管您可以对其进行微调,但这样的事情应该可以做到。

async function pageFunction(context) {
    const {
        log,
        jQuery: $,
        waitFor
    } = context;

    log.info('Pagination');
    const buttonSelector = 'div.pagination-view-more';

    // Last item date, we have to check it before loop also
    let lastItemsDate = new Date($(".thing-card").last().find('.item-header .item-date').text().replace(/\s/g, ''));

    const weekAgo = new Date(Date.now() - 1000 * 3600 * 24 * 7);

    // We also need to track if we get new items after the click so we dont run in endless loop
    let itemCount = $(".thing-card").length;

    // We only enter the loop if the last item is more recent than week ago
    if (lastItemsDate >= weekAgo) {
        // I chose 'infinite' loop so we can log how we break out
        while (true) {
            log.info('Waiting for the "Show more" button.');
            try {
                await waitFor(buttonSelector);
            } catch (err) {
                // Ignore the timeout error.
                log.info('Could not find the "Show more button", we\'ve reached the end.');
                break;
            }
            log.info('Clicking the "Show more" button.');
            $(buttonSelector).click();
            // Wait a bit so items can load
            await waitFor(5000);

            // Now we check if new items were loaded
            const itemCountAfterClick = $(".thing-card").length;
            if (itemCountAfterClick === itemCount) {
                log.info('No new items, exiting the loop...');
                break;
            }
            itemCount = itemCountAfterClick;

            // Now we check if last item is still within a week. We can compare Dates directly
            lastItemsDate = new Date($(".thing-card").last().find('.item-header .item-date').text().replace(/\s/g, ''));

            if (lastItemsDate < weekAgo) {
                log.info(`Last item date is older than a week, exiting the loop: ${lastItemsDate}`);
                break;
            }
        }
    }

    //export the results
    var result = [];
    $(".thing-card").each(function() {
        result.push({
            title: $(this).attr('title'),
            //format Dec 15, 2019
            date: $(this).find('.item-header .item-date').text().replace(/\s/g, ''),
        });
    });
    return result;

}
于 2020-01-02T07:57:57.137 回答