0

如何从 PhantomJS 获取最新的页面数据(HTML 和 Javascript 变量)

例如 page.refresh() 什么的?

我有一个间隔,而不是每 200 毫秒检查一个变量(在页面上)。但是,此变量和页面内容并未显示随时间而改变。(即使我知道它有)

所以我需要一种有效的方法来每隔 200 毫秒左右检查一次 JS 变量的值,

然后,一旦我发现该变量已更改值,我想请求最新的页面 HTML。

我怎样才能做到这一点?

var Error = function (description) {
    this.description = description;
    return this;
};

var DTO = function (status, content, error) {
    this.status = status;
    this.content = content;
    this.error = error;
    return this;
};

function outputAndExit(dto) {
    console.log(JSON.stringify(dto));
    phantom.exit();
}

//For any uncaught exception, just log it out for .NET to capture
window.onerror = function (errorMsg, url, lineNumber) {
    var description = 'window.onerror caught an error: ' +
        'errorMsg: ' + errorMsg +
        'url: ' + url +
        'lineNumber: ' + lineNumber;
    outputAndExit(new DTO(false, null, new Error(description)));
};

var GetDynamicPageResult__ = function () {
    var obj = new GetDynamicPageResult();
    obj.initialize();
    return obj;
};

var GetDynamicPageResult = function () {
    var self = this;
    this.initialize = function () {

        this.error = null;
        this.isContentReadyForCrawler = false;

        this.ticker = null;
        this.tickerInterval = 150;
        this.tickerElapsed = 0;

        this.url = '';

        this.loadDependencies();
        this.processArgs();

        this.openPage();

    };
    this.loadDependencies = function () {
        this.system = require('system'),
        this.page = require('webpage').create(),
        this.page.injectJs('jquery-1.10.2.min');
        this.fs = require('fs');
    };
    this.processArgs = function () {
        if (this.system.args.length == 0) {
            outputAndExit(new DTO(false, null, new Error('No arguments given')));
        }
        //system.args[0] Was the name of this script
        this.url = this.system.args[1];
    };
    this.updateIsContentReadyForCrawler = function () {
        var updateIsContentReadyForCrawler = self.page.evaluate(function () {
            self.isContentReadyForCrawler = window.isContentReadyForCrawler;
        });
    };
    this.openPage = function () {
        self.page.open(this.url, function (status) { //NB: status = 'success' || 'fail'
            if (status !== 'success') {
                outputAndExit(new DTO(false, null, new Error('page.open received a non-success status')));
            }
            self.initTicker();
        });
    };

    this.initTicker = function () {
        this.ticker = setInterval(self.handleTick, self.tickerInterval);
    };
    this.handleTick = function () {
        self.tickerElapsed += self.tickerInterval;
        self.updateIsContentReadyForCrawler();
        if (self.isContentReadyForCrawler) {
            clearInterval(self.ticker);
            var content = self.page.content;
            self.finish(true, content, null);
        } else {
            var tooMuchTimeElapsed = self.tickerElapsed > 7000;
            if (tooMuchTimeElapsed) {
                clearInterval(self.ticker);
                self.finish(false, null, new Error('Too much time elapsed'));
            }
        }
    };
    this.finish = function (status, content, error) {
        content = content || '';
        error = error || {};
        outputAndExit(new DTO(status, content, error));
    };
};

/**********************************************************************************/
/***************************** Helpers *****************************/
/**********************************************************************************/

var Utility__ = function () {
    var obj = new Utility();
    obj.initialize();
    return obj;
};

var Utility = function () {
    var self = this;
    this.initialize = function () {
    };
    this.isEmpty = function (obj) {
        var isEmpty = false;
        (obj == undefined || obj == null) && (isEmpty = true);
        return isEmpty;
    };
    this.isStringEmpty = function (str) {
        var isEmpty = false;
        isEmpty(str) && (isEmpty = true);
        (isEmpty == false && $.trim(str) == '') && (isEmpty = true);
        return isEmpty;
    };
};

var getDynamicPageResult = new GetDynamicPageResult__();
4

1 回答 1

2

我想你快到了:你需要使用page.evaluate(),但目前只使用它来获取 window.isContentReadyForCrawler。您还需要使用page.evaluate()来获取最新的 HTML。

我将无耻地粘贴另一个答案的代码(https://stackoverflow.com/a/12044474/841830):

var html = page.evaluate(function () {
    var root = document.getElementsByTagName("html")[0];
    var html = root ? root.outerHTML : document.body.innerHTML;
    return html;
});
于 2013-11-11T02:47:09.840 回答