0

我有一个用于抓取提要的通用 PhantomJS 设置(我们在所有者的许可下为客户执行此操作)——你给它一个 URL、用于翻页的 jQuery/javascript 代码和一个从提要中选择链接的选择器。

除了翻页的按钮 [参见图片] 之外,此提要似乎可以正常加载所有内容。

由 PhantomJS 渲染:

PhantomJS 照片缺少“翻页”按钮

在我的电脑上由 Chrome 渲染:

在此处输入图像描述

我被难住了一天多。

非常感谢任何帮助。

我的代码:

var page = new WebPage({
                  settings: {
                    loadPlugins: true,
                    userAgent : "Mozilla/5.0 (X11; Linux i686) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46 Safari/536.5",
                    XSSAuditingEnabled: false,
                    webSecurityEnabled: false
                  },
                  viewportSize: { width: 1366, height: 768 }
               }),
    output_phantom = {errors: [], results: null};

var turn_page_jquery    = "$('#yui-pg0-0-next-link').click_link()",
    url_selector_jquery = "$('h3 a').multiAttr('href')",
    url                 = "http://www.springcjd.com/new/search?dpt=2#QryString=%3FlogSearch%3Dfalse%26sortCol%3Dnull%26sortType%3Dnull%26VIN%3Dnull%26dealerStockID%3Dnull%26stockType%3D2%26year%3Dnull%26make%3Dnull%26model%3Dnull%26subModel%3Dnull%26body%3Dnull%26minMileage%3Dnull%26maxMileage%3Dnull%26numOfDoors%3Dnull%26certified%3Dnull%26minDFList%3Dnull%26maxDFList%3Dnull%26onLotAfter%3Dnull%26onLotBefore%3Dnull%26pageNum%3D1%26carsPerPage%3D30%26fullText%3Dnull%26%26mpghmn%3Dnull%26%26lotID%3Dnull%26daysOnLotMin%3Dnull%26%26output%3Djson%22";

// Start things going here
get_links_from_pages(url, url_selector_jquery, turn_page_jquery);

// Allows you to pass args to function in page
function evaluate(page, func) {
    var args = [].slice.call(arguments, 2);
    var fn   = "function() { return (" + func.toString() + ").apply(this, " + JSON.stringify(args) + ");}";
    return page.evaluate(fn);
}

page.onError = function (msg, trace) {
    console.log(msg);
    trace.forEach(function(item) {
        console.log('  ', item.file, ':', item.line);
    });
};

// Communicator between phantomJS and the page.
page.onConsoleMessage = function(msg) 
{
    var msg_json = JSON.parse(msg);

    if(msg_json && msg_json.type)
    {
        var type    = msg_json.type;
        var message = msg_json.message;

        switch(type)
        {
            case 'return_value':
                output_phantom.results = message;
                console.log(JSON.stringify(output_phantom)); 
                phantom.exit(); 
                break;
            case 'message':
                output_phantom.errors.push(message);
                break;
            case 'exit': 
                phantom.exit();
                break;
            case 'render': 
                var photo_name = 'phantom_test.png';
                if(message != '') 
                    photo_name = message;
                page.render(photo_name);
                break;
        }
    }
    else
       output_phantom.errors.push(msg);
};

function inject_scripts()
{
    // Inject jquery and our additional script if they don't exist
    // Would like to be able to overwrite an older version of jQuery
    if(page.evaluate(function () { return typeof jQuery;}) == 'undefined')
    {
        if(page.evaluate(function () { return typeof $;}) == 'function') {
            console.log("'$' symbol already used.");
        }
        else
        {
            if (!page.injectJs("/../../jquery.js")) {
                console.log("jQuery not loaded...");
                phantom.exit();                
            }
        }
    } 

    // Inject scripts that allow communication using fn onConsoleMessage
    if (!page.injectJs("/../lf_additional.js")) {
        console.log("Additional scripts not loaded...");
        phantom.exit();                
    }
}

function run_phantom(url, fn/* args here, just not seen */)
{
    var extra_args = [].slice.call(arguments, 2);

    page.open(url, function (status) {
        // if (status !== 'success') {
        //     console.log('Unable to load the url! (URL: '+url+')');
        //     phantom.exit();
        // }
        // else 
        // {
            inject_scripts();
            output_phantom = {errors: [], results: null}; 

            // run our js code inside the headless browser.
            extra_args.unshift(page, fn);
            evaluate.apply(this, extra_args);
        // }
    });
}

function get_links_from_pages(url, url_selector_jquery, turn_page_jquery)
{
    var fn = function(url_selector_jquery, turn_page_jquery) 
             {
                var results = [];
                var i = 0; 

                var interval = setInterval(function() 
                    {
                        // Photograph page right before selecting values
                        phantom_render('ph_scjd_'+i+'.png');

                        var selected_data = eval(url_selector_jquery);

                        //
                        results.push(selected_data);

                        // Try to turn the page
                        eval(turn_page_jquery);

                        // Get the first 4 pages
                        if(i >= 3) {
                            phantom_return(results);
                            clearInterval(interval);
                        }
                        i++;

                    }, 3000);
             };
    return run_phantom(url, fn, url_selector_jquery, turn_page_jquery);
}

//////////////////////////////////////////////////////////
// Other stuff

// Improved json parsing
(function() {

    var parse = JSON.parse;

    JSON = {

        stringify: JSON.stringify,

        validate: function(str) {

            try {
                parse(str);
                return true;
            } catch(err){
                return err;
            }
        },

        parse: function(str) {

            try {
                return parse(str);
            } catch(err){
                return undefined;
            }
        }
    }
})();

注入的文件“additional.js”:

/* 
 *  These are additional scripts intended to make selection of multiple elements
 *  much more concise.
 */

$.fn.click_link = function() {
     simulateMouseClick(this.selector);
};

$.fn.collect = function(fn) {
    var values = [];

    if (typeof fn == 'string') {
        var prop = fn;
        fn = function() { return this.attr(prop); };
    }

    $(this).each(function() {
        var val = fn.call($(this));
        values.push(val);
    });
    return values;
};

$.fn.multiAttr = function(attrName) {
    return this.collect(attrName);
};

// .text() should be pretty close, except concatenated?
$.fn.multiHtml = function() {
    var val_array = this.collect(function() { return this.html(); });
    return val_array;
};

$.fn.multiVal = function() {
    return this.multiAttr('value');
};

// The commented out code is much more concise, but probably less efficient
// $(arr1).not(arr2).length == 0 && $(arr2).not(arr1).length == 0
jQuery.extend({
    compareArray: function (arrayA, arrayB) {
        if (arrayA.length != arrayB.length) { return false; }
        // sort modifies original array
        // (which are passed by reference to our method!)
        // so clone the arrays before sorting
        var a = jQuery.extend(true, [], arrayA);
        var b = jQuery.extend(true, [], arrayB);
        a.sort(); 
        b.sort();
        for (var i = 0, l = a.length; i < l; i++) {
            if (a[i] !== b[i]) { 
                return false;
            }
        }
        return true;
    }
});

function simulateMouseClick(selector) { 
    var targets = document.querySelectorAll(selector), 
        evt = document.createEvent('MouseEvents'), 
        i, len; 
    evt.initMouseEvent("click", true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null); 

    for ( i = 0, len = targets.length; i < len; ++i ) { 
        targets[i].dispatchEvent(evt);     
    }
}

function send_console_command(type, message)
{
    var msg = {};

    if(!message) message = '';

    msg.message    = message;
    msg.type       = type;
    msg.validation = 'phantom_js_communicator';

    console.log(JSON.stringify(msg));
}

function phantom_exit() {
    send_console_command('exit');
}

function phantom_message(msg) {
    send_console_command('message', msg);
}

function phantom_return(return_val) {
    send_console_command('return_value', return_val);
}

function phantom_render(photo_name) {
    send_console_command('render', photo_name);
}
4

1 回答 1

0

是不是

var page = require('webpage').create();
page.viewportSize = { width: 1366, height: 768 };

未经测试,只是阅读文档。

于 2012-06-13T14:54:47.450 回答