我有一个用于抓取提要的通用 PhantomJS 设置(我们在所有者的许可下为客户执行此操作)——你给它一个 URL、用于翻页的 jQuery/javascript 代码和一个从提要中选择链接的选择器。
除了翻页的按钮 [参见图片] 之外,此提要似乎可以正常加载所有内容。
由 PhantomJS 渲染:
在我的电脑上由 Chrome 渲染:
我被难住了一天多。
非常感谢任何帮助。
我的代码:
var page = new WebPage({
settings: {
loadPlugins: true,
userAgent : "Mozilla/5.0 (X11; Linux i686) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46 Safari/536.5",
XSSAuditingEnabled: false,
webSecurityEnabled: false
},
viewportSize: { width: 1366, height: 768 }
}),
output_phantom = {errors: [], results: null};
var turn_page_jquery = "$('#yui-pg0-0-next-link').click_link()",
url_selector_jquery = "$('h3 a').multiAttr('href')",
url = "http://www.springcjd.com/new/search?dpt=2#QryString=%3FlogSearch%3Dfalse%26sortCol%3Dnull%26sortType%3Dnull%26VIN%3Dnull%26dealerStockID%3Dnull%26stockType%3D2%26year%3Dnull%26make%3Dnull%26model%3Dnull%26subModel%3Dnull%26body%3Dnull%26minMileage%3Dnull%26maxMileage%3Dnull%26numOfDoors%3Dnull%26certified%3Dnull%26minDFList%3Dnull%26maxDFList%3Dnull%26onLotAfter%3Dnull%26onLotBefore%3Dnull%26pageNum%3D1%26carsPerPage%3D30%26fullText%3Dnull%26%26mpghmn%3Dnull%26%26lotID%3Dnull%26daysOnLotMin%3Dnull%26%26output%3Djson%22";
// Start things going here
get_links_from_pages(url, url_selector_jquery, turn_page_jquery);
// Allows you to pass args to function in page
function evaluate(page, func) {
var args = [].slice.call(arguments, 2);
var fn = "function() { return (" + func.toString() + ").apply(this, " + JSON.stringify(args) + ");}";
return page.evaluate(fn);
}
page.onError = function (msg, trace) {
console.log(msg);
trace.forEach(function(item) {
console.log(' ', item.file, ':', item.line);
});
};
// Communicator between phantomJS and the page.
page.onConsoleMessage = function(msg)
{
var msg_json = JSON.parse(msg);
if(msg_json && msg_json.type)
{
var type = msg_json.type;
var message = msg_json.message;
switch(type)
{
case 'return_value':
output_phantom.results = message;
console.log(JSON.stringify(output_phantom));
phantom.exit();
break;
case 'message':
output_phantom.errors.push(message);
break;
case 'exit':
phantom.exit();
break;
case 'render':
var photo_name = 'phantom_test.png';
if(message != '')
photo_name = message;
page.render(photo_name);
break;
}
}
else
output_phantom.errors.push(msg);
};
function inject_scripts()
{
// Inject jquery and our additional script if they don't exist
// Would like to be able to overwrite an older version of jQuery
if(page.evaluate(function () { return typeof jQuery;}) == 'undefined')
{
if(page.evaluate(function () { return typeof $;}) == 'function') {
console.log("'$' symbol already used.");
}
else
{
if (!page.injectJs("/../../jquery.js")) {
console.log("jQuery not loaded...");
phantom.exit();
}
}
}
// Inject scripts that allow communication using fn onConsoleMessage
if (!page.injectJs("/../lf_additional.js")) {
console.log("Additional scripts not loaded...");
phantom.exit();
}
}
function run_phantom(url, fn/* args here, just not seen */)
{
var extra_args = [].slice.call(arguments, 2);
page.open(url, function (status) {
// if (status !== 'success') {
// console.log('Unable to load the url! (URL: '+url+')');
// phantom.exit();
// }
// else
// {
inject_scripts();
output_phantom = {errors: [], results: null};
// run our js code inside the headless browser.
extra_args.unshift(page, fn);
evaluate.apply(this, extra_args);
// }
});
}
function get_links_from_pages(url, url_selector_jquery, turn_page_jquery)
{
var fn = function(url_selector_jquery, turn_page_jquery)
{
var results = [];
var i = 0;
var interval = setInterval(function()
{
// Photograph page right before selecting values
phantom_render('ph_scjd_'+i+'.png');
var selected_data = eval(url_selector_jquery);
//
results.push(selected_data);
// Try to turn the page
eval(turn_page_jquery);
// Get the first 4 pages
if(i >= 3) {
phantom_return(results);
clearInterval(interval);
}
i++;
}, 3000);
};
return run_phantom(url, fn, url_selector_jquery, turn_page_jquery);
}
//////////////////////////////////////////////////////////
// Other stuff
// Improved json parsing
(function() {
var parse = JSON.parse;
JSON = {
stringify: JSON.stringify,
validate: function(str) {
try {
parse(str);
return true;
} catch(err){
return err;
}
},
parse: function(str) {
try {
return parse(str);
} catch(err){
return undefined;
}
}
}
})();
注入的文件“additional.js”:
/*
* These are additional scripts intended to make selection of multiple elements
* much more concise.
*/
$.fn.click_link = function() {
simulateMouseClick(this.selector);
};
$.fn.collect = function(fn) {
var values = [];
if (typeof fn == 'string') {
var prop = fn;
fn = function() { return this.attr(prop); };
}
$(this).each(function() {
var val = fn.call($(this));
values.push(val);
});
return values;
};
$.fn.multiAttr = function(attrName) {
return this.collect(attrName);
};
// .text() should be pretty close, except concatenated?
$.fn.multiHtml = function() {
var val_array = this.collect(function() { return this.html(); });
return val_array;
};
$.fn.multiVal = function() {
return this.multiAttr('value');
};
// The commented out code is much more concise, but probably less efficient
// $(arr1).not(arr2).length == 0 && $(arr2).not(arr1).length == 0
jQuery.extend({
compareArray: function (arrayA, arrayB) {
if (arrayA.length != arrayB.length) { return false; }
// sort modifies original array
// (which are passed by reference to our method!)
// so clone the arrays before sorting
var a = jQuery.extend(true, [], arrayA);
var b = jQuery.extend(true, [], arrayB);
a.sort();
b.sort();
for (var i = 0, l = a.length; i < l; i++) {
if (a[i] !== b[i]) {
return false;
}
}
return true;
}
});
function simulateMouseClick(selector) {
var targets = document.querySelectorAll(selector),
evt = document.createEvent('MouseEvents'),
i, len;
evt.initMouseEvent("click", true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null);
for ( i = 0, len = targets.length; i < len; ++i ) {
targets[i].dispatchEvent(evt);
}
}
function send_console_command(type, message)
{
var msg = {};
if(!message) message = '';
msg.message = message;
msg.type = type;
msg.validation = 'phantom_js_communicator';
console.log(JSON.stringify(msg));
}
function phantom_exit() {
send_console_command('exit');
}
function phantom_message(msg) {
send_console_command('message', msg);
}
function phantom_return(return_val) {
send_console_command('return_value', return_val);
}
function phantom_render(photo_name) {
send_console_command('render', photo_name);
}