我想在nodejs中制作简单的网络爬虫,它将从登录成员访问的URL中提取标题和描述。第一个成员将登录,然后如果他想爬取......这将由“createdby”完成这里是我的代码,dal/url.js:
var mysql = require('mysql-libmysqlclient');
var cheerio = require('cheerio');
var async = require('async');
var db = require('mysql');
var generator = require('../bal/urlgenerator');
var crawler = require('../bal/url');
var cutil = require('../util/crawler_utils');
var request = require('request');
var fs = require("fs");
var path = require("path");
var domain = "http://www.pin.tl/" ;
var configs = fs.readFileSync(path.resolve(__dirname, "../config.json"));
if (configs) {
configs = JSON.parse(configs.toString());
}
else {
throw error;
}
exports.insert = function (req,res) {
var surl = generator.generateURL().toString();
db.db_connect(function (err, con) {
if (err) {
throw err;
}
else {
var custom = req.body.curl;
if(!custom=="")
{
var insertQry = "INSERT INTO URL(idURL, URL, Name, ShortURL, CreatedBy, CreatedDate, Code, idCategoryURL, IP) values(" +
"uuid(),'" + req.body.url + "', '' , '"+ domain+custom +"', '"+req.params.userid+"', Now(), '"+custom+"', '15e90e46-ed13-11e2-8bca-74867a028220', '"+req.connection.remoteAddress+"')";
console.log(insertQry)
con.query(insertQry, function (err, msg) {
if (err) {
throw err;
} else {
console.log("Data successfully inserted");
}
});
}
else {
var insertQry = "INSERT INTO URL(idURL, URL, Name, ShortURL, CreatedBy, CreatedDate, Code, idCategoryURL, IP) values(" +
"uuid(),'" + req.body.url + "', '' , '"+ domain+surl +"', '"+req.params.userid+"', Now(), '"+surl+"', '15e90e46-ed13-11e2-8bca-74867a028220', '"+req.connection.remoteAddress+"')";
console.log(insertQry)
con.query(insertQry, function (err, msg) {
if (err) {
throw err;
} else {
console.log("Data successfully inserted");
res.send(domain+surl);
res.send(req.body.FullName);
}
});
}
}
});
}
exports.selectAll = function (req, res) {
db.db_connect(function (err, con) {
if (err) {
throw err;
}
else {
var select ="select * from URL where CreatedBy = '"+req.params.id+"'";
con.query(select, function (err, msg) {
if (err) {
throw err;
}
else {
console.log("Select All from URL");
msg.fetchAll(function(err, rows){
if(err){
throw err;
}
else{
res.json(rows);
}
Crawler = function(){
var self = this;
this.conn = db.createConnection(config.get('db'));
this.indexed = 0;
this._url = select;
console.log(select);
this.url = select;
this.crawl = function(cb){
this.conn.query('SELECT * FROM `queue` LIMIT 0,1',function(e,result){
self.url = result.length > 0 ? result[0].url : select;
request(self.url,function(e,res,body){
if(result.length > 0){
self.conn.query('DELETE FROM `queue` WHERE `id` = ?',[result[0].id],function(){
cb();
});
}
else {
cb();
}
if(!e && res.statusCode === 200){
self.getInfo(body,result.length > 0 ? result[0].from : '');
}
else {
console.log('Error requesting page %s',self.url);
}
self._url = self.url;
});
});
};
this.getInfo = function(html,from){
var $ = cheerio.load(html);
var title = $('head title').text();
var keywords = $('head meta[name=keywords]').attr('content');
var desc = $('head meta[name=description]').attr('content');
var links = $('a');
console.log('Crawling "%s" | %s',title,this.url);
async.map(links.map(function(){
var href = $(this).attr('href');
if(href && href != self._url && !(/^#(\w)+/.test(href)) && !cutil.imageRegexp.test(href)){
if(cutil.isExternal(href)){
return 'INSERT INTO `queue` SET `id` = ``\''+cutil.id()+'\', `url` = '+self.conn.escape(href)+', `from` = '+self.conn.escape(from);
}
else {
return 'INSERT INTO `queue` SET `id` = \''+cutil.id()+'\', `url` = '+self.conn.escape(cutil.resolveRelativeURL(href,self._url))+', `from` = '+self.conn.escape(from);
}
}
return false;
}).filter(function(el){
return !!el;
})
,this.conn.query.bind(this.conn),function(e,result){
if(e){
console.log('Error writing queue.');
console.log(e);
}
});
this.conn.query('INSERT INTO `URL` SET ?',{
id:cutil.id(),
url:this.url,
from:from,
title:title,
keywords:keywords || '',
desc:desc || ''
},function(e){
if(e){
console.log('Error indexing page %s',self.url);
console.log(e);
}
else {
console.log('Successfully indexed page %s',self.url);
self.indexed++;
}
});
};
};
});
}
});
}
});
}