0

我在市场上买一辆新车。与其反复搜索经销商网站,我认为这将是一个学习一点节点和 mongodb 的有趣而有趣的机会,所以我正在爬取当地经销商的网站以获取我感兴趣的品牌和型号。

我遇到的问题是在我的最终回调运行完成后节点不会终止。

var cheerio = require('cheerio');
var request = require('request');
var db = require('mongodb');
var S = require('string');
var log = require('console').log;
var async = require('async');

var links = [];
var website = 'http://www.yahoo.com'; 

async.series(
    [
        function(){
            log('starting');
            db.connect('mongodb://127.0.0.1:27017/test',
                function(err, base){
                    if(err) throw err;
                    db = base;
                });
        },
        request(website, start)
    ],
        function(){
            log('closing DB');
            db.close();
    });

function start(err,resp,body){
    var $ = cheerio.load(body);
    var numPages = 2;
    $('.gbps').each(function(i,elem) {
        links.push('http://www.yahoo.com');
    });

    var pageURLS = [];
    for (var i = 2; i<=numPages; i++){
        //create URLs for additional pages
        pageURLS[i-2] = website;
    }
    var pages = 1;
    log('getting page URLs');
    pageURLS.forEach(function(url, index, array){
        request(url, function(error,response,bodies) {
            pages++;
            var $ = cheerio.load(bodies);
            $('.tab').each(function(i,elem) {
                links.push('http://www.yahoo.com');
            });
            if (pages == numPages){
                getDetailInfo();
            };
        });
    });
}

function getDetailInfo(){
    log(links.length);
    links.forEach(function(link, index, array){
        request(link, doStuff);
    });
}

function doStuff(err, response, body){
    if(err){
        log(err);
    }
    parseDetailResponse(err,response,body, addToDB);
}

function parseDetailResponse(err,resp,body,callback){
    log('parsing');
    var $ = cheerio.load(body);
    var specs = $('.specifications').children().map(function(i, elem){
        var key = 'key';
        var value = 'value';
        var ret = {};
        ret [ 'name' ] = key;
        ret [ 'value' ] = value;
        return ret;
    });
    var makeAndModel = 'makeAndModel';
    callback(['picture url', 'vehicle description', 100, specs, makeAndModel]);
}

function getMakeAndModel(stuff){
    var $ = cheerio.load(stuff);
    temp = $('.gbps').map(function(i, elem){
        var ret = {};
        switch(i){
            case 0:
                ret['name'] = 'year';
                ret['value'] = $(this).text();
                break;
            case 1:
                ret['name'] = 'make';
                ret['value'] = $(this).text();
                break;
            case 2:
                ret['name'] = 'model';
                ret['value'] = $(this).text();
                break;
            case 3:
                ret['name'] = 'ignore';
                ret['value'] = $(this).text();
                break;
            default:
                ret['name'] = 'ignore';
                ret['value'] = 'ignore';
        }
        return ret;
    });
    return temp;
}

function addToDB(arr){
    log('adding to DB');
    pic = arr[0];
    description = arr[1];
    price = arr[2];
    specs = arr[3];
    makeAndModel = arr[4];

    var obj = {};
    for (var i = specs.length - 1; i >= 0; i--) {
        obj [specs[i].name] = specs[i].value;
     }; 
    for (var i = makeAndModel.length - 1; i >= 0; i--){
        obj [makeAndModel[i].name] = makeAndModel[i].value;
    };
    db.collection('carsTest').update(
        {VIN: obj.VIN},
        {
            $set: {
                VIN: obj.VIN,
                make: obj.make,
                model: obj.model,
                year: obj.year,
                price: price,
                engine: obj.Engine,
                interior: obj.Interior,
                exterior: obj.Exterior,
                'model code': obj['Model Code'],
                'stock number': S(obj['Stock Number']).toInt(),
                transmission: obj.Transmission,
                mileage: obj.Mileage ? obj.Mileage : 0,
                description: description,
                picture: pic,
            }
        },
        {upsert: true, safe: true},
        function(err,result){
            if(err){
                throw err;
            }
        });
    log('finished with this one!');
}

我在这里省略并更改了相当数量的证明,没有进行大量错误检查或任何其他操作,但即使这样也会添加文档但不会退出。Node 只是坐在那里,等待某些事情发生,它从不调用最终回调来关闭数据库并退出。

> db.carsTest.find().pretty()
{
    "_id" : ObjectId("52139aa7c9b7a39e0f1eb61d"),
    "VIN" : null,
    "description" : "vehicle description",
    "engine" : null,
    "exterior" : null,
    "interior" : null,
    "make" : null,
    "mileage" : 0,
    "model" : null,
    "model code" : null,
    "picture" : "picture url",
    "price" : 100,
    "stock number" : NaN,
    "transmission" : null,
    "year" : null
}
4

1 回答 1

2

我认为您误解了如何async.series工作。

您的函数async.seriescallback作为参数,他们不调用它。而那些request(...)东西可能根本就不是一个函数。这可能就是它打破异步循环的原因。试试这个:

async.series(
    [
        function(callback) { // <--- missing callback
            log('starting');
            db.connect('mongodb://127.0.0.1:27017/test',
                function(err, base){
                    if(err) throw err;
                    db = base;
                    callback(); // <--- missing callback
                });
        },
        function(callback) { // <--- missing function with callback
            request(website, function(err,resp,body) {
                start(err, resp, body, callback);
            })
        }
    ],
    function(){
        log('closing DB');
        db.close();
    }
);

请注意,我callback在调用start. 因此,您将不得不大胆地重构您的代码,以便每个函数都接受callback在您知道所有工作都已完成时可以在最后调用的函数。例如,您可以在async.parallel里面添加start,这个函数可能如下所示:

function start(err, resp, body, callback) {
    // some stuff happens here
    var jobs = []
    pageURLS.forEach(function(url, index, array){
        jobs.push(function(clb) {
            request(url, function(error,response,bodies) {
                // some stuff
                clb(); // <--- this refers to the local callback for the job
            });
        });
    });
    async.parallel(jobs, function() {
        // all jobs are done, let's finilize everything
        callback();
    });
};
于 2013-08-20T17:17:40.757 回答