7

以下脚本在我的 NodeJS 服务器中运行良好,但当我尝试抓取一些西里尔文网站时,它很少返回这样的响应。

脚本

x(url, {
    name: 'title',
    ogDescription: 'meta[property="og:description"]@content',
    metaDescription: 'meta[name="description"]@content',
        ogImage: 'meta[property="og:image"]@content',
        twitterImage: 'meta[name="name="twitter:image:src""]@content',
    metaImage: 'meta[name="image"]@content',
    headImage: 'head img@src',
    contentImage_1: '.content img@src',
    contentImage_2: '.image img@src'
  })
(function (err, obj) {
    var firstData = {
        name: [
            obj.name
        ],
        description: [
            obj.metaDescription, 
            obj.ogDescription,
        ],
        image: [
            obj.ogImage,
            obj.twitterImage,
            obj.metaImage,
            obj.headImage,
            obj.contentImage_1,
            obj.contentImage_2
        ]
    }

编码不正确的响应示例

firstData { name: [ '(Rock, Pop) [15LP] [24/96] Queen - Studio Collection - 2015, 
                     FLAC (tracks) :: RuTracker.org' ],
  description:
   [ 'RuTracker.org » ���������� ��� (����������� ���������) » 
                      ������� ������� (Rock, Pop) [15LP] [24/96] Queen - 
                      Studio Collection - 2015, FLAC (tracks)',
                      undefined ],
  image: [ undefined, undefined, undefined, undefined, undefined, undefined ] }

我该如何解决?

4

1 回答 1

0

您可以使用 request 作为 X 射线的驱动程序和 iconv 其中的主体,如下所示:

var options = {};
var conv = null;
options.encoding = 'binary';
iconv = new require('iconv').Iconv('Windows-1251', 'utf8');
conv = function(body) {
    if (!body) return body;
    body = new Buffer.from(body, 'binary');
    return iconv.convert(body).toString();
}

var request = require('request').defaults(options);
var driver = function driver(context, callback) {
    var url = context.url;
    request(url, function(err, response, body) {
        if (!err && conv) body = conv(body);
        return callback(err, body);
    })
};
x.driver(driver);


x(url, {
    name: 'title',
    ogDescription: 'meta[property="og:description"]@content',
    metaDescription: 'meta[name="description"]@content',
    ogImage: 'meta[property="og:image"]@content',
    twitterImage: 'meta[name="name="twitter:image:src""]@content',
    metaImage: 'meta[name="image"]@content',
    headImage: 'head img@src',
    contentImage_1: '.content img@src',
    contentImage_2: '.image img@src'
})
(function (err, obj) {
    var firstData = {
        name: [
            obj.name
        ],
        description: [
            obj.metaDescription, 
            obj.ogDescription,
        ],
        image: [
            obj.ogImage,
            obj.twitterImage,
            obj.metaImage,
            obj.headImage,
            obj.contentImage_1,
            obj.contentImage_2
        ]
    }
    console.log(firstData);

});
于 2018-06-25T11:22:00.833 回答