0

我正在尝试学习 js/puppeteer,并通过构建一个简单的网络刮板来刮取书籍信息以用于教育目的。我正在尝试让网络爬虫将 CSV 文件中的 UPC 编号填充到图书网站的搜索栏上。如果我使用单个 UPC 号码,我设法让网络抓取工具抓取网站。

但是我有一个带有 UPC 列表的 CSV,并且很喜欢网络爬虫:

  1. 读取 CSV 文件,
  2. 从第一行获取 UPC,
  3. 在网站上搜索 UPC,
  4. 抓取信息,
  5. 从第 2 行获取 UPC,
  6. 重复 3、4

CSV 样本:

DATE,QUANTITY,NAME,CODECONTENT,CODETYPE
2021-10-13 20:16:44 +1100,1,"Book 1","9781250035288",9
2021-10-13 20:16:40 +1100,1,"Book 2","9781847245601",9
2021-10-13 20:16:35 +1100,1,"Book 3","9780007149247",9
2021-10-13 20:16:30 +1100,1,"Book 4","9780749958084",9
2021-10-13 20:16:26 +1100,1,"Book 5","9781405920384",9

到目前为止,这是我的代码。我被困在 CSV 解析器的异步函数中,我执行

console.log(allupcs);

另外我不知道如何获得

await page.type('#book-search-form > div.el-wrap.header-search-el-wrap > input.text-input','9781509847556');

接受 UPC

请参见下面的代码:

const puppeteer = require('puppeteer');
const parse = require('csv-parser');
const fs = require('fs');

async function getupcs(){
var upcData=[];
fs.createReadStream('Book_Bulk.csv')
    .pipe(parse({delimiter: ':'}))
    .on('data', function(csvrow) {
        // console.log(+csvrow.CODECONTENT);
        //do something with csvrow
        upcData.push(+csvrow.CODECONTENT);        
    })
    .on('end',function() {
      //do something with csvData
      // return upcData;
      console.log(upcData);
    });
}

async function main(){

  // const allupcs = await upcData();

  // console.log(allupcs);

  const browser = await puppeteer.launch({ headless: false, defaultViewport: null, args: ['--start-maximized']});
  const page = await browser.newPage();
  await page.goto('https://www.bookdepository.com/');
  await page.type('#book-search-form > div.el-wrap.header-search-el-wrap > input.text-input','9781509847556');
  await page.click('#book-search-form > div.el-wrap.header-search-el-wrap > button');
  
  //Title
  await page.waitForSelector('.item-info h1');
  const title = await page.$eval('.item-info h1', h1 => h1.textContent);

  //Author
  await page.waitForSelector('div.author-info.hidden-md > span > a > span');
  const author = await page.$eval('div.author-info.hidden-md > span > a > span', span => span.innerText);

  //Genre
  await page.waitForSelector('.active a');
  const genre = await page.$eval('.active a', a => a.innerText);

  //Format
  await page.waitForSelector('.item-info li');
  const format = await page.$eval('.item-info li', li => li.innerText);

  //Publisher
  await page.waitForSelector('div.biblio-wrap > div > ul > li:nth-child(4) > span > a > span'); 
  const publisher = await page.$eval('div.biblio-wrap > div > ul > li:nth-child(4) > span > a > span', span => span.innerText);

  //Year
  await page.waitForSelector('div.biblio-wrap > div > ul > li:nth-child(3) > span'); 
  const year = await page.$eval('div.biblio-wrap > div > ul > li:nth-child(3) > span', span => span.innerText);
  const newyear = year.slice(-4)

  // Price
  try {
    await page.waitForSelector('div.price.item-price-wrap.hidden-xs.hidden-sm > span', { timeout: 1000 });
    const price = await page.$eval('div.price.item-price-wrap.hidden-xs.hidden-sm > span', span => span.innerText);
    var newprice = price.slice(-6);
  } catch {
    await page.waitForSelector('p.list-price'); 
    const price = await page.$eval('p.list-price', p => p.innerText);
    var newprice = price.slice(-6);
  } finally {
    await page.waitForSelector('div.price.item-price-wrap.hidden-xs.hidden-sm > span.sale-price'); 
    const price = await page.$eval('div.price.item-price-wrap.hidden-xs.hidden-sm > span.sale-price', span => span.innerText);
    var newprice = price.slice(-6);
  }

  console.log(title);
  console.log(author);
  console.log(genre);
  console.log(format);
  console.log(publisher);
  console.log(newyear);
  console.log(newprice);

  // return {
  //     title: title,
  //     author: author,
  //     genre: genre,
  //     format: format,
  //     publisher: publisher,
  //     year: newyear,
  //     price: newprice
  // }

}

main();

更新:使用来自答案的代码

const puppeteer = require('puppeteer');
const parse = require('csv-parser');
const fs = require('fs');


async function getpageData(page,upc){
    await page.goto('https://www.bookdepository.com/');
    await page.type('#book-search-form > div.el-wrap.header-search-el-wrap > input.text-input',upc);
    await page.click('#book-search-form > div.el-wrap.header-search-el-wrap > button');
    
    //Title
    await page.waitForSelector('.item-info h1');
    const title = await page.$eval('.item-info h1', h1 => h1.textContent);

    //Author
    await page.waitForSelector('div.author-info.hidden-md > span > a > span');
    const author = await page.$eval('div.author-info.hidden-md > span > a > span', span => span.innerText);

    //Genre
    await page.waitForSelector('.active a');
    const genre = await page.$eval('.active a', a => a.innerText);

    //Format
    await page.waitForSelector('.item-info li');
    const format = await page.$eval('.item-info li', li => li.innerText);

    //Publisher
    await page.waitForSelector('div.biblio-wrap > div > ul > li:nth-child(4) > span > a > span'); 
    const publisher = await page.$eval('div.biblio-wrap > div > ul > li:nth-child(4) > span > a > span', span => span.innerText);

    //Year
    await page.waitForSelector('div.biblio-wrap > div > ul > li:nth-child(3) > span'); 
    const year = await page.$eval('div.biblio-wrap > div > ul > li:nth-child(3) > span', span => span.innerText);
    const newyear = year.slice(-4)

    // Price
    try {
        await page.waitForSelector('div.price.item-price-wrap.hidden-xs.hidden-sm > span', { timeout: 1000 });
        const price = await page.$eval('div.price.item-price-wrap.hidden-xs.hidden-sm > span', span => span.innerText);
        var newprice = price.slice(-6);
    } catch {
        await page.waitForSelector('p.list-price'); 
        const price = await page.$eval('p.list-price', p => p.innerText);
        var newprice = price.slice(-6);
    } finally {
        await page.waitForSelector('div.price.item-price-wrap.hidden-xs.hidden-sm > span.sale-price'); 
        const price = await page.$eval('div.price.item-price-wrap.hidden-xs.hidden-sm > span.sale-price', span => span.innerText);
        var newprice = price.slice(-6);
    }

    
    // console.log(title);
    // console.log(author);
    // console.log(genre);
    // console.log(format);
    // console.log(publisher);
    // console.log(newyear);
    // console.log(newprice);

    return {
        title: title,
        author: author,
        genre: genre,
        format: format,
        publisher: publisher,
        year: newyear,
        price: newprice
    }

};


function readCsvAsync(filename, delimiter=',', encoding='utf-8') {
    return new Promise((resolve, reject) => {
        const rows = [];
        try {
            fs.createReadStream(filename, {encoding: encoding})
                .pipe(parse({delimiter: delimiter}))
                .on('data', (row) => rows.push(+row.CODECONTENT))
                .on('end', () => resolve(rows))
                .on('error', reject);
        } catch (err) {
            reject(err);
        }
    });
}

async function upcData() {
    try {
        const rows = await readCsvAsync('Book_Bulk.csv', ':');
        // console.log(csvData);
        // call puppeteer or whatever
        return rows;
    } catch (err) {
        console.log(err);
    }
}


async function main(){

    const allupcs = await upcData();
  
    // console.log(allupcs);
    const browser = await puppeteer.launch({ headless: false, defaultViewport: null, args: ['--start-maximized']});
    const page = await browser.newPage();
    const scrapedData = [];

    for(let upc of allupcs){
        const data = await getpageData(page,upc);
        scrapedData.push(data);
    }

    console.log(scrapedData);
  
  }

main();

4

1 回答 1

1

如您所见,CSV 解析器是异步的。“异步”意味着你不能这样做:

var upcData=[];                               // 1
fs.createReadStream('Book_Bulk.csv')          // 2
    .pipe(parse({delimiter: ':'}))
    .on('data', (csvrow) {                    // 5 6 7 8 9
        upcData.push(+csvrow.CODECONTENT);   
    })
    .on('end',function() {                    // 10
      console.log(upcData);
    });
}
console.log(upcData);                         // 3
// call puppeteer or whatever                 // 4

我已经概述了执行顺序。最后一个在您设置读取流后立即console.log()运行。此时将不包含任何内容。upcData

但它将包含点 #10 的数据,而 #5 等将填充它。

这意味着:无论您想用 做什么upcData,都可以'end'事件处理程序中进行。

    .on('end',function() {                    // 10
      console.log(upcData);
      for (let upc of upcData) {
        // call puppeteer or whatever
      }
    });

由于 csv 阅读器将为每个事件提供一行data,因此您也可以直接在data事件处理程序中执行操作,而根本不构建upcData数组。

    .on('data', (csvrow) {                    // 5 6 7 8 9
        const upc = +csvrow.CODECONTENT;
        // call puppeteer or whatever
    })

如果你想能够做到await整件事,你必须先把它变成一个承诺。在这种情况下,相关步骤(承诺解决)再次发生在end回调中:

function readCsvAsync(filename, delimiter=',', encoding='utf-8') {
    return new Promise((resolve, reject) => {
        const rows = [];
        try {
            fs.createReadStream(filename, {encoding: encoding})
                .pipe(parse({delimiter: delimiter}))
                .on('data', (row) => rows.push(row))
                .on('end', () => resolve(rows))
                .on('error', reject);
        } catch (err) {
            reject(err);
        }
    });
}

async function main() {
    try {
        const rows = await readCsvAsync('Book_Bulk.csv', ':');
        // call puppeteer or whatever
    } catch (err) {
        console.log(err);
    }
}
于 2021-10-17T11:41:23.917 回答