2

我正在创建一个项目,使用 TimelineJS 抓取 Wikipedia JP 在 Netlify 函数上的数据。

TimelineJS 需要 JSON 及其确定的模式,我从 Wikiepdia JP 中抓取了多个元素来获取 JSON 数据。

我需要将这些元素设置为数组,并在 HTTP 请求时将正文作为文本返回。

我写的代码在这里。

const axios = require('axios')
const cheerio = require('cheerio')
const moment = require('moment')
const ogs = require('open-graph-scraper')

const url = 'https://ja.wikipedia.org/wiki/'

moment.locale('ja')

const getOGPImagePath = async (options) => {
  return await ogs(options)
    .then((response) => {
      // console.log(response)
      const ogImagePath = response.data.ogImage.url
      return ogImagePath
    })
    .catch((error) => {
      console.log(error)
    })
}

exports.handler = async (event, context) => {
  const date = event.queryStringParameters.date
  const endpoint = url + date
  const momentDate = moment(date, 'M月D日')
  const day = momentDate.format('D')
  const month = momentDate.format('M')
  const html = await axios.get(encodeURI(endpoint),{ headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100'}})
  const $ = await cheerio.load(html.data, { decodeEntities: false })
  let data = []

  $('.mw-parser-output > ul').first().find('li').not('.mw-empty-elt').each(async (i,elem) => {
    const year = String($(elem).text().match(/\d{1,4}年/)).replace(/年/, '')
    const content = String($(elem).text().match(/\-\s(.*)/))

    const contentHtml = $(elem).html().replace(/((^<a(.*)>\d{1,4}年&lt;\/a>(.*)\s\-\s)|(^\d{1,4}年(.*)\s\-\s))/,'')

    const $c = await cheerio.load(contentHtml, { decodeEntities: false })

    const mainUrl = await 'https://ja.wikipedia.org' + $c('a').first().attr('href')

    const options = {
      url: mainUrl
    }

    // console.log(mainUrl) <- works fine

    const ogImagePath = await getOGPImagePath(options)

    // console.log(ogImagePath) <- works fine

    data.push({
      media: {  
        url: ogImagePath,
      },
      start_date: { 
        year: year,
        month: month,
        day: day
      },
      text: {
        text: contentHtml
      },  
      background: ogImagePath
    })

    // console.log(data) <- works fine
  })

  console.log(data) // <- Does not works fine. The result is '[]' empty array.

  return {
    statusCode: 200,
    body: JSON.stringify({ events: data }),
    headers: {
      'Content-Type': 'application/json; charset=utf8',
      'Access-Control-Allow-Origin': "*"
    }
  };
}

有没有人有这种情况的解决方案?

4

0 回答 0