0

我喜欢威士忌,但我不是开发者。我尝试为从网站上抓取数据编写脚本,然后通知我有关该网站上的新产品。但是我遇到了麻烦,脚本无法正常工作。它没有通知我并且缺少新产品。有人能帮助我吗?我使用 cron 每 5 分钟执行一次我的脚本。

#!/usr/bin/python3
from bs4 import BeautifulSoup
import requests
import time
import pymongo
import difflib
import functools
import numpy as np
import telebot
URL = 'https://www.thewhiskyexchange.com/new-products/standard-whisky'
base = "https://www.thewhiskyexchange.com"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}


client = pymongo.MongoClient('localhost:27017')
db = client.Scrape.WhiskeyExchange
content = requests.get(URL, headers=headers)
soup = BeautifulSoup(content.text, 'html.parser')

bot = telebot.TeleBot('MyApiKey')
bot_chatID2 = 'MychatID'
bot_chatID = 'MyGroupChatID'


def scrapeItems():
    itemList = []

    r = requests.get(URL, headers=headers)
    soup = BeautifulSoup(r.content, 'lxml')

    for listing in soup.select('.product-list-item'):
        name = listing.select_one('.information p').text
        link = base + listing.select_one('.product-list-item a')['href']
        price = listing.select_one('.price')  # price
        if not price is None:  # not every listing has a price
            price = price.text
        else:
            price = np.NaN
        itemList.append(
            {'name': name,
             'link': link,
             'price': price,
             }
        )
    #print(itemList)
    return itemList


def newScrapeItems():
    newItemList = []

    r = requests.get(URL, headers=headers)
    soup = BeautifulSoup(r.content, 'lxml')

    for listing in soup.select('.product-list-item'):
        name = listing.select_one('.information p').text
        link = base + listing.select_one('.product-list-item a')['href']
        price = listing.select_one('.price')  # price
        if not price is None:  # not every listing has a price
            price = price.text
        else:
            price = np.NaN
        newItemList.append(
            {'name': name,
             'link': link,
             'price': price,
             }
        )
    # print(itemList)
    return newItemList

items = scrapeItems()
time.sleep(350)
newItems = newScrapeItems()
links = '\n '.join([i["link"] for i in newItems])

def comparingProducts():
    # compare two lists for difference
    if functools.reduce(lambda x, y: x and y, map(lambda p, q: p == q, items, newItems), True):
        print("Lists items and newItems has not difference between, exit...")
        bot.send_message(bot_chatID, links)
    else:
        print("Lists items and newItems has difference between!, doing next steps")
        list_difference = []
        for item in newItems:
           if item not in items:
                newLinks = '\n '.join([i["link"] for i in items])
                list_difference.append(item)
                bot.send_message(bot_chatID2, newLinks)
                try:
                    db.insert_many(item)
                    print(f'inserted {len(item)} articles')
                except:
                    print('an error occurred quotes were not stored to db')

comparingProducts()

我在聊天中使用 Telebot 发送通知。也许我的问题是无法正常使用 MongoDB,我应该检查 MongoDB 插入?

4

0 回答 0