1

您好我正在使用 python scrape 库来创建蜘蛛并从网站中提取数据。在我的管道中,我使用 Flask-SQLAlchemy 来配置蜘蛛,以便它将抓取的数据添加到 SQLite 表中。我试图弄清楚如何防止蜘蛛向表中添加重复数据,因此我查询数据库以查看表是否已经包含与蜘蛛即将添加到表中的元素相同的元素。如果是这样,我不希望蜘蛛添加数据而只是移动到下一个元素。这是我到目前为止的代码:

from flask import Flask, render_template, redirect, request, url_for
from flask_sqlalchemy import SQLAlchemy
from sqlalchemy.orm import sessionmaker

import os
import sys

from sqlalchemy import Column, ForeignKey, Integer, String, DateTime, union
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship
from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL

app = Flask(__name__)

# Flask-SQLAlchemy
app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = False
app.config["SQLALCHEMY_DATABASE_URI"] = "sqlite://draft.db"
app.config["SQLALCHEMY_ECHO"] = True
db = SQLAlchemy(app)

def db_connect():
    return create_engine('sqlite:///draft.db')

Base = declarative_base()

class Events(Base):
    __tablename__ = 'events'
    id = Column(Integer, primary_key=True)
    title = Column(String(250), nullable=False)
    location = Column(String(250), nullable=False)
    date = Column(String(250), nullable=False)
    url = Column(String(250), nullable=False)

engine = create_engine('sqlite:///draft.db')

def create_events_table(engine): 
    Base.metadata.create_all(engine)

class GeneralPipeline(object):
    def __init__(self):
        engine = db_connect()
        self.Session = sessionmaker(bind=engine)    

    def process_item(self, item, spider):
        session = self.Session()
        event = Events(**item)

        duplicates = union(session.query(Events).filter_by(title=item["title"]), session.query(Events).filter_by(date=item["date"]))

        if duplicates:
            session.close()

        else:
            try:
                session.add(event)
                session.commit()
            except:
                session.rollback()
                raise
            finally:
                session.close()

        return item

这将返回错误 raise TypeError("Boolean value of this clause is not defined") 我假设它指的是 if duplicates: 语句,但是我不确定为什么这不起作用。我只想检查是否存在任何重复项,如果存在,则蜘蛛不应将该数据元素添加到表中。关于如何使这个系统工作的任何指示?

以下是所要求的更多堆栈跟踪:

TypeError: Boolean value of this clause is not defined
2016-12-06 21:02:34 [scrapy] ERROR: Error processing {'location': None, 'date': datetime.datetime(2017, 1, 21, 21, 0), 'url': 'http://www.trumba.com/events-calendar/ma/boston/harvard-events/harvard-event-calendar/harvard-activities/gazette', 'title': 'Varsity Show'}
Traceback (most recent call last):
  File "/usr/local/lib/python3.4/dist-packages/twisted/internet/defer.py", line 649, in _runCallbacks
    current.result = callback(current.result, *args, **kw)
  File "/home/ubuntu/workspace/project/draft/draft/draft/pipelines.py", line 58, in process_item
    if duplicates:
  File "/usr/local/lib/python3.4/dist-packages/sqlalchemy/sql/elements.py", line 481, in __bool__
    raise TypeError("Boolean value of this clause is not defined")
TypeError: Boolean value of this clause is not defined
2016-12-06 21:02:34 [scrapy] INFO: Closing spider (finished)
2016-12-06 21:02:34 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 524,
 'downloader/request_count': 2,
 'downloader/request_method_count/GET': 2,
 'downloader/response_bytes': 352524,
 'downloader/response_count': 2,
 'downloader/response_status_count/200': 2,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2016, 12, 7, 2, 2, 34, 379384),
 'log_count/DEBUG': 3,
 'log_count/ERROR': 201,
 'log_count/INFO': 7,
 'response_received_count': 2,
 'scheduler/dequeued': 1,
 'scheduler/dequeued/memory': 1,
 'scheduler/enqueued': 1,
 'scheduler/enqueued/memory': 1,
 'start_time': datetime.datetime(2016, 12, 7, 2, 2, 32, 3552)}
2016-12-06 21:02:34 [scrapy] INFO: Spider closed (finished)
4

0 回答 0