从 Scrapy 结果中,标题中有一个不需要的非 ASCII 代码\u2013
(又名character(150)
或en dash
),例如u'Director/Senior Director \u2013 Pathology'
. 我正在尝试使用管道删除\u2013
常规的,
. 但是下面的代码不起作用。也不会报告错误消息。
from datetime import datetime
from hashlib import md5
from scrapy.exceptions import DropItem
from twisted.enterprise import adbapi
import re
import string
class ReplaceASC2InTitlePipeline(object):
"""replace unwanted ASCII characters in titles"""
ascii_to_filter = ["\u2013",]
def process_item(self, item, spider):
for word in self.ascii_to_filter:
desc = item.get('title')
if (desc) and word in desc:
spider.log("\u2013 in '%s' was replace" % (item['title']) )
item['title']=item['title'].replace("\u2013", ",")
return item
else:
return item