我是 Scrapy 的新手,并试图从网页中提取内容,但在输出中得到了很多额外的字符。见附图。
如何更新我的代码以摆脱字符?我只需要从网页中提取 href。
我的代码:
class AttractionSpider(CrawlSpider):
name = "get-webcontent"
start_urls = [
'http://quotes.toscrape.com/page/1/'
]
rules = ()
def create_dirs(dir):
if not os.path.exists(dir):
os.makedirs(dir)
else:
shutil.rmtree(dir) #removes all the subdirectories!
os.makedirs(dir)
def __init__(self, name=None, **kwargs):
super(AttractionSpider, self).__init__(name, **kwargs)
self.items_buffer = {}
self.base_url = "http://quotes.toscrape.com/page/1/"
from scrapy.conf import settings
settings.overrides['DOWNLOAD_TIMEOUT'] = 360
def write_to_file(file_name, content_list):
with open(file_name, 'wb') as fp:
pickle.dump(content_list, fp)
def parse(self, response):
print ("Start scrapping webcontent....")
try:
str = ""
hxs = Selector(response)
links = hxs.xpath('//li//@href').extract()
with open('test1_href', 'wb') as fp:
pickle.dump(links, fp)
if not links:
return
log.msg("No Data to scrap")
for link in links:
v_url = ''.join( link.extract() )
if not v_url:
continue
else:
_url = self.base_url + v_url
except Exception as e:
log.msg("Parsing failed for URL {%s}"%format(response.request.url))
raise
def parse_details(self, response):
print ("Start scrapping Detailed Info....")
try:
hxs = Selector(response)
yield l_venue
except Exception as e:
log.msg("Parsing failed for URL {%s}"%format(response.request.url))
raise