我正在使用 Scrapy 从 stox.vn 收集数据。我有 urls.txt 大约有 800 个 url,并将所有 url 传递给我的机器人。但是,起初它爬行和刮擦很好。但后来它停止刮擦,只爬行。
2013-06-27 03:24:28+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial/PV_Index?filter=1&unit=1000000&ticker=AAA> (referer: http://companyaz.stox.vn/Financial?cId=746&iId=150&iIdL=147&eId=1&tId=2status=1&id=-1&cat=&ticker=AAA)
2013-06-27 03:24:28+0700 [stox] DEBUG: Scraped from <200 http://companyaz.stox.vn/Financial/PV_Index?filter=1&unit=1000000&ticker=AAA>
{'chi_phi_ban_hang': u'-7453.41',
'chi_phi_khau_hao_TSCD': u'11890.11',
'chi_phi_quan_ly': u'-5913.60',
'chi_phi_tai_chinh': u'-10677.99',
'chi_phi_tien_lai_vay': u'-5672.17',
'doanh_thu_thuan': u'122008.75',
'gia_von_hang_ban': u'-90790.07',
'lai_co_dong_ct_me': u'11885.60',
'lai_gop': u'31218.69',
'lai_sau_thue': u'11885.60',
'lai_tu_hdkd': u'11376.31',
'loi_ich_CDTS': u'11885.60',
'qtime': u'20101',
'thu_nhap_tai_chinh': u'4202.63',
'thue_TNDN_hl': u'509.29',
'thue_TNDN_ht': u'0',
'ticker': 'AAA'}
.....
2013-06-27 03:24:31+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=446&iId=292&iIdL=280&eId=3&tId=3status=1&id=-1&cat=&ticker=ABI> (referer: None)
2013-06-27 03:24:33+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=1&iId=217&iIdL=202&eId=0&tId=2status=1&id=-1&cat=&ticker=ABT> (referer: None)
2013-06-27 03:24:36+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=164&iId=289&iIdL=279&eId=1&tId=0status=1&id=-1&cat=&ticker=ACB> (referer: None)
2013-06-27 03:24:38+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=522&iId=180&iIdL=170&eId=0&tId=2status=1&id=-1&cat=&ticker=ACC> (referer: None)
2013-06-27 03:24:40+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=486&iId=180&iIdL=170&eId=3&tId=2status=1&id=-1&cat=&ticker=ACE> (referer: None)
2013-06-27 03:24:42+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=2&iId=217&iIdL=202&eId=0&tId=2status=1&id=-1&cat=&ticker=ACL> (referer: None)
2013-06-27 03:24:44+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=858&iId=256&iIdL=241&eId=1&tId=2status=1&id=-1&cat=&ticker=ADC> (referer: None)
2013-06-27 03:24:47+0700 [stox] DEBUG: Crawled (200) <GET http://companyaz.stox.vn/Financial?cId=556&iId=180&iIdL=170&eId=3&tId=2status=1&id=-1&cat=&ticker=ADP> (referer: None)
这是我在 stox/spider/test.py 中所做的
from scrapy import log
import logging
from scrapy.log import ScrapyFileLogObserver
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from stox.items import StoxItem
from scrapy.http import Request, Response
from scrapy.http.cookies import CookieJar
from scrapy.contrib.exporter import CsvItemExporter
class MySpider(BaseSpider):
name = "stox"
allowed_domains = ["stox.vn"]
start_urls =["http://companyaz.stox.vn/Financial?cId=113&iId=217&iIdL=202&eId=0&tId=2&status=1&id=-1&cats=&ticker=FPT",
"http://companyaz.stox.vn/Financial?cId=113&iId=217&iIdL=202&eId=0&tId=2&status=1&id=-1&cats=&ticker=SSC"]
ticker = "";
items = [];
def __init__(self):
#write log file here
logfile = open('testlog.log', 'w')
log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG)
log_observer.start() #start logging
def start_requests(self):
products = []
#with open('urls.txt', 'rb') as urls:
# for url in urls:
# yield Request(url, self.parse)
# extract url file and call parse()
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
for url in start_urls:
yield Request(url, self.parse)
def parse(self, response):
hxs = HtmlXPathSelector(response)
self.ticker = "".join(hxs.select("//div[@class='stock-ticker-title']/label/text()").extract()).strip()
my_start_url = "http://companyaz.stox.vn/Financial/PV_Index?filter=1&unit=1000000&ticker=%s" % self.ticker
#get the cookie of start_url
cookieJar = response.meta.setdefault('cookie_jar', CookieJar())
cookieJar.extract_cookies(response, response.request)
request = Request(my_start_url, callback = self.extractItem,
meta = {'dont_merge_cookies': True, 'cookie_jar': cookieJar})
cookieJar.add_cookie_header(request) # apply Set-Cookie ourselves
yield request
def extractItem(self, response):
items = [];
#extract ticker from url
pos = response.url.find('ticker=')
l = len("ticker=")
ticker = response.url[pos+l:]
f = open("data/%s.csv" % ticker, 'w')
#get the XPath
hxs = HtmlXPathSelector(response)
titles = hxs.select("//p[@data-time]/..")
for title in titles:
item = StoxItem()
item ["ticker"] = ticker;
item ["qtime"] = "".join(title.select("./p/@data-time").extract())
item ["doanh_thu_thuan"] = ''.join(title.select("./div[1]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
item ["gia_von_hang_ban"] = ''.join(title.select("./div[1]/p[2]/text()").extract()).strip().replace('.','').replace(',','.')#.encode('utf-8')
item ["lai_gop"] = ''.join(title.select("./div[2]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
item ["thu_nhap_tai_chinh"] = ''.join(title.select("./div[2]/p[2]/text()").extract()).strip().replace('.','').replace(',','.')
item ["chi_phi_tai_chinh"] = ''.join(title.select("./div[2]/p[3]/text()").extract()).strip().replace('.','').replace(',','.')
item ["chi_phi_tien_lai_vay"] = ''.join(title.select("./div[2]/p[4]/text()").extract()).strip().replace('.','').replace(',','.')
item ["chi_phi_ban_hang"] = ''.join(title.select("./div[2]/p[5]/text()").extract()).strip().replace('.','').replace(',','.')
item ["chi_phi_quan_ly"] = ''.join(title.select("./div[2]/p[6]/text()").extract()).strip().replace('.','').replace(',','.')
item ["lai_tu_hdkd"] = ''.join(title.select("./div[3]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
item ["thue_TNDN_ht"] = ''.join(title.select("./div[3]/p[2]/text()").extract()).strip().replace('.','').replace(',','.')
item ["thue_TNDN_hl"] = ''.join(title.select("./div[3]/p[3]/text()").extract()).strip().replace('.','').replace(',','.')
item ["lai_sau_thue"] = ''.join(title.select("./div[4]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
item ["loi_ich_CDTS"] = ''.join(title.select("./div[4]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
item ["lai_co_dong_ct_me"] = ''.join(title.select("./div[5]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
item ["chi_phi_khau_hao_TSCD"] = ''.join(title.select("./div[6]/p[1]/text()").extract()).strip().replace('.','').replace(',','.')
items.append(item)
#write to file
str = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n" % (item ["ticker"],
item ["qtime"],
item ["doanh_thu_thuan"],
item ["gia_von_hang_ban"],
item ["lai_gop"],
item ["thu_nhap_tai_chinh"],
item ["chi_phi_tai_chinh"],
item ["chi_phi_tien_lai_vay"],
item ["chi_phi_ban_hang"],
item ["chi_phi_quan_ly"],
item ["lai_tu_hdkd"],
item ["thue_TNDN_ht"],
item ["thue_TNDN_hl"],
item ["lai_sau_thue"],
item ["loi_ich_CDTS"],
item ["lai_co_dong_ct_me"],
item ["chi_phi_khau_hao_TSCD"])
f.write(str)
#print "Item %r " %items;
f.close()
return items
我的设置.py
BOT_NAME = 'stox'
SPIDER_MODULES = ['stox.spiders']
NEWSPIDER_MODULE = 'stox.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'stox (+http://www.yourdomain.com)'
#ITEM_PIPELINES = ['stox.pipelines.StoxPipeline']
DOWNLOAD_DELAY = 2
#DOWNLOAD_TIMEOUT = 180
#CONCURRENT_REQUESTS = 2
我检查当我更改参数 CONCURRENT_REQUESTS 时,它会在抓取 CONCURENT_REQUEST 次后停止,然后它只会爬行。我认为并发进程有问题(它不是释放进程???)
更新了urls.txt 的内容
http://companyaz.stox.vn/Financial?cId=746&iId=150&iIdL=147&eId=1&tId=2status=1&id=-1&cat=&ticker=AAA
http://companyaz.stox.vn/Financial?cId=446&iId=292&iIdL=280&eId=3&tId=3status=1&id=-1&cat=&ticker=ABI
http://companyaz.stox.vn/Financial?cId=1&iId=217&iIdL=202&eId=0&tId=2status=1&id=-1&cat=&ticker=ABT
.....
非常感谢任何帮助!谢谢你。
PS:我对 Scrapy 项目很陌生,很抱歉我的英语不好