我正在尝试从中国微博新浪微博收集转发数据,您可以看到以下代码。但是,我遇到了IP请求超出限制的问题。
为了解决这个问题,我必须为代码设置 time.sleep()。您可以看到我试图在代码中添加一行“time.sleep(10) # 来抑制 ip 请求限制”。因此,python 将在抓取一页转推后休眠 10 秒(一页包含 200 条转推)。
然而,它仍然不足以解决IP问题。
因此,我计划在每 20 页爬网后更系统地让 python 休眠 60 秒。您的想法将不胜感激。ids=[3388154704688495, 3388154704688494, 3388154704688492]
addressForSavingData= "C:/Python27/weibo/Weibo_repost/repostOwsSave1.csv"
file = open(addressForSavingData,'wb') # save to csv file
for id in ids:
if api.rate_limit_status().remaining_hits >= 205:
for object in api.counts(ids=id):
repost_count=object.__getattribute__('rt')
print id, repost_count
pages= repost_count/200 +2 # why should it be 2? cuz python starts from 0
for page in range(1, pages):
time.sleep(10) # to opress the ip request limit
for object in api.repost_timeline(id=id, count=200, page=page): # get the repost_timeline of a weibo
"""1.1 reposts"""
mid = object.__getattribute__("id")
text = object.__getattribute__("text").encode('gb18030') # add encode here
"""1.2 reposts.user"""
user = object.__getattribute__("user") # for object in user
user_id = user.id
"""2.1 retweeted_status"""
rts = object.__getattribute__("retweeted_status")
rts_mid = rts.id # the id of weibo
"""2.2 retweeted_status.user"""
rtsuser_id = rts.user[u'id']
try:
w = csv.writer(file,delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
w.writerow(( mid,
user_id, rts_mid,
rtsuser_id, text)) # write it out
except: # Exception of UnicodeEncodeError
pass
elif api.rate_limit_status().remaining_hits < 205:
sleep_time=api.rate_limit_status().reset_time_in_seconds # time.time()
print sleep_time, api.rate_limit_status().reset_time
time.sleep(sleep_time+2)
file.close()
pass