您需要将标题的字典传递给shouldDownload
(或结果urlopen
):
def shouldDownload(url, headers, prev_etag, prev_lastmod, prev_expires, prev_content_length):
return (prev_content_length != headers.get("content-length") || prev_lastmod != headers.get("If-Modified-Since") || prev_expires != headers.get("Expires") || prev_etag != headers.get("ETAG"))
# or the optimistic way:
# return prev_content_length == headers.get("content-length") and prev_lastmod == headers.get("If-Modified-Since") and prev_expires = headers.get("Expires") and prev_etag = headers.get("ETAG")
打开 URL 时执行此操作:
# my urllib2 is a little fuzzy but I believe `urlopen()` doesn't
# read the whole file until `.read()` is called, and you can still
# get the headers with `.headers`. Worst case is you may have to
# `read(50)` or so to get them.
s = urllib2.urlopen(MYURL)
try:
if shouldDownload(s.headers):
source = s.read()
# do stuff with source
else:
continue
# except HTTPError, etc if you need to
finally:
s.close()