我真的很喜欢使用 SimpleHTTPServer 在网络上轻松共享文件的方式,但我希望有像“下载整个目录”这样的选项。是否有一种简单的(单线)方法来实现这一点?
谢谢
我真的很喜欢使用 SimpleHTTPServer 在网络上轻松共享文件的方式,但我希望有像“下载整个目录”这样的选项。是否有一种简单的(单线)方法来实现这一点?
谢谢
我为你做了那个修改,我不知道是否有更好的方法可以做到这一点,但是:
只需保存文件(例如:ThreadedHTTPServer.py)并访问为:
$ python -m /path/to/ThreadedHTTPServer PORT
修改也以线程方式工作,因此您不会同时遇到下载和导航问题,代码没有组织但是:
from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler
from SocketServer import ThreadingMixIn
import threading
import SimpleHTTPServer
import sys, os, zipfile
PORT = int(sys.argv[1])
def send_head(self):
"""Common code for GET and HEAD commands.
This sends the response code and MIME headers.
Return value is either a file object (which has to be copied
to the outputfile by the caller unless the command was HEAD,
and must be closed by the caller under all circumstances), or
None, in which case the caller has nothing further to do.
"""
path = self.translate_path(self.path)
f = None
if self.path.endswith('?download'):
tmp_file = "tmp.zip"
self.path = self.path.replace("?download","")
zip = zipfile.ZipFile(tmp_file, 'w')
for root, dirs, files in os.walk(path):
for file in files:
if os.path.join(root, file) != os.path.join(root, tmp_file):
zip.write(os.path.join(root, file))
zip.close()
path = self.translate_path(tmp_file)
elif os.path.isdir(path):
if not self.path.endswith('/'):
# redirect browser - doing basically what apache does
self.send_response(301)
self.send_header("Location", self.path + "/")
self.end_headers()
return None
else:
for index in "index.html", "index.htm":
index = os.path.join(path, index)
if os.path.exists(index):
path = index
break
else:
return self.list_directory(path)
ctype = self.guess_type(path)
try:
# Always read in binary mode. Opening files in text mode may cause
# newline translations, making the actual size of the content
# transmitted *less* than the content-length!
f = open(path, 'rb')
except IOError:
self.send_error(404, "File not found")
return None
self.send_response(200)
self.send_header("Content-type", ctype)
fs = os.fstat(f.fileno())
self.send_header("Content-Length", str(fs[6]))
self.send_header("Last-Modified", self.date_time_string(fs.st_mtime))
self.end_headers()
return f
def list_directory(self, path):
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
import cgi, urllib
"""Helper to produce a directory listing (absent index.html).
Return value is either a file object, or None (indicating an
error). In either case, the headers are sent, making the
interface the same as for send_head().
"""
try:
list = os.listdir(path)
except os.error:
self.send_error(404, "No permission to list directory")
return None
list.sort(key=lambda a: a.lower())
f = StringIO()
displaypath = cgi.escape(urllib.unquote(self.path))
f.write('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">')
f.write("<html>\n<title>Directory listing for %s</title>\n" % displaypath)
f.write("<body>\n<h2>Directory listing for %s</h2>\n" % displaypath)
f.write("<a href='%s'>%s</a>\n" % (self.path+"?download",'Download Directory Tree as Zip'))
f.write("<hr>\n<ul>\n")
for name in list:
fullname = os.path.join(path, name)
displayname = linkname = name
# Append / for directories or @ for symbolic links
if os.path.isdir(fullname):
displayname = name + "/"
linkname = name + "/"
if os.path.islink(fullname):
displayname = name + "@"
# Note: a link to a directory displays with @ and links with /
f.write('<li><a href="%s">%s</a>\n'
% (urllib.quote(linkname), cgi.escape(displayname)))
f.write("</ul>\n<hr>\n</body>\n</html>\n")
length = f.tell()
f.seek(0)
self.send_response(200)
encoding = sys.getfilesystemencoding()
self.send_header("Content-type", "text/html; charset=%s" % encoding)
self.send_header("Content-Length", str(length))
self.end_headers()
return f
Handler = SimpleHTTPServer.SimpleHTTPRequestHandler
Handler.send_head = send_head
Handler.list_directory = list_directory
class ThreadedHTTPServer(ThreadingMixIn, HTTPServer):
"""Handle requests in a separate thread."""
if __name__ == '__main__':
server = ThreadedHTTPServer(('0.0.0.0', PORT), Handler)
print 'Starting server, use <Ctrl-C> to stop'
server.serve_forever()
查看来源,例如在线here。现在,如果您使用作为目录的 URL 调用服务器,index.html
则会提供其文件,或者,如果缺少该文件,list_directory
则调用该方法。大概,您想zip
用目录的内容制作一个文件(我想是递归的),然后提供它?显然,没有办法通过单行更改来做到这一点,因为您想替换现在的第 68-80 行(在 method 中send_head
)加上整个 method list_directory
,第 98-137 行 - 这至少已经改变了50 行;-)。
如果您可以更改几十行而不是一行,并且我描述的语义是您想要的,那么您当然可以使用ZipFilecStringIO.StringIO
类将所需的 zipfile 构建为对象,并使用os填充它.walk在有问题的目录上(假设您还想递归地获取所有子目录)。但它绝对不会是单行的;-)。
没有一个班轮可以做到这一点,“下载整个目录”作为tar或zip是什么意思?
无论如何,您可以按照以下步骤操作
将是一个有趣的练习:)
没有简单的方法。
另一种方法是使用下面的 python 脚本以递归方式下载整个文件夹。这适用于 Python 3。根据需要更改 URL。
import os
from pathlib import Path
from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup
def get_links(content):
soup = BeautifulSoup(content)
for a in soup.findAll('a'):
yield a.get('href')
def download(url):
path = urlparse(url).path.lstrip('/')
print(path)
r = requests.get(url)
if r.status_code != 200:
raise Exception('status code is {} for {}'.format(r.status_code, url))
content = r.text
if path.endswith('/'):
Path(path.rstrip('/')).mkdir(parents=True, exist_ok=True)
for link in get_links(content):
if not link.startswith('.'): # skip hidden files such as .DS_Store
download(urljoin(url, link))
else:
with open(path, 'w') as f:
f.write(content)
if __name__ == '__main__':
# the trailing / indicates a folder
url = 'http://ed470d37.ngrok.io/a/bc/'
download(url)
我喜欢@mononoke 的解决方案。但其中有几个问题。他们是
href
和text
不同,特别是对于非ASCII路径我试图解决这些问题:</p>
import os
from pathlib import Path
from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup
import math
def get_links(content):
soup = BeautifulSoup(content)
for a in soup.findAll('a'):
yield a.get('href'), a.get_text()
def download(url, path=None, overwrite=False):
if path is None:
path = urlparse(url).path.lstrip('/')
if url.endswith('/'):
r = requests.get(url)
if r.status_code != 200:
raise Exception('status code is {} for {}'.format(r.status_code, url))
content = r.text
Path(path.rstrip('/')).mkdir(parents=True, exist_ok=True)
for link, name in get_links(content):
if not link.startswith('.'): # skip hidden files such as .DS_Store
download(urljoin(url, link), os.path.join(path, name))
else:
if os.path.isfile(path):
print("#existing", path)
if not overwrite:
return
chunk_size = 1024*1024
r = requests.get(url, stream=True)
content_size = int(r.headers['content-length'])
total = math.ceil(content_size / chunk_size)
print("#", path)
with open(path, 'wb') as f:
c = 0
st = 100
for chunk in r.iter_content(chunk_size=chunk_size):
c += 1
if chunk:
f.write(chunk)
ap = int(c*st/total) - int((c-1)*st/total)
if ap > 0:
print("#" * ap, end="")
print("\r "," "*int(c*st/total), "\r", end="")
if __name__ == '__main__':
# the trailing / indicates a folder
url = 'http://ed470d37.ngrok.io/a/bc/'
download(url, "/data/bc")