python - 在 Python SimpleHTTPServer 中下载整个目录

Question

我真的很喜欢使用 SimpleHTTPServer 在网络上轻松共享文件的方式，但我希望有像“下载整个目录”这样的选项。是否有一种简单的（单线）方法来实现这一点？

谢谢

score 6 · Accepted Answer

我为你做了那个修改，我不知道是否有更好的方法可以做到这一点，但是：

只需保存文件（例如：ThreadedHTTPServer.py）并访问为：

$ python -m /path/to/ThreadedHTTPServer PORT

BPaste 原始版本

修改也以线程方式工作，因此您不会同时遇到下载和导航问题，代码没有组织但是：

from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler
from SocketServer import ThreadingMixIn
import threading
import SimpleHTTPServer
import sys, os, zipfile

PORT = int(sys.argv[1])

def send_head(self):
    """Common code for GET and HEAD commands.

    This sends the response code and MIME headers.

    Return value is either a file object (which has to be copied
    to the outputfile by the caller unless the command was HEAD,
    and must be closed by the caller under all circumstances), or
    None, in which case the caller has nothing further to do.

    """
    path = self.translate_path(self.path)
    f = None

    if self.path.endswith('?download'):

        tmp_file = "tmp.zip"
        self.path = self.path.replace("?download","")

        zip = zipfile.ZipFile(tmp_file, 'w')
        for root, dirs, files in os.walk(path):
            for file in files:
                if os.path.join(root, file) != os.path.join(root, tmp_file):
                    zip.write(os.path.join(root, file))
        zip.close()
        path = self.translate_path(tmp_file)

    elif os.path.isdir(path):

        if not self.path.endswith('/'):
            # redirect browser - doing basically what apache does
            self.send_response(301)
            self.send_header("Location", self.path + "/")
            self.end_headers()
            return None
        else:

            for index in "index.html", "index.htm":
                index = os.path.join(path, index)
                if os.path.exists(index):
                    path = index
                    break
            else:
                return self.list_directory(path)
    ctype = self.guess_type(path)
    try:
        # Always read in binary mode. Opening files in text mode may cause
        # newline translations, making the actual size of the content
        # transmitted *less* than the content-length!
        f = open(path, 'rb')
    except IOError:
        self.send_error(404, "File not found")
        return None
    self.send_response(200)
    self.send_header("Content-type", ctype)
    fs = os.fstat(f.fileno())
    self.send_header("Content-Length", str(fs[6]))
    self.send_header("Last-Modified", self.date_time_string(fs.st_mtime))
    self.end_headers()
    return f

def list_directory(self, path):

    try:
        from cStringIO import StringIO
    except ImportError:
        from StringIO import StringIO
    import cgi, urllib

    """Helper to produce a directory listing (absent index.html).

    Return value is either a file object, or None (indicating an
    error).  In either case, the headers are sent, making the
    interface the same as for send_head().

    """
    try:
        list = os.listdir(path)
    except os.error:
        self.send_error(404, "No permission to list directory")
        return None
    list.sort(key=lambda a: a.lower())
    f = StringIO()
    displaypath = cgi.escape(urllib.unquote(self.path))
    f.write('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">')
    f.write("<html>\n<title>Directory listing for %s</title>\n" % displaypath)
    f.write("<body>\n<h2>Directory listing for %s</h2>\n" % displaypath)
    f.write("<a href='%s'>%s</a>\n" % (self.path+"?download",'Download Directory Tree as Zip'))
    f.write("<hr>\n<ul>\n")
    for name in list:
        fullname = os.path.join(path, name)
        displayname = linkname = name
        # Append / for directories or @ for symbolic links
        if os.path.isdir(fullname):
            displayname = name + "/"
            linkname = name + "/"
        if os.path.islink(fullname):
            displayname = name + "@"
            # Note: a link to a directory displays with @ and links with /
        f.write('<li><a href="%s">%s</a>\n'
                % (urllib.quote(linkname), cgi.escape(displayname)))
    f.write("</ul>\n<hr>\n</body>\n</html>\n")
    length = f.tell()
    f.seek(0)
    self.send_response(200)
    encoding = sys.getfilesystemencoding()
    self.send_header("Content-type", "text/html; charset=%s" % encoding)
    self.send_header("Content-Length", str(length))
    self.end_headers()
    return f

Handler = SimpleHTTPServer.SimpleHTTPRequestHandler
Handler.send_head = send_head
Handler.list_directory = list_directory

class ThreadedHTTPServer(ThreadingMixIn, HTTPServer):
    """Handle requests in a separate thread."""

if __name__ == '__main__':
    server = ThreadedHTTPServer(('0.0.0.0', PORT), Handler)
    print 'Starting server, use <Ctrl-C> to stop'
    server.serve_forever()

score 5 · Accepted Answer

查看来源，例如在线here。现在，如果您使用作为目录的 URL 调用服务器，index.html则会提供其文件，或者，如果缺少该文件，list_directory则调用该方法。大概，您想zip用目录的内容制作一个文件（我想是递归的），然后提供它？显然，没有办法通过单行更改来做到这一点，因为您想替换现在的第 68-80 行（在 method 中send_head）加上整个 method list_directory，第 98-137 行 - 这至少已经改变了50 行；-)。

如果您可以更改几十行而不是一行，并且我描述的语义是您想要的，那么您当然可以使用ZipFile cStringIO.StringIO类将所需的 zipfile 构建为对象，并使用os填充它.walk在有问题的目录上（假设您还想递归地获取所有子目录）。但它绝对不会是单行的；-)。

score 4 · Accepted Answer

没有一个班轮可以做到这一点，“下载整个目录”作为tar或zip是什么意思？

无论如何，您可以按照以下步骤操作

从 SimpleHTTPRequestHandler 派生一个类，或者可能只是复制它的代码
更改 list_directory 方法以返回“下载整个文件夹”的链接
更改 copyfile 方法，以便为您的链接压缩整个目录并返回它
您可以缓存 zip，这样您就不会每次都压缩文件夹，而是查看是否有任何文件被修改

将是一个有趣的练习:)

score 2 · Accepted Answer

没有简单的方法。

另一种方法是使用下面的 python 脚本以递归方式下载整个文件夹。这适用于 Python 3。根据需要更改 URL。

import os
from pathlib import Path
from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup

def get_links(content):
    soup = BeautifulSoup(content)
    for a in soup.findAll('a'):
        yield a.get('href')

def download(url):
    path = urlparse(url).path.lstrip('/')
    print(path)
    r = requests.get(url)
    if r.status_code != 200:
        raise Exception('status code is {} for {}'.format(r.status_code, url))
    content = r.text
    if path.endswith('/'):
        Path(path.rstrip('/')).mkdir(parents=True, exist_ok=True)
        for link in get_links(content):
            if not link.startswith('.'): # skip hidden files such as .DS_Store
                download(urljoin(url, link))
    else:
        with open(path, 'w') as f:
            f.write(content)


if __name__ == '__main__':
    # the trailing / indicates a folder
    url = 'http://ed470d37.ngrok.io/a/bc/'
    download(url)

score 1 · Accepted Answer

我喜欢@mononoke 的解决方案。但其中有几个问题。他们是

以文本模式写入文件
有时href和text不同，特别是对于非ASCII路径
不逐块下载大文件

我试图解决这些问题：</p>

import os
from pathlib import Path
from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup
import math

def get_links(content):
    soup = BeautifulSoup(content)
    for a in soup.findAll('a'):
        yield a.get('href'), a.get_text()

def download(url, path=None, overwrite=False):
    if path is None:
        path = urlparse(url).path.lstrip('/')
    if url.endswith('/'):
        r = requests.get(url)
        if r.status_code != 200:
            raise Exception('status code is {} for {}'.format(r.status_code, url))
        content = r.text
        Path(path.rstrip('/')).mkdir(parents=True, exist_ok=True)
        for link, name in get_links(content):
            if not link.startswith('.'): # skip hidden files such as .DS_Store
                download(urljoin(url, link), os.path.join(path, name))
    else:
        if os.path.isfile(path):
            print("#existing", path)
            if not overwrite:
                return
        chunk_size = 1024*1024
        r = requests.get(url, stream=True)
        content_size = int(r.headers['content-length'])
        total = math.ceil(content_size / chunk_size)
        print("#", path)
        with open(path, 'wb') as f:
            c = 0
            st = 100
            for chunk in r.iter_content(chunk_size=chunk_size):
                c += 1
                if chunk:
                    f.write(chunk)
                ap = int(c*st/total) - int((c-1)*st/total)
                if ap > 0:
                    print("#" * ap, end="")
            print("\r  "," "*int(c*st/total), "\r", end="")
            
if __name__ == '__main__':
    # the trailing / indicates a folder
    url = 'http://ed470d37.ngrok.io/a/bc/'
    download(url, "/data/bc")

python - 在 Python SimpleHTTPServer 中下载整个目录

5 回答 5

Related

Reference