397

是否有用于工作线程的 Pool 类,类似于多处理模块的Pool 类

例如,我喜欢并行化地图函数的简单方法

def long_running_func(p):
    c_func_no_gil(p)

p = multiprocessing.Pool(4)
xs = p.map(long_running_func, range(100))

但是我想在没有创建新进程的开销的情况下做到这一点。

我知道 GIL。但是,在我的用例中,该函数将是一个 IO 绑定的 C 函数,python 包装器将在实际函数调用之前为其释放 GIL。

我必须编写自己的线程池吗?

4

11 回答 11

511

我刚刚发现模块中实际上 一个基于线程的池接口multiprocessing,但是它有些隐藏并且没有正确记录。

它可以通过以下方式导入

from multiprocessing.pool import ThreadPool

它是使用包装 python 线程的虚拟 Process 类实现的。这个基于线程的 Process 类可以在docsmultiprocessing.dummy中简要提及的地方找到。这个虚拟模块据说提供了基于线程的整个多处理接口。

于 2010-08-02T09:52:28.747 回答
261

在 Python 3 中,您可以使用concurrent.futures.ThreadPoolExecutor,即:

executor = ThreadPoolExecutor(max_workers=10)
a = executor.submit(my_function)

有关更多信息和示例,请参阅文档

于 2012-07-17T19:42:40.387 回答
71

是的,它似乎(或多或少)具有相同的 API。

import multiprocessing

def worker(lnk):
    ....    
def start_process():
    .....
....

if(PROCESS):
    pool = multiprocessing.Pool(processes=POOL_SIZE, initializer=start_process)
else:
    pool = multiprocessing.pool.ThreadPool(processes=POOL_SIZE, 
                                           initializer=start_process)

pool.map(worker, inputs)
....
于 2012-11-30T19:42:33.810 回答
47

对于一些非常简单和轻量级的东西(从这里稍微修改):

from Queue import Queue
from threading import Thread


class Worker(Thread):
    """Thread executing tasks from a given tasks queue"""
    def __init__(self, tasks):
        Thread.__init__(self)
        self.tasks = tasks
        self.daemon = True
        self.start()

    def run(self):
        while True:
            func, args, kargs = self.tasks.get()
            try:
                func(*args, **kargs)
            except Exception, e:
                print e
            finally:
                self.tasks.task_done()


class ThreadPool:
    """Pool of threads consuming tasks from a queue"""
    def __init__(self, num_threads):
        self.tasks = Queue(num_threads)
        for _ in range(num_threads):
            Worker(self.tasks)

    def add_task(self, func, *args, **kargs):
        """Add a task to the queue"""
        self.tasks.put((func, args, kargs))

    def wait_completion(self):
        """Wait for completion of all the tasks in the queue"""
        self.tasks.join()

if __name__ == '__main__':
    from random import randrange
    from time import sleep

    delays = [randrange(1, 10) for i in range(100)]

    def wait_delay(d):
        print 'sleeping for (%d)sec' % d
        sleep(d)

    pool = ThreadPool(20)

    for i, d in enumerate(delays):
        pool.add_task(wait_delay, d)

    pool.wait_completion()

要支持任务完成时的回调,您只需将回调添加到任务元组即可。

于 2011-08-31T13:23:19.453 回答
26

嗨,要在 Python 中使用线程池,您可以使用这个库:

from multiprocessing.dummy import Pool as ThreadPool

然后为了使用,这个库是这样的:

pool = ThreadPool(threads)
results = pool.map(service, tasks)
pool.close()
pool.join()
return results

线程是您想要的线程数,任务是最映射到服务的任务列表。

于 2017-07-28T04:58:38.653 回答
12

这是我最终使用的结果。这是上面 dgorissen 的类的修改版本。

文件:threadpool.py

from queue import Queue, Empty
import threading
from threading import Thread


class Worker(Thread):
    _TIMEOUT = 2
    """ Thread executing tasks from a given tasks queue. Thread is signalable, 
        to exit
    """
    def __init__(self, tasks, th_num):
        Thread.__init__(self)
        self.tasks = tasks
        self.daemon, self.th_num = True, th_num
        self.done = threading.Event()
        self.start()

    def run(self):       
        while not self.done.is_set():
            try:
                func, args, kwargs = self.tasks.get(block=True,
                                                   timeout=self._TIMEOUT)
                try:
                    func(*args, **kwargs)
                except Exception as e:
                    print(e)
                finally:
                    self.tasks.task_done()
            except Empty as e:
                pass
        return

    def signal_exit(self):
        """ Signal to thread to exit """
        self.done.set()


class ThreadPool:
    """Pool of threads consuming tasks from a queue"""
    def __init__(self, num_threads, tasks=[]):
        self.tasks = Queue(num_threads)
        self.workers = []
        self.done = False
        self._init_workers(num_threads)
        for task in tasks:
            self.tasks.put(task)

    def _init_workers(self, num_threads):
        for i in range(num_threads):
            self.workers.append(Worker(self.tasks, i))

    def add_task(self, func, *args, **kwargs):
        """Add a task to the queue"""
        self.tasks.put((func, args, kwargs))

    def _close_all_threads(self):
        """ Signal all threads to exit and lose the references to them """
        for workr in self.workers:
            workr.signal_exit()
        self.workers = []

    def wait_completion(self):
        """Wait for completion of all the tasks in the queue"""
        self.tasks.join()

    def __del__(self):
        self._close_all_threads()


def create_task(func, *args, **kwargs):
    return (func, args, kwargs)

使用游泳池

from random import randrange
from time import sleep

delays = [randrange(1, 10) for i in range(30)]

def wait_delay(d):
    print('sleeping for (%d)sec' % d)
    sleep(d)

pool = ThreadPool(20)
for i, d in enumerate(delays):
    pool.add_task(wait_delay, d)
pool.wait_completion()
于 2018-05-10T05:04:03.497 回答
7

是的,有一个类似于多处理池的线程池,但是它有些隐藏并且没有正确记录。您可以通过以下方式导入它:-

from multiprocessing.pool import ThreadPool

只是我给你看一个简单的例子

def test_multithread_stringio_read_csv(self):
        # see gh-11786
        max_row_range = 10000
        num_files = 100

        bytes_to_df = [
            '\n'.join(
                ['%d,%d,%d' % (i, i, i) for i in range(max_row_range)]
            ).encode() for j in range(num_files)]
        files = [BytesIO(b) for b in bytes_to_df]

        # read all files in many threads
        pool = ThreadPool(8)
        results = pool.map(self.read_csv, files)
        first_result = results[0]

        for result in results:
            tm.assert_frame_equal(first_result, result) 
于 2020-10-15T14:33:39.000 回答
5

另一种方法是将进程添加到线程队列池

import concurrent.futures
with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor:
    for i in range(10):
        a = executor.submit(arg1, arg2,....)
于 2020-06-15T20:28:24.843 回答
3

没有内置的基于线程的池。Queue但是,使用该类实现生产者/消费者队列可能非常快。

来自: https ://docs.python.org/2/library/queue.html

from threading import Thread
from Queue import Queue
def worker():
    while True:
        item = q.get()
        do_work(item)
        q.task_done()

q = Queue()
for i in range(num_worker_threads):
     t = Thread(target=worker)
     t.daemon = True
     t.start()

for item in source():
    q.put(item)

q.join()       # block until all tasks are done
于 2010-06-13T21:30:35.630 回答
3

创建新进程的开销很小,尤其是当它只有 4 个时。我怀疑这是您的应用程序的性能热点。保持简单,优化你必须做的地方和分析结果指向的地方。

于 2010-06-13T22:24:45.890 回答
0

如果你不介意执行其他人的代码,这里是我的:

注意:您可能需要删除许多额外的代码 [添加以更好地说明和演示其工作原理]

注意: Python 命名约定用于方法名称和变量名称,而不是 camelCase。

工作程序:

  1. MultiThread 类将通过共享锁、工作队列、退出标志和结果在没有线程实例的情况下启动。
  2. 一旦创建了所有实例,MultiThread 将启动 SingleThread。
  3. 我们可以使用 MultiThread 添加作品(它会处理锁定)。
  4. SingleThreads 将使用中间的锁来处理工作队列。
  5. 完成工作后,您可以使用共享布尔值销毁所有线程。
  6. 在这里,工作可以是任何东西。它可以使用给定的参数自动导入(取消注释导入行)和处理模块。
  7. 结果将被添加到结果中,我们可以使用 get_results

代码:

import threading
import queue


class SingleThread(threading.Thread):
    def __init__(self, name, work_queue, lock, exit_flag, results):
        threading.Thread.__init__(self)
        self.name = name
        self.work_queue = work_queue
        self.lock = lock
        self.exit_flag = exit_flag
        self.results = results

    def run(self):
        # print("Coming %s with parameters %s", self.name, self.exit_flag)
        while not self.exit_flag:
            # print(self.exit_flag)
            self.lock.acquire()
            if not self.work_queue.empty():
                work = self.work_queue.get()
                module, operation, args, kwargs = work.module, work.operation, work.args, work.kwargs
                self.lock.release()
                print("Processing : " + operation + " with parameters " + str(args) + " and " + str(kwargs) + " by " + self.name + "\n")
                # module = __import__(module_name)
                result = str(getattr(module, operation)(*args, **kwargs))
                print("Result : " + result + " for operation " + operation + " and input " + str(args) + " " + str(kwargs))
                self.results.append(result)
            else:
                self.lock.release()
        # process_work_queue(self.work_queue)

class MultiThread:
    def __init__(self, no_of_threads):
        self.exit_flag = bool_instance()
        self.queue_lock = threading.Lock()
        self.threads = []
        self.work_queue = queue.Queue()
        self.results = []
        for index in range(0, no_of_threads):
            thread = SingleThread("Thread" + str(index+1), self.work_queue, self.queue_lock, self.exit_flag, self.results)
            thread.start()
            self.threads.append(thread)

    def add_work(self, work):
        self.queue_lock.acquire()
        self.work_queue._put(work)
        self.queue_lock.release()

    def destroy(self):
        self.exit_flag.value = True
        for thread in self.threads:
            thread.join()

    def get_results(self):
        return self.results


class Work:
    def __init__(self, module, operation, args, kwargs={}):
        self.module = module
        self.operation = operation
        self.args = args
        self.kwargs = kwargs


class SimpleOperations:
    def sum(self, *args):
        return sum([int(arg) for arg in args])

    @staticmethod
    def mul(a, b, c=0):
        return int(a) * int(b) + int(c)


class bool_instance:
    def __init__(self, value=False):
        self.value = value

    def __setattr__(self, key, value):
        if key != "value":
            raise AttributeError("Only value can be set!")
        if not isinstance(value, bool):
            raise AttributeError("Only True/False can be set!")
        self.__dict__[key] = value
        # super.__setattr__(key, bool(value))

    def __bool__(self):
        return self.value

if __name__ == "__main__":
    multi_thread = MultiThread(5)
    multi_thread.add_work(Work(SimpleOperations(), "mul", [2, 3], {"c":4}))
    while True:
        data_input = input()
        if data_input == "":
            pass
        elif data_input == "break":
            break
        else:
            work = data_input.split()
            multi_thread.add_work(Work(SimpleOperations(), work[0], work[1:], {}))
    multi_thread.destroy()
    print(multi_thread.get_results())
于 2020-12-03T13:19:48.010 回答