2

我在 SLURM 集群上保留了一些节点,并希望在这些节点上运行 python 脚本。在一个节点(服务器)上,python 脚本应该用作业填充队列并将这些作业分派给客户端。大多数时候这工作正常,但偶尔脚本会停止。当使用 Ctrl+C 时,事实证明,在这种情况下,一个(或有时更多)节点似乎卡在了<Finalize object, dead>

^Csrun: interrupt (one more within 1 sec to abort)
srun: task 30: running
srun: tasks 0-29,31-39: exited
^Csrun: sending Ctrl-C to job 1075185.14
Exception KeyboardInterrupt: KeyboardInterrupt() in <Finalize object, dead> ignored
srun: Job step aborted: Waiting up to 2 seconds for job step to finish.
slurmd[cluster-112]: *** STEP 1075185.14 KILLED AT 2014-04-03T09:11:23 WITH SIGNAL 9 ***

我不知道可能是什么原因。也许,它看起来像是与垃圾收集器有关的东西。

这是我运行的脚本:

#!/usr/bin/env

import os
import multiprocessing.managers
import Queue
import sys
import subprocess
import socket
import errno

class QueueManager(multiprocessing.managers.SyncManager):
    pass

def worker(i, my_slurm_proc_id):
    print 'hello %i (proc=%i)' % (i, my_slurm_proc_id)
    time.sleep(0.1)
    pass

def run_server(first_slurm_node, N_procs):
    queue = Queue.Queue()
    barrier = multiprocessing.BoundedSemaphore(N_procs-1)

    QueueManager.register('get_queue', callable=lambda: queue)
    QueueManager.register('get_barrier', callable=lambda: barrier)

    for i in range(5000):
        queue.put(i)

    m = QueueManager(address=(first_slurm_node, 50000), authkey='abracadabra')

    m.start()
    for i in range(N_procs-1):
        barrier.acquire(True)
    m.get_queue().join() # somehow just 'queue.join()' doesn't work here

def run_client(my_slurm_proc_id, first_slurm_node):
    QueueManager.register('get_queue')
    QueueManager.register('get_barrier')

    m = QueueManager(address=(first_slurm_node, 50000), authkey='abracadabra')
    m.connect()

    barrier = m.get_barrier()
    barrier.acquire(True)

    queue = m.get_queue()

    while not queue.empty():
        try:
            data = queue.get_nowait()
        except Queue.Empty:
            break

        worker(data, my_slurm_proc_id)
        queue.task_done()

    queue = None
    barrier.release()
    barrier = None

def main():
    slurm_job_nodelist = subprocess.check_output('scontrol show hostname'.split(' ') + [os.environ['SLURM_JOB_NODELIST']]).split('\n')
    master_node = slurm_job_nodelist[0]
    my_slurm_proc_id = int(os.environ['SLURM_PROCID'])
    N_procs = int(os.environ['SLURM_NPROCS'])

    if my_slurm_proc_id == 0:
        run_server(master_node, N_procs)
    else:
        run_client(my_slurm_proc_id, master_node)

if __name__ == '__main__':
    main()
4

0 回答 0