我在 SLURM 集群上保留了一些节点,并希望在这些节点上运行 python 脚本。在一个节点(服务器)上,python 脚本应该用作业填充队列并将这些作业分派给客户端。大多数时候这工作正常,但偶尔脚本会停止。当使用 Ctrl+C 时,事实证明,在这种情况下,一个(或有时更多)节点似乎卡在了<Finalize object, dead>
:
^Csrun: interrupt (one more within 1 sec to abort)
srun: task 30: running
srun: tasks 0-29,31-39: exited
^Csrun: sending Ctrl-C to job 1075185.14
Exception KeyboardInterrupt: KeyboardInterrupt() in <Finalize object, dead> ignored
srun: Job step aborted: Waiting up to 2 seconds for job step to finish.
slurmd[cluster-112]: *** STEP 1075185.14 KILLED AT 2014-04-03T09:11:23 WITH SIGNAL 9 ***
我不知道可能是什么原因。也许,它看起来像是与垃圾收集器有关的东西。
这是我运行的脚本:
#!/usr/bin/env
import os
import multiprocessing.managers
import Queue
import sys
import subprocess
import socket
import errno
class QueueManager(multiprocessing.managers.SyncManager):
pass
def worker(i, my_slurm_proc_id):
print 'hello %i (proc=%i)' % (i, my_slurm_proc_id)
time.sleep(0.1)
pass
def run_server(first_slurm_node, N_procs):
queue = Queue.Queue()
barrier = multiprocessing.BoundedSemaphore(N_procs-1)
QueueManager.register('get_queue', callable=lambda: queue)
QueueManager.register('get_barrier', callable=lambda: barrier)
for i in range(5000):
queue.put(i)
m = QueueManager(address=(first_slurm_node, 50000), authkey='abracadabra')
m.start()
for i in range(N_procs-1):
barrier.acquire(True)
m.get_queue().join() # somehow just 'queue.join()' doesn't work here
def run_client(my_slurm_proc_id, first_slurm_node):
QueueManager.register('get_queue')
QueueManager.register('get_barrier')
m = QueueManager(address=(first_slurm_node, 50000), authkey='abracadabra')
m.connect()
barrier = m.get_barrier()
barrier.acquire(True)
queue = m.get_queue()
while not queue.empty():
try:
data = queue.get_nowait()
except Queue.Empty:
break
worker(data, my_slurm_proc_id)
queue.task_done()
queue = None
barrier.release()
barrier = None
def main():
slurm_job_nodelist = subprocess.check_output('scontrol show hostname'.split(' ') + [os.environ['SLURM_JOB_NODELIST']]).split('\n')
master_node = slurm_job_nodelist[0]
my_slurm_proc_id = int(os.environ['SLURM_PROCID'])
N_procs = int(os.environ['SLURM_NPROCS'])
if my_slurm_proc_id == 0:
run_server(master_node, N_procs)
else:
run_client(my_slurm_proc_id, master_node)
if __name__ == '__main__':
main()