全部。
在我的 Slurm 集群中,当 srun 或 sbatch 作业请求多个节点的资源时,将无法正确提交。
这个 Slurm 集群有 4 个节点,每个节点有 4 个 GPU。
我可以同时使用 4 个 GPU 执行多个作业。
但我无法运行 5 个或更多 GPU 的作业请求。
下面的信息会显示cise3状态为down,这是另一个问题。
错误信息:</h1>
sbatch:错误:批处理作业提交失败:请求的节点配置不可用
开始.sh:
#!/bin/bash
#SBATCH -o code20.out
#SBATCH --partition=cup-hpc
#SBATCH --nodes=3
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=40
#SBATCH --gres=gpu:5
#SBATCH --mem-per-cpu=100mb
source /home/slurm/tensorflow_prj/tf_gpu_cluster/bin/activate
python3 /nfs/code/code20.py
slurm.conf:
NodeName=cise1 NodeAddr=10.18.19.191 CPUs=40 RealMemory=94887 Sockets=2 CoresPerSocket=10 ThreadsPerCore=2 State=UNKNOWN Gres=gpu:rtx4000:4
NodeName=cise2 NodeAddr=10.18.19.107 CPUs=40 RealMemory=94889 Sockets=2 CoresPerSocket=10 ThreadsPerCore=2 State=UNKNOWN Gres=gpu:rtx4000:4
NodeName=cise3 NodeAddr=10.18.19.47 CPUs=40 RealMemory=94889 Sockets=2 CoresPerSocket=10 ThreadsPerCore=2 State=UNKNOWN Gres=gpu:rtx4000:4
NodeName=cise4 NodeAddr=10.18.19.183 CPUs=40 RealMemory=94889 Sockets=2 CoresPerSocket=10 ThreadsPerCore=2 State=UNKNOWN Gres=gpu:rtx4000:4
PartitionName=cup-hpc Nodes=cise[1-4] Default=YES MaxTime=INFINITE State=UP
gres.conf:
# Configure support for four GPUs (with MPS), plus bandwidth
AutoDetect=nvml
Name=gpu File=/dev/nvidia0
Name=gpu File=/dev/nvidia1
Name=gpu File=/dev/nvidia2
Name=gpu File=/dev/nvidia3
信息:
[root@localhost nfs]# sinfo
PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
cup-hpc* up infinite 1 down* cise3
cup-hpc* up infinite 3 idle cise[1-2,4]
scontrol 显示节点:
[root@localhost nfs]# scontrol show nodes
NodeName=cise1 Arch=x86_64 CoresPerSocket=10
CPUAlloc=0 CPUTot=40 CPULoad=0.01
AvailableFeatures=(null)
ActiveFeatures=(null)
Gres=gpu:rtx4000:4
NodeAddr=10.18.19.191 NodeHostName=cise1 Version=20.02.1
OS=Linux 4.18.0-80.el8.x86_64 #1 SMP Tue Jun 4 09:19:46 UTC 2019
RealMemory=94887 AllocMem=0 FreeMem=83727 Sockets=2 Boards=1
State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=cup-hpc
BootTime=2020-04-13T08:34:13 SlurmdStartTime=2020-04-17T14:49:20
CfgTRES=cpu=40,mem=94887M,billing=40
AllocTRES=
CapWatts=n/a
CurrentWatts=0 AveWatts=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
NodeName=cise2 Arch=x86_64 CoresPerSocket=10
CPUAlloc=0 CPUTot=40 CPULoad=0.00
AvailableFeatures=(null)
ActiveFeatures=(null)
Gres=gpu:rtx4000:4
NodeAddr=10.18.19.107 NodeHostName=cise2 Version=20.02.1
OS=Linux 4.18.0-80.el8.x86_64 #1 SMP Tue Jun 4 09:19:46 UTC 2019
RealMemory=94889 AllocMem=0 FreeMem=83405 Sockets=2 Boards=1
State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=cup-hpc
BootTime=2020-04-13T08:33:51 SlurmdStartTime=2020-04-17T14:49:33
CfgTRES=cpu=40,mem=94889M,billing=40
AllocTRES=
CapWatts=n/a
CurrentWatts=0 AveWatts=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
NodeName=cise3 Arch=x86_64 CoresPerSocket=10
CPUAlloc=0 CPUTot=40 CPULoad=0.00
AvailableFeatures=(null)
ActiveFeatures=(null)
Gres=gpu:rtx4000:4
NodeAddr=10.18.19.47 NodeHostName=cise3 Version=20.02.1
OS=Linux 4.18.0-80.el8.x86_64 #1 SMP Tue Jun 4 09:19:46 UTC 2019
RealMemory=94889 AllocMem=0 FreeMem=83456 Sockets=2 Boards=1
State=DOWN* ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=cup-hpc
BootTime=2020-04-13T08:31:48 SlurmdStartTime=2020-04-17T15:10:16
CfgTRES=cpu=40,mem=94889M,billing=40
AllocTRES=
CapWatts=n/a
CurrentWatts=0 AveWatts=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
Reason=Not responding [slurm@2020-04-17T15:17:58]
NodeName=cise4 Arch=x86_64 CoresPerSocket=10
CPUAlloc=0 CPUTot=40 CPULoad=0.00
AvailableFeatures=(null)
ActiveFeatures=(null)
Gres=gpu:rtx4000:4
NodeAddr=10.18.19.183 NodeHostName=cise4 Version=20.02.1
OS=Linux 4.18.0-80.el8.x86_64 #1 SMP Tue Jun 4 09:19:46 UTC 2019
RealMemory=94889 AllocMem=0 FreeMem=83432 Sockets=2 Boards=1
State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=cup-hpc
BootTime=2020-04-13T08:36:40 SlurmdStartTime=2020-04-17T14:49:23
CfgTRES=cpu=40,mem=94889M,billing=40
AllocTRES=
CapWatts=n/a
CurrentWatts=0 AveWatts=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s