我正在关注链接:https : //kubernetes.dask.org/en/latest/,在 Kubernetes 集群上运行 dask 数组。运行示例代码时,worker pod 显示错误状态如下:
脚步:
在 3 个节点(1 个主节点和 2 个工作节点)上安装了 Kubernetes。
pip install dask-kubernetes
dask_example.py 带有运行 dask 数组的代码(与链接上给出的示例相同)
带有 pod 配置的 Worker-spec.yml 文件(与链接上给出的示例相同)
(base) [root@k8s-master example]# ls
dask_example.py worker-spec.yml
(base) [root@k8s-master example]# nohup python dask_example.py &
[1] 3660
(base) [root@k8s-master example]# cat nohup.out
distributed.scheduler - INFO - Clear task state
distributed.scheduler - INFO - Scheduler at: tcp://172.16.0.76:40119
distributed.scheduler - INFO - Receive client connection: Client-df4caa18-0bc8-11ea-8e4c-12bd5ffa93ff
distributed.core - INFO - Starting established connection
(base) [root@k8s-master example]# kubectl get pods -o wide --all-namespaces
NAMESPACE NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
default workerpod 1/1 Running 0 70s 10.32.0.2 worker-node1 <none> <none>
kube-system coredns-5644d7b6d9-l4jsd 1/1 Running 0 8m19s 10.32.0.4 k8s-master <none> <none>
kube-system coredns-5644d7b6d9-q679h 1/1 Running 0 8m19s 10.32.0.3 k8s-master <none> <none>
kube-system etcd-k8s-master 1/1 Running 0 7m16s 172.16.0.76 k8s-master <none> <none>
kube-system kube-apiserver-k8s-master 1/1 Running 0 7m1s 172.16.0.76 k8s-master <none> <none>
kube-system kube-controller-manager-k8s-master 1/1 Running 0 7m27s 172.16.0.76 k8s-master <none> <none>
kube-system kube-proxy-ctgj8 1/1 Running 0 5m7s 172.16.0.114 worker-node2 <none> <none>
kube-system kube-proxy-f78bm 1/1 Running 0 8m18s 172.16.0.76 k8s-master <none> <none>
kube-system kube-proxy-ksk59 1/1 Running 0 5m15s 172.16.0.31 worker-node1 <none> <none>
kube-system kube-scheduler-k8s-master 1/1 Running 0 7m2s 172.16.0.76 k8s-master <none> <none>
kube-system weave-net-q2zwn 2/2 Running 0 6m22s 172.16.0.76 k8s-master <none> <none>
kube-system weave-net-r9tzs 2/2 Running 0 5m15s 172.16.0.31 worker-node1 <none> <none>
kube-system weave-net-tm8xx 2/2 Running 0 5m7s 172.16.0.114 worker-node2 <none> <none>
(base) [root@k8s-master example]# kubectl get pods -o wide --all-namespaces
NAMESPACE NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
default workerpod 0/1 Error 0 4m23s 10.32.0.2 worker-node1 <none> <none>
kube-system coredns-5644d7b6d9-l4jsd 1/1 Running 0 11m 10.32.0.4 k8s-master <none> <none>
kube-system coredns-5644d7b6d9-q679h 1/1 Running 0 11m 10.32.0.3 k8s-master <none> <none>
kube-system etcd-k8s-master 1/1 Running 0 10m 172.16.0.76 k8s-master <none> <none>
kube-system kube-apiserver-k8s-master 1/1 Running 0 10m 172.16.0.76 k8s-master <none> <none>
kube-system kube-controller-manager-k8s-master 1/1 Running 0 10m 172.16.0.76 k8s-master <none> <none>
kube-system kube-proxy-ctgj8 1/1 Running 0 8m20s 172.16.0.114 worker-node2 <none> <none>
kube-system kube-proxy-f78bm 1/1 Running 0 11m 172.16.0.76 k8s-master <none> <none>
kube-system kube-proxy-ksk59 1/1 Running 0 8m28s 172.16.0.31 worker-node1 <none> <none>
kube-system kube-scheduler-k8s-master 1/1 Running 0 10m 172.16.0.76 k8s-master <none> <none>
kube-system weave-net-q2zwn 2/2 Running 0 9m35s 172.16.0.76 k8s-master <none> <none>
kube-system weave-net-r9tzs 2/2 Running 0 8m28s 172.16.0.31 worker-node1 <none> <none>
kube-system weave-net-tm8xx 2/2 Running 0 8m20s 172.16.0.114 worker-node2 <none> <none>
(base) [root@k8s-master example]# cat nohup.out
distributed.scheduler - INFO - Clear task state
distributed.scheduler - INFO - Scheduler at: tcp://172.16.0.76:40119
distributed.scheduler - INFO - Receive client connection: Client-df4caa18-0bc8-11ea-8e4c-12bd5ffa93ff
distributed.core - INFO - Starting established connection
(base) [root@k8s-master example]# kubectl describe pod workerpod
Name: workerpod
Namespace: default
Priority: 0
Node: worker-node1/172.16.0.31
Start Time: Wed, 20 Nov 2019 19:06:36 +0000
Labels: app=dask
dask.org/cluster-name=dask-root-99dcf768-4
dask.org/component=worker
foo=bar
user=root
Annotations: <none>
Status: Failed
IP: 10.32.0.2
IPs:
IP: 10.32.0.2
Containers:
dask:
Container ID: docker://578dc575fc263c4a3889a4f2cb5e06cd82a00e03cfc6acfd7a98fef703421390
Image: daskdev/dask:latest
Image ID: docker-pullable://daskdev/dask@sha256:0a936daa94c82cea371c19a2c90c695688ab4e1e7acc905f8b30dfd419adfb6f
Port: <none>
Host Port: <none>
Args:
dask-worker
--nthreads
2
--no-bokeh
--memory-limit
6GB
--death-timeout
60
State: Terminated
Reason: Error
Exit Code: 1
Started: Wed, 20 Nov 2019 19:06:38 +0000
Finished: Wed, 20 Nov 2019 19:08:20 +0000
Ready: False
Restart Count: 0
Limits:
cpu: 2
memory: 6G
Requests:
cpu: 2
memory: 6G
Environment:
EXTRA_PIP_PACKAGES: fastparquet git+https://github.com/dask/distributed
DASK_SCHEDULER_ADDRESS: tcp://172.16.0.76:40119
Mounts:
/var/run/secrets/kubernetes.io/serviceaccount from default-token-p9f9v (ro)
Conditions:
Type Status
Initialized True
Ready False
ContainersReady False
PodScheduled True
Volumes:
default-token-p9f9v:
Type: Secret (a volume populated by a Secret)
SecretName: default-token-p9f9v
Optional: false
QoS Class: Guaranteed
Node-Selectors: <none>
Tolerations: k8s.dask.org/dedicated=worker:NoSchedule
k8s.dask.org_dedicated=worker:NoSchedule
node.kubernetes.io/not-ready:NoExecute for 300s
node.kubernetes.io/unreachable:NoExecute for 300s
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Scheduled 5m47s default-scheduler Successfully assigned default/workerpod to worker-node1
Normal Pulled 5m45s kubelet, worker-node1 Container image "daskdev/dask:latest" already present on machine
Normal Created 5m45s kubelet, worker-node1 Created container dask
Normal Started 5m45s kubelet, worker-node1 Started container dask
(base) [root@k8s-master example]#
(base) [root@k8s-master example]# kubectl get events
LAST SEEN TYPE REASON OBJECT MESSAGE
21m Normal Starting node/k8s-master Starting kubelet.
21m Normal NodeHasSufficientMemory node/k8s-master Node k8s-master status is now: NodeHasSufficientMemory
21m Normal NodeHasNoDiskPressure node/k8s-master Node k8s-master status is now: NodeHasNoDiskPressure
21m Normal NodeHasSufficientPID node/k8s-master Node k8s-master status is now: NodeHasSufficientPID
21m Normal NodeAllocatableEnforced node/k8s-master Updated Node Allocatable limit across pods
21m Normal RegisteredNode node/k8s-master Node k8s-master event: Registered Node k8s-master in Controller
21m Normal Starting node/k8s-master Starting kube-proxy.
18m Normal Starting node/worker-node1 Starting kubelet.
18m Normal NodeHasSufficientMemory node/worker-node1 Node worker-node1 status is now: NodeHasSufficientMemory
18m Normal NodeHasNoDiskPressure node/worker-node1 Node worker-node1 status is now: NodeHasNoDiskPressure
18m Normal NodeHasSufficientPID node/worker-node1 Node worker-node1 status is now: NodeHasSufficientPID
18m Normal NodeAllocatableEnforced node/worker-node1 Updated Node Allocatable limit across pods
18m Normal Starting node/worker-node1 Starting kube-proxy.
18m Normal RegisteredNode node/worker-node1 Node worker-node1 event: Registered Node worker-node1 in Controller
17m Normal NodeReady node/worker-node1 Node worker-node1 status is now: NodeReady
18m Normal Starting node/worker-node2 Starting kubelet.
18m Normal NodeHasSufficientMemory node/worker-node2 Node worker-node2 status is now: NodeHasSufficientMemory
18m Normal NodeHasNoDiskPressure node/worker-node2 Node worker-node2 status is now: NodeHasNoDiskPressure
18m Normal NodeHasSufficientPID node/worker-node2 Node worker-node2 status is now: NodeHasSufficientPID
18m Normal NodeAllocatableEnforced node/worker-node2 Updated Node Allocatable limit across pods
18m Normal Starting node/worker-node2 Starting kube-proxy.
17m Normal RegisteredNode node/worker-node2 Node worker-node2 event: Registered Node worker-node2 in Controller
17m Normal NodeReady node/worker-node2 Node worker-node2 status is now: NodeReady
14m Normal Scheduled pod/workerpod Successfully assigned default/workerpod to worker-node1
14m Normal Pulled pod/workerpod Container image "daskdev/dask:latest" already present on machine
14m Normal Created pod/workerpod Created container dask
14m Normal Started pod/workerpod Started container dask
(base) [root@k8s-master example]#
更新 - 添加 pod 日志(如 Dawid Kruk 所建议):
(base) [root@k8s-master example]# kubectl logs workerpod
+ '[' '' ']'
+ '[' -e /opt/app/environment.yml ']'
+ echo 'no environment.yml'
+ '[' '' ']'
+ '[' 'fastparquet git+https://github.com/dask/distributed' ']'
+ echo 'EXTRA_PIP_PACKAGES environment variable found. Installing.'
+ /opt/conda/bin/pip install fastparquet git+https://github.com/dask/distributed
no environment.yml
EXTRA_PIP_PACKAGES environment variable found. Installing.
Collecting git+https://github.com/dask/distributed
Cloning https://github.com/dask/distributed to /tmp/pip-req-build-i3_1vo06
Running command git clone -q https://github.com/dask/distributed /tmp/pip-req-build-i3_1vo06
fatal: unable to access 'https://github.com/dask/distributed/': Could not resolve host: github.com
ERROR: Command errored out with exit status 128: git clone -q https://github.com/dask/distributed /tmp/pip-req-build-i3_1vo06 Check the logs for full command output.
+ exec dask-worker --nthreads 2 --no-bokeh --memory-limit 6GB --death-timeout 60
/opt/conda/lib/python3.7/site-packages/distributed/cli/dask_worker.py:252: UserWarning: The --bokeh/--no-bokeh flag has been renamed to --dashboard/--no-dashboard.
"The --bokeh/--no-bokeh flag has been renamed to --dashboard/--no-dashboard. "
distributed.nanny - INFO - Start Nanny at: 'tcp://10.32.0.2:45097'
distributed.worker - INFO - Start worker at: tcp://10.32.0.2:36389
distributed.worker - INFO - Listening to: tcp://10.32.0.2:36389
distributed.worker - INFO - Waiting to connect to: tcp://172.16.0.76:43389
distributed.worker - INFO - -------------------------------------------------
distributed.worker - INFO - Threads: 2
distributed.worker - INFO - Memory: 6.00 GB
distributed.worker - INFO - Local Directory: /worker-55rpow8j
distributed.worker - INFO - -------------------------------------------------
distributed.worker - INFO - Waiting to connect to: tcp://172.16.0.76:43389
distributed.worker - INFO - Waiting to connect to: tcp://172.16.0.76:43389
distributed.worker - INFO - Waiting to connect to: tcp://172.16.0.76:43389
distributed.worker - INFO - Waiting to connect to: tcp://172.16.0.76:43389
distributed.worker - INFO - Waiting to connect to: tcp://172.16.0.76:43389
distributed.nanny - INFO - Closing Nanny at 'tcp://10.32.0.2:45097'
distributed.worker - INFO - Stopping worker at tcp://10.32.0.2:36389
distributed.worker - INFO - Closed worker has not yet started: None
distributed.dask_worker - INFO - Timed out starting worker
distributed.dask_worker - INFO - End worker
(base) [root@k8s-master example]# git
usage: git [--version] [--help] [-C <path>] [-c <name>=<value>]
[--exec-path[=<path>]] [--html-path] [--man-path] [--info-path]
在我看来,与工作节点的 dask pod 连接是问题,但我看到工作节点处于就绪状态,其他 pod(nginx)正在工作节点上运行:
(base) [root@k8s-master example]# kubectl get nodes
NAME STATUS ROLES AGE VERSION
k8s-master Ready master 20h v1.16.3
worker-node1 Ready worker 20h v1.16.2
worker-node2 Ready worker 20h v1.16.2
NAMESPACE NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
default nginx-deployment-54f57cf6bf-b9nfd 1/1 Running 0 34s 10.32.0.2 worker-node1 <none> <none>
default nginx-deployment-54f57cf6bf-pnp59 1/1 Running 0 34s 10.40.0.0 worker-node2 <none> <none>
default workerpod 0/1 Error 0 56m 10.32.0.2 worker-node1 <none> <none>
kube-system coredns-5644d7b6d9-l4jsd 1/1 Running 0 21h 10.32.0.4 k8s-master <none> <none>
kube-system coredns-5644d7b6d9-q679h 1/1 Running 0 21h 10.32.0.3 k8s-master <none> <none>
kube-system etcd-k8s-master 1/1 Running 0 21h 172.16.0.76 k8s-master <none> <none>
kube-system kube-apiserver-k8s-master 1/1 Running 0 21h 172.16.0.76 k8s-master <none> <none>
kube-system kube-controller-manager-k8s-master 1/1 Running 0 21h 172.16.0.76 k8s-master <none> <none>
kube-system kube-proxy-ctgj8 1/1 Running 0 21h 172.16.0.114 worker-node2 <none> <none>
kube-system kube-proxy-f78bm 1/1 Running 0 21h 172.16.0.76 k8s-master <none> <none>
kube-system kube-proxy-ksk59 1/1 Running 0 21h 172.16.0.31 worker-node1 <none> <none>
kube-system kube-scheduler-k8s-master 1/1 Running 0 21h 172.16.0.76 k8s-master <none> <none>
kube-system weave-net-q2zwn 2/2 Running 0 21h 172.16.0.76 k8s-master <none> <none>
kube-system weave-net-r9tzs 2/2 Running 0 21h 172.16.0.31 worker-node1 <none> <none>
kube-system weave-net-tm8xx 2/2 Running 0 21h 172.16.0.114 worker-node2 <none> <none>
Update2 - 添加了 nslookup 输出(由 VAS 建议)参考:https ://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/#create-a-simple-pod-to-use-as-a -测试环境
(base) [root@k8s-master example]# kubectl exec -ti workerpod -- nslookup kubernetes.default
OCI runtime exec failed: exec failed: container_linux.go:345: starting container process caused "exec: \"nslookup\": executable file not found in $PATH": unknown
command terminated with exit code 126
如何将可执行文件添加到 pod?它设置在我的主机上。
(base) [root@k8s-master example]# nslookup github.com
Server: 172.31.0.2
Address: 172.31.0.2#53
Non-authoritative answer:
Name: github.com
Address: 140.82.114.3
更新 3:用于 dnsutils 的 nslookup(由 VAS 建议)
(base) [root@k8s-master example]# kubectl run dnsutils -it --rm=true --restart=Never --image=tutum/dnsutils cat /etc/resolv.conf
nameserver 10.96.0.10
search default.svc.cluster.local svc.cluster.local cluster.local ec2.internal
options ndots:5
pod "dnsutils" deleted
(base) [root@k8s-master example]# kubectl run dnsutils -it --restart=Never --image=tutum/dnsutils nslookup github.com
If you don't see a command prompt, try pressing enter.
;; connection timed out; no servers could be reached
pod default/dnsutils terminated (Error)
(base) [root@k8s-master example]# kubectl logs dnsutils
;; connection timed out; no servers could be reached
(base) [root@k8s-master example]#
更新 4:
(base) [root@k8s-master example]# kubectl exec -ti busybox -- nslookup kubernetes.default
Server: 10.96.0.10
Address 1: 10.96.0.10
nslookup: can't resolve 'kubernetes.default'
command terminated with exit code 1
(base) [root@k8s-master example]# kubectl get pods --namespace=kube-system -l k8s-app=kube-dns
NAME READY STATUS RESTARTS AGE
coredns-5644d7b6d9-l4jsd 1/1 Running 0 25h
coredns-5644d7b6d9-q679h 1/1 Running 0 25h
(base) [root@k8s-master example]# kubectl get pods --namespace=kube-system -l k8s-app=kube-dns
NAME READY STATUS RESTARTS AGE
coredns-5644d7b6d9-l4jsd 1/1 Running 0 25h
coredns-5644d7b6d9-q679h 1/1 Running 0 25h
(base) [root@k8s-master example]# for p in $(kubectl get pods --namespace=kube-system -l k8s-app=kube-dns -o name); do kubectl logs --namespace=kube-system $p; done
.:53
2019-11-20T19:01:42.161Z [INFO] plugin/reload: Running configuration MD5 = f64cb9b977c7dfca58c4fab108535a76
2019-11-20T19:01:42.161Z [INFO] CoreDNS-1.6.2
2019-11-20T19:01:42.161Z [INFO] linux/amd64, go1.12.8, 795a3eb
CoreDNS-1.6.2
linux/amd64, go1.12.8, 795a3eb
.:53
2019-11-20T19:01:41.862Z [INFO] plugin/reload: Running configuration MD5 = f64cb9b977c7dfca58c4fab108535a76
2019-11-20T19:01:41.862Z [INFO] CoreDNS-1.6.2
2019-11-20T19:01:41.862Z [INFO] linux/amd64, go1.12.8, 795a3eb
CoreDNS-1.6.2
linux/amd64, go1.12.8, 795a3eb
(base) [root@k8s-master example]# kubectl get service
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
kubernetes ClusterIP 10.96.0.1 <none> 443/TCP 26h
(base) [root@k8s-master example]# kubectl get ep kube-dns --namespace=kube-system
NAME ENDPOINTS AGE
kube-dns 10.32.0.3:53,10.32.0.4:53,10.32.0.3:53 + 3 more... 26h
(base) [root@k8s-master example]# kubectl -n kube-system edit configmap coredns
Edit cancelled, no changes made.
来自工作节点 1 的 nslookup
[root@worker-node1 ec2-user]# nslookup 10.96.0.10
Server: 172.31.0.2
Address: 172.31.0.2#53
Non-authoritative answer:
10.0.96.10.in-addr.arpa name = ip-10-96-0-10.ec2.internal.
Authoritative answers can be found from:
[root@worker-node1 ec2-user]# nslookup 10.96.0.1
Server: 172.31.0.2
Address: 172.31.0.2#53
Non-authoritative answer:
1.0.96.10.in-addr.arpa name = ip-10-96-0-1.ec2.internal.
Authoritative answers can be found from:
[root@worker-node1 ec2-user]#
来自工作节点 2 的 nslookup
[root@worker-node2 ec2-user]# nslookup 10.96.0.10
Server: 172.31.0.2
Address: 172.31.0.2#53
Non-authoritative answer:
10.0.96.10.in-addr.arpa name = ip-10-96-0-10.ec2.internal.
Authoritative answers can be found from:
[root@worker-node2 ec2-user]# nslookup 10.96.0.1
Server: 172.31.0.2
Address: 172.31.0.2#53
Non-authoritative answer:
1.0.96.10.in-addr.arpa name = ip-10-96-0-1.ec2.internal.
Authoritative answers can be found from: