1

今天我重新启动了我的一个 k8s 工作节点。现在无法获取在此节点上启动的任何 pod 的指标。kubectl top nodes工作正常。

$ kubectl top pods
W0413 03:16:04.917900  596110 top_pod.go:266] Metrics not available for pod default/cluster-registry-84f8b6b45c-xmzr4, age: 1h32m29.917882167s
error: Metrics not available for pod default/cluster-registry-84f8b6b45c-xmzr4, age: 1h32m29.917882167s
$ kubectl logs -f metrics-server-596fcd4bcd-fgk86 -n kube-system

E0412 20:16:07.413028       1 reststorage.go:160] unable to fetch pod metrics for pod default/runner-registry-74bdcf4f9b-8kkzn: no metrics known for pod
E0412 20:17:07.413399       1 reststorage.go:160] unable to fetch pod metrics for pod default/runner-registry-74bdcf4f9b-8kkzn: no metrics known for pod

我尝试使用--v=4arg 启动 metrics-server,但没有发现任何有趣的东西。其他节点上 pod 的指标是可以的。

k8s -v1.17.4

metrics-server-amd64:v0.3.6 开始于

--kubelet-insecure-tls
--kubelet-preferred-address-types=InternalIP

更新: 节点名称是sms-crm-stg-2. 下面的输出kubectl get --raw /api/v1/nodes/sms-crm-stg-2/proxy/stats/summary

$ kubectl get --raw /api/v1/nodes/sms-crm-stg-2/proxy/stats/summary
{
 "node": {
  "nodeName": "sms-crm-stg-2",
  "systemContainers": [
   {
    "name": "pods",
    "startTime": "2020-04-12T17:50:25Z",
    "cpu": {
     "time": "2020-04-14T10:53:20Z",
     "usageNanoCores": 12877941,
     "usageCoreNanoSeconds": 4387476849484
    },
    "memory": {
     "time": "2020-04-14T10:53:20Z",
     "availableBytes": 16520691712,
     "usageBytes": 154824704,
     "workingSetBytes": 136818688,
     "rssBytes": 68583424,
     "pageFaults": 0,
     "majorPageFaults": 0
    }
   },
   {
    "name": "kubelet",
    "startTime": "2020-04-12T17:49:18Z",
    "cpu": {
     "time": "2020-04-14T10:53:05Z",
     "usageNanoCores": 18983004,
     "usageCoreNanoSeconds": 2979656573959
    },
    "memory": {
     "time": "2020-04-14T10:53:05Z",
     "usageBytes": 374534144,
     "workingSetBytes": 353353728,
     "rssBytes": 325005312,
     "pageFaults": 133278612,
     "majorPageFaults": 536505
    }
   },
   {
    "name": "runtime",
    "startTime": "2020-04-12T17:48:35Z",
    "cpu": {
     "time": "2020-04-14T10:53:03Z",
     "usageNanoCores": 15982086,
     "usageCoreNanoSeconds": 1522750008369
    },
    "memory": {
     "time": "2020-04-14T10:53:03Z",
     "usageBytes": 306790400,
     "workingSetBytes": 297889792,
     "rssBytes": 280047616,
     "pageFaults": 53437788,
     "majorPageFaults": 255703
    }
   }
  ],
  "startTime": "2020-04-12T17:48:19Z",
  "cpu": {
   "time": "2020-04-14T10:53:20Z",
   "usageNanoCores": 110654764,
   "usageCoreNanoSeconds": 29602969518334
  },
  "memory": {
   "time": "2020-04-14T10:53:20Z",
   "availableBytes": 1377738752,
   "usageBytes": 15835013120,
   "workingSetBytes": 15279771648,
   "rssBytes": 14585233408,
   "pageFaults": 3309653,
   "majorPageFaults": 16969
  },
  "network": {
   "time": "2020-04-14T10:53:20Z",
   "name": "",
   "interfaces": [
    {
     "name": "br-6edcec7930f0",
     "rxBytes": 0,
     "rxErrors": 0,
     "txBytes": 0,
     "txErrors": 0
    },
    {
     "name": "cali63387897a01",
     "rxBytes": 131540393,
     "rxErrors": 0,
     "txBytes": 71581241,
     "txErrors": 0
    },
    {
     "name": "cali75b3a97cfc0",
     "rxBytes": 194967,
     "rxErrors": 0,
     "txBytes": 54249,
     "txErrors": 0
    },
    {
     "name": "cali382d1538876",
     "rxBytes": 666667,
     "rxErrors": 0,
     "txBytes": 780072,
     "txErrors": 0
    },
    {
     "name": "br-0b3d0a271eb2",
     "rxBytes": 0,
     "rxErrors": 0,
     "txBytes": 0,
     "txErrors": 0
    },
    {
     "name": "cali7c48479e916",
     "rxBytes": 139682733,
     "rxErrors": 0,
     "txBytes": 205172367,
     "txErrors": 0
    },
    {
     "name": "cali346a5d86923",
     "rxBytes": 112517660,
     "rxErrors": 0,
     "txBytes": 232383,
     "txErrors": 0
    },
    {
     "name": "br-5d30bcdbc231",
     "rxBytes": 0,
     "rxErrors": 0,
     "txBytes": 0,
     "txErrors": 0
    },
    {
     "name": "tunl0",
     "rxBytes": 195091257,
     "rxErrors": 0,
     "txBytes": 215334849,
     "txErrors": 0
    },
    {
     "name": "ens160",
     "rxBytes": 3241985272,
     "rxErrors": 0,
     "txBytes": 3548616264,
     "txErrors": 0
    }
   ]
  },
  "fs": {
   "time": "2020-04-14T10:53:20Z",
   "availableBytes": 9231872000,
   "capacityBytes": 24109666304,
   "usedBytes": 14877794304,
   "inodesFree": 23363080,
   "inodes": 23556096,
   "inodesUsed": 193016
  },
  "runtime": {
   "imageFs": {
    "time": "2020-04-14T10:53:20Z",
    "availableBytes": 9231872000,
    "capacityBytes": 24109666304,
    "usedBytes": 6145920764,
    "inodesFree": 23363080,
    "inodes": 23556096,
    "inodesUsed": 193016
   }
  },
  "rlimit": {
   "time": "2020-04-14T10:53:22Z",
   "maxpid": 32768,
   "curproc": 1608
  }
 },
 "pods": []
}

"pods": []是空的,所以看起来是节点问题,而不是指标服务器。

4

1 回答 1

1

OP 确认指标服务器问题是由故障节点引起的。添加一个新的解决了问题。

于 2020-04-17T07:21:20.713 回答