Cloud Pak for AIOps health dashboard

This dashboard focuses on the high level health of Cloud Pak for AIOps, its storage and the cluster that it is running on. This dashboard answers the following questions:

  1. Does the cluster have enough resources?
  2. How much storage is used?
  3. Is the Cloud Pak for AIOps instance healthy?

To import this JSON dashboard, see Importing dashboard by using JSON.

You can import the following JSON into Grafana.

{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [],
"type": "dashboard"
},
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 10,
"links": [],
"liveNow": false,
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 8,
"panels": [],
"title": "Cluster Health",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"description": "Percentage of CPU usage across all nodes taken at 5 minute averages",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 2,
"mappings": [
{
"options": {
"match": "null",
"result": {
"color": "rgba(50, 172, 45, 0.97)",
"text": "0"
}
},
"type": "special"
}
],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(50, 172, 45, 0.97)",
"value": null
},
{
"color": "rgba(237, 129, 40, 0.89)",
"value": 65
},
{
"color": "rgba(245, 54, 54, 0.9)",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 8,
"x": 0,
"y": 1
},
"id": 6,
"links": [],
"maxDataPoints": 100,
"options": {
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"pluginVersion": "9.4.7",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\",node=~\"^.*$\"}[5m])) / sum (machine_cpu_cores{ node=~\"^.*$\"}) * 100",
"format": "time_series",
"interval": "10s",
"intervalFactor": 1,
"refId": "A",
"step": 10
}
],
"title": "CPU usage (5m avg)",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"description": "Percentage of memory usage across all nodes",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"color": "rgba(50, 172, 45, 0.97)",
"text": "0"
}
},
"type": "special"
}
],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(50, 172, 45, 0.97)",
"value": null
},
{
"color": "rgba(237, 129, 40, 0.89)",
"value": 65
},
{
"color": "rgba(245, 54, 54, 0.9)",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 8,
"x": 8,
"y": 1
},
"id": 4,
"links": [],
"maxDataPoints": 100,
"options": {
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"pluginVersion": "9.4.7",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"expr": "sum (container_memory_working_set_bytes{id=\"/\",node=~\"^.*$\"}) / sum (machine_memory_bytes{node=~\"^.*$\"}) * 100",
"format": "time_series",
"interval": "10s",
"intervalFactor": 1,
"refId": "A",
"step": 10
}
],
"title": "Memory usage",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"description": "Percentage of filesystem usage across all nodes",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 2,
"mappings": [
{
"options": {
"match": "null",
"result": {
"color": "rgba(50, 172, 45, 0.97)",
"text": "0"
}
},
"type": "special"
}
],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(50, 172, 45, 0.97)",
"value": null
},
{
"color": "rgba(237, 129, 40, 0.89)",
"value": 65
},
{
"color": "rgba(245, 54, 54, 0.9)",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 8,
"x": 16,
"y": 1
},
"id": 7,
"links": [],
"maxDataPoints": 100,
"options": {
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"pluginVersion": "9.4.7",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"expr": "sum (container_fs_usage_bytes{device=~\"^/dev/.*$\",id=\"/\",node=~\"^.*$\"}) / sum (container_fs_limit_bytes{device=~\"^/dev/.*$\",id=\"/\",node=~\"^.*$\"}) * 100",
"format": "time_series",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "",
"metric": "",
"refId": "A",
"step": 10
}
],
"title": "Filesystem usage",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"description": "Sum of the 5 min avg of the cumulative cpu time consumed in seconds for all nodes",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 2,
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "0"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 0,
"y": 6
},
"id": 11,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.4.7",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"editorMode": "code",
"expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\",node=~\"^.*$\"}[5m]))",
"format": "time_series",
"interval": "10s",
"intervalFactor": 1,
"range": true,
"refId": "A",
"step": 10
}
],
"title": "Used",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"description": "Total number of logical cores across all nodes",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 2,
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "0"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 4,
"y": 6
},
"id": 12,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.4.7",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"editorMode": "code",
"expr": "sum (machine_cpu_cores{node=~\"^.*$\"})",
"interval": "10s",
"intervalFactor": 1,
"range": true,
"refId": "A",
"step": 10
}
],
"title": "Total",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 2,
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "0"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 8,
"y": 6
},
"id": 9,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.4.7",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"expr": "sum (container_memory_working_set_bytes{id=\"/\",node=~\"^.*$\"})",
"interval": "10s",
"intervalFactor": 1,
"refId": "A",
"step": 10
}
],
"title": "Used",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 2,
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "0"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 12,
"y": 6
},
"id": 10,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.4.7",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"expr": "sum (machine_memory_bytes{node=~\"^.*$\"})",
"interval": "10s",
"intervalFactor": 1,
"refId": "A",
"step": 10
}
],
"title": "Total",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 2,
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "0"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 16,
"y": 6
},
"id": 13,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.4.7",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"expr": "sum (container_fs_usage_bytes{device=~\"^/dev/.*$\",id=\"/\",node=~\"^.*$\"})",
"interval": "10s",
"intervalFactor": 1,
"refId": "A",
"step": 10
}
],
"title": "Used",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 2,
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "0"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 20,
"y": 6
},
"id": 14,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.4.7",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"expr": "sum (container_fs_limit_bytes{device=~\"^/dev/.*$\",id=\"/\",node=~\"^.*$\"})",
"interval": "10s",
"intervalFactor": 1,
"refId": "A",
"step": 10
}
],
"title": "Total",
"type": "stat"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 9
},
"id": 16,
"panels": [],
"title": "Node Health",
"type": "row"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"description": "Sum of nodes in a \"ready\" state across a period of time",
"fill": 0,
"fillGradient": 0,
"gridPos": {
"h": 6,
"w": 12,
"x": 0,
"y": 10
},
"hiddenSeries": false,
"id": 2058,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "9.4.7",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "Total",
"fillBelowTo": "Unavailable"
}
],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"expr": "sum(kube_node_info{node=~\".*\"})",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Total",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"expr": "sum(kube_node_status_condition{condition=\"Ready\", node=~\".*\", status!=\"true\"})",
"format": "time_series",
"instant": false,
"intervalFactor": 2,
"legendFormat": "Unavailable",
"refId": "B"
}
],
"thresholds": [],
"timeRegions": [],
"title": "Available Nodes",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:95",
"decimals": 0,
"format": "short",
"logBase": 1,
"show": true
},
{
"$$hashKey": "object:96",
"format": "short",
"logBase": 1,
"show": true
}
],
"yaxis": {
"align": false
}
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"description": "Number of nodes experiencing low disk space (default value of 85%)",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"color": "#299c46",
"text": "0"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#299c46",
"value": null
},
{
"color": "rgba(237, 129, 40, 0.89)",
"value": 1
},
{
"color": "#d44a3a"
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 12,
"y": 10
},
"id": 2056,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.4.7",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"expr": "sum(kube_node_status_condition{condition=\"DiskPressure\", node=~\".*\", status!=\"false\"})",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"title": "Disk Pressure",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"description": "Number of nodes experiencing memory pressure",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"color": "#299c46",
"text": "0"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#299c46",
"value": null
},
{
"color": "rgba(237, 129, 40, 0.89)",
"value": 1
},
{
"color": "#d44a3a"
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 16,
"y": 10
},
"id": 2059,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.4.7",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"expr": "sum(kube_node_status_condition{condition=\"MemoryPressure\", node=~\".*\", status!=\"false\"})",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"title": "Memory Pressure",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"description": "Number of nodes experiencing PID pressure",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"color": "#299c46",
"text": "0"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#299c46",
"value": null
},
{
"color": "rgba(237, 129, 40, 0.89)",
"value": 1
},
{
"color": "#d44a3a"
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 20,
"y": 10
},
"id": 2063,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.4.7",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"expr": "sum(kube_node_status_condition{condition=\"PIDPressure\", node=~\".*\", status!=\"false\"})",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"title": "PID Pressure",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"description": "Number of nodes that are unschedulable. New pods cannot be scheduled on nodes in this state.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"color": "#299c46",
"text": "0"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#299c46",
"value": null
},
{
"color": "rgba(237, 129, 40, 0.89)",
"value": 1
},
{
"color": "#d44a3a"
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 12,
"y": 13
},
"id": 2026,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.4.7",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"expr": "sum(kube_node_spec_unschedulable{node=~\".*\"})",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"title": "Unschedulable",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"description": "Number of nodes that are out of disk storage",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"color": "#299c46",
"text": "0"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#299c46",
"value": null
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 16,
"y": 13
},
"id": 2025,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.4.7",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"expr": "sum(kube_node_status_condition{condition=\"OutOfDisk\", node=~\".*\", status=\"true\"})",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"title": "Out of Disk",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"description": "Number of nodes with a \"NetworkUnavailable\" status",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"color": "#299c46",
"text": "0"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#299c46",
"value": null
},
{
"color": "rgba(237, 129, 40, 0.89)",
"value": 1
},
{
"color": "#d44a3a"
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 20,
"y": 13
},
"id": 2060,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.4.7",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"expr": "sum(kube_node_status_condition{condition=\"NetworkUnavailable\", node=~\".*\", status!=\"false\"})",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"title": "Network Unavailable",
"type": "stat"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 16
},
"id": 2065,
"panels": [],
"title": "Storage Health",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"description": "Count of PVCs in a \"bound\" state",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"steps": [
{
"color": "green",
"value": null
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 6,
"w": 12,
"x": 0,
"y": 17
},
"id": 2067,
"options": {
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"pluginVersion": "9.4.7",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"editorMode": "code",
"expr": "count(kube_persistentvolumeclaim_status_phase{namespace=\"$namespace\",phase=\"Bound\"})",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Bound PVC",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "percentage",
"steps": [
{
"color": "red",
"value": null
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 6,
"w": 12,
"x": 12,
"y": 17
},
"id": 2070,
"options": {
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"pluginVersion": "9.4.7",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"editorMode": "code",
"expr": "sum by (phase)(kube_persistentvolumeclaim_status_phase{namespace=\"$namespace\",phase!~\"Bound\"})",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Unbound PVC",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"description": "PVC usage percentage for each PVC in the namespace, thresholds of 85% & 95% usage",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 2,
"displayName": "${__field.labels.persistentvolumeclaim}",
"mappings": [],
"max": 100,
"thresholds": {
"mode": "percentage",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "#EAB839",
"value": 85
},
{
"color": "red",
"value": 95
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 14,
"w": 24,
"x": 0,
"y": 23
},
"id": 2071,
"options": {
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"text": {
"titleSize": 14
}
},
"pluginVersion": "9.4.7",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"editorMode": "code",
"expr": "sort_desc(kubelet_volume_stats_used_bytes{namespace=\"$namespace\"}/kubelet_volume_stats_capacity_bytes{namespace=\"$namespace\"} * 100)",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "PVC Usage",
"type": "gauge"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"description": "Max and current PVC usage",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 13,
"w": 24,
"x": 0,
"y": 37
},
"hiddenSeries": false,
"id": 56,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "9.4.7",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"expr": "sum by (persistentvolumeclaim) (kubelet_volume_stats_used_bytes{namespace=\"$namespace\"}/kubelet_volume_stats_capacity_bytes{namespace=\"$namespace\"} * 100)",
"interval": "",
"legendFormat": "{{ persistentvolumeclaim }}",
"refId": "A"
}
],
"thresholds": [],
"timeRegions": [],
"title": "PVC Usage (max & current)",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"mode": "time",
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:539",
"format": "percent",
"logBase": 1,
"show": true
},
{
"$$hashKey": "object:540",
"format": "short",
"logBase": 1,
"show": true
}
],
"yaxis": {
"align": false
}
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 50
},
"id": 2069,
"panels": [],
"title": "Pod Health",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"description": "Number of pods in a \"running\" state in $namespace namespace",
"fieldConfig": {
"defaults": {
"color": {
"fixedColor": "#629e51",
"mode": "fixed"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "0"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 12,
"x": 0,
"y": 51
},
"id": 2030,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.4.7",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"expr": "sum(kube_pod_status_phase{namespace=\"$namespace\", phase=\"Running\"})",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"refId": "A"
}
],
"title": "Pods Running",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"description": "Number of pods in a \"succeeded\" state in $namespace namespace",
"fieldConfig": {
"defaults": {
"color": {
"fixedColor": "#629e51",
"mode": "fixed"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "0"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 12,
"x": 12,
"y": 51
},
"id": 2033,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.4.7",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"expr": "sum(kube_pod_status_phase{namespace=\"$namespace\", phase=\"Succeeded\"})",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"refId": "A"
}
],
"title": "Pods Succeeded",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"description": "Number of pods in a \"failed\" state in $namespace namespace",
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "0"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 1
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 12,
"x": 0,
"y": 56
},
"id": 2032,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.4.7",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"expr": "sum(kube_pod_status_phase{namespace=\"$namespace\", phase=\"Failed\"})",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"refId": "A"
}
],
"title": "Pods Failed",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"description": "Number of pods in an \"unknown\" state in $namespace namespace",
"fieldConfig": {
"defaults": {
"color": {
"fixedColor": "#629e51",
"mode": "fixed"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "0"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 12,
"x": 12,
"y": 56
},
"id": 2034,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "9.4.7",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"expr": "sum(kube_pod_status_phase{namespace=\"$namespace\", phase=\"Unknown\"})",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"refId": "A"
}
],
"title": "Pods Unknown",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"description": "Number of pods in a \"pending\" state in $namespace namespace",
"fieldConfig": {
"defaults": {
"color": {
"fixedColor": "red",
"mode": "fixed"
},
"custom": {
"align": "auto",
"cellOptions": {
"type": "auto"
},
"inspect": false
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 1
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 60
},
"id": 2073,
"options": {
"footer": {
"countRows": false,
"fields": "",
"reducer": [
"sum"
],
"show": false
},
"frameIndex": 0,
"showHeader": true,
"sortBy": [
{
"desc": true,
"displayName": "{pod=\"aimanager-aio-chatops-slack-integrator-944797d46-79dv6\", reason=\"InvalidImageName\"}"
}
]
},
"pluginVersion": "9.4.7",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"editorMode": "builder",
"expr": "sum(kube_pod_container_status_waiting_reason{namespace=\"$namespace\"}) by (pod, reason)",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Pods pending",
"transformations": [
{
"filter": {
"id": "byRefId",
"options": ""
},
"id": "limit",
"options": {
"limitField": 1
}
},
{
"id": "joinByLabels",
"options": {
"value": "reason"
}
}
],
"type": "table"
}
],
"refresh": "",
"revision": 1,
"schemaVersion": 38,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"current": {
"selected": false,
"text": "katamari",
"value": "katamari"
},
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"definition": "label_values(kube_namespace_status_phase{job=\"kube-state-metrics\", cluster=\"$cluster\"}, namespace)",
"hide": 0,
"includeAll": false,
"multi": false,
"name": "namespace",
"options": [],
"query": {
"query": "label_values(kube_namespace_status_phase{job=\"kube-state-metrics\", cluster=\"$cluster\"}, namespace)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
},
{
"current": {
"isNone": true,
"selected": false,
"text": "None",
"value": ""
},
"datasource": {
"type": "prometheus",
"uid": "P1809F7CD0C75ACF3"
},
"definition": "label_values(up{job=\"kube-state-metrics\"}, cluster)",
"hide": 2,
"includeAll": false,
"multi": false,
"name": "cluster",
"options": [],
"query": {
"query": "label_values(up{job=\"kube-state-metrics\"}, cluster)",
"refId": "StandardVariableQuery"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
}
]
},
"time": {
"from": "now-30m",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "CP4AIOps Health",
"uid": "80acb6b1-edee-48e7-b9f6-92daf1dc4deb",
"version": 31,
"weekStart": ""
}