Alerts

Inactive (15) Pending (0) Firing (1)

/etc/prometheus/rules/ansible_managed.rules > ansible managed alert rules

Watchdog (1 active)

alert: Watchdog
expr: vector(1)
for: 10m
labels:
  severity: warning
annotations:
  description: |-
    This is an alert meant to ensure that the entire alerting pipeline is functional.
    This alert is always firing, therefore it should always be firing in Alertmanager
    and always fire against a receiver. There are integrations with various notification
    mechanisms that send a notification when this alert is not firing. For example the
    "DeadMansSnitch" integration in PagerDuty.
  summary: Ensure entire alerting pipeline is functional

Labels	State	Active Since	Value
alertname="Watchdog" severity="warning"	firing	2024-06-15 10:19:59 +0000 UTC	1
Annotations
description This is an alert meant to ensure that the entire alerting pipeline is functional. This alert is always firing, therefore it should always be firing in Alertmanager and always fire against a receiver. There are integrations with various notification mechanisms that send a notification when this alert is not firing. For example the "DeadMansSnitch" integration in PagerDuty. summary Ensure entire alerting pipeline is functional

InstanceDown (0 active)

alert: InstanceDown
expr: up == 0
for: 5m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for
    more than 5 minutes.'
  summary: Instance {{ $labels.instance }} down

NodeClockNotSynchronising (0 active)

alert: NodeClockNotSynchronising
expr: min_over_time(node_timex_sync_status[5m])
  == 0
for: 10m
labels:
  severity: warning
annotations:
  message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured
    on this host.
  summary: Clock not synchronising.

NodeClockSkewDetected (0 active)

alert: NodeClockSkewDetected
expr: (node_timex_offset_seconds
  > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds
  < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
for: 10m
labels:
  severity: warning
annotations:
  message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure
    NTP is configured correctly on this host.
  summary: Clock skew detected.

NodeFilesystemAlmostOutOfFiles (0 active)

alert: NodeFilesystemAlmostOutOfFiles
expr: (node_filesystem_files_free{fstype!="",job="node"}
  / node_filesystem_files{fstype!="",job="node"} * 100 < 3 and
  node_filesystem_readonly{fstype!="",job="node"} == 0)
for: 1h
labels:
  severity: critical
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
    {{ printf "%.2f" $value }}% available inodes left.
  summary: Filesystem has less than 3% inodes left.

NodeFilesystemAlmostOutOfFiles (0 active)

alert: NodeFilesystemAlmostOutOfFiles
expr: (node_filesystem_files_free{fstype!="",job="node"}
  / node_filesystem_files{fstype!="",job="node"} * 100 < 5 and
  node_filesystem_readonly{fstype!="",job="node"} == 0)
for: 1h
labels:
  severity: warning
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
    {{ printf "%.2f" $value }}% available inodes left.
  summary: Filesystem has less than 5% inodes left.

NodeFilesystemAlmostOutOfSpace (0 active)

alert: NodeFilesystemAlmostOutOfSpace
expr: (node_filesystem_avail_bytes{fstype!="",job="node"}
  / node_filesystem_size_bytes{fstype!="",job="node"} * 100 < 5
  and node_filesystem_readonly{fstype!="",job="node"} == 0)
for: 1h
labels:
  severity: warning
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
    {{ printf "%.2f" $value }}% available space left.
  summary: Filesystem has less than 5% space left.

NodeFilesystemAlmostOutOfSpace (0 active)

alert: NodeFilesystemAlmostOutOfSpace
expr: (node_filesystem_avail_bytes{fstype!="",job="node"}
  / node_filesystem_size_bytes{fstype!="",job="node"} * 100 < 3
  and node_filesystem_readonly{fstype!="",job="node"} == 0)
for: 1h
labels:
  severity: critical
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
    {{ printf "%.2f" $value }}% available space left.
  summary: Filesystem has less than 3% space left.

NodeFilesystemFilesFillingUp (0 active)

alert: NodeFilesystemFilesFillingUp
expr: (node_filesystem_files_free{fstype!="",job="node"}
  / node_filesystem_files{fstype!="",job="node"} * 100 < 40 and
  predict_linear(node_filesystem_files_free{fstype!="",job="node"}[6h],
  24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node"}
  == 0)
for: 1h
labels:
  severity: warning
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
    {{ printf "%.2f" $value }}% available inodes left and is filling up.
  summary: Filesystem is predicted to run out of inodes within the next 24 hours.

NodeFilesystemFilesFillingUp (0 active)

alert: NodeFilesystemFilesFillingUp
expr: (node_filesystem_files_free{fstype!="",job="node"}
  / node_filesystem_files{fstype!="",job="node"} * 100 < 20 and
  predict_linear(node_filesystem_files_free{fstype!="",job="node"}[6h],
  4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node"}
  == 0)
for: 1h
labels:
  severity: critical
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
    {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
  summary: Filesystem is predicted to run out of inodes within the next 4 hours.

NodeFilesystemSpaceFillingUp (0 active)

alert: NodeFilesystemSpaceFillingUp
expr: (node_filesystem_avail_bytes{fstype!="",job="node"}
  / node_filesystem_size_bytes{fstype!="",job="node"} * 100 < 20
  and predict_linear(node_filesystem_avail_bytes{fstype!="",job="node"}[6h],
  4 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node"}
  == 0)
for: 1h
labels:
  severity: critical
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
    {{ printf "%.2f" $value }}% available space left and is filling up fast.
  summary: Filesystem is predicted to run out of space within the next 4 hours.

NodeFilesystemSpaceFillingUp (0 active)

alert: NodeFilesystemSpaceFillingUp
expr: (node_filesystem_avail_bytes{fstype!="",job="node"}
  / node_filesystem_size_bytes{fstype!="",job="node"} * 100 < 40
  and predict_linear(node_filesystem_avail_bytes{fstype!="",job="node"}[6h],
  24 * 60 * 60) < 0 and node_filesystem_readonly{fstype!="",job="node"}
  == 0)
for: 1h
labels:
  severity: warning
annotations:
  description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
    {{ printf "%.2f" $value }}% available space left and is filling up.
  summary: Filesystem is predicted to run out of space within the next 24 hours.

NodeHighNumberConntrackEntriesUsed (0 active)

alert: NodeHighNumberConntrackEntriesUsed
expr: (node_nf_conntrack_entries
  / node_nf_conntrack_entries_limit) > 0.75
labels:
  severity: warning
annotations:
  description: '{{ $value | humanizePercentage }} of conntrack entries are used'
  summary: Number of conntrack are getting close to the limit

NodeNetworkReceiveErrs (0 active)

alert: NodeNetworkReceiveErrs
expr: increase(node_network_receive_errs_total[2m])
  > 10
for: 1h
labels:
  severity: warning
annotations:
  description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
    {{ printf "%.0f" $value }} receive errors in the last two minutes.'
  summary: Network interface is reporting many receive errors.

NodeNetworkTransmitErrs (0 active)

alert: NodeNetworkTransmitErrs
expr: increase(node_network_transmit_errs_total[2m])
  > 10
for: 1h
labels:
  severity: warning
annotations:
  description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
    {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
  summary: Network interface is reporting many transmit errors.

RebootRequired (0 active)

alert: RebootRequired
expr: node_reboot_required
  > 0
labels:
  severity: warning
annotations:
  description: '{{ $labels.instance }} requires a reboot.'
  summary: Instance {{ $labels.instance }} - reboot required