ansible/group_vars/monitor_servers/main.yaml

414 lines
12 KiB
YAML

---
node_exporter_machine_roles:
- monitor
- stats
prometheus_web_external_url: https://monitor.kill0.net/prometheus
alertmanager_web_external_url: https://monitor.kill0.net/alertmanager
prometheus_web_route_prefix: /
alertmanager_web_route_prefix: /
prometheus_config:
global:
scrape_interval: 15s
external_labels:
cluster: 1
region: dallas
provider: linode
replica: A
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093
scrape_configs:
- job_name: prometheus
scrape_interval: 5s
static_configs:
- targets:
- localhost:9090
- job_name: alertmanager
scrape_interval: 5s
static_configs:
- targets:
- localhost:9093
- job_name: pushgateway
scrape_interval: 5s
static_configs:
- targets:
- jump0.kill0.net:9091
- job_name: node
scrape_interval: 5s
static_configs:
- targets:
- jump0.kill0.net:9100
- mine0.kill0.net:9100
relabel_configs:
- source_labels: [__address__]
target_label: instance
regex: (.+):\d+
replacement: $1
- job_name: mtail
scrape_interval: 5s
static_configs:
- targets:
- jump0.kill0.net:3903
- mine0.kill0.net:3903
relabel_configs:
- source_labels: [__address__]
target_label: instance
regex: (.+):\d+
replacement: $1
- job_name: blackbox
scrape_interval: 5s
static_configs:
- targets:
- jump0.kill0.net:9115
- mine0.kill0.net:9115
- job_name: blackbox-icmp4
metrics_path: /probe
params:
module:
- icmpv4
static_configs:
- targets:
- dns.google
- vpn-home.kill0.net
- ping-home.kill0.net
- 10.255.0.16
- vpn1-sch.corp.nmi.com
- vpn-chi.ops.nmi.com
- vpn-ash.ops.nmi.com
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115 # The blackbox exporter's real hostname:port.
- job_name: blackbox-icmp6
metrics_path: /probe
params:
module:
- icmpv6
static_configs:
- targets:
- dns.google
- ping-home.kill0.net
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115 # The blackbox exporter's real hostname:port.
- job_name: blackbox-tcp4
metrics_path: /probe
params:
module:
- tcp_connect4
static_configs:
- targets:
- mine0.kill0.net:25565
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115 # The blackbox exporter's real hostname:port.
- job_name: blackbox-tcp6
metrics_path: /probe
params:
module:
- tcp_connect6
static_configs:
- targets:
- mine0.kill0.net:25565
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115 # The blackbox exporter's real hostname:port.
- job_name: blackbox-http
metrics_path: /probe
params:
module:
- http_2xx
static_configs:
- targets:
- https://cavi.cc
- https://git.kill0.net
- https://stats.kill0.net
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115 # The blackbox exporter's real hostname:port.
- job_name: thanos-sidecar
scrape_interval: 5s
static_configs:
- targets:
- "localhost:10902"
- job_name: thanos-query
scrape_interval: 5s
static_configs:
- targets:
- "localhost:10904"
- job_name: thanos-store
scrape_interval: 5s
static_configs:
- targets:
- "localhost:10902"
- job_name: thanos-compact
scrape_interval: 5s
static_configs:
- targets:
- "localhost:10912"
rule_files:
- rules.yaml
prometheus_rules_config:
groups:
- name: alertmanager.rules
rules:
- alert: PrometheusAlertmanagerJobMissing
expr: absent(up{job="alertmanager"})
for: 0m
labels:
severity: warning
annotations:
summary: "{% raw %} Prometheus AlertManager job missing (instance {{ $labels.instance }}){% endraw %}"
description: "{% raw %}A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}{% endraw %}"
- alert: PrometheusAlertmanagerE2eDeadManSwitch
expr: vector(1)
for: 0m
labels:
severity: critical
annotations:
summary: "{% raw %}Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }}){% endraw %}"
description: "{% raw %}Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}{% endraw %}"
- name: node.rules
rules:
- record: is_dst
expr: |
(vector(0) and (month() < 3 or month() > 11))
or
(vector(1) and (month() > 3 and month() < 11))
or
(vector(1) and month() == 3 and (day_of_month() - day_of_week()) >= 8 and absent(day_of_week() == 0 and day_of_month() >= 8 and day_of_month() <= 14))
or
(vector(1) and month() == 11 and (day_of_month() - day_of_week()) <= 0)
or
(vector(1) and month() == 3 and day_of_month() >= 8 and day_of_month() <= 14 and day_of_week() == 0 and hour() >= 8)
or
(vector(1) and month() == 11 and day_of_month() >= 1 and day_of_month() <= 7 and day_of_week() == 0 and hour() < 7)
or
vector(0)
- record: america_chicago_time
expr: time() - ((6 * 3600) - (3600 * is_dst))
- record: america_chicago_hour
expr: hour(america_chicago_time)
- alert: InstanceDown
expr: up{job="node"} == 0
for: 1m
- alert: ThanosServiceDown
expr: up{job=~"thanos.+"} == 0
labels:
severity: critical
- alert: FileSystemUsage
expr: ((node_filesystem_size_bytes{mountpoint!~"fuse.lxcfs|tmpfs"} - node_filesystem_free_bytes) / node_filesystem_size_bytes) > 0.80
for: 1m
- alert: FileSystemReadOnly
expr: node_filesystem_readonly{fstype!~"fuse.lxcfs|tmpfs"} == 1
- alert: RebootRequired
expr: node_reboot_required > 0
for: 15m
- alert: AptUpgradesPending
expr: apt_upgrades_pending > 0
for: 1d
- alert: ResticSystemJobLastRun
expr: (time() - node_restic_last_run_time{restic_job="system"}) > 7200
for: 2h
- alert: ResticMinecraftJobLastRun
expr: (time() - node_restic_last_run_time{restic_job=~"minecraft"}) > 86400
for: 2h
- alert: MinecraftUnitInactive
expr: node_systemd_unit_state{name="minecraft.service",state="inactive"} == 1
for: 15m
- alert: GiteaUnitInactive
expr: node_systemd_unit_state{name="gitea.service",state="inactive"} == 1
for: 15m
- alert: MaintenanceMode
expr: maintenance_mode == 1
for: 1m
- name: blackbox.rules
rules:
- alert: ServiceDown
expr: probe_success{job!~"blackbox-icmp[0-9]"} == 0
for: 1m
- alert: PingDown
expr: probe_success{job=~"blackbox-icmp[0-9]"} == 0
for: 15s
- alert: CertExpiry
expr: ((probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time()) / 86400) < 30
for: 15s
labels:
severity: warning
annotations:
# summary: Certificates expiring in < 30 days
summary: "{% raw %}Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}){% endraw %}"
description: "{% raw %}SSL certificate expires in 30 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}{% endraw %}"
- alert: CertExpiry
expr: ((probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time()) / 86400) < 14
for: 15s
labels:
severity: critical
annotations:
# summary: Certificates expiring in < 14 days
summary: "{% raw %}Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}){% endraw %}"
description: "{% raw %}SSL certificate expires in 14 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}{% endraw %}"
blackbox_exporter_config:
modules:
icmpv4:
prober: icmp
timeout: 5s
icmp:
preferred_ip_protocol: ip4
icmpv6:
prober: icmp
timeout: 5s
icmp:
preferred_ip_protocol: ip6
tcp_connect4:
prober: tcp
timeout: 5s
tcp:
preferred_ip_protocol: ip4
tcp_connect6:
prober: tcp
timeout: 5s
tcp:
preferred_ip_protocol: ip6
http_2xx:
prober: http
timeout: 5s
http:
method: GET
# route:
# receiver: pushover-receiver
# mute_time_intervals:
# - quiet_hours
# routes:
# - receiver: blackhole
# match:
# alertname: MaintenanceMode
# #- receiver: blackhole
# # match:
# # alertname: QuietHours
# receivers:
# - name: blackhole
# - name: pushover-receiver
# pushover_configs:
# - token: "{{ vault_pushover_token }}"
# user_key: "{{ vault_pushover_user_key }}"
# inhibit_rules:
# - source_match:
# alertname: MaintenanceMode
# #- source_match:
# # alertname: QuietHours
# time_intervals:
# - name: quiet_hours
# times:
# - start_time: 03:00
# end_time: 15:00
alertmanager_config:
inhibit_rules:
- source_match:
alertname: MaintenanceMode
receivers:
- name: blackhole
- name: pushover-receiver
pushover_configs:
- token: agwd6wv7xveakykb8e5rz7rw3eg2v3
user_key: 28G1x3lT4oUtlck50R1H3e6j8kDHjb
route:
receiver: pushover-receiver
routes:
- match:
alertname: MaintenanceMode
receiver: blackhole
- match:
alertname: PrometheusAlertmanagerE2eDeadManSwitch
receiver: blackhole
- receiver: pushover-receiver
mute_time_intervals:
- quiet_hours
time_intervals:
- name: quiet_hours
time_intervals:
- times:
- start_time: "03:00"
end_time: "15:00"
node_exporter_du_directories:
- /var/log/syslog
- /var/spool/rsyslog
- /var/lib/influxdb
- /var/lib/prometheus
- /var/lib/loki
firewall_ipset_loki:
- 10.255.0.0/24
karma_config:
alertmanager:
interval: 60s
servers:
- name: local
uri: http://localhost:9093
timeout: 10s
proxy: true
readonly: false
healthcheck:
filters:
dms:
- alertname=PrometheusAlertmanagerE2eDeadManSwitch
grid:
sorting:
order: label
reverse: false
label: cluster
customValues:
labels:
severity:
critical: 1
warning: 2
info: 3
auto:
order:
- severity
labels:
color:
custom:
severity:
- value: info
color: "#87c4e0"
- value: warning
color: "#ffae42"
- value: critical
color: "#ff220c"
alertAcknowledgement:
enabled: true
thanos_bucket_config: "{{ vault_thanos_bucket_config }}"
kthxbye_listen: :8081