ansible/group_vars/monitor_servers/main.yaml

597 lines
17 KiB
YAML

---
node_exporter_machine_roles:
- monitor
- stats
prometheus_web_external_url: https://monitor.kill0.net/prometheus
alertmanager_web_external_url: https://monitor.kill0.net/alertmanager
prometheus_web_route_prefix: /
alertmanager_web_route_prefix: /
prometheus_file_sd_config_d_files: []
prometheus_config:
global:
scrape_interval: 15s
external_labels:
cluster: 1
region: dallas
provider: linode
replica: A
remote_write:
- url: http://localhost:9009/api/v1/push
headers:
X-Scope-OrgID: kill0-net
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093
scrape_configs:
- job_name: prometheus
scrape_interval: 5s
static_configs:
- targets:
- localhost:9090
- job_name: alertmanager
scrape_interval: 5s
static_configs:
- targets:
- localhost:9093
- job_name: pushgateway
scrape_interval: 5s
static_configs:
- targets:
- jump0.kill0.net:9091
- job_name: node
scrape_interval: 5s
static_configs:
- targets:
- jump0.kill0.net:9100
- mine0.kill0.net:9100
relabel_configs:
- source_labels: [__address__]
target_label: instance
regex: (.+):\d+
replacement: $1
- job_name: mtail
scrape_interval: 5s
static_configs:
- targets:
- jump0.kill0.net:3903
- mine0.kill0.net:3903
relabel_configs:
- source_labels: [__address__]
target_label: instance
regex: (.+):\d+
replacement: $1
- job_name: blackbox
scrape_interval: 5s
static_configs:
- targets:
- jump0.kill0.net:9115
- mine0.kill0.net:9115
- job_name: blackbox-icmp4
metrics_path: /probe
params:
module:
- icmpv4
static_configs:
- targets:
- dns.google
- vpn-home.kill0.net
- ping-home.kill0.net
- 169.254.0.2
- vpn1-sch.corp.nmi.com
- gp-chi.ops.nmi.com
- gp-ash.ops.nmi.com
- 172.16.100.1
- 172.16.100.2
- 172.16.10.16
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115 # The blackbox exporter's real hostname:port.
- job_name: blackbox-icmp6
metrics_path: /probe
params:
module:
- icmpv6
static_configs:
- targets:
- dns.google
- ping-home.kill0.net
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115 # The blackbox exporter's real hostname:port.
- job_name: blackbox-tcp4
metrics_path: /probe
params:
module:
- tcp_connect4
static_configs:
- targets:
- mine0.kill0.net:25565
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115 # The blackbox exporter's real hostname:port.
- job_name: blackbox-tcp6
metrics_path: /probe
params:
module:
- tcp_connect6
static_configs:
- targets:
- mine0.kill0.net:25565
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115 # The blackbox exporter's real hostname:port.
- job_name: blackbox-http
metrics_path: /probe
params:
module:
- http_2xx
static_configs:
- targets:
- https://cavi.cc
- https://git.kill0.net
- https://stats.kill0.net
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115 # The blackbox exporter's real hostname:port.
- job_name: thanos-sidecar
scrape_interval: 5s
static_configs:
- targets:
- "localhost:10902"
- job_name: thanos-query
scrape_interval: 5s
static_configs:
- targets:
- "localhost:10904"
- job_name: thanos-store
scrape_interval: 5s
static_configs:
- targets:
- "localhost:10902"
- job_name: thanos-compact
scrape_interval: 5s
static_configs:
- targets:
- "localhost:10912"
- job_name: grafana
scrape_interval: 5s
static_configs:
- targets:
- "localhost:3002"
# - job_name: process-exporter
# scrape_interval: 5s
# static_configs:
# - targets:
# - "localhost:9256"
- job_name: loki
scrape_interval: 5s
static_configs:
- targets:
- "localhost:3100"
- job_name: promtail
scrape_interval: 5s
static_configs:
- targets:
- jump0.kill0.net:9080
- mine0.kill0.net:9080
- job_name: gitea
scrape_interval: 5s
static_configs:
- targets:
- localhost:3001
- job_name: karma
scrape_interval: 5s
static_configs:
- targets:
- localhost:8080
- job_name: kthxbye
scrape_interval: 5s
static_configs:
- targets:
- localhost:8081
- job_name: smokeping
scrape_interval: 5s
static_configs:
- targets:
- localhost:9374
- job_name: mimir
scrape_interval: 5s
static_configs:
- targets:
- localhost:9009
- &snmp_job
job_name: snmp
static_configs:
- targets:
- 172.16.100.1
- 172.16.100.2
metrics_path: /snmp
params:
auth: [public_v2]
module:
- if_mib
- ip_mib
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9116
- job_name: snmp_exporter
static_configs:
- targets:
- localhost:9116
- <<: *snmp_job
job_name: snmp-long
scrape_interval: 30s
scrape_timeout: 30s
static_configs:
- targets: []
rule_files:
- rules.yaml
prometheus_rules_config:
groups:
- name: alertmanager.rules
rules:
- alert: PrometheusAlertmanagerJobMissing
expr: absent(up{job="alertmanager"})
for: 0m
labels:
severity: warning
annotations:
summary: "{% raw %} Prometheus AlertManager job missing (instance {{ $labels.instance }}){% endraw %}"
description: "{% raw %}A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}{% endraw %}"
- alert: PrometheusAlertmanagerE2eDeadManSwitch
expr: vector(1)
for: 0m
labels:
severity: critical
annotations:
summary: "{% raw %}Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }}){% endraw %}"
description: "{% raw %}Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}{% endraw %}"
- name: node.rules
rules:
- record: is_dst
expr: |
(vector(0) and (month() < 3 or month() > 11))
or
(vector(1) and (month() > 3 and month() < 11))
or
(vector(1) and month() == 3 and (day_of_month() - day_of_week()) >= 8 and absent(day_of_week() == 0 and day_of_month() >= 8 and day_of_month() <= 14))
or
(vector(1) and month() == 11 and (day_of_month() - day_of_week()) <= 0)
or
(vector(1) and month() == 3 and day_of_month() >= 8 and day_of_month() <= 14 and day_of_week() == 0 and hour() >= 8)
or
(vector(1) and month() == 11 and day_of_month() >= 1 and day_of_month() <= 7 and day_of_week() == 0 and hour() < 7)
or
vector(0)
- record: america_chicago_time
expr: time() - ((6 * 3600) - (3600 * is_dst))
- record: america_chicago_hour
expr: hour(america_chicago_time)
- alert: InstanceDown
expr: up{job="node"} == 0
for: 1m
- alert: ThanosServiceDown
expr: up{job=~"thanos.+"} == 0
labels:
severity: critical
- alert: Down
expr: up == 0
labels:
severity: critical
- alert: FileSystemUsage
expr: ((node_filesystem_size_bytes{mountpoint!~"fuse.lxcfs|tmpfs"} - node_filesystem_free_bytes) / node_filesystem_size_bytes) > 0.80
for: 1m
- alert: FileSystemReadOnly
expr: node_filesystem_readonly{fstype!~"fuse.lxcfs|tmpfs"} == 1
- alert: RebootRequired
expr: node_reboot_required > 0
for: 15m
- alert: AptUpgradesPending
expr: apt_upgrades_pending > 0
for: 1d
- alert: ResticSystemJobLastRun
expr: (time() - node_restic_last_run_time{restic_job="system"}) > 7200
for: 2h
- alert: ResticMinecraftJobLastRun
expr: (time() - node_restic_last_run_time{restic_job=~"minecraft"}) > 86400
for: 2h
- alert: MinecraftUnitInactive
expr: node_systemd_unit_state{name="minecraft.service",state="inactive"} == 1
for: 15m
- alert: GiteaUnitInactive
expr: node_systemd_unit_state{name="gitea.service",state="inactive"} == 1
for: 15m
- alert: MaintenanceMode
expr: maintenance_mode == 1
for: 1m
- name: blackbox.rules
rules:
- alert: ServiceDown
expr: probe_success{job!~"blackbox-icmp[0-9]"} == 0
for: 1m
- alert: PingDown
expr: probe_success{job=~"blackbox-icmp[0-9]"} == 0
for: 15s
- alert: CertExpiry
expr: ((probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time()) / 86400) < 30
for: 15s
labels:
severity: warning
annotations:
# summary: Certificates expiring in < 30 days
summary: "{% raw %}Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}){% endraw %}"
description: "{% raw %}SSL certificate expires in 30 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}{% endraw %}"
- alert: CertExpiry
expr: ((probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time()) / 86400) < 14
for: 15s
labels:
severity: critical
annotations:
# summary: Certificates expiring in < 14 days
summary: "{% raw %}Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}){% endraw %}"
description: "{% raw %}SSL certificate expires in 14 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}{% endraw %}"
- name: snmp.rules
rules:
- alert: PortDown
expr: ifAdminStatus{ifName=~"(Gi|eth).+", ifAlias!~".+laptop|notebook.+"} == 1 and ifOperStatus == 2
for: 1m
- alert: PortFlapping
expr: changes(ifOperStatus{ifName=~"(Gi|eth).+"}[5m]) > 2
blackbox_exporter_config:
modules:
icmpv4:
prober: icmp
timeout: 5s
icmp:
preferred_ip_protocol: ip4
icmpv6:
prober: icmp
timeout: 5s
icmp:
preferred_ip_protocol: ip6
tcp_connect4:
prober: tcp
timeout: 5s
tcp:
preferred_ip_protocol: ip4
tcp_connect6:
prober: tcp
timeout: 5s
tcp:
preferred_ip_protocol: ip6
http_2xx:
prober: http
timeout: 5s
http:
method: GET
alertmanager_config:
inhibit_rules:
- source_match:
alertname: MaintenanceMode
receivers:
- name: blackhole
- name: pushover-receiver
pushover_configs:
- token: "{{ vault_alertmanager_pushover_token }}"
user_key: 28G1x3lT4oUtlck50R1H3e6j8kDHjb
- name: discord
discord_configs:
- webhook_url: "{{ vault_alertmanager_discord_webhook_url }}"
route:
repeat_interval: 24h
receiver: pushover-receiver
routes:
- match:
alertname: MaintenanceMode
receiver: blackhole
- match:
alertname: PrometheusAlertmanagerE2eDeadManSwitch
receiver: blackhole
- receiver: pushover-receiver
mute_time_intervals:
- quiet_hours
continue: true
- receiver: discord
time_intervals:
- name: quiet_hours
time_intervals:
- times:
- start_time: "03:00"
end_time: "15:00"
node_exporter_du_directories:
- /var/log/syslog
- /var/spool/rsyslog
- /var/lib/influxdb
- /var/lib/prometheus
- /var/lib/loki
firewall_ipset_loki:
- 169.254.0.0/24
karma_config:
alertmanager:
interval: 60s
servers:
- name: local
uri: http://localhost:9093
timeout: 10s
proxy: true
readonly: false
healthcheck:
filters:
dms:
- alertname=PrometheusAlertmanagerE2eDeadManSwitch
grid:
sorting:
order: label
reverse: false
label: cluster
customValues:
labels:
severity:
critical: 1
warning: 2
info: 3
auto:
order:
- severity
labels:
color:
custom:
severity:
- value: info
color: "#87c4e0"
- value: warning
color: "#ffae42"
- value: critical
color: "#ff220c"
alertAcknowledgement:
enabled: true
thanos_bucket_config: "{{ vault_thanos_bucket_config }}"
kthxbye_listen: :8081
loki_storage_config:
tsdb_shipper:
active_index_directory: "{{ loki_var_path }}/tsdb-index"
cache_location: "{{ loki_var_path }}/tsdb-cache"
gcs:
bucket_name: kill0-net-loki
service_account: "{{ vault_loki_gcs_service_account | string }}"
loki_schema_config:
configs:
- from: 2023-08-11
index:
period: 24h
prefix: index_
object_store: gcs
schema: v12
store: tsdb
- from: 2024-04-10
index:
period: 24h
prefix: index_
object_store: gcs
schema: v13
store: tsdb
loki_query_scheduler:
max_outstanding_requests_per_tenant: 32768
loki_querier:
max_concurrent: 16
loki_compactor:
working_directory: "{{ loki_var_path }}/retention"
delete_request_store: gcs
compaction_interval: 10m
retention_enabled: true
retention_delete_delay: 2h
retention_delete_worker_count: 150
loki_ruler:
alertmanager_url: http://localhost:9093
storage:
type: gcs
gcs:
bucket_name: kill0-net-loki
service_account: "{{ vault_loki_gcs_service_account | string }}"
ring:
kvstore:
store: inmemory
enable_api: true
rsyslog_d:
- name: loki
priority: 10
content: |
if $hostname == [ "ap0", "coresw0", "fw0", "power0", "172.16.100.1", "172.16.100.2" ] then {
action(
type="omfwd"
target="localhost"
port="1514"
protocol="tcp"
action.resumeretrycount="-1"
queue.type="linkedlist"
queue.size="1000000"
queue.filename="loki-fwd"
queue.saveonshutdown="on"
keepalive="on"
template="RSYSLOG_SyslogProtocol23Format"
tcp_framing="octet-counted"
)
}
smokeping_prober_config:
targets:
- hosts:
- dns.google
- vpn-home.kill0.net
- ping-home.kill0.net
- vpn1-sch.corp.nmi.com
- gp-chi.ops.nmi.com
- gp-ash.ops.nmi.com
- 169.254.0.2
- 172.16.100.1
- 172.16.100.2
- 172.16.10.16
network: ip4
- hosts:
- dns.google
- ping-home.kill0.net
- fc00::ffff:169.255.0.2
- fc00::ffff:169.255.0.16
network: ip6
mimir_common:
storage:
backend: gcs
gcs:
bucket_name: kill0-net-mimir
service_account: "{{ vault_mimir_gcs_service_account | string }}"
mimir_blocks_storage:
storage_prefix: blocks
mimir_alertmanager_storage:
storage_prefix: alertmanager
mimir_ruler_storage:
storage_prefix: ruler