597 lines
17 KiB
YAML
597 lines
17 KiB
YAML
---
|
|
node_exporter_machine_roles:
|
|
- monitor
|
|
- stats
|
|
|
|
prometheus_web_external_url: https://monitor.kill0.net/prometheus
|
|
alertmanager_web_external_url: https://monitor.kill0.net/alertmanager
|
|
prometheus_web_route_prefix: /
|
|
alertmanager_web_route_prefix: /
|
|
|
|
prometheus_file_sd_config_d_files: []
|
|
|
|
prometheus_config:
|
|
global:
|
|
scrape_interval: 15s
|
|
external_labels:
|
|
cluster: 1
|
|
region: dallas
|
|
provider: linode
|
|
replica: A
|
|
remote_write:
|
|
- url: http://localhost:9009/api/v1/push
|
|
headers:
|
|
X-Scope-OrgID: kill0-net
|
|
alerting:
|
|
alertmanagers:
|
|
- static_configs:
|
|
- targets:
|
|
- localhost:9093
|
|
scrape_configs:
|
|
- job_name: prometheus
|
|
scrape_interval: 5s
|
|
static_configs:
|
|
- targets:
|
|
- localhost:9090
|
|
- job_name: alertmanager
|
|
scrape_interval: 5s
|
|
static_configs:
|
|
- targets:
|
|
- localhost:9093
|
|
- job_name: pushgateway
|
|
scrape_interval: 5s
|
|
static_configs:
|
|
- targets:
|
|
- jump0.kill0.net:9091
|
|
- job_name: node
|
|
scrape_interval: 5s
|
|
static_configs:
|
|
- targets:
|
|
- jump0.kill0.net:9100
|
|
- mine0.kill0.net:9100
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: instance
|
|
regex: (.+):\d+
|
|
replacement: $1
|
|
- job_name: mtail
|
|
scrape_interval: 5s
|
|
static_configs:
|
|
- targets:
|
|
- jump0.kill0.net:3903
|
|
- mine0.kill0.net:3903
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: instance
|
|
regex: (.+):\d+
|
|
replacement: $1
|
|
- job_name: blackbox
|
|
scrape_interval: 5s
|
|
static_configs:
|
|
- targets:
|
|
- jump0.kill0.net:9115
|
|
- mine0.kill0.net:9115
|
|
- job_name: blackbox-icmp4
|
|
metrics_path: /probe
|
|
params:
|
|
module:
|
|
- icmpv4
|
|
static_configs:
|
|
- targets:
|
|
- dns.google
|
|
- vpn-home.kill0.net
|
|
- ping-home.kill0.net
|
|
- 169.254.0.2
|
|
- vpn1-sch.corp.nmi.com
|
|
- gp-chi.ops.nmi.com
|
|
- gp-ash.ops.nmi.com
|
|
- 172.16.100.1
|
|
- 172.16.100.2
|
|
- 172.16.10.16
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: 127.0.0.1:9115 # The blackbox exporter's real hostname:port.
|
|
- job_name: blackbox-icmp6
|
|
metrics_path: /probe
|
|
params:
|
|
module:
|
|
- icmpv6
|
|
static_configs:
|
|
- targets:
|
|
- dns.google
|
|
- ping-home.kill0.net
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: 127.0.0.1:9115 # The blackbox exporter's real hostname:port.
|
|
- job_name: blackbox-tcp4
|
|
metrics_path: /probe
|
|
params:
|
|
module:
|
|
- tcp_connect4
|
|
static_configs:
|
|
- targets:
|
|
- mine0.kill0.net:25565
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: 127.0.0.1:9115 # The blackbox exporter's real hostname:port.
|
|
- job_name: blackbox-tcp6
|
|
metrics_path: /probe
|
|
params:
|
|
module:
|
|
- tcp_connect6
|
|
static_configs:
|
|
- targets:
|
|
- mine0.kill0.net:25565
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: 127.0.0.1:9115 # The blackbox exporter's real hostname:port.
|
|
- job_name: blackbox-http
|
|
metrics_path: /probe
|
|
params:
|
|
module:
|
|
- http_2xx
|
|
static_configs:
|
|
- targets:
|
|
- https://cavi.cc
|
|
- https://git.kill0.net
|
|
- https://stats.kill0.net
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: 127.0.0.1:9115 # The blackbox exporter's real hostname:port.
|
|
- job_name: thanos-sidecar
|
|
scrape_interval: 5s
|
|
static_configs:
|
|
- targets:
|
|
- "localhost:10902"
|
|
- job_name: thanos-query
|
|
scrape_interval: 5s
|
|
static_configs:
|
|
- targets:
|
|
- "localhost:10904"
|
|
- job_name: thanos-store
|
|
scrape_interval: 5s
|
|
static_configs:
|
|
- targets:
|
|
- "localhost:10902"
|
|
- job_name: thanos-compact
|
|
scrape_interval: 5s
|
|
static_configs:
|
|
- targets:
|
|
- "localhost:10912"
|
|
- job_name: grafana
|
|
scrape_interval: 5s
|
|
static_configs:
|
|
- targets:
|
|
- "localhost:3002"
|
|
# - job_name: process-exporter
|
|
# scrape_interval: 5s
|
|
# static_configs:
|
|
# - targets:
|
|
# - "localhost:9256"
|
|
- job_name: loki
|
|
scrape_interval: 5s
|
|
static_configs:
|
|
- targets:
|
|
- "localhost:3100"
|
|
- job_name: promtail
|
|
scrape_interval: 5s
|
|
static_configs:
|
|
- targets:
|
|
- jump0.kill0.net:9080
|
|
- mine0.kill0.net:9080
|
|
- job_name: gitea
|
|
scrape_interval: 5s
|
|
static_configs:
|
|
- targets:
|
|
- localhost:3001
|
|
- job_name: karma
|
|
scrape_interval: 5s
|
|
static_configs:
|
|
- targets:
|
|
- localhost:8080
|
|
- job_name: kthxbye
|
|
scrape_interval: 5s
|
|
static_configs:
|
|
- targets:
|
|
- localhost:8081
|
|
- job_name: smokeping
|
|
scrape_interval: 5s
|
|
static_configs:
|
|
- targets:
|
|
- localhost:9374
|
|
- job_name: mimir
|
|
scrape_interval: 5s
|
|
static_configs:
|
|
- targets:
|
|
- localhost:9009
|
|
- &snmp_job
|
|
job_name: snmp
|
|
static_configs:
|
|
- targets:
|
|
- 172.16.100.1
|
|
- 172.16.100.2
|
|
metrics_path: /snmp
|
|
params:
|
|
auth: [public_v2]
|
|
module:
|
|
- if_mib
|
|
- ip_mib
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: 127.0.0.1:9116
|
|
- job_name: snmp_exporter
|
|
static_configs:
|
|
- targets:
|
|
- localhost:9116
|
|
- <<: *snmp_job
|
|
job_name: snmp-long
|
|
scrape_interval: 30s
|
|
scrape_timeout: 30s
|
|
static_configs:
|
|
- targets: []
|
|
|
|
rule_files:
|
|
- rules.yaml
|
|
|
|
prometheus_rules_config:
|
|
groups:
|
|
- name: alertmanager.rules
|
|
rules:
|
|
- alert: PrometheusAlertmanagerJobMissing
|
|
expr: absent(up{job="alertmanager"})
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "{% raw %} Prometheus AlertManager job missing (instance {{ $labels.instance }}){% endraw %}"
|
|
description: "{% raw %}A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}{% endraw %}"
|
|
- alert: PrometheusAlertmanagerE2eDeadManSwitch
|
|
expr: vector(1)
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "{% raw %}Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }}){% endraw %}"
|
|
description: "{% raw %}Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}{% endraw %}"
|
|
- name: node.rules
|
|
rules:
|
|
- record: is_dst
|
|
expr: |
|
|
(vector(0) and (month() < 3 or month() > 11))
|
|
or
|
|
(vector(1) and (month() > 3 and month() < 11))
|
|
or
|
|
(vector(1) and month() == 3 and (day_of_month() - day_of_week()) >= 8 and absent(day_of_week() == 0 and day_of_month() >= 8 and day_of_month() <= 14))
|
|
or
|
|
(vector(1) and month() == 11 and (day_of_month() - day_of_week()) <= 0)
|
|
or
|
|
(vector(1) and month() == 3 and day_of_month() >= 8 and day_of_month() <= 14 and day_of_week() == 0 and hour() >= 8)
|
|
or
|
|
(vector(1) and month() == 11 and day_of_month() >= 1 and day_of_month() <= 7 and day_of_week() == 0 and hour() < 7)
|
|
or
|
|
vector(0)
|
|
- record: america_chicago_time
|
|
expr: time() - ((6 * 3600) - (3600 * is_dst))
|
|
- record: america_chicago_hour
|
|
expr: hour(america_chicago_time)
|
|
- alert: InstanceDown
|
|
expr: up{job="node"} == 0
|
|
for: 1m
|
|
- alert: ThanosServiceDown
|
|
expr: up{job=~"thanos.+"} == 0
|
|
labels:
|
|
severity: critical
|
|
- alert: Down
|
|
expr: up == 0
|
|
labels:
|
|
severity: critical
|
|
- alert: FileSystemUsage
|
|
expr: ((node_filesystem_size_bytes{mountpoint!~"fuse.lxcfs|tmpfs"} - node_filesystem_free_bytes) / node_filesystem_size_bytes) > 0.80
|
|
for: 1m
|
|
- alert: FileSystemReadOnly
|
|
expr: node_filesystem_readonly{fstype!~"fuse.lxcfs|tmpfs"} == 1
|
|
- alert: RebootRequired
|
|
expr: node_reboot_required > 0
|
|
for: 15m
|
|
- alert: AptUpgradesPending
|
|
expr: apt_upgrades_pending > 0
|
|
for: 1d
|
|
- alert: ResticSystemJobLastRun
|
|
expr: (time() - node_restic_last_run_time{restic_job="system"}) > 7200
|
|
for: 2h
|
|
- alert: ResticMinecraftJobLastRun
|
|
expr: (time() - node_restic_last_run_time{restic_job=~"minecraft"}) > 86400
|
|
for: 2h
|
|
- alert: MinecraftUnitInactive
|
|
expr: node_systemd_unit_state{name="minecraft.service",state="inactive"} == 1
|
|
for: 15m
|
|
- alert: GiteaUnitInactive
|
|
expr: node_systemd_unit_state{name="gitea.service",state="inactive"} == 1
|
|
for: 15m
|
|
- alert: MaintenanceMode
|
|
expr: maintenance_mode == 1
|
|
for: 1m
|
|
- name: blackbox.rules
|
|
rules:
|
|
- alert: ServiceDown
|
|
expr: probe_success{job!~"blackbox-icmp[0-9]"} == 0
|
|
for: 1m
|
|
- alert: PingDown
|
|
expr: probe_success{job=~"blackbox-icmp[0-9]"} == 0
|
|
for: 15s
|
|
- alert: CertExpiry
|
|
expr: ((probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time()) / 86400) < 30
|
|
for: 15s
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
# summary: Certificates expiring in < 30 days
|
|
summary: "{% raw %}Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}){% endraw %}"
|
|
description: "{% raw %}SSL certificate expires in 30 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}{% endraw %}"
|
|
- alert: CertExpiry
|
|
expr: ((probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time()) / 86400) < 14
|
|
for: 15s
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
# summary: Certificates expiring in < 14 days
|
|
summary: "{% raw %}Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}){% endraw %}"
|
|
description: "{% raw %}SSL certificate expires in 14 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}{% endraw %}"
|
|
- name: snmp.rules
|
|
rules:
|
|
- alert: PortDown
|
|
expr: ifAdminStatus{ifName=~"(Gi|eth).+", ifAlias!~".+laptop|notebook.+"} == 1 and ifOperStatus == 2
|
|
for: 1m
|
|
- alert: PortFlapping
|
|
expr: changes(ifOperStatus{ifName=~"(Gi|eth).+"}[5m]) > 2
|
|
|
|
blackbox_exporter_config:
|
|
modules:
|
|
icmpv4:
|
|
prober: icmp
|
|
timeout: 5s
|
|
icmp:
|
|
preferred_ip_protocol: ip4
|
|
icmpv6:
|
|
prober: icmp
|
|
timeout: 5s
|
|
icmp:
|
|
preferred_ip_protocol: ip6
|
|
tcp_connect4:
|
|
prober: tcp
|
|
timeout: 5s
|
|
tcp:
|
|
preferred_ip_protocol: ip4
|
|
tcp_connect6:
|
|
prober: tcp
|
|
timeout: 5s
|
|
tcp:
|
|
preferred_ip_protocol: ip6
|
|
http_2xx:
|
|
prober: http
|
|
timeout: 5s
|
|
http:
|
|
method: GET
|
|
|
|
alertmanager_config:
|
|
inhibit_rules:
|
|
- source_match:
|
|
alertname: MaintenanceMode
|
|
receivers:
|
|
- name: blackhole
|
|
- name: pushover-receiver
|
|
pushover_configs:
|
|
- token: "{{ vault_alertmanager_pushover_token }}"
|
|
user_key: 28G1x3lT4oUtlck50R1H3e6j8kDHjb
|
|
- name: discord
|
|
discord_configs:
|
|
- webhook_url: "{{ vault_alertmanager_discord_webhook_url }}"
|
|
route:
|
|
repeat_interval: 24h
|
|
receiver: pushover-receiver
|
|
routes:
|
|
- match:
|
|
alertname: MaintenanceMode
|
|
receiver: blackhole
|
|
- match:
|
|
alertname: PrometheusAlertmanagerE2eDeadManSwitch
|
|
receiver: blackhole
|
|
- receiver: pushover-receiver
|
|
mute_time_intervals:
|
|
- quiet_hours
|
|
continue: true
|
|
- receiver: discord
|
|
time_intervals:
|
|
- name: quiet_hours
|
|
time_intervals:
|
|
- times:
|
|
- start_time: "03:00"
|
|
end_time: "15:00"
|
|
|
|
node_exporter_du_directories:
|
|
- /var/log/syslog
|
|
- /var/spool/rsyslog
|
|
- /var/lib/influxdb
|
|
- /var/lib/prometheus
|
|
- /var/lib/loki
|
|
|
|
firewall_ipset_loki:
|
|
- 169.254.0.0/24
|
|
|
|
karma_config:
|
|
alertmanager:
|
|
interval: 60s
|
|
servers:
|
|
- name: local
|
|
uri: http://localhost:9093
|
|
timeout: 10s
|
|
proxy: true
|
|
readonly: false
|
|
healthcheck:
|
|
filters:
|
|
dms:
|
|
- alertname=PrometheusAlertmanagerE2eDeadManSwitch
|
|
grid:
|
|
sorting:
|
|
order: label
|
|
reverse: false
|
|
label: cluster
|
|
customValues:
|
|
labels:
|
|
severity:
|
|
critical: 1
|
|
warning: 2
|
|
info: 3
|
|
auto:
|
|
order:
|
|
- severity
|
|
labels:
|
|
color:
|
|
custom:
|
|
severity:
|
|
- value: info
|
|
color: "#87c4e0"
|
|
- value: warning
|
|
color: "#ffae42"
|
|
- value: critical
|
|
color: "#ff220c"
|
|
alertAcknowledgement:
|
|
enabled: true
|
|
|
|
thanos_bucket_config: "{{ vault_thanos_bucket_config }}"
|
|
|
|
kthxbye_listen: :8081
|
|
|
|
loki_storage_config:
|
|
tsdb_shipper:
|
|
active_index_directory: "{{ loki_var_path }}/tsdb-index"
|
|
cache_location: "{{ loki_var_path }}/tsdb-cache"
|
|
gcs:
|
|
bucket_name: kill0-net-loki
|
|
service_account: "{{ vault_loki_gcs_service_account | string }}"
|
|
|
|
loki_schema_config:
|
|
configs:
|
|
- from: 2023-08-11
|
|
index:
|
|
period: 24h
|
|
prefix: index_
|
|
object_store: gcs
|
|
schema: v12
|
|
store: tsdb
|
|
- from: 2024-04-10
|
|
index:
|
|
period: 24h
|
|
prefix: index_
|
|
object_store: gcs
|
|
schema: v13
|
|
store: tsdb
|
|
|
|
loki_query_scheduler:
|
|
max_outstanding_requests_per_tenant: 32768
|
|
|
|
loki_querier:
|
|
max_concurrent: 16
|
|
|
|
loki_compactor:
|
|
working_directory: "{{ loki_var_path }}/retention"
|
|
delete_request_store: gcs
|
|
compaction_interval: 10m
|
|
retention_enabled: true
|
|
retention_delete_delay: 2h
|
|
retention_delete_worker_count: 150
|
|
|
|
loki_ruler:
|
|
alertmanager_url: http://localhost:9093
|
|
storage:
|
|
type: gcs
|
|
gcs:
|
|
bucket_name: kill0-net-loki
|
|
service_account: "{{ vault_loki_gcs_service_account | string }}"
|
|
ring:
|
|
kvstore:
|
|
store: inmemory
|
|
enable_api: true
|
|
|
|
rsyslog_d:
|
|
- name: loki
|
|
priority: 10
|
|
content: |
|
|
if $hostname == [ "ap0", "coresw0", "fw0", "power0", "172.16.100.1", "172.16.100.2" ] then {
|
|
action(
|
|
type="omfwd"
|
|
target="localhost"
|
|
port="1514"
|
|
protocol="tcp"
|
|
action.resumeretrycount="-1"
|
|
queue.type="linkedlist"
|
|
queue.size="1000000"
|
|
queue.filename="loki-fwd"
|
|
queue.saveonshutdown="on"
|
|
keepalive="on"
|
|
template="RSYSLOG_SyslogProtocol23Format"
|
|
tcp_framing="octet-counted"
|
|
)
|
|
}
|
|
|
|
smokeping_prober_config:
|
|
targets:
|
|
- hosts:
|
|
- dns.google
|
|
- vpn-home.kill0.net
|
|
- ping-home.kill0.net
|
|
- vpn1-sch.corp.nmi.com
|
|
- gp-chi.ops.nmi.com
|
|
- gp-ash.ops.nmi.com
|
|
- 169.254.0.2
|
|
- 172.16.100.1
|
|
- 172.16.100.2
|
|
- 172.16.10.16
|
|
network: ip4
|
|
- hosts:
|
|
- dns.google
|
|
- ping-home.kill0.net
|
|
- fc00::ffff:169.255.0.2
|
|
- fc00::ffff:169.255.0.16
|
|
network: ip6
|
|
|
|
mimir_common:
|
|
storage:
|
|
backend: gcs
|
|
gcs:
|
|
bucket_name: kill0-net-mimir
|
|
service_account: "{{ vault_mimir_gcs_service_account | string }}"
|
|
|
|
mimir_blocks_storage:
|
|
storage_prefix: blocks
|
|
|
|
mimir_alertmanager_storage:
|
|
storage_prefix: alertmanager
|
|
|
|
mimir_ruler_storage:
|
|
storage_prefix: ruler
|