2022-08-30 12:41:17 +00:00
---
node_exporter_machine_roles :
- monitor
- stats
prometheus_web_external_url : https://monitor.kill0.net/prometheus
alertmanager_web_external_url : https://monitor.kill0.net/alertmanager
prometheus_web_route_prefix : /
alertmanager_web_route_prefix : /
2024-04-14 23:22:41 +00:00
prometheus_file_sd_config_d_files : [ ]
2022-08-30 12:41:17 +00:00
prometheus_config :
global :
scrape_interval : 15s
external_labels :
cluster : 1
region : dallas
provider : linode
replica : A
2024-04-14 23:22:41 +00:00
remote_write :
- url : http://localhost:9009/api/v1/push
headers :
X-Scope-OrgID : kill0-net
2022-08-30 12:41:17 +00:00
alerting :
alertmanagers :
- static_configs :
2022-08-31 18:30:59 +00:00
- targets :
- localhost:9093
2022-08-30 12:41:17 +00:00
scrape_configs :
- job_name : prometheus
scrape_interval : 5s
static_configs :
- targets :
2022-08-31 18:30:59 +00:00
- localhost:9090
2022-08-30 12:41:17 +00:00
- job_name : alertmanager
scrape_interval : 5s
static_configs :
- targets :
2022-08-31 18:30:59 +00:00
- localhost:9093
2022-08-30 12:41:17 +00:00
- job_name : pushgateway
scrape_interval : 5s
static_configs :
- targets :
2022-08-31 18:30:59 +00:00
- jump0.kill0.net:9091
2022-08-30 12:41:17 +00:00
- job_name : node
scrape_interval : 5s
static_configs :
- targets :
2022-08-31 18:30:59 +00:00
- jump0.kill0.net:9100
- mine0.kill0.net:9100
2022-08-30 12:41:17 +00:00
relabel_configs :
- source_labels : [ __address__]
target_label : instance
regex : (.+):\d+
replacement : $1
- job_name : mtail
scrape_interval : 5s
static_configs :
- targets :
2022-08-31 18:30:59 +00:00
- jump0.kill0.net:3903
- mine0.kill0.net:3903
2022-08-30 12:41:17 +00:00
relabel_configs :
- source_labels : [ __address__]
target_label : instance
regex : (.+):\d+
replacement : $1
- job_name : blackbox
scrape_interval : 5s
static_configs :
- targets :
2022-08-31 18:30:59 +00:00
- jump0.kill0.net:9115
- mine0.kill0.net:9115
2022-08-30 12:41:17 +00:00
- job_name : blackbox-icmp4
metrics_path : /probe
params :
module :
- icmpv4
static_configs :
- targets :
2022-08-31 18:30:59 +00:00
- dns.google
- vpn-home.kill0.net
- ping-home.kill0.net
2024-04-14 23:04:20 +00:00
- 169.254 .0 .2
2022-08-31 18:30:59 +00:00
- vpn1-sch.corp.nmi.com
2024-04-14 23:04:20 +00:00
- gp-chi.ops.nmi.com
- gp-ash.ops.nmi.com
- 172.16 .100 .1
- 172.16 .100 .2
- 172.16 .10 .16
2022-08-30 12:41:17 +00:00
relabel_configs :
- source_labels : [ __address__]
target_label : __param_target
- source_labels : [ __param_target]
target_label : instance
- target_label : __address__
replacement : 127.0 .0 .1 : 9115 # The blackbox exporter's real hostname:port.
- job_name : blackbox-icmp6
metrics_path : /probe
params :
module :
- icmpv6
static_configs :
- targets :
2022-08-31 18:30:59 +00:00
- dns.google
- ping-home.kill0.net
2022-08-30 12:41:17 +00:00
relabel_configs :
- source_labels : [ __address__]
target_label : __param_target
- source_labels : [ __param_target]
target_label : instance
- target_label : __address__
replacement : 127.0 .0 .1 : 9115 # The blackbox exporter's real hostname:port.
- job_name : blackbox-tcp4
metrics_path : /probe
params :
module :
- tcp_connect4
static_configs :
- targets :
2022-08-31 18:30:59 +00:00
- mine0.kill0.net:25565
2022-08-30 12:41:17 +00:00
relabel_configs :
- source_labels : [ __address__]
target_label : __param_target
- source_labels : [ __param_target]
target_label : instance
- target_label : __address__
replacement : 127.0 .0 .1 : 9115 # The blackbox exporter's real hostname:port.
- job_name : blackbox-tcp6
metrics_path : /probe
params :
module :
- tcp_connect6
static_configs :
- targets :
2022-08-31 18:30:59 +00:00
- mine0.kill0.net:25565
2022-08-30 12:41:17 +00:00
relabel_configs :
- source_labels : [ __address__]
target_label : __param_target
- source_labels : [ __param_target]
target_label : instance
- target_label : __address__
replacement : 127.0 .0 .1 : 9115 # The blackbox exporter's real hostname:port.
- job_name : blackbox-http
metrics_path : /probe
params :
module :
- http_2xx
static_configs :
- targets :
2022-08-31 18:30:59 +00:00
- https://cavi.cc
- https://git.kill0.net
- https://stats.kill0.net
2022-08-30 12:41:17 +00:00
relabel_configs :
- source_labels : [ __address__]
target_label : __param_target
- source_labels : [ __param_target]
target_label : instance
- target_label : __address__
replacement : 127.0 .0 .1 : 9115 # The blackbox exporter's real hostname:port.
- job_name : thanos-sidecar
scrape_interval : 5s
static_configs :
2022-08-31 18:30:59 +00:00
- targets :
- "localhost:10902"
2022-08-30 12:41:17 +00:00
- job_name : thanos-query
scrape_interval : 5s
static_configs :
2022-08-31 18:30:59 +00:00
- targets :
- "localhost:10904"
2022-08-30 12:41:17 +00:00
- job_name : thanos-store
scrape_interval : 5s
static_configs :
2022-08-31 18:30:59 +00:00
- targets :
- "localhost:10902"
2022-08-30 12:41:17 +00:00
- job_name : thanos-compact
scrape_interval : 5s
static_configs :
2022-08-31 18:30:59 +00:00
- targets :
- "localhost:10912"
2022-09-01 21:40:12 +00:00
- job_name : grafana
scrape_interval : 5s
static_configs :
- targets :
- "localhost:3002"
2024-04-14 23:22:41 +00:00
# - job_name: process-exporter
# scrape_interval: 5s
# static_configs:
# - targets:
# - "localhost:9256"
- job_name : loki
scrape_interval : 5s
static_configs :
- targets :
- "localhost:3100"
- job_name : promtail
scrape_interval : 5s
static_configs :
- targets :
- jump0.kill0.net:9080
- mine0.kill0.net:9080
- job_name : gitea
scrape_interval : 5s
static_configs :
- targets :
- localhost:3001
- job_name : karma
scrape_interval : 5s
static_configs :
- targets :
- localhost:8080
- job_name : kthxbye
scrape_interval : 5s
static_configs :
- targets :
- localhost:8081
- job_name : smokeping
scrape_interval : 5s
static_configs :
- targets :
- localhost:9374
- job_name : mimir
scrape_interval : 5s
static_configs :
- targets :
- localhost:9009
- &snmp_job
job_name : snmp
static_configs :
- targets :
- 172.16 .100 .1
- 172.16 .100 .2
metrics_path : /snmp
params :
auth : [ public_v2]
module :
- if_mib
- ip_mib
relabel_configs :
- source_labels : [ __address__]
target_label : __param_target
- source_labels : [ __param_target]
target_label : instance
- target_label : __address__
replacement : 127.0 .0 .1 : 9116
- job_name : snmp_exporter
static_configs :
- targets :
- localhost:9116
- << : *snmp_job
job_name : snmp-long
scrape_interval : 30s
scrape_timeout : 30s
static_configs :
- targets : [ ]
2022-08-30 12:41:17 +00:00
rule_files :
- rules.yaml
prometheus_rules_config :
groups :
- name : alertmanager.rules
rules :
- alert : PrometheusAlertmanagerJobMissing
expr : absent(up{job="alertmanager"})
for : 0m
labels :
severity : warning
annotations :
summary : "{% raw %} Prometheus AlertManager job missing (instance {{ $labels.instance }}){% endraw %}"
description : "{% raw %}A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}{% endraw %}"
- alert : PrometheusAlertmanagerE2eDeadManSwitch
expr : vector(1)
for : 0m
labels :
severity : critical
annotations :
summary : "{% raw %}Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }}){% endraw %}"
description : "{% raw %}Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}{% endraw %}"
- name : node.rules
rules :
- record : is_dst
expr : |
(vector(0) and (month() < 3 or month() > 11))
or
(vector(1) and (month() > 3 and month() < 11))
or
(vector(1) and month() == 3 and (day_of_month() - day_of_week()) >= 8 and absent(day_of_week() == 0 and day_of_month() >= 8 and day_of_month() <= 14))
or
(vector(1) and month() == 11 and (day_of_month() - day_of_week()) <= 0)
or
(vector(1) and month() == 3 and day_of_month() >= 8 and day_of_month() <= 14 and day_of_week() == 0 and hour() >= 8)
or
(vector(1) and month() == 11 and day_of_month() >= 1 and day_of_month() <= 7 and day_of_week() == 0 and hour() < 7)
or
vector(0)
- record : america_chicago_time
expr : time() - ((6 * 3600) - (3600 * is_dst))
- record : america_chicago_hour
expr : hour(america_chicago_time)
- alert : InstanceDown
expr : up{job="node"} == 0
for : 1m
- alert : ThanosServiceDown
expr : up{job=~"thanos.+"} == 0
labels :
severity : critical
2024-04-14 23:22:41 +00:00
- alert : Down
expr : up == 0
labels :
severity : critical
2022-08-30 12:41:17 +00:00
- alert : FileSystemUsage
expr : ((node_filesystem_size_bytes{mountpoint!~"fuse.lxcfs|tmpfs"} - node_filesystem_free_bytes) / node_filesystem_size_bytes) > 0.80
for : 1m
- alert : FileSystemReadOnly
expr : node_filesystem_readonly{fstype!~"fuse.lxcfs|tmpfs"} == 1
- alert : RebootRequired
expr : node_reboot_required > 0
for : 15m
- alert : AptUpgradesPending
expr : apt_upgrades_pending > 0
for : 1d
- alert : ResticSystemJobLastRun
expr : (time() - node_restic_last_run_time{restic_job="system"}) > 7200
for : 2h
- alert : ResticMinecraftJobLastRun
expr : (time() - node_restic_last_run_time{restic_job=~"minecraft"}) > 86400
for : 2h
- alert : MinecraftUnitInactive
2022-08-31 18:30:59 +00:00
expr : node_systemd_unit_state{name="minecraft.service",state="inactive"} == 1
2022-08-30 12:41:17 +00:00
for : 15m
- alert : GiteaUnitInactive
2022-08-31 18:30:59 +00:00
expr : node_systemd_unit_state{name="gitea.service",state="inactive"} == 1
2022-08-30 12:41:17 +00:00
for : 15m
- alert : MaintenanceMode
expr : maintenance_mode == 1
for : 1m
- name : blackbox.rules
rules :
- alert : ServiceDown
expr : probe_success{job!~"blackbox-icmp[0-9]"} == 0
for : 1m
2022-08-31 18:30:59 +00:00
- alert : PingDown
2022-08-30 12:41:17 +00:00
expr : probe_success{job=~"blackbox-icmp[0-9]"} == 0
for : 15s
- alert : CertExpiry
expr : ((probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time()) / 86400) < 30
for : 15s
labels :
severity : warning
annotations :
# summary: Certificates expiring in < 30 days
summary : "{% raw %}Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}){% endraw %}"
description : "{% raw %}SSL certificate expires in 30 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}{% endraw %}"
2022-08-31 18:30:59 +00:00
- alert : CertExpiry
2022-08-30 12:41:17 +00:00
expr : ((probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time()) / 86400) < 14
for : 15s
labels :
severity : critical
annotations :
# summary: Certificates expiring in < 14 days
summary : "{% raw %}Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}){% endraw %}"
description : "{% raw %}SSL certificate expires in 14 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}{% endraw %}"
2024-04-14 23:22:41 +00:00
- name : snmp.rules
rules :
- alert : PortDown
expr : ifAdminStatus{ifName=~"(Gi|eth).+", ifAlias!~".+laptop|notebook.+"} == 1 and ifOperStatus == 2
for : 1m
- alert : PortFlapping
expr : changes(ifOperStatus{ifName=~"(Gi|eth).+"}[5m]) > 2
2022-08-30 12:41:17 +00:00
blackbox_exporter_config :
modules :
icmpv4 :
prober : icmp
timeout : 5s
icmp :
preferred_ip_protocol : ip4
icmpv6 :
prober : icmp
timeout : 5s
icmp :
preferred_ip_protocol : ip6
tcp_connect4 :
prober : tcp
timeout : 5s
tcp :
preferred_ip_protocol : ip4
tcp_connect6 :
prober : tcp
timeout : 5s
tcp :
preferred_ip_protocol : ip6
http_2xx :
prober : http
timeout : 5s
http :
method : GET
alertmanager_config :
inhibit_rules :
- source_match :
alertname : MaintenanceMode
receivers :
- name : blackhole
- name : pushover-receiver
pushover_configs :
2024-04-14 23:22:41 +00:00
- token : "{{ vault_alertmanager_pushover_token }}"
2022-08-30 12:41:17 +00:00
user_key : 28G1x3lT4oUtlck50R1H3e6j8kDHjb
2024-04-14 23:22:41 +00:00
- name : discord
discord_configs :
- webhook_url : "{{ vault_alertmanager_discord_webhook_url }}"
2022-08-30 12:41:17 +00:00
route :
2024-04-14 23:22:41 +00:00
repeat_interval : 24h
2022-08-30 12:41:17 +00:00
receiver : pushover-receiver
routes :
- match :
alertname : MaintenanceMode
receiver : blackhole
- match :
alertname : PrometheusAlertmanagerE2eDeadManSwitch
receiver : blackhole
- receiver : pushover-receiver
mute_time_intervals :
- quiet_hours
2024-04-14 23:22:41 +00:00
continue : true
- receiver : discord
2022-08-30 12:41:17 +00:00
time_intervals :
- name : quiet_hours
time_intervals :
- times :
- start_time : "03:00"
end_time : "15:00"
2022-08-31 18:30:59 +00:00
2022-08-30 12:41:17 +00:00
node_exporter_du_directories :
- /var/log/syslog
- /var/spool/rsyslog
- /var/lib/influxdb
- /var/lib/prometheus
- /var/lib/loki
firewall_ipset_loki :
2024-04-14 23:04:20 +00:00
- 169.254 .0 .0 /24
2022-08-30 12:41:17 +00:00
karma_config :
alertmanager :
interval : 60s
servers :
- name : local
uri : http://localhost:9093
timeout : 10s
proxy : true
readonly : false
healthcheck :
filters :
dms :
- alertname=PrometheusAlertmanagerE2eDeadManSwitch
grid :
sorting :
order : label
reverse : false
label : cluster
customValues :
labels :
severity :
critical : 1
warning : 2
info : 3
auto :
order :
- severity
labels :
color :
custom :
severity :
- value : info
color : "#87c4e0"
- value : warning
color : "#ffae42"
- value : critical
color : "#ff220c"
alertAcknowledgement :
enabled : true
thanos_bucket_config : "{{ vault_thanos_bucket_config }}"
kthxbye_listen : : 8081
2024-04-14 23:22:41 +00:00
2024-04-14 23:23:28 +00:00
loki_storage_config :
tsdb_shipper :
active_index_directory : "{{ loki_var_path }}/tsdb-index"
cache_location : "{{ loki_var_path }}/tsdb-cache"
gcs :
bucket_name : kill0-net-loki
service_account : "{{ vault_loki_gcs_service_account | string }}"
loki_schema_config :
configs :
- from : 2023-08-11
index :
period : 24h
prefix : index_
object_store : gcs
schema : v12
store : tsdb
- from : 2024-04-10
index :
period : 24h
prefix : index_
object_store : gcs
schema : v13
store : tsdb
loki_query_scheduler :
max_outstanding_requests_per_tenant : 32768
loki_querier :
max_concurrent : 16
loki_compactor :
working_directory : "{{ loki_var_path }}/retention"
delete_request_store : gcs
compaction_interval : 10m
retention_enabled : true
retention_delete_delay : 2h
retention_delete_worker_count : 150
loki_ruler :
alertmanager_url : http://localhost:9093
storage :
type : gcs
gcs :
bucket_name : kill0-net-loki
service_account : "{{ vault_loki_gcs_service_account | string }}"
ring :
kvstore :
store : inmemory
enable_api : true
rsyslog_d :
- name : loki
priority : 10
content : |
if $hostname == [ "ap0", "coresw0", "fw0", "power0", "172.16.100.1", "172.16.100.2" ] then {
action(
type="omfwd"
target="localhost"
port="1514"
protocol="tcp"
action.resumeretrycount="-1"
queue.type="linkedlist"
queue.size="1000000"
queue.filename="loki-fwd"
queue.saveonshutdown="on"
keepalive="on"
template="RSYSLOG_SyslogProtocol23Format"
tcp_framing="octet-counted"
)
}
2024-04-14 23:22:41 +00:00
smokeping_prober_config :
targets :
- hosts :
- dns.google
- vpn-home.kill0.net
- ping-home.kill0.net
- vpn1-sch.corp.nmi.com
- gp-chi.ops.nmi.com
- gp-ash.ops.nmi.com
- 169.254 .0 .2
- 172.16 .100 .1
- 172.16 .100 .2
- 172.16 .10 .16
network : ip4
- hosts :
- dns.google
- ping-home.kill0.net
- fc00::ffff:169.255.0.2
- fc00::ffff:169.255.0.16
network : ip6
2024-04-14 23:23:41 +00:00
mimir_common :
storage :
backend : gcs
gcs :
bucket_name : kill0-net-mimir
service_account : "{{ vault_mimir_gcs_service_account | string }}"
mimir_blocks_storage :
storage_prefix : blocks
mimir_alertmanager_storage :
storage_prefix : alertmanager
mimir_ruler_storage :
storage_prefix : ruler