--- node_exporter_machine_roles: - monitor - stats prometheus_web_external_url: https://monitor.kill0.net/prometheus alertmanager_web_external_url: https://monitor.kill0.net/alertmanager prometheus_web_route_prefix: / alertmanager_web_route_prefix: / prometheus_file_sd_config_d_files: [] prometheus_config: global: scrape_interval: 15s external_labels: cluster: 1 region: dallas provider: linode replica: A remote_write: - url: http://localhost:9009/api/v1/push headers: X-Scope-OrgID: kill0-net alerting: alertmanagers: - static_configs: - targets: - localhost:9093 scrape_configs: - job_name: prometheus scrape_interval: 5s static_configs: - targets: - localhost:9090 - job_name: alertmanager scrape_interval: 5s static_configs: - targets: - localhost:9093 - job_name: pushgateway scrape_interval: 5s static_configs: - targets: - jump0.kill0.net:9091 - job_name: node scrape_interval: 5s static_configs: - targets: - jump0.kill0.net:9100 - mine0.kill0.net:9100 relabel_configs: - source_labels: [__address__] target_label: instance regex: (.+):\d+ replacement: $1 - job_name: mtail scrape_interval: 5s static_configs: - targets: - jump0.kill0.net:3903 - mine0.kill0.net:3903 relabel_configs: - source_labels: [__address__] target_label: instance regex: (.+):\d+ replacement: $1 - job_name: blackbox scrape_interval: 5s static_configs: - targets: - jump0.kill0.net:9115 - mine0.kill0.net:9115 - job_name: blackbox-icmp4 metrics_path: /probe params: module: - icmpv4 static_configs: - targets: - dns.google - vpn-home.kill0.net - ping-home.kill0.net - 169.254.0.2 - vpn1-sch.corp.nmi.com - gp-chi.ops.nmi.com - gp-ash.ops.nmi.com - 172.16.100.1 - 172.16.100.2 - 172.16.10.16 relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: 127.0.0.1:9115 # The blackbox exporter's real hostname:port. - job_name: blackbox-icmp6 metrics_path: /probe params: module: - icmpv6 static_configs: - targets: - dns.google - ping-home.kill0.net relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: 127.0.0.1:9115 # The blackbox exporter's real hostname:port. - job_name: blackbox-tcp4 metrics_path: /probe params: module: - tcp_connect4 static_configs: - targets: - mine0.kill0.net:25565 relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: 127.0.0.1:9115 # The blackbox exporter's real hostname:port. - job_name: blackbox-tcp6 metrics_path: /probe params: module: - tcp_connect6 static_configs: - targets: - mine0.kill0.net:25565 relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: 127.0.0.1:9115 # The blackbox exporter's real hostname:port. - job_name: blackbox-http metrics_path: /probe params: module: - http_2xx static_configs: - targets: - https://cavi.cc - https://git.kill0.net - https://stats.kill0.net relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: 127.0.0.1:9115 # The blackbox exporter's real hostname:port. - job_name: thanos-sidecar scrape_interval: 5s static_configs: - targets: - "localhost:10902" - job_name: thanos-query scrape_interval: 5s static_configs: - targets: - "localhost:10904" - job_name: thanos-store scrape_interval: 5s static_configs: - targets: - "localhost:10902" - job_name: thanos-compact scrape_interval: 5s static_configs: - targets: - "localhost:10912" - job_name: grafana scrape_interval: 5s static_configs: - targets: - "localhost:3002" # - job_name: process-exporter # scrape_interval: 5s # static_configs: # - targets: # - "localhost:9256" - job_name: loki scrape_interval: 5s static_configs: - targets: - "localhost:3100" - job_name: promtail scrape_interval: 5s static_configs: - targets: - jump0.kill0.net:9080 - mine0.kill0.net:9080 - job_name: gitea scrape_interval: 5s static_configs: - targets: - localhost:3001 - job_name: karma scrape_interval: 5s static_configs: - targets: - localhost:8080 - job_name: kthxbye scrape_interval: 5s static_configs: - targets: - localhost:8081 - job_name: smokeping scrape_interval: 5s static_configs: - targets: - localhost:9374 - job_name: mimir scrape_interval: 5s static_configs: - targets: - localhost:9009 - &snmp_job job_name: snmp static_configs: - targets: - 172.16.100.1 - 172.16.100.2 metrics_path: /snmp params: auth: [public_v2] module: - if_mib - ip_mib relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: 127.0.0.1:9116 - job_name: snmp_exporter static_configs: - targets: - localhost:9116 - <<: *snmp_job job_name: snmp-long scrape_interval: 30s scrape_timeout: 30s static_configs: - targets: [] rule_files: - rules.yaml prometheus_rules_config: groups: - name: alertmanager.rules rules: - alert: PrometheusAlertmanagerJobMissing expr: absent(up{job="alertmanager"}) for: 0m labels: severity: warning annotations: summary: "{% raw %} Prometheus AlertManager job missing (instance {{ $labels.instance }}){% endraw %}" description: "{% raw %}A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}{% endraw %}" - alert: PrometheusAlertmanagerE2eDeadManSwitch expr: vector(1) for: 0m labels: severity: critical annotations: summary: "{% raw %}Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }}){% endraw %}" description: "{% raw %}Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}{% endraw %}" - name: node.rules rules: - record: is_dst expr: | (vector(0) and (month() < 3 or month() > 11)) or (vector(1) and (month() > 3 and month() < 11)) or (vector(1) and month() == 3 and (day_of_month() - day_of_week()) >= 8 and absent(day_of_week() == 0 and day_of_month() >= 8 and day_of_month() <= 14)) or (vector(1) and month() == 11 and (day_of_month() - day_of_week()) <= 0) or (vector(1) and month() == 3 and day_of_month() >= 8 and day_of_month() <= 14 and day_of_week() == 0 and hour() >= 8) or (vector(1) and month() == 11 and day_of_month() >= 1 and day_of_month() <= 7 and day_of_week() == 0 and hour() < 7) or vector(0) - record: america_chicago_time expr: time() - ((6 * 3600) - (3600 * is_dst)) - record: america_chicago_hour expr: hour(america_chicago_time) - alert: InstanceDown expr: up{job="node"} == 0 for: 1m - alert: ThanosServiceDown expr: up{job=~"thanos.+"} == 0 labels: severity: critical - alert: Down expr: up == 0 labels: severity: critical - alert: FileSystemUsage expr: ((node_filesystem_size_bytes{mountpoint!~"fuse.lxcfs|tmpfs"} - node_filesystem_free_bytes) / node_filesystem_size_bytes) > 0.80 for: 1m - alert: FileSystemReadOnly expr: node_filesystem_readonly{fstype!~"fuse.lxcfs|tmpfs"} == 1 - alert: RebootRequired expr: node_reboot_required > 0 for: 15m - alert: AptUpgradesPending expr: apt_upgrades_pending > 0 for: 1d - alert: ResticSystemJobLastRun expr: (time() - node_restic_last_run_time{restic_job="system"}) > 7200 for: 2h - alert: ResticMinecraftJobLastRun expr: (time() - node_restic_last_run_time{restic_job=~"minecraft"}) > 86400 for: 2h - alert: MinecraftUnitInactive expr: node_systemd_unit_state{name="minecraft.service",state="inactive"} == 1 for: 15m - alert: GiteaUnitInactive expr: node_systemd_unit_state{name="gitea.service",state="inactive"} == 1 for: 15m - alert: MaintenanceMode expr: maintenance_mode == 1 for: 1m - name: blackbox.rules rules: - alert: ServiceDown expr: probe_success{job!~"blackbox-icmp[0-9]"} == 0 for: 1m - alert: PingDown expr: probe_success{job=~"blackbox-icmp[0-9]"} == 0 for: 15s - alert: CertExpiry expr: ((probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time()) / 86400) < 30 for: 15s labels: severity: warning annotations: # summary: Certificates expiring in < 30 days summary: "{% raw %}Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}){% endraw %}" description: "{% raw %}SSL certificate expires in 30 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}{% endraw %}" - alert: CertExpiry expr: ((probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time()) / 86400) < 14 for: 15s labels: severity: critical annotations: # summary: Certificates expiring in < 14 days summary: "{% raw %}Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}){% endraw %}" description: "{% raw %}SSL certificate expires in 14 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}{% endraw %}" - name: snmp.rules rules: - alert: PortDown expr: ifAdminStatus{ifName=~"(Gi|eth).+", ifAlias!~".+laptop|notebook.+"} == 1 and ifOperStatus == 2 for: 1m - alert: PortFlapping expr: changes(ifOperStatus{ifName=~"(Gi|eth).+"}[5m]) > 2 blackbox_exporter_config: modules: icmpv4: prober: icmp timeout: 5s icmp: preferred_ip_protocol: ip4 icmpv6: prober: icmp timeout: 5s icmp: preferred_ip_protocol: ip6 tcp_connect4: prober: tcp timeout: 5s tcp: preferred_ip_protocol: ip4 tcp_connect6: prober: tcp timeout: 5s tcp: preferred_ip_protocol: ip6 http_2xx: prober: http timeout: 5s http: method: GET alertmanager_config: inhibit_rules: - source_match: alertname: MaintenanceMode receivers: - name: blackhole - name: pushover-receiver pushover_configs: - token: "{{ vault_alertmanager_pushover_token }}" user_key: 28G1x3lT4oUtlck50R1H3e6j8kDHjb - name: discord discord_configs: - webhook_url: "{{ vault_alertmanager_discord_webhook_url }}" route: repeat_interval: 24h receiver: pushover-receiver routes: - match: alertname: MaintenanceMode receiver: blackhole - match: alertname: PrometheusAlertmanagerE2eDeadManSwitch receiver: blackhole - receiver: pushover-receiver mute_time_intervals: - quiet_hours continue: true - receiver: discord time_intervals: - name: quiet_hours time_intervals: - times: - start_time: "03:00" end_time: "15:00" node_exporter_du_directories: - /var/log/syslog - /var/spool/rsyslog - /var/lib/influxdb - /var/lib/prometheus - /var/lib/loki firewall_ipset_loki: - 169.254.0.0/24 karma_config: alertmanager: interval: 60s servers: - name: local uri: http://localhost:9093 timeout: 10s proxy: true readonly: false healthcheck: filters: dms: - alertname=PrometheusAlertmanagerE2eDeadManSwitch grid: sorting: order: label reverse: false label: cluster customValues: labels: severity: critical: 1 warning: 2 info: 3 auto: order: - severity labels: color: custom: severity: - value: info color: "#87c4e0" - value: warning color: "#ffae42" - value: critical color: "#ff220c" alertAcknowledgement: enabled: true thanos_bucket_config: "{{ vault_thanos_bucket_config }}" kthxbye_listen: :8081 loki_storage_config: tsdb_shipper: active_index_directory: "{{ loki_var_path }}/tsdb-index" cache_location: "{{ loki_var_path }}/tsdb-cache" gcs: bucket_name: kill0-net-loki service_account: "{{ vault_loki_gcs_service_account | string }}" loki_schema_config: configs: - from: 2023-08-11 index: period: 24h prefix: index_ object_store: gcs schema: v12 store: tsdb - from: 2024-04-10 index: period: 24h prefix: index_ object_store: gcs schema: v13 store: tsdb loki_query_scheduler: max_outstanding_requests_per_tenant: 32768 loki_querier: max_concurrent: 16 loki_compactor: working_directory: "{{ loki_var_path }}/retention" delete_request_store: gcs compaction_interval: 10m retention_enabled: true retention_delete_delay: 2h retention_delete_worker_count: 150 loki_ruler: alertmanager_url: http://localhost:9093 storage: type: gcs gcs: bucket_name: kill0-net-loki service_account: "{{ vault_loki_gcs_service_account | string }}" ring: kvstore: store: inmemory enable_api: true rsyslog_d: - name: loki priority: 10 content: | if $hostname == [ "ap0", "coresw0", "fw0", "power0", "172.16.100.1", "172.16.100.2" ] then { action( type="omfwd" target="localhost" port="1514" protocol="tcp" action.resumeretrycount="-1" queue.type="linkedlist" queue.size="1000000" queue.filename="loki-fwd" queue.saveonshutdown="on" keepalive="on" template="RSYSLOG_SyslogProtocol23Format" tcp_framing="octet-counted" ) } smokeping_prober_config: targets: - hosts: - dns.google - vpn-home.kill0.net - ping-home.kill0.net - vpn1-sch.corp.nmi.com - gp-chi.ops.nmi.com - gp-ash.ops.nmi.com - 169.254.0.2 - 172.16.100.1 - 172.16.100.2 - 172.16.10.16 network: ip4 - hosts: - dns.google - ping-home.kill0.net - fc00::ffff:169.255.0.2 - fc00::ffff:169.255.0.16 network: ip6 mimir_common: storage: backend: gcs gcs: bucket_name: kill0-net-mimir service_account: "{{ vault_mimir_gcs_service_account | string }}" mimir_blocks_storage: storage_prefix: blocks mimir_alertmanager_storage: storage_prefix: alertmanager mimir_ruler_storage: storage_prefix: ruler