---
node_exporter_machine_roles:
  - monitor
  - stats

prometheus_web_external_url: https://monitor.kill0.net/prometheus
alertmanager_web_external_url: https://monitor.kill0.net/alertmanager
prometheus_web_route_prefix: /
alertmanager_web_route_prefix: /

prometheus_config:
  global:
    scrape_interval: 15s
    external_labels:
      cluster: 1
      region: dallas
      provider: linode
      replica: A
  alerting:
    alertmanagers:
      - static_configs:
          - targets:
              - localhost:9093
  scrape_configs:
    - job_name: prometheus
      scrape_interval: 5s
      static_configs:
        - targets:
            - localhost:9090
    - job_name: alertmanager
      scrape_interval: 5s
      static_configs:
        - targets:
            - localhost:9093
    - job_name: pushgateway
      scrape_interval: 5s
      static_configs:
        - targets:
            - jump0.kill0.net:9091
    - job_name: node
      scrape_interval: 5s
      static_configs:
        - targets:
            - jump0.kill0.net:9100
            - mine0.kill0.net:9100
      relabel_configs:
        - source_labels: [__address__]
          target_label: instance
          regex: (.+):\d+
          replacement: $1
    - job_name: mtail
      scrape_interval: 5s
      static_configs:
        - targets:
            - jump0.kill0.net:3903
            - mine0.kill0.net:3903
      relabel_configs:
        - source_labels: [__address__]
          target_label: instance
          regex: (.+):\d+
          replacement: $1
    - job_name: blackbox
      scrape_interval: 5s
      static_configs:
        - targets:
            - jump0.kill0.net:9115
            - mine0.kill0.net:9115
    - job_name: blackbox-icmp4
      metrics_path: /probe
      params:
        module:
          - icmpv4
      static_configs:
        - targets:
            - dns.google
            - vpn-home.kill0.net
            - ping-home.kill0.net
            - 10.255.0.16
            - vpn1-sch.corp.nmi.com
            - vpn-chi.ops.nmi.com
            - vpn-ash.ops.nmi.com
      relabel_configs:
        - source_labels: [__address__]
          target_label: __param_target
        - source_labels: [__param_target]
          target_label: instance
        - target_label: __address__
          replacement: 127.0.0.1:9115  # The blackbox exporter's real hostname:port.
    - job_name: blackbox-icmp6
      metrics_path: /probe
      params:
        module:
          - icmpv6
      static_configs:
        - targets:
            - dns.google
            - ping-home.kill0.net
      relabel_configs:
        - source_labels: [__address__]
          target_label: __param_target
        - source_labels: [__param_target]
          target_label: instance
        - target_label: __address__
          replacement: 127.0.0.1:9115  # The blackbox exporter's real hostname:port.
    - job_name: blackbox-tcp4
      metrics_path: /probe
      params:
        module:
          - tcp_connect4
      static_configs:
        - targets:
            - mine0.kill0.net:25565
      relabel_configs:
        - source_labels: [__address__]
          target_label: __param_target
        - source_labels: [__param_target]
          target_label: instance
        - target_label: __address__
          replacement: 127.0.0.1:9115  # The blackbox exporter's real hostname:port.
    - job_name: blackbox-tcp6
      metrics_path: /probe
      params:
        module:
          - tcp_connect6
      static_configs:
        - targets:
            - mine0.kill0.net:25565
      relabel_configs:
        - source_labels: [__address__]
          target_label: __param_target
        - source_labels: [__param_target]
          target_label: instance
        - target_label: __address__
          replacement: 127.0.0.1:9115  # The blackbox exporter's real hostname:port.
    - job_name: blackbox-http
      metrics_path: /probe
      params:
        module:
          - http_2xx
      static_configs:
        - targets:
            - https://cavi.cc
            - https://git.kill0.net
            - https://stats.kill0.net
      relabel_configs:
        - source_labels: [__address__]
          target_label: __param_target
        - source_labels: [__param_target]
          target_label: instance
        - target_label: __address__
          replacement: 127.0.0.1:9115  # The blackbox exporter's real hostname:port.
    - job_name: thanos-sidecar
      scrape_interval: 5s
      static_configs:
        - targets:
            - "localhost:10902"
    - job_name: thanos-query
      scrape_interval: 5s
      static_configs:
        - targets:
            - "localhost:10904"
    - job_name: thanos-store
      scrape_interval: 5s
      static_configs:
        - targets:
            - "localhost:10902"
    - job_name: thanos-compact
      scrape_interval: 5s
      static_configs:
        - targets:
            - "localhost:10912"
    - job_name: grafana
      scrape_interval: 5s
      static_configs:
        - targets:
            - "localhost:3002"
  rule_files:
    - rules.yaml

prometheus_rules_config:
  groups:
    - name: alertmanager.rules
      rules:
        - alert: PrometheusAlertmanagerJobMissing
          expr: absent(up{job="alertmanager"})
          for: 0m
          labels:
            severity: warning
          annotations:
            summary: "{% raw %} Prometheus AlertManager job missing (instance {{ $labels.instance }}){% endraw %}"
            description: "{% raw %}A Prometheus AlertManager job has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}{% endraw %}"
        - alert: PrometheusAlertmanagerE2eDeadManSwitch
          expr: vector(1)
          for: 0m
          labels:
            severity: critical
          annotations:
            summary: "{% raw %}Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }}){% endraw %}"
            description: "{% raw %}Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}{% endraw %}"
    - name: node.rules
      rules:
        - record: is_dst
          expr: |
            (vector(0) and (month() < 3 or month() > 11))
            or
            (vector(1) and (month() > 3 and month() < 11))
            or
            (vector(1) and month() == 3 and (day_of_month() - day_of_week()) >= 8 and absent(day_of_week() == 0 and day_of_month() >= 8 and day_of_month() <= 14))
            or
            (vector(1) and month() == 11 and (day_of_month() - day_of_week()) <= 0)
            or
            (vector(1) and month() == 3 and day_of_month() >= 8 and day_of_month() <= 14 and day_of_week() == 0 and hour() >= 8)
            or
            (vector(1) and month() == 11 and day_of_month() >= 1 and day_of_month() <= 7 and day_of_week() == 0 and hour() < 7)
            or
            vector(0)
        - record: america_chicago_time
          expr: time() - ((6 * 3600) - (3600 * is_dst))
        - record: america_chicago_hour
          expr: hour(america_chicago_time)
        - alert: InstanceDown
          expr: up{job="node"} == 0
          for: 1m
        - alert: ThanosServiceDown
          expr: up{job=~"thanos.+"} == 0
          labels:
            severity: critical
        - alert: FileSystemUsage
          expr: ((node_filesystem_size_bytes{mountpoint!~"fuse.lxcfs|tmpfs"} - node_filesystem_free_bytes) / node_filesystem_size_bytes) > 0.80
          for: 1m
        - alert: FileSystemReadOnly
          expr: node_filesystem_readonly{fstype!~"fuse.lxcfs|tmpfs"} == 1
        - alert: RebootRequired
          expr: node_reboot_required > 0
          for: 15m
        - alert: AptUpgradesPending
          expr: apt_upgrades_pending > 0
          for: 1d
        - alert: ResticSystemJobLastRun
          expr: (time() - node_restic_last_run_time{restic_job="system"}) > 7200
          for: 2h
        - alert: ResticMinecraftJobLastRun
          expr: (time() - node_restic_last_run_time{restic_job=~"minecraft"}) > 86400
          for: 2h
        - alert: MinecraftUnitInactive
          expr: node_systemd_unit_state{name="minecraft.service",state="inactive"} == 1
          for: 15m
        - alert: GiteaUnitInactive
          expr: node_systemd_unit_state{name="gitea.service",state="inactive"} == 1
          for: 15m
        - alert: MaintenanceMode
          expr: maintenance_mode == 1
          for: 1m
    - name: blackbox.rules
      rules:
        - alert: ServiceDown
          expr: probe_success{job!~"blackbox-icmp[0-9]"} == 0
          for: 1m
        - alert: PingDown
          expr: probe_success{job=~"blackbox-icmp[0-9]"} == 0
          for: 15s
        - alert: CertExpiry
          expr: ((probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time()) / 86400) < 30
          for: 15s
          labels:
            severity: warning
          annotations:
            # summary: Certificates expiring in < 30 days
            summary: "{% raw %}Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}){% endraw %}"
            description: "{% raw %}SSL certificate expires in 30 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}{% endraw %}"
        - alert: CertExpiry
          expr: ((probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time()) / 86400) < 14
          for: 15s
          labels:
            severity: critical
          annotations:
            # summary: Certificates expiring in < 14 days
            summary: "{% raw %}Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}){% endraw %}"
            description: "{% raw %}SSL certificate expires in 14 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}{% endraw %}"

blackbox_exporter_config:
  modules:
    icmpv4:
      prober: icmp
      timeout: 5s
      icmp:
        preferred_ip_protocol: ip4
    icmpv6:
      prober: icmp
      timeout: 5s
      icmp:
        preferred_ip_protocol: ip6
    tcp_connect4:
      prober: tcp
      timeout: 5s
      tcp:
        preferred_ip_protocol: ip4
    tcp_connect6:
      prober: tcp
      timeout: 5s
      tcp:
        preferred_ip_protocol: ip6
    http_2xx:
      prober: http
      timeout: 5s
      http:
        method: GET

#  route:
#    receiver: pushover-receiver
#    mute_time_intervals:
#      - quiet_hours
#    routes:
#      - receiver: blackhole
#        match:
#          alertname: MaintenanceMode
#      #- receiver: blackhole
#      #  match:
#      #    alertname: QuietHours
#  receivers:
#    - name: blackhole
#    - name: pushover-receiver
#      pushover_configs:
#        - token: "{{ vault_pushover_token }}"
#          user_key: "{{ vault_pushover_user_key }}"
#  inhibit_rules:
#    - source_match:
#        alertname: MaintenanceMode
#    #- source_match:
#    #    alertname: QuietHours
#  time_intervals:
#    - name: quiet_hours
#      times:
#        - start_time: 03:00
#          end_time: 15:00

alertmanager_config:
  inhibit_rules:
    - source_match:
        alertname: MaintenanceMode
  receivers:
    - name: blackhole
    - name: pushover-receiver
      pushover_configs:
        - token: agwd6wv7xveakykb8e5rz7rw3eg2v3
          user_key: 28G1x3lT4oUtlck50R1H3e6j8kDHjb
  route:
    receiver: pushover-receiver
    routes:
      - match:
          alertname: MaintenanceMode
        receiver: blackhole
      - match:
          alertname: PrometheusAlertmanagerE2eDeadManSwitch
        receiver: blackhole
      - receiver: pushover-receiver
        mute_time_intervals:
          - quiet_hours
  time_intervals:
    - name: quiet_hours
      time_intervals:
        - times:
            - start_time: "03:00"
              end_time: "15:00"

node_exporter_du_directories:
  - /var/log/syslog
  - /var/spool/rsyslog
  - /var/lib/influxdb
  - /var/lib/prometheus
  - /var/lib/loki

firewall_ipset_loki:
  - 10.255.0.0/24

karma_config:
  alertmanager:
    interval: 60s
    servers:
      - name: local
        uri: http://localhost:9093
        timeout: 10s
        proxy: true
        readonly: false
        healthcheck:
          filters:
            dms:
              - alertname=PrometheusAlertmanagerE2eDeadManSwitch
  grid:
    sorting:
      order: label
      reverse: false
      label: cluster
      customValues:
        labels:
          severity:
            critical: 1
            warning: 2
            info: 3
    auto:
      order:
        - severity
  labels:
    color:
      custom:
        severity:
          - value: info
            color: "#87c4e0"
          - value: warning
            color: "#ffae42"
          - value: critical
            color: "#ff220c"
  alertAcknowledgement:
    enabled: true

thanos_bucket_config: "{{ vault_thanos_bucket_config }}"

kthxbye_listen: :8081