diff --git a/alertmanager/defaults/main.yml b/alertmanager/defaults/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..f7aa68113a96317da2b873a84f441ae4bf587690 --- /dev/null +++ b/alertmanager/defaults/main.yml @@ -0,0 +1,27 @@ +--- +prometheus_alertmanager_args: "" +prometheus_smtp_smarthost: mail.example.org:25 +prometheus_smtp_from: monitoring@example.org +prometheus_alert_route: + group_by: ['alertname', 'cluster', 'service'] + group_wait: 30s + group_interval: 5m + repeat_interval: 3h + receiver: admins-mail + routes: + - match: + severity: critical + receiver: admins-pager +prometheus_alert_inhibit_rules: + - source_match: + severity: 'critical' + target_match: + severity: 'warning' + equal: ['alertname', 'cluster', 'service'] +prometheus_alert_receivers: + - name: admins-mail + email_configs: + - to: admin@example.org + - name: admins-pager + pagerduty_configs: + - service_key: 42 diff --git a/alertmanager/handlers/main.yml b/alertmanager/handlers/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..94a8c21a636aa943a469d880eb671a209bc09a1a --- /dev/null +++ b/alertmanager/handlers/main.yml @@ -0,0 +1,11 @@ +--- + +- name: Restart alertmanager + systemd: + name: prometheus-alertmanager.service + state: restarted + +- name: Reload alertmanager + systemd: + name: prometheus-alertmanager.service + state: reloaded diff --git a/alertmanager/tasks/main.yml b/alertmanager/tasks/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..71f61859f31c7ac77e740124360203f2b989899c --- /dev/null +++ b/alertmanager/tasks/main.yml @@ -0,0 +1,24 @@ +--- + +- name: Install alertmanager + apt: + name: prometheus-alertmanager + state: present + +- name: Configure alertmanager command arguments + template: + src: default.j2 + dest: /etc/default/prometheus-alertmanager + notify: + - Restart alertmanager + tags: + - config + +- name: Configure alertmanager + template: + src: alertmanager.yml.j2 + dest: /etc/prometheus/alertmanager.yml + notify: + - Reload alertmanager + tags: + - config diff --git a/alertmanager/templates/alertmanager.yml.j2 b/alertmanager/templates/alertmanager.yml.j2 new file mode 100644 index 0000000000000000000000000000000000000000..78cf56ec2e297bb378fc84ace777c31f8873c70d --- /dev/null +++ b/alertmanager/templates/alertmanager.yml.j2 @@ -0,0 +1,15 @@ +global: + smtp_smarthost: '{{ prometheus_smtp_smarthost }}' + smtp_from: '{{ prometheus_smtp_from }}' + +templates: + - '/etc/prometheus/alertmanager_templates/*.tmpl' + +route: +{{ prometheus_alert_route|to_yaml|indent(2, true) }} + +inhibit_rules: +{{ prometheus_alert_inhibit_rules|to_yaml|indent(2, true) }} + +receivers: +{{ prometheus_alert_receivers|to_yaml|indent(2, true) }} diff --git a/alertmanager/templates/default.j2 b/alertmanager/templates/default.j2 new file mode 100644 index 0000000000000000000000000000000000000000..dac90e65cbbb48599d707ce5f06c8ee0ff029c79 --- /dev/null +++ b/alertmanager/templates/default.j2 @@ -0,0 +1,65 @@ +# Set the command-line arguments to pass to the server. +ARGS="{{ prometheus_alertmanager_args }}" + +# The alert manager supports the following options: + +# --config.file="/etc/prometheus/alertmanager.yml" +# Alertmanager configuration file name. +# --storage.path="/var/lib/prometheus/alertmanager/" +# Base path for data storage. +# --data.retention=120h +# How long to keep data for. +# --alerts.gc-interval=30m +# Interval between alert GC. +# --log.level=info +# Only log messages with the given severity or above. +# --web.external-url=WEB.EXTERNAL-URL +# The URL under which Alertmanager is externally reachable (for example, +# if Alertmanager is served via a reverse proxy). Used for generating +# relative and absolute links back to Alertmanager itself. If the URL has +# a path portion, it will be used to prefix all HTTP endpoints served by +# Alertmanager. If omitted, relevant URL components will be derived +# automatically. +# --web.route-prefix=WEB.ROUTE-PREFIX +# Prefix for the internal routes of web endpoints. Defaults to path of +# --web.external-url. +# --web.listen-address=":9093" +# Address to listen on for the web interface and API. +# --web.ui-path="/usr/share/prometheus/alertmanager/ui/" +# Path to static UI directory. +# --template.default="/usr/share/prometheus/alertmanager/default.tmpl" +# Path to default notification template. +# --cluster.listen-address="0.0.0.0:9094" +# Listen address for cluster. +# --cluster.advertise-address=CLUSTER.ADVERTISE-ADDRESS +# Explicit address to advertise in cluster. +# --cluster.peer=CLUSTER.PEER ... +# Initial peers (may be repeated). +# --cluster.peer-timeout=15s +# Time to wait between peers to send notifications. +# --cluster.gossip-interval=200ms +# Interval between sending gossip messages. By lowering this value (more +# frequent) gossip messages are propagated across the cluster more +# quickly at the expense of increased bandwidth. +# --cluster.pushpull-interval=1m0s +# Interval for gossip state syncs. Setting this interval lower (more +# frequent) will increase convergence speeds across larger clusters at +# the expense of increased bandwidth usage. +# --cluster.tcp-timeout=10s Timeout for establishing a stream connection +# with a remote node for a full state sync, and for stream read and write +# operations. +# --cluster.probe-timeout=500ms +# Timeout to wait for an ack from a probed node before assuming it is +# unhealthy. This should be set to 99-percentile of RTT (round-trip time) +# on your network. +# --cluster.probe-interval=1s +# Interval between random node probes. Setting this lower (more frequent) +# will cause the cluster to detect failed nodes more quickly at the +# expense of increased bandwidth usage. +# --cluster.settle-timeout=1m0s +# Maximum time to wait for cluster connections to settle before +# evaluating notifications. +# --cluster.reconnect-interval=10s +# Interval between attempting to reconnect to lost peers. +# --cluster.reconnect-timeout=6h0m0s +# Length of time to attempt to reconnect to a lost peer. diff --git a/prometheus/defaults/main.yml b/prometheus/defaults/main.yml index 73de8fb9c25da9001c6f2bd8c7ec41a4d1dc1207..9e759d360515811b33461ebb1be71d26b54d9c40 100644 --- a/prometheus/defaults/main.yml +++ b/prometheus/defaults/main.yml @@ -3,3 +3,8 @@ prometheus_args: "" # This variable intentionally left null. They need to exist, so that the # template works, but they need to have a value (empty string does not work). prometheus_rules: null +prometheus_alert_relabel_configs: [] +prometheus_alertmanagers: + - static_configs: + - targets: + - 'localhost:9093' diff --git a/prometheus/templates/prometheus.yml.j2 b/prometheus/templates/prometheus.yml.j2 index 67daddacfa99cd28f48cc2c24f9ece603ed235f3..a6e70959f6dcfaa6be95886310b922f9d078b741 100644 --- a/prometheus/templates/prometheus.yml.j2 +++ b/prometheus/templates/prometheus.yml.j2 @@ -4,8 +4,11 @@ global: external_labels: monitor: '{{ ansible_fqdn }}' -# TODO -# alerting: +alerting: + alert_relabel_configs: +{{ prometheus_alert_relabel_configs|to_yaml|indent(4, true) }} + alertmanagers: +{{ prometheus_alertmanagers|to_yaml|indent(4, true) }} rule_files: - /etc/prometheus/rules/*.yml