参考文章: 使用企业微信机器人接收 Prometheus 报警信息
GitHub 仓库地址: prometheus-flask
创建 Prometheus-flask 程序
Alertmanager 告警信息发送到企业微信群,先发送到 prometheus-flask
程序,再由 prometheus-flask
对信息格式解析后再发送到企业微信群中。
创建
app/main.py
文件,服务启动后为监听在5000端口的服务1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# @Author: zhoubin
# @Email: bin.zhou@vonechain.com
# @Description: 起一个 webhook 监听服务,接收 Prometheus 告警信息,并转发到企业微信机器人
from flask import Flask, request
from flask_json import as_json
from flask import jsonify
from gevent import pywsgi
import sys
import Alert
import argparse
import requests
import json
import datetime
app = Flask(__name__)
def alert_data():
data = request.get_data()
json_re = json.loads(data)
json.dumps(json_re)
print(json_re)
Alert.send_alert(json_re, args.key)
return json_re
def status():
return 'ok'
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-p", "--port", type=int, help="The service port")
parser.add_argument("-k", "--key", type=str, help="The webhook url key")
args = parser.parse_args()
if not args.port or not args.key:
parser.print_help()
sys.exit(0)
else:
print("服务开始运行,监听地址: {}; 监听端口 {} ".format('0.0.0.0', args.port))
print("Robot key: {}".format(args.key))
server = pywsgi.WSGIServer(('0.0.0.0', args.port), app)
server.serve_forever()创建
app/Alert.py
文件,Alert.py
是将main.py
转发过来的数据转换成企业微信群机器可识别的数据后发送给机器人的代码1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# @Author: zhoubin
# @Email: bin.zhou@vonechain.com
# @Description: 解析 Prometheus 告警数据,并发送通知到企业微信机器人
import requests
import json
import datetime
from doctest import debug_script
from pydoc import describe
from flask import jsonify
from dateutil import parser
def parse_time(*args):
"""
处理 Prometheus 通知信息中的时间格式
:return: 返回处理好的时间(+8 小时)
"""
times = []
for dates in args:
format_time = parser.parse(dates).strftime('%Y-%m-%dT%H:%M:%SZ')
time_format = datetime.datetime.strptime(format_time, '%Y-%m-%dT%H:%M:%SZ')
eta_time = time_format + datetime.timedelta(hours=8)
times.append(eta_time)
return times
def alert(status,alertnames,levels,times,ins,instance,description):
"""
告警信息内容
:param status: 告警状态
:param alertnames: 告警名称
:param levels: 告警级别
:param times: 告警时间
:param ins:
:param instance: 告警实例
:param description: 告警信息详细说明
"""
params = json.dumps({
"msgtype": "markdown",
"markdown": {
"content": """
## <font color=\"red\">告警通知: {0}</font>\n
**告警名称:** <font color=\"warning\">{1}</font>\n
**告警级别:** {2}\n
**告警时间:** {3}\n
**{4}:** {5}\n
**告警详情:** <font color=\"comment\">{6}</font>""".format(
status,alertnames,levels,times[0],ins,instance,description
)
}
})
return params
def recive(status,alertnames,levels,times,ins,instance,description):
"""
恢复信息内容
:param status: 告警状态
:param alertnames: 告警名称
:param levels: 告警级别
:param times: 告警时间
:param ins:
:param instance: 告警实例
:param description: 告警信息详细说明
"""
params = json.dumps({
"msgtype": "markdown",
"markdown": {
"content": """
## <font color=\"info\">恢复通知: {0}</font>\n
**告警名称:** <font color=\"warning\">{1}</font>\n
**告警级别:** {2}\n
**告警时间:** {3}\n
**恢复时间:** {4}\n
**{5}:** {6}\n
**告警详情:** <font color=\"comment\">{7}</font>""".format(
status,alertnames,levels,times[0],times[1],ins,instance,description
)
}
})
return params
def webhook_url(params,url_key):
"""
发送企业微信机器人通知基础方法
:param params: 发送通知的内容
:param url_key: 企业机器人 webhook URL
"""
headers = {"Content-type": "application/json"}
url = "{}".format(url_key)
r = requests.post(url,params,headers)
def send_alert(json_re,url_key):
"""
发送通知信息
"""
for i in json_re['alerts']:
if i['status'] == 'firing':
if "instance" in i['labels']:
webhook_url(
alert(
i['status'],
i['labels']['alertname'],
i['labels']['severity'],
parse_time(i['startsAt']),
'故障实例',
i['labels']['instance'],
i['annotations']['description']
),
url_key
)
elif "namespace" in i['labels']:
webhook_url(
alert(
i['status'],
i['labels']['alertname'],
i['labels']['severity'],
parse_time(i['startsAt']),
'命名空间',
i['labels']['namespace'],
i['annotations']['description']
),
url_key
)
elif "Watchdog" in i['labels']['alertname']:
webhook_url(
alert(i['status'],
i['labels']['alertname'],
'0', '0', '0', '0', '0'
),
url_key
)
elif i['status'] == 'resolved':
if "instance" in i['labels']:
webhook_url(
recive(
i['status'],
i['labels']['alertname'],
i['labels']['severity'],
parse_time(i['startsAt'],i['endsAt']),
'故障实例',
i['labels']['instance'],
i['annotations']['description']
),
url_key)
elif "namespace" in i['labels']:
webhook_url(
recive(
i['status'],
i['labels']['alertname'],
i['labels']['severity'],
parse_time(i['startsAt'],i['endsAt']),
'命名空间',
i['labels']['namespace'],
i['annotations']['description']
),
url_key
)
elif "Watchdog" in i['labels']['alertname']:
webhook_url(
recive(
i['status'],
i['labels']['alertname'],
'0', '0', '0', '0', '0'
),
url_key
)创建 requirements.txt 文件,内容如下
1
2
3
4
5flask_json == 0.3.4
flask == 2.0.1
requests == 2.19.1
gevent == 21.12.0
python-dateutil == 2.8.2创建 Dockerfile 文件,内容如下
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19FROM python:3.7.7 as builder
COPY ./requirements.txt /app/requirements.txt
RUN pip install -U pip -i https://pypi.tuna.tsinghua.edu.cn/simple && \
pip install -r /app/requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple && \
find /usr/local/lib -name '*.pyc' -delete
FROM python:3.7.7-slim
COPY ./app /app
COPY --from=builder /usr/local/lib/python3.7/site-packages /usr/local/lib/python3.7/site-packages
WORKDIR /app
ENTRYPOINT ["python", "/app/main.py"]
CMD ["-p 5000", "-k https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=e95221dc-baa4-497d-8a9b-8907d8645e0e"]构建并推送镜像到 Hub 仓库
1
2
3
4# 构建镜像
docker build -t registry-changsha.vonebaas.com/publics/prometheus-flask:latest .
# 上传镜像
docker push registry-changsha.vonebaas.com/publics/prometheus-flask:latest部署 Prometheus-flask 程序到 k8s 集群,创建资源清单文件
prometheus-webhook-dingtalk.yaml
,内容如下1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43apiVersion: apps/v1
kind: Deployment
metadata:
labels:
run: prometheus-webhook-dingtalk
name: prometheus-webhook-dingtalk
namespace: monitoring
spec:
selector:
matchLabels:
run: prometheus-webhook-dingtalk
template:
metadata:
labels:
run: prometheus-webhook-dingtalk
spec:
containers:
- image: registry-changsha.vonebaas.com/publics/prometheus-flask:latest
name: prometheus-webhook-dingtalk
args:
- "-p 5000"
- "-k https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=e95221dc-baa4-497d-8a9b-8907d8645e0e"
ports:
- containerPort: 5000
protocol: TCP
imagePullPolicy: Always
apiVersion: v1
kind: Service
metadata:
labels:
run: prometheus-webhook-dingtalk
name: prometheus-webhook-dingtalk
namespace: monitoring
spec:
ports:
- port: 5000
protocol: TCP
targetPort: 5000
selector:
run: prometheus-webhook-dingtalk
type: ClusterIP执行以下命令创建服务
1
kubectl apply -f prometheus-webhook-dingtalk.yaml
配置 Alertmanager
进入
kube-prometheus/manifests
目录,修改alertmanager-secret.yaml
文件,配置告警通知信息1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200apiVersion: v1
kind: Secret
metadata:
name: alertmanager-main
namespace: monitoring
stringData:
alertmanager.yaml: |-
"global":
"resolve_timeout": "5m"
smtp_from: "15626580887@163.com"
smtp_smarthost: "smtp.163.com:465"
smtp_hello: "163.com"
smtp_auth_username: "15626580887@163.com"
smtp_auth_password: "xxxxxxssasasdU"
smtp_require_tls: false
"templates":
- '/etc/alertmanager/config/*.tmpl'
"inhibit_rules":
- "equal":
- "namespace"
- "alertname"
- "instance"
"source_match":
"severity": "critical"
"target_match_re":
"severity": "warning|info"
- "equal":
- "namespace"
- "alertname"
- "instance"
"source_match":
"severity": "warning"
"target_match_re":
"severity": "info"
"receivers":
- "name": "email"
"email_configs":
- to: "bin.zhou@vonechain.com,jiaojiao.zou@vonechain.com"
send_resolved: true
html: '{{ template "email.html" . }}'
- "name": 'webhook'
"webhook_configs":
- url: "http://prometheus-webhook-dingtalk:5000/alertinfo"
send_resolved: true
- "name": "Default"
"email_configs":
- to: "bin.zhou@vonechain.com,jiaojiao.zou@vonechain.com"
send_resolved: true
html: '{{ template "email.html" . }}'
"webhook_configs":
- url: "http://prometheus-webhook-dingtalk:5000/alertinfo"
send_resolved: true
- "name": "Critical"
"email_configs":
- to: "bin.zhou@vonechain.com,jiaojiao.zou@vonechain.com"
send_resolved: true
html: '{{ template "email.html" . }}'
"webhook_configs":
- url: "http://prometheus-webhook-dingtalk:5000/alertinfo"
send_resolved: true
"route":
"group_by":
- "namespace"
- "job"
- "alertname"
"group_interval": "5m"
"group_wait": "30s"
"receiver": "Default"
"repeat_interval": "1h"
"routes":
- "receiver": "webhook"
"match":
"severity": "warning|info"
"continue": true
- "receiver": "Critical"
"match":
"severity": "critical"
- "receiver": "webhook"
"match":
"alertname": "Watchdog"
wechat.tmpl: |
{{ define "wechat.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 }}
===异常告警===
告警类型: {{ $alert.Labels.alertname }}
告警级别: {{ $alert.Labels.severity }}
告警详情: {{ $alert.Annotations.description}}
故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{- if gt (len $alert.Labels.instance) 0 }}
实例信息: {{ $alert.Labels.instance }}
{{- end }}
{{- if gt (len $alert.Labels.namespace) 0 }}
命名空间: {{ $alert.Labels.namespace }}
{{- end }}
{{- if gt (len $alert.Labels.node) 0 }}
节点信息: {{ $alert.Labels.node }}
{{- end }}
{{- if gt (len $alert.Labels.pod) 0 }}
实例名称: {{ $alert.Labels.pod }}
{{- end }}
===END===
{{- end }}
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 }}
===异常恢复===
告警类型: {{ $alert.Labels.alertname }}
告警级别: {{ $alert.Labels.severity }}
告警详情: {{ $alert.Annotations.description}}
故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
恢复时间: {{ ($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{- if gt (len $alert.Labels.instance) 0 }}
实例信息: {{ $alert.Labels.instance }}
{{- end }}
{{- if gt (len $alert.Labels.namespace) 0 }}
命名空间: {{ $alert.Labels.namespace }}
{{- end }}
{{- if gt (len $alert.Labels.node) 0 }}
节点信息: {{ $alert.Labels.node }}
{{- end }}
{{- if gt (len $alert.Labels.pod) 0 }}
实例名称: {{ $alert.Labels.pod }}
{{- end }}
===END===
{{- end }}
{{- end }}
{{- end }}
{{- end }}
email.tmpl: |
{{ define "email.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
========= <span style=color:red;font-size:24px;font-weight:bold;> 监控告警 </span>=========<br>
<span style=font-size:16px;font-weight:bold;> 告警程序: </span> Alertmanager <br>
<span style=font-size:16px;font-weight:bold;> 告警类型: </span> {{ $alert.Labels.alertname }} <br>
<span style=font-size:16px;font-weight:bold;> 告警级别: </span> {{ $alert.Labels.severity }} 级 <br>
<span style=font-size:16px;font-weight:bold;> 告警状态: </span> {{ .Status }} <br>
<span style=font-size:16px;font-weight:bold;> 故障主机: </span> {{ $alert.Labels.instance }} {{ $alert.Labels.device }} <br>
<span style=font-size:16px;font-weight:bold;> 告警主题: </span> {{ .Annotations.summary }} <br>
<span style=font-size:16px;font-weight:bold;> 告警详情: </span> {{ $alert.Annotations.message }}{{ $alert.Annotations.description}} <br>
<span style=font-size:16px;font-weight:bold;> 主机标签: </span> {{ range .Labels.SortedPairs }} <br> [{{ .Name }}: {{ .Value | html }} ]{{ end }}<br>
<span style=font-size:16px;font-weight:bold;> 故障时间: </span> {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
============== End ============
<br>
<br>
<div>
<div>
<p style=color:red;font-size:16px>(这是一封自动发送的邮件,请勿回复。)</p>
</div>
<div align=right style="border-top:solid 1px gray" id=bottomTime>
<p>Prometheus 监控平台</p>
<label>{{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02" }}</label><br>
</div>
</div>
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
========= <span style=color:#00FF00;font-size:24px;font-weight:bold;> 告警恢复 </span>=========<br>
<span style=font-size:16px;font-weight:bold;> 告警程序: </span> Alertmanager <br>
<span style=font-size:16px;font-weight:bold;> 告警主题: </span> {{ $alert.Annotations.summary }}<br>
<span style=font-size:16px;font-weight:bold;> 告警主机: </span> {{ .Labels.instance }} <br>
<span style=font-size:16px;font-weight:bold;> 告警类型: </span> {{ .Labels.alertname }}<br>
<span style=font-size:16px;font-weight:bold;> 告警级别: </span> {{ $alert.Labels.severity }} 级 <br>
<span style=font-size:16px;font-weight:bold;> 告警状态: </span> {{ .Status }}<br>
<span style=font-size:16px;font-weight:bold;> 告警详情: </span> {{ $alert.Annotations.message }}{{ $alert.Annotations.description}}<br>
<span style=font-size:16px;font-weight:bold;> 故障时间: </span> {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
<span style=font-size:16px;font-weight:bold;> 恢复时间: </span> {{ ($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
============== End ============
<br>
<br>
<div>
<div>
<p style=color:red;font-size:16px>(这是一封自动发送的邮件,请勿回复。)</p>
</div>
<div align=right style="border-top:solid 1px gray" id=bottomTime>
<p>Prometheus 监控平台</p>
<label>{{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02" }}</label><br>
</div>
</div>
{{- end }}
{{- end }}
{{ end }}
type: Opaque更新 Alertmanager 配置
1
kubectl replace -f alertmanager-secret.yaml
打开 Alertmanager web 界面,查看 Status 界面,检查配置是否更新
修改
prometheus-rules.yaml
文件,将所有的annotations.message
字段改成annotations.description
1
2
3
4
5# 批量修改
sed -i s/message/description/g prometheus-rules.yaml
# 更细资源配置
kubectl replace -f prometheus-rules.yaml