prometheus普罗米修斯监控钉钉报警
2022-03-25 本文已影响0人
王宣成
vim /tmp/prometheus.yml
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ['101.43.104.115:9090']
- job_name: 'node'
static_configs:
- targets: ['101.43.104.115:9100']
- job_name: 'alertmanager'
static_configs:
- targets: ['101.43.104.115:9093']
- job_name: 'docker'
static_configs:
- targets: ['101.43.104.115:8080']
alerting:
alertmanagers:
- static_configs:
- targets:
- 101.43.104.115:9093
rule_files:
- "/etc/prometheus/rules.yml"
vim /tmp/rules.yml
groups:
- name: Host
rules:
- alert: HostMemory Usage
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)) / node_memory_MemTotal_bytes * 100 > 80
for: 1m
labels:
name: Memory
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: "主机内存使用率超过80%."
value: "{{ $value }}"
- alert: HostCPU Usage
expr: sum(avg without (cpu)(irate(node_cpu_seconds_total{mode!='idle'}[5m]))) by (instance,appname) > 0.80
for: 1m
labels:
name: CPU
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: "主机CPU使用率超过80%."
value: "{{ $value }}"
- alert: HostLoad
expr: node_load5 > 4
for: 1m
labels:
name: Load
severity: Warning
annotations:
summary: "{{ $labels.appname }} "
description: " 主机负载5分钟超过4."
value: "{{ $value }}"
- alert: HostFilesystem Usage
expr: 1-(node_filesystem_free_bytes / node_filesystem_size_bytes) > 0.8
for: 1m
labels:
name: Disk
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: " 主机 [ {{ $labels.mountpoint }} ]分区使用超过80%."
value: "{{ $value }}%"
- alert: HostDiskio
expr: irate(node_disk_writes_completed_total{job=~"Host"}[1m]) > 10
for: 1m
labels:
name: Diskio
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: " 主机 [{{ $labels.device }}]磁盘1分钟平均写入IO负载较高."
value: "{{ $value }}iops"
- alert: Network_receive
expr: irate(node_network_receive_bytes_total{device!~"lo|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1048576 > 3
for: 1m
labels:
name: Network_receive
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: " 主机 [{{ $labels.device }}] 网卡5分钟平均接收流量超过3Mbps."
value: "{{ $value }}3Mbps"
- alert: Network_transmit
expr: irate(node_network_transmit_bytes_total{device!~"lo|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1048576 > 3
for: 1m
labels:
name: Network_transmit
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: " 主机 [{{ $labels.device }}] 网卡5分钟内平均发送流量超过3Mbps."
value: "{{ $value }}3Mbps"
vim /tmp/alertmanager.yml
global:
resolve_timeout: 5m
route:
receiver: webhook
group_wait: 30s
group_interval: 5m
repeat_interval: 5m
group_by: [alertname]
routes:
- receiver: webhook
group_wait: 10s
receivers:
- name: webhook
webhook_configs:
- url: http://101.43.104.115:8060/dingtalk/webhook1/send
send_resolved: true
docker run -d --name alertmanager -p 9093:9093 -v /tmp/alertmanager.yml:/etc/alertmanager/alertmanager.yml prom/alertmanager
docker run -d --name node-exporter -p 9100:9100 -v /proc:/host/proc -v /sys:/host/sys -v /:/rootfs prom/node-exporter --path.procfs /host/proc --path.sysfs /host/sys --collector.filesystem.ignored-mount-points "^/(sys|proc|dev|host|etc)($|/)"
docker run \
--volume=/:/rootfs:ro \
--volume=/var/run:/var/run:ro \
--volume=/sys:/sys:ro \
--volume=/var/lib/docker/:/var/lib/docker:ro \
--volume=/dev/disk/:/dev/disk:ro \
--volume=/cgroup:/cgroup:ro \
--privileged=true \
--publish=8080:8080 \
--detach=true \
--name=cadvisor \
google/cadvisor
# access_token=钉钉机器人token
docker run -d --name dingtalk --restart always -p 8060:8060 timonwong/prometheus-webhook-dingtalk:v0.3.0 --ding.profile="webhook1=https://oapi.dingtalk.com/robot/send?access_token=xxxxx"
docker run -d --name prometheus -p 9090:9090 -v /tmp/prometheus.yml:/etc/prometheus/prometheus.yml -v /tmp/rules.yml:/etc/prometheus/rules.yml prom/prometheus:latest
docker run -d \
-p 3000:3000 \
--restart=always \
--name=grafana \
-e "GF_SECURITY_ADMIN_PASSWORD=admin" \
-e "GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource,raintank-worldping-app,grafana-piechart-panel" \
grafana/grafana
添加设置钉钉机器人
image.png# 手动拉高系统的CPU使用率
cat /dev/zero>/dev/null
等待一会提示警告
image.pnghttp://101.43.104.115:9090/alerts
http://101.43.104.115:9090/targets
http://101.43.104.115:9090/rules
image.png
image访问grafana 账号: admin 密码: admin
http://101.43.104.115:3000/admin/users?search=open
image.png
image
image
image
image
image
image.png导入模板id:8919