5.prometheus- config

2019-10-07 本文已影响0人 Plenari

prometheus

global:
  scrape_interval: 1m
  scrape_timeout: 10s
  evaluation_interval: 1m
  external_labels:
    monitor: codelab-monitor
alerting:
  alertmanagers:
  - static_configs:
    - targets:
      - 127.0.0.1:9093
    scheme: http
    timeout: 10s
  - static_configs:
    - targets:
      - localhost:9093
    scheme: http
    timeout: 10s
rule_files:
- /storage/config/alert.rules
scrape_configs:
- job_name: NODE
  scrape_interval: 15s
  scrape_timeout: 10s
  metrics_path: /metrics
  scheme: http
  static_configs:
  - targets:
    - 192.168.119.132:9100
    labels:
      group: NODE_exporter
- job_name: snmp
  params:
    module:
    - default
  scrape_interval: 1m
  scrape_timeout: 10s
  metrics_path: /snmp
  scheme: http
  static_configs:
  - targets:
    - 10.55.173.34
    - 192.168.119.132
  relabel_configs:
  - source_labels: [__address__]
    separator: ;
    regex: (.*)
    target_label: __param_target
    replacement: $1
    action: replace
  - source_labels: [__param_target]
    separator: ;
    regex: (.*)
    target_label: instance
    replacement: $1
    action: replace
  - source_labels: []
    separator: ;
    regex: (.*)
    target_label: __address__
    replacement: 127.0.0.1:9116
    action: replace

rules

ALERTS cpu_threshold_exceeded
  IF [(100 * (1 - avg(irate(node_cpu{mode="idle"}[5m])) BY (instacnce))) > 70](http://192.168.119.132:9090/graph?g0.expr=%28100+%2A+%281+-+avg%28irate%28node_cpu%7Bmode%3D%22idle%22%7D%5B5m%5D%29%29+BY+%28instacnce%29%29%29+%3E+70&g0.tab=0)
  FOR 1m
  LABELS {title="CPU占用率过高", urgency="immediate"}
  ANNOTATIONS {description="服务器当前CPU占用率为{{ $value }}，超过了上限设置", summary="服务器{{ $labels.instance }}上的CPU占用率太高了"}
ALERT [net_device_down](http://192.168.119.132:9090/graph?g0.expr=ALERTS%7Balertname%3D%22net_device_down%22%7D&g0.tab=0)
  IF [up{job="snmp"} == 0](http://192.168.119.132:9090/graph?g0.expr=up%7Bjob%3D%22snmp%22%7D+%3D%3D+0&g0.tab=0)
  FOR 1m
  LABELS {title="网络设备掉线", urgency="immediate"}
  ANNOTATIONS {description="这个网络设备无法连接超过1分钟了，请及时检查", summary="IP地址为{{ $labels.instance }}的网络设备无法连接"}
ALERT [server_down](http://192.168.119.132:9090/graph?g0.expr=ALERTS%7Balertname%3D%22server_down%22%7D&g0.tab=0)
  IF [up{job="NODE"} == 0](http://192.168.119.132:9090/graph?g0.expr=up%7Bjob%3D%22NODE%22%7D+%3D%3D+0&g0.tab=0)
  FOR 1m
  LABELS {title="服务器掉线", urgency="immediate"}
  ANNOTATIONS {description="这台服务器无法访问已经超过1分钟了，怀疑是服务器宕机，或者node_export服务有问题", summary="IP地址为{{ $labels.instance }}的服务器无法访问"}
ALERT [windows_server_down](http://192.168.119.132:9090/graph?g0.expr=ALERTS%7Balertname%3D%22windows_server_down%22%7D&g0.tab=0)
  IF [up{job="WIN_NODE"} == 0](http://192.168.119.132:9090/graph?g0.expr=up%7Bjob%3D%22WIN_NODE%22%7D+%3D%3D+0&g0.tab=0)
  FOR 1m
  LABELS {title="服务器掉线", urgency="immediate"}
  ANNOTATIONS {description="这台Windows服务器无法访问已经超过1分钟了，怀疑是服务器宕机，或者node_export服务有问题", summary="IP地址为{{ $labels.instance }}的服务器无法访问"}
ALERT [NodeRebootingFrequently](http://192.168.119.132:9090/graph?g0.expr=ALERTS%7Balertname%3D%22NodeRebootingFrequently%22%7D&g0.tab=0)
  IF [changes(node_boot_time{job="NODE"}[1h]) > 3](http://192.168.119.132:9090/graph?g0.expr=changes%28node_boot_time%7Bjob%3D%22NODE%22%7D%5B1h%5D%29+%3E+3&g0.tab=0)
  LABELS {title="服务器频繁重启", urgency="immediate"}
  ANNOTATIONS {description="这台服务器在过去1小时内重启了{{$value}}次,超过了3次的限制", summary="服务器{{$labels.instance}}重启的太频繁了"}
ALERT [DiskWillFillIn4Hours](http://192.168.119.132:9090/graph?g0.expr=ALERTS%7Balertname%3D%22DiskWillFillIn4Hours%22%7D&g0.tab=0)
  IF [predict_linear(node_filesystem_avail{fstype="ext4",job="NODE"}[1h], 4 * 3600) < 0](http://192.168.119.132:9090/graph?g0.expr=predict_linear%28node_filesystem_avail%7Bfstype%3D%22ext4%22%2Cjob%3D%22NODE%22%7D%5B1h%5D%2C+4+%2A+3600%29+%3C+0&g0.tab=0)
  FOR 5m
  LABELS {title="硬盘空间不够", urgency="immediate"}
  ANNOTATIONS {description="挂载在服务器{{$labels.instance}}目录{{$labels.mountpoint}}上的分区{{$labels.device}}，其空余空间在4小时内会用完", summary="服务器{{$labels.instance}}上的空余硬盘空间预计在4小时内会用完"}
ALERT [HighErrorRate](http://192.168.119.132:9090/graph?g0.expr=ALERTS%7Balertname%3D%22HighErrorRate%22%7D&g0.tab=0)
  IF [sum(rate(http_requests_total{status=~"5.."}[1m])) BY (job, path) / sum(rate(http_requests_total[1m])) BY (job, path) * 100 > 1](http://192.168.119.132:9090/graph?g0.expr=sum%28rate%28http_requests_total%7Bstatus%3D~%225..%22%7D%5B1m%5D%29%29+BY+%28job%2C+path%29+%2F+sum%28rate%28http_requests_total%5B1m%5D%29%29+BY+%28job%2C+path%29+%2A+100+%3E+1&g0.tab=0)
  LABELS {title="HTTP 5xx错误太多", urgency="immediate"}
  ANNOTATIONS {description="工作任务{{$labels.job}}中web服务{{$labels.path}}有{{$value}}%的5xx错误", summary="web服务太多5xx错误"}</pre>

aleertmanager

global:
  resolve_timeout: 1m
  smtp_from: alertmanager@example.org
  smtp_smarthost: localhost:25
  smtp_auth_username: alertmanager
  smtp_auth_password: <secret>
  smtp_auth_secret: null
  smtp_auth_identity: ""
  smtp_require_tls: true
  slack_api_url: null
  pagerduty_url: ""
  hipchat_url: ""
  hipchat_auth_token: null
  opsgenie_api_host: ""
  victorops_api_url: https://alert.victorops.com/integrations/generic/20131114/alert/
route:
  receiver: luhya
  group_by:
  - alertname
  group_wait: 1s
  group_interval: 5m
  repeat_interval: 1h
receivers:
- name: luhya
  email_configs:
  - send_resolved: false
    to: admin@com.cn
    from: alertmanager@example.org
    smarthost: localhost:25
    auth_username: alertmanager
    auth_password: <secret>
    auth_secret: null
    auth_identity: ""
    headers:
      From: alertmanager@example.org
      Subject: '{{ template "email.default.subject" . }}'
      To: admin@com.cn
    html: '{{ template "email.default.html" . }}'
    require_tls: true
  webhook_configs:
  - send_resolved: true
    url: http://127.0.0.1/portal/api/1.0/alert
templates: []

5.prometheus- config

prometheus

rules

aleertmanager

猜你喜欢

热点阅读