Elves 自动化运维
2020-01-21 本文已影响0人
柘龙义
为什么需要自动化运维
# 大批量机器靠人工运维有以下弊端(以配置更新为例):
# 运维时间长: 每一台都需要远程上去更改
# 容易出错: 人工输入命令及其容易出错
# 结果反馈不明显: 需要靠人工自己判断
# 回退麻烦: 出错回退无法保证
# ps: 离职交接的时候及其繁琐
为什么选用Elves
# 安装简单
# 界面管理
# 扩展性及其强悍(使用编程)
# 上手快
# 运维范围广
# 安全管理
Elves安装
# 没有安装docker可以安装(默认的docker版本会比较低),已经有docker跳过这一步
centos:
yum install -y yum-utils device-mapper-persistent-data lvm2
yum-config-manager \
--add-repo \
https://download.docker.com/linux/centos/docker-ce.repo
yum makecache fast
yum install -y --setopt=obsoletes=0 \
docker-ce-18.06.1.ce-3.el7
# 没有安装docker-compose的可以安装,已经有docker-compose跳过这一步
centos: yum install -y docker-compose
# docker安装 Elves
git clone https://github.com/elves-project/docker.git
cd docker
chmod u+x ./control
./control build //下载Base镜像并构建新镜像。 心细的小伙伴可以自己把镜像tag成自己的,再安装避免rebuild。
./control start //调用docker-compose启动各容器,也可以docker-compose up -d.
./control insertsql //插入Elves 数据表结构. 失败了可以手动执行,手动查看标注
./control restart //组件依赖mysql,重启容器刷新程序.
# 为了正常执行,更改ftp目录权限
docker exec -it vsftp bash # 进入容器
chown -R ftpuser:ftpuser /data/ # 修改权限
# 标注
Elves-Dashboard页面端口: 8004
Elves-supervisor页面端口:9092 ; user/password: [admin@gyyx.cn](mailto:admin@gyyx.cn)/admin
Rabbitmq 页面端口:15672 ; user/password: admin/1q2w3e4r
Nginx 页面端口:80
Ftp 端口:21 ; user/passwd: ftpuser/1q2w3e4r
使用Elves
注册主机
git clone https://github.com/elves-project/agent.git
cd agent
cp conf/cfg.example.json conf/cfg.json
vi conf/cfg.json # 更改配置ip,asset以及服务器的配置
chmod u+x ./control
./control start # 在服务界面上查看添加结果,可能需要几分钟同步心跳。
访问 Elves-supervisor: 本机: 127.0.0.1:9092
elves-supervisor # 如上图:
# agent列表: 注册主机的列表
# app管理: 我们运维逻辑,zip包管理
# auth管理: app 密钥,使用app的时候需要验证
上传app
app# 如图:
# 点击右上角 + 添加
# 添加完成后点app右边的编辑图标(橙色小笔)上传app包(zip,制作参考 “app 制作”)
# 上传成功后点击app中间(淡蓝色)选择版本启用
# 启用成功后点击app最右边(深蓝色)选择需要运维的主机
# 之后到auth管理设置app的秘钥
# 之后通过命令启用即可自动运维(启动命令参看 “app 启动”)
app (以machineCheck,开发语言选python2 为例)
- machineCheck 检测主机进程的线程,且这里不能使用python3,因为其他机器大概率用的是python2,不然会有版本问题
# 需要特定的目录结构(这里选择实时反馈型)
tree machineCheck
├── appcfg.json
├── app-worker.py
└── machineCheck.py
appcfg.json 配置文件
{
"Processor":{
"Commnet" : "This Is Processor CFG , Do Not Use For Other",
"Addr" : "127.0.0.1",
"Port" : 10010,
"Timeout" : 0
}
}
app-worker.py app的入口
#!/usr/bin/python
# coding=utf-8
# Author: toryzen
#
# app worker入口
import sys
import json
import base64
import os
import traceback
sys.path.append(os.path.abspath(__file__))
def agentExec(app,func,jsonParam=""):
flag = "false"
try:
param = ""
if(jsonParam!=""):
param = json.loads(repr(base64.b64decode(jsonParam))[1:-1])
#print param
agentObj = __import__(app)
agentClass = getattr(agentObj,app) # 这里加载我们处理逻辑
obj = agentClass()
mtd = getattr(obj,func) # 这里加载我们方法
flag,result = mtd(param) # 这里调用,由于有param,所以我们app需要param选项
except Exception,e:
flag,result = "false",traceback.format_exc()
elvesPrint(flag,result)
def elvesPrint(flag,result):
print "<ElvesWFlag>"+str(flag)+"</ElvesWFlag> <ElvesWResult>"+str(result)+"</ElvesWResult>"
if __name__ == '__main__':
if(len(sys.argv)==3):
agentExec(sys.argv[1],sys.argv[2])
elif(len(sys.argv)==4):
agentExec(sys.argv[1],sys.argv[2],sys.argv[3])
else:
elvesPrint("false","param error")
machineCheck.py 实现逻辑
#!/usr/bin/python
# coding: utf-8
import logging
import socket
import traceback
import commands
import os
import json
if not os.path.isdir('/var/log/elves/machinecheck'):
os.makedirs('/var/log/elves/machinecheck')
logger = logging.getLogger(__name__)
logger.setLevel(level=logging.INFO)
handler = logging.FileHandler('/var/log/elves/machinecheck/info.log')
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
class machineCheck: # 这里需要和最后的zip包前缀一样,elves会是使用同样的名字
@staticmethod
def threads_over():
over_threads = []
pids=commands.getoutput("ps -xH|awk '{ print $1}'").split('\n')
set_pids = set(pids)
for i in set_pids:
if pids.count(i) > 1500: # 超过1500则进行处理
over_threads.append(i)
return over_threads
@staticmethod
def write_log(flag, result): # 记录结果,在服务器中的mysql里面也记录了
message = 'status: %s, message: %s' % (flag, result)
if flag not in ('success', 'error'):
message = 'status: error, message: Function exec failed!'
logger.info(message)
def check(self, params=""): # 这里需要params
return_flag, return_result = ('error', 'Internal Error!')
try:
over_threads = machineCheck.threads_over()
ip = socket.gethostbyname(socket.getfqdn(socket.gethostname()))
if over_threads: # 处理逻辑
return_flag, return_result = ('error', 'have process\' threads more than 1500! Check /var/log/elves/machinecheck/error')
over_threads_command = ["pid: " + pid + commands.getoutput('cat /proc/%s/cmdline' % pid) + '\n' for pid in over_threads]
with open('/var/log/elves/machinecheck/error', 'a') as f:
json.dump(over_threads_command, f) # 记录进程信息
os.system("""curl 'https://oapi.dingtalk.com/robot/send?access_token=xxxxxx' -H 'Content-Type: application/json' -d '{"msgtype": "text","text": {"content": "threads too mush in %s"}}'""" % ip) # 发送报警,我这里接了钉钉报警。
return
return_flag, return_result = ('success', 'host is healthy')
except Exception as e:
return_flag, return_result = ('error', traceback.format_exc())
finally:
machineCheck.write_log(return_flag, return_result)
return_result = 'status: %s, message: %s!' % (return_flag, return_result)
if return_flag == "success":
return_flag = "true"
else:
return_flag = "false"
return (return_flag, return_result)
if __name__ == '__main__':
pass
制作zip包
cd machineCheck
zip ../machineCheck_1.0.zip * # 这里1.0是版本,elves会自己获取。
chown 1000:1000 machineCheck_1.0.zip # 让浏览器可以加载。
app 启动
这里还是以machineCheck为例,我写成了python3脚本调用。启动需要访问openapi(统一入口,8080端口),api详情查看 “Elves Api”
签名
# 调用前需要普及个概念,Elves交互时的签名认证
# Elves 签名使用md5签名.
# 拼凑签名字段: 请求路径 + ? + 参数(按字母排序,并且不带sign_type和sign) + auth_key(在服务端9092端口的auth管理界面)
# 签名: hashlib.md5(签名字段).encode('utf-8')).hexdigest() # python中,下面案例详细讲解
运行app进行测试
machineCheck.py
#!/bin/env/python3
# coding: utf-8
import os
import json
import time
import hashlib
import requests
server_url = "http://127.0.0.1:8080"
ip = "10.1.9.173" # 运维主机的ip
path = "/api/v2/rt/exec"
func = "check"
param = ""
app = "machineCheck"
auth_id = "0906DDE6518477A8" # auth管理界面的id
authkey = "FF6DB1AB43393D3F" # auth管理界面的key
sign_type = 'MD5'
def get_sign():
params = json.dumps(param)
timestamp = int(time.time())
# params = "app=%sp&auth_id=%s&func=%s&ip=%s¶m=%s&proxy=&timeout=×tamp=%s" % (app, auth_id, func, ip, params, timestamp)
params = "app=%s&auth_id=%s&func=%s&ip=%s¶m=%s×tamp=%s" % (app, auth_id, func, ip, params, timestamp)
paths = path + '?' + params
sign = hashlib.md5((paths + authkey).encode('utf-8')).hexdigest() # 签名
return params, sign
def send_bytes(sign, params):
response = requests.post(server_url + path + "?%s&sign_type=MD5&sign=%s" % (params, sign)) # 发送post请求
return response.text # 返回结果
if __name__ == "__main__":
params, sign = get_sign()
print(send_bytes(sign, params))
使用定时任务来启动
add-cron.py
#!/bin/env/python3
import os
import json
import time
import hashlib
import requests
import sys
import json
import subprocess
server_url = "http://127.0.01:8080"
ip = None
try:
ip = sys.argv[1] # 运行时候输入ip,动态调用
except Exception:
print('Error: exp. python x.py $IP')
sys.exit(-1)
path = "/api/v2/cron/add" # 路径
func = "check" # 方法
app = "machineCheck" # app名
rule = "0 0 */1 * * ?" #cron规则
mode = "NP"
auth_id = "0906DDE6518477A8" # auth 管理界面的id
authkey = "FF6DB1AB43393D3F" # auth管理界面的key
timestamp = int(time.time()) # 时间戳
sign_type = 'MD5'
def get_sign():
params = "app=%s&auth_id=%s&func=%s&ip=%s&mode=%s&rule=%s×tamp=%s" % (app, auth_id, func, ip, mode, rule, timestamp)
paths = path + '?' + params
sign = hashlib.md5((paths + authkey).encode('utf-8')).hexdigest()
return params, sign
def send_bytes(sign, params):
url = server_url + path + "?%s&sign_type=MD5&sign=%s" % (params, sign)
response = requests.post(url)
return response.text
if __name__ == "__main__":
params, sign = get_sign()
message = send_bytes(sign, params)
print('Add cron:', message)
cron_id = json.loads(message).get('result').get('id')
os.system("echo '%s %s' >> ./cron_id" % (cron_id, ip)) # 记录cron_id,在服务端mysql中也有记录
print('Start cron:', subprocess.getoutput("python3 ./start-cron.py %s" % cron_id)) # 启动cron
start-cron.py
#!/bin/env/python3
import os
import json
import time
import hashlib
import requests
import sys
server_url = "http://127.0.0.1:8080"
path = "/api/v2/cron/start"
cron_id = None
try:
cron_id = sys.argv[1]
except Exception:
print('Error: exp. python x.py $cron_id')
sys.exit(-1)
auth_id = "0906DDE6518477A8"
authkey = "FF6DB1AB43393D3F"
timestamp = int(time.time())
sign_type = 'MD5'
def get_sign():
params = "auth_id=%s&cron_id=%s×tamp=%s" % (auth_id, cron_id, timestamp)
paths = path + '?' + params
sign = hashlib.md5((paths + authkey).encode('utf-8')).hexdigest()
return params, sign
def send_bytes(sign, params):
url = server_url + path + "?%s&sign_type=MD5&sign=%s" % (params, sign)
response = requests.post(url)
return response.text
if __name__ == "__main__":
params, sign = get_sign()
print(send_bytes(sign, params))
标注
# 签名的时候一定要保证参数齐全且顺序排列正确,不然签名验证不通过
# cron规则如果报mysql字段范围错误,可以手动更改mysql字段
alter table task_cron modify column mode varchar(6) not null;
# cron 规则遵循quartz cron,和linux cron有区别,周那里使用? 代替
参考文献
Elves 官网: [https://gy-games.gitbooks.io/elves/module/elves-app.html](https://gy-games.gitbooks.io/elves/module/elves-app.html)
Evels-Api: [https://gy-games.gitbooks.io/elves/api.html](https://gy-games.gitbooks.io/elves/api.html)
Elves-docker: [https://github.com/elves-project/docker](https://github.com/elves-project/docker)