Linux GPU Prometheus监控脚本
monitor.sh
GPU跨平台通用监控脚本
功能: Useage: monitor.sh fast|mem|gpu|temp|all|[pathToLog sleepTimeNum]
注意: ./monitor.sh fast速度最快
#!/bin/bash
#. /etc/profile
#. ~/.bash_profile
#. ~/.bashrc
# 判断nvidia-smi命令是否存在
/usr/bin/nvidia-smi > /dev/null
if [ $? -eq 0 ]
then
echo 'nvidia-smi check pass' `date`
else
echo 'nvidia-smi not exists'
exit 1
fi
# 获取GPU Count
function get_gpu_list()
{
count=`nvidia-smi -L|wc -l`
echo $count
}
#获取GPU id对应uuid
function get_uuid()
{
uuid=`nvidia-smi -q -i $1|grep 'UUID'|awk '{print $4}'`
echo $uuid
}
#获取显存使用率
function get_memory_usage()
{
usage=`nvidia-smi -q -d MEMORY -i $1|grep -E 'Total|Used'|head -2|awk '{print $3}'|xargs echo|awk '{print $2/$1}'`
echo $usage
}
#获取内存详细信息
function get_memory_detail()
{
detail=`nvidia-smi -q -d MEMORY -i $1|grep -E 'Total|Used|Free'|head -3|awk '{print $3}'|xargs echo`
echo $detail
}
#获取GPU使用率
function get_volatile_gpu()
{
vol=`nvidia-smi -q -d UTILIZATION -i $1 |grep -A 5 "GPU Utilization"|tail -1|awk '{print $3}'`
echo $vol
}
#获取GPU Current 温度
function get_temperature()
{
temp=`nvidia-smi -q -d Temperature -i $1|grep 'GPU Current'|awk '{print $5}'`
echo $temp
}
#获取Pod_id
function get_pod_id()
{
echo `hostname`
}
#数据output
#output $1 $2 $3 $4 $5
#$1 字段名 $2 pod_id $3 gpu编号 $4 uuid $5 监控值
function output()
{
echo $1"{podid=\""$2"\",gpu=\""$3"\",uuid=\""$4"\"}" $5
}
#输出mem prometheus格式数据
#dcgm_mem_usage{pod_id="localhost"}
function mem_prm()
{
for((i=0;i<`get_gpu_list`;i++))
do
name="dcgm_mem_usage"
pod_id=`get_pod_id`
uuid=`get_uuid $i`
value=`get_memory_usage $i`
output $name $pod_id $i $uuid $value
done
}
#输出mem detail prometheus格式数据
#dcgm_mem_detail{pod_id="localhost"}
function mem_detail_prm()
{
for((i=0;i<`get_gpu_list`;i++))
do
pod_id=`get_pod_id`
uuid=`get_uuid $i`
value=`get_memory_detail $i`
output "dcgm_fb_total" $pod_id $i $uuid `echo $value|awk '{print $1}'`
output "dcgm_fb_used" $pod_id $i $uuid `echo $value|awk '{print $2}'`
output "dcgm_fb_free" $pod_id $i $uuid `echo $value|awk '{print $3}'`
done
}
#输出gpu prometheus格式数据
#dcgm_gpu_utilization{...}
function gpu_prm()
{
for((i=0;i<`get_gpu_list`;i++))
do
name="dcgm_gpu_utilization"
pod_id=`get_pod_id`
uuid=`get_uuid $i`
value=`get_volatile_gpu $i`
output $name $pod_id $i $uuid $value
done
}
#输出温度 prometheus格式数据
#dcgm_temp{...}
function temp_prm()
{
for((i=0;i<`get_gpu_list`;i++))
do
name="dcgm_temp"
pod_id=`get_pod_id`
uuid=`get_uuid $i`
value=`get_temperature $i`
output $name $pod_id $i $uuid $value
done
}
function allinone()
{
mem_prm
mem_detail_prm
gpu_prm
temp_prm
}
#快速获取
function fast()
{
nvidia-smi -q > /tmp/1
num=0
count=0
uuid=''
first=0
for i in `cat /tmp/1|grep -E 'Minor Number|UUID|GPU Current Temp|Gpu|Total|Used|Free'|cut -d ':' -f2|awk '{print $1}'`
do
if [ $num -eq 0 ];then
uuid=$i
elif [ $num -eq 1 ];then
count=$i
elif [ $num -eq 2 ];then
if [ $first -lt 13 ];then
echo '# HELP dcgm_fb_total Framebuffer memory total (in MiB).'
echo '# TYPE dcgm_fb_total gauge'
fi
output 'dcgm_fb_total' ${HOSTNAME} $count $uuid $i
elif [ $num -eq 3 ];then
if [ $first -lt 13 ];then
echo '# HELP dcgm_fb_used Framebuffer memory used (in MiB).'
echo '# TYPE dcgm_fb_used gauge'
fi
output 'dcgm_fb_used' ${HOSTNAME} $count $uuid $i
elif [ $num -eq 4 ];then
if [ $first -lt 13 ];then
echo '# HELP dcgm_fb_free Framebuffer memory free (in MiB).'
echo '# TYPE dcgm_fb_free gauge'
fi
output 'dcgm_fb_free' ${HOSTNAME} $count $uuid $i
elif [ $num -eq 8 ];then
if [ $first -lt 13 ];then
echo '# HELP dcgm_gpu_utilization GPU utilization (in %).'
echo '# TYPE dcgm_gpu_utilization gauge'
fi
output 'dcgm_gpu_utilization' ${HOSTNAME} $count $uuid $i
elif [ $num -eq 13 ];then
if [ $first -le 13 ];then
echo '# HELP dcgm_gpu_temp GPU temperature (in C).'
echo '# TYPE dcgm_gpu_temp gauge'
fi
output 'dcgm_gpu_temp' ${HOSTNAME} $count $uuid $i
fi
if [ $num -eq 13 ];then
num=0
else
((num++))
fi
((first++))
done
}
case $1 in
"help")
echo 'Useage: monitor.sh fast|mem|gpu|temp|all|[pathToLog sleepTimeNum]'
;;
"mem")
mem_prm
mem_detail_prm
;;
"gpu")
gpu_prm
;;
"temp")
temp_prm
;;
"fast")
fast
;;
"all")
allinone
;;
"onebyone")
if [ ! -n "$1" ];then
if [ ! -d "/run/prometheus" ];then
mkdir -p /run/prometheus
fi
while true;do allinone > /run/prometheus/`hostname`_dcgm.prom;sleep 15;done
else
if [ ! -n "$2" ];then
while true;do allinone > $1;sleep 15;done
else
while true;do allinone > $1;sleep $2;done
fi
fi
;;
*)
if [ ! -n "$1" ];then
if [ ! -d "/run/prometheus" ];then
mkdir -p /run/prometheus
fi
while true;do fast > /run/prometheus/`hostname`_dcgm.prom;sleep 15;done
else
if [ ! -n "$2" ];then
while true;do fast > $1;sleep 15;done
else
while true;do fast > $1;sleep $2;done
fi
fi
;;
esac