简述2zabbix服务搭建
非本文重点,略过
3MegaCli工具安装
yum -y install MegaCli
4功能实现
5阀值设置
Medaia Error Count on Every Disk <=30
Other Error Count on Every Disk <=1000
Predictive Failure Count On Every Disk <=2
Firmware State on Every Disk !=Unconfigured(bad),Failed
Raid Level State != Degraded
6硬盘自动发现
zabbix提供的自发现接口需要使用json格式
MegaCli64 -PDlist -aAll -NoLog|grep Slot|awk 'BEGIN{printf "{\"data\":[\n\n"} {printf ",\n{ \"{#SLOT_NUM}\":\"%s\"}", $NF, $1;} END{ printf "\n\t]\n}\n";}' | sed '/^,$/d'
执行代码后格式如下
#MegaCli64 -PDlist -aAll -NoLog|grep Slot|awk 'BEGIN{printf "{\"data\":[\n\n"} {printf ",\n{ \"{#SLOT_NUM}\":\"%s\"}", $NF, $1;} END{ printf "\n\t]\n}\n";}' | sed '/^,$/d'
{"data":[
{ "{#SLOT_NUM}":"0"},
{ "{#SLOT_NUM}":"1"},
{ "{#SLOT_NUM}":"2"},
{ "{#SLOT_NUM}":"3"},
{ "{#SLOT_NUM}":"4"},
{ "{#SLOT_NUM}":"5"},
{ "{#SLOT_NUM}":"6"},
{ "{#SLOT_NUM}":"7"},
{ "{#SLOT_NUM}":"8"},
{ "{#SLOT_NUM}":"9"},
{ "{#SLOT_NUM}":"10"},
{ "{#SLOT_NUM}":"11"}
}
7硬盘数据收集脚本
cat diskcheck_megacli.sh
#!/bin/bash
#zabbix监控硬盘信息脚本
#By xiangjunyu 20151101
. ~/.bash_profile > /dev/null
#获取磁盘信息
/opt/MegaRAID/MegaCli/MegaCli64 -Pdlist -a0|grep -Ei '(Slot Number|Media Error Count|Other Error Count|Predictive Failure Count|Raw Size|Firmware state)'|sed -e "s:\[0x.*Sectors\]::g" >/tmp/pdinfo.txt
#将每块磁盘信息拆分,进行逐盘分析
split -l 6 -d /tmp/pdinfo.txt /tmp/pdinfo
#获取磁盘数量(实际数量=PDNUM+1)
PDNUM=`/opt/MegaRAID/MegaCli/MegaCli64 -PDGetNum -aAll|grep Physical|awk '{ print $8 }'`
#磁盘分块后文件名规范统一化
for((i=0;i<${PDNUM};i++))
do
mv /tmp/pdinfo0${i} /tmp/pdinfo${i} >/dev/null 2>&1
#ls /tmp/pdinfo${i}
done
SLOT_NUM=$2
DATAFORMATE()
{
while read LINE
do
if [[ ${LINE} == Slot* ]];
then
SLOTNUMNAME=`echo ${LINE}|awk -F: '{ print $1 }'`
SLOTNUM=`echo ${LINE}|awk -F: '{ print $2 }'`
elif [[ ${LINE} == Media* ]];
then
MECNAME=`echo ${LINE}|awk -F: '{ print $1 }'`
MEC=`echo ${LINE}|awk -F: '{ print $2 }'`
elif [[ ${LINE} == Other* ]];
then
OECNAME=`echo ${LINE}|awk -F: '{ print $1 }'`
OEC=`echo ${LINE}|awk -F: '{ print $2 }'`
elif [[ ${LINE} == Predictive* ]];
then
PFCNAME=`echo ${LINE}|awk -F: '{ print $1 }'`
PFC=`echo ${LINE}|awk -F: '{ print $2 }'`
elif [[ ${LINE} == Raw* ]];
then
RAWNAME=`echo ${LINE}|awk -F: '{ print $1 }'`
SIZE=`echo ${LINE}|awk -F: '{ print $2 }'`
elif [[ ${LINE} == Firmware* ]];
then
FIRMWARENAME=`echo ${LINE}|awk -F: '{ print $1 }'`
FIRMWARESTATUS=`echo ${LINE}|awk -F: '{ print $2 }'`
fi
done </tmp/pdinfo${SLOT_NUM}
}
#检测阵列等级状态
CHECKRAIDLEVEL()
{
/opt/MegaRAID/MegaCli/MegaCli64 -LDInfo -Lall -aALL|grep Degraded
if [ $? = 0 ]
then
echo -1
else
echo 0
fi
}
OPTION=$1
case $OPTION in
mec) DATAFORMATE
echo ${MEC}
;;
oec) DATAFORMATE
echo ${OEC}
;;
pfc) DATAFORMATE
echo ${PFC}
;;
firm)
DATAFORMATE
if [[ "$FIRMWARESTATUS{}" = "Unconfigured(bad)" ]]
then
echo -1
elif [[ "$FIRMWARESTATUS{}" = "Failed" ]]
then
echo -1
else
echo 0
fi
;;
rdlevel)
CHECKRAIDLEVEL
;;
*) echo "Please select option: mec $slot_num ;oec $slot_num;pfc $slot_num;firm $slot_num;rdlevel"
esac
rm -rf /tmp/pdinfo*
8zabbix_agent配置
Include=/etc/zabbix/zabbix_agentd.conf.d/
UnsafeUserParameters=1
echo 'zabbix ALL=(ALL) NOPASSWD: ALL' > /etc/sudoers.d/zabbix
#硬盘自动发现
UserParameter=raid.pd.discovery,MegaCli64 -PDlist -aAll -NoLog|grep Slot|awk 'BEGIN{printf "{\"data\":[\n\n"} {printf ",\n{ \"{#SLOT_NUM}\":\"%s\"}", $NF, $1;} END{ printf "\n\t]\n}\n";}' | sed '/^,$/d'
#收集Media Error Count
UserParameter=raid.phy.mec
,/opt/zabbix-2.4.1/externalscripts/diskcheck_megacli.sh mec $1
#收集Other Error Count
UserParameter=raid.phy.oec
,/opt/zabbix-2.4.1/externalscripts/diskcheck_megacli.sh oec $1
#收集Predictive Failure Count
UserParameter=raid.phy.pfc
,/opt/zabbix-2.4.1/externalscripts/diskcheck_megacli.sh pfc $1
#检测硬盘状态,有故障则回复-1
UserParameter=raid.phy.firms
,/opt/zabbix-2.4.1/externalscripts/diskcheck_megacli.sh firm $1
#检测阵列等级,有降级则回复-1
UserParameter=raid.level.state,/opt/zabbix-2.4.1/externalscripts/diskcheck_megacli.sh rdlevel
9Zabbix Server 配置9.1 新建模板
新建一个zabbix模板Raid.Phy.Megacli
Template name:Raid.Phy.Megacli
9.2 在模板中新建一个Discovery rule
Name:Physical disk discovery
Type:Zabbix agent(active)
Key:raid.pd.discovery
Update interval (in sec):3600
Keep lost resources period (in days):30
Description:Find physical disk
Enabled: ✔
9.3 在Discovery rule中新建Item
按照自己需要建立Item,我这里建四个
Media Error Count On Slot {#SLOT_NUM}
Other Error Count On Slot {#SLOT_NUM}
Predictive Error Count On Slot {#SLOT_NUM}
Firmware State On Slot {#SLOT_NUM}
这里列出一个Item详细参数
Name:Media Error Count On Slot $1
Type:Zabbix agent(active)
Key:raid.phy.mec[{#SLOT_NUM}] #这里的key注意和disk.conf里的匹配
Applications:MegaRaid #自己新建一个Application
Enabled: ✔
其余的默认即可
9.4 在Discovery rule中新建Trigger
Name:{HOST.NAME}Error Count On Slot {#SLOT_NUM}
Expression:{Raid.Phy.Megacli:raid.phy.mec[{#SLOT_NUM}].last(#1,0)}>30 or {Raid.Phy.Megacli:raid.phy.oec[{#SLOT_NUM}].last(#1,0)}>1000 or {Raid.Phy.Megacli:raid.phy.pfc[{#SLOT_NUM}].last(#1,0)}>2
Description:Media Error Count >30 Other Error Count >1000 Predictive Failure Count >2
Severity:Average #根据自己想要的告警等级设定
Enabled: ✔
其余的默认即可
Name:{HOST.NAME}Firmware State On Slot {#SLOT_NUM}
Expression:{Raid.Phy.Megacli:raid.phy.firms[{#SLOT_NUM}].last(#1,0)}=-1
Severity:Average #根据自己想要的告警等级设定
Enabled: ✔
其余的默认即可
9.5 监控raid等级状态Item
在Raid.Phy.Megacli模板中,新建一个Item
Name:Raid State
Type:Zabbix agent(active)
Key:raid.level.state #这里的key注意和disk.conf里的匹配
Applications:MegaRaid #自己新建的Application
Enabled: ✔
其余的默认即可
9.6 监控Raid等级Triggers
在Raid.Phy.Megacli模板中,新建一个Traggers
Name:{HOST.NAME}Raid State
Expression:{Raid.Phy.Megacli:raid.level.state.last(#1,0)}=-1
Severity:Average #根据自己想要的告警等级设定
Enabled: ✔
其余的默认即可
原创:余祥军