Skip to content

Commit

Permalink
Related-Bug: #1449230. Fix provides galera node recovery on node / ra…
Browse files Browse the repository at this point in the history
…ck / power / network failures and data consistency after the failures

Related-Bug: #1463538. Adding the cmon failure cluster discovery such that cmon (cluster monitor) can detect and monitor mysql

Change-Id: I96b3f786a6858ec75bcedeb493e8be836a172f4c
  • Loading branch information
sanju-a committed Jun 10, 2015
1 parent 06e5772 commit ad0d54c
Show file tree
Hide file tree
Showing 2 changed files with 167 additions and 30 deletions.
Expand Up @@ -56,6 +56,12 @@ lock() {
|| return 1
}

unlock() {
local prefix=$1
local lock_file=$LOCKFILE_DIR/$prefix.lock
(exec rm -rf "$lock_file")&
}

eexit() {
local error_str="$@"
echo $error_str
Expand Down Expand Up @@ -83,29 +89,32 @@ log_info_msg() {

galera_check()
{
$MYSQL_BIN -u $MYSQL_USERNAME -p${MYSQL_PASSWORD} -e "$MYSQL_WSREP_STATE" 2> >( cat <() > $STDERR_FILE )
$MYSQL_BIN --connect_timeout 5 -u $MYSQL_USERNAME -p${MYSQL_PASSWORD} -e "$MYSQL_WSREP_STATE" 2> >( cat <() > $STDERR_FILE )
error=`cat ${STDERR_FILE} | awk '{print $1}'`
if [[ $error == "ERROR" ]]; then
checkNKill
fi
(exec rm -rf $STDERR_FILE)&
for (( i=0; i<${DIPS_SIZE}; i++ ))
do
wval=$($MYSQL_BIN -u $MYSQL_USERNAME -p${MYSQL_PASSWORD} -h ${DIPS[i]} -e "$MYSQL_WSREP_STATE" | awk '{print $2}' | sed '1d')
cval=$($MYSQL_BIN -u $MYSQL_USERNAME -p${MYSQL_PASSWORD} -h ${DIPS[i]} -e "$MYSQL_CLUSTER_STATE" | awk '{print $2}' | sed '1d')
if [[ $wval == $SYNCED ]] & [[ $cval == $STATUS ]]; then
status=$(ping -c 1 -w 1 -W 1 -n ${DIPS[i]} | grep packet | awk '{print $6}' | cut -c1)
if [[ $status == 0 ]]; then
wval=$($MYSQL_BIN --connect_timeout 5 -u $MYSQL_USERNAME -p${MYSQL_PASSWORD} -h ${DIPS[i]} -e "$MYSQL_WSREP_STATE" | awk '{print $2}' | sed '1d')
cval=$($MYSQL_BIN --connect_timeout 5 -u $MYSQL_USERNAME -p${MYSQL_PASSWORD} -h ${DIPS[i]} -e "$MYSQL_CLUSTER_STATE" | awk '{print $2}' | sed '1d')
fi
if [[ $wval == $SYNCED ]] & [[ $cval == $STATUS ]]; then
ret="y"
break
else
ret="n"
fi
fi
done
echo $ret
}
checkNKill()
{
$CMON_MON_STOP
#$CMON_MON_STOP
cmonpid=$(pidof cmon)
if [ -n "$cmonpid" ]; then
Expand Down Expand Up @@ -165,15 +174,9 @@ log_info_msg "Bootstraping galera cluster."
cmd="service mysql start --wsrep_recover"
log_info_msg "Starting mysql recovery: $cmd"
setsid $cmd >> $LOGFILE
if [ -f $GRA_FILE ] && [ ! -s "$GRA_FILE" ]; then
uuid=$(cat $GRA_FILE | grep uuid | awk '{print $2}')
gtid=$(grep "Recovered position: $uuid" /var/log/mysql/error.log | awk '{print $7}' | cut -d ":" -f 2 | tail -1)
echo $gtid > $GTID_FILE
else
log_info_msg "$GRA_FILE not found. Recover mysql without grastate"
gtid=$(grep "Recovered position: " /var/log/mysql/error.log | awk '{print $7}' | cut -d ":" -f 2 | tail -1)
echo $gtid > $GTID_FILE
fi
log_info_msg "Recover mysql GTID"
gtid=$(grep "Recovered position: " /var/log/mysql/error.log | awk '{print $7}' | cut -d ":" -f 2 | tail -1)
echo $gtid > $GTID_FILE
fi
if [[ $galchk == "y" ]]; then
log_info_msg "One of the galera cluster node is up. Cluster monitor will initialize the galera cluster."
Expand All @@ -192,10 +195,10 @@ if [[ $boot == $DONOR ]]; then
log_info_msg "bootstrapping this instance of mysql as DONOR based on the GTID"
cmd="service mysql start --wsrep_cluster_address=gcomm://"
setsid $cmd >> $LOGFILE
(ssh -o StrictHostKeyChecking=no $VIP "$CMON_STOP")&
(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 $VIP "$CMON_STOP")&
else
bootstrap
$CMON_MON_START
#(exec $CMON_MON_START)&
fi
if [ -f $SST_FILE ]; then
Expand All @@ -205,6 +208,7 @@ fi
if [ -f $GRA_FILE ]; then
(exec rm -rf "$GRA_FILE")&
fi
unlock $PROGNAME
}
main
159 changes: 146 additions & 13 deletions contrail_provisioning/openstack/scripts/contrail-cmon-monitor.sh
Expand Up @@ -12,15 +12,26 @@ RUN_STATE="isrunning"
CMON_SVC_CHECK=$(pgrep -xf '/usr/local/cmon/sbin/cmon -r /var/run/cmon')
RUN_CMON="service cmon start"
STOP_CMON="service cmon stop"
RESTART_CMON="service cmon restart"
mysql_host=$VIP
mysql_port=33306
MYSQL_SVC_CHECK="service mysql status"
MYSQL_SVC_STOP="service mysql stop"
HAP_RESTART="service haproxy restart"
cmon_run=0
viponme=0
eviponme=0
haprestart=0
galerastate=0
recluster=false
RMQ_MONITOR="/opt/contrail/bin/contrail-rmq-monitor.sh"
RMQ_MONITOR_STOP="/opt/contrail/bin/contrail-rmq-monitor.sh STOP"
rstcnt="/tmp/ha-chk/rmq-rst-cnt"
numrst="/tmp/ha-chk/rmq-num-rst"
cleanuppending="/tmp/ha-chk/rmq_mnesia_cleanup_pending"
MYID="/etc/contrail/galeraid"
cmonerror="CmonCron could not initialize"
cmonlog="/var/log/cmon.log"

NOVA_SCHED_CHK="supervisorctl -s unix:///tmp/supervisord_openstack.sock status nova-scheduler"
NOVA_CONS_CHK="supervisorctl -s unix:///tmp/supervisord_openstack.sock status nova-console"
Expand Down Expand Up @@ -52,6 +63,13 @@ SYNCED=4
STATUS="Primary"
RMQ_SRVR_STATUS="supervisorctl -s unix:///tmp/supervisord_support_service.sock status rabbitmq-server"
RMQ_SRVR_RST="supervisorctl -s unix:///tmp/supervisord_support_service.sock restart rabbitmq-server"
RECLUSTER="/opt/contrail/bin/contrail-bootstrap-galera.sh"
ERROR2002="Can't connect to local MySQL server"
ERROR1205="Lock wait timeout exceeded"
STDERR="/tmp/galera-chk/stderr"
RECLUSTRUN="/tmp/galera/recluster"
RMQSTOP="/tmp/ha-chk/rmqstopped"
cmondisco="/etc/mysql/.cmondiscoinit"

timestamp() {
date
Expand All @@ -72,6 +90,11 @@ log_info_msg() {
echo "$(timestamp): INFO: $msg" >> $LOGFILE
}

#Failure supported
cSize=$((${DIPS_SIZE} - 1))
nFailures=$(($cSize / 2))

vip_info() {
for y in $MYIPS
do
for (( i=0; i<${DIPS_SIZE}; i++ ))
Expand All @@ -96,7 +119,9 @@ for y in $MYIPS
break
fi
done
}

ka_vip_del() {
# This is to prevent a bug in keepalived
# that does not remove VRRP IP on it being down
ka=$(pidof keepalived)
Expand All @@ -115,13 +140,14 @@ if [[ $kps == 0 ]]; then
(exec $ecmd)&
fi
fi
}

galera_check()
{
for (( i=0; i<${DIPS_SIZE}; i++ ))
do
wval=$($MYSQL_BIN -u $MYSQL_USERNAME -p${MYSQL_PASSWORD} -h ${DIPS[i]} -e "$MYSQL_WSREP_STATE" | awk '{print $2}' | sed '1d')
cval=$($MYSQL_BIN -u $MYSQL_USERNAME -p${MYSQL_PASSWORD} -h ${DIPS[i]} -e "$MYSQL_CLUSTER_STATE" | awk '{print $2}' | sed '1d')
wval=$($MYSQL_BIN --connect_timeout 10 -u $MYSQL_USERNAME -p${MYSQL_PASSWORD} -h ${DIPS[i]} -e "$MYSQL_WSREP_STATE" | awk '{print $2}' | sed '1d')
cval=$($MYSQL_BIN --connect_timeout 10 -u $MYSQL_USERNAME -p${MYSQL_PASSWORD} -h ${DIPS[i]} -e "$MYSQL_CLUSTER_STATE" | awk '{print $2}' | sed '1d')
if [[ $wval == $SYNCED ]] & [[ $cval == $STATUS ]]; then
ret="y"
break
Expand All @@ -146,6 +172,7 @@ verify_cmon() {
fi
}

procs_check() {
# These checks will eventually be replaced when we have nodemgr plugged in
# for openstack services
# CHECK FOR NOVA SCHD
Expand Down Expand Up @@ -189,26 +216,34 @@ verify_cmon() {
(exec $RMQ_SRVR_RST)&
log_info_msg "RabbitMQ restarted becuase of the state $state"
fi
}

#Failure supported
cSize=$((${DIPS_SIZE} - 1))
nFailures=$(($cSize / 2))

bootstrap() {
# Check for the state of mysql and remove any
# stale gtid files
galerastate=$(galera_check)
if [ $galerastate == "y" ] && [ -f $GTID_FILE ]; then
if [ $galerastate == "y" ]; then
if [ -f $GTID_FILE ]; then
(exec rm -rf $GTID_FILE)&
log_info_msg "Removed GTID File"
log_info_msg "Removed GTID file"
fi
if [ -f $RECLUSTRUN ]; then
(exec rm -rf $RECLUSTRUN)&
log_info_msg "Removed recluster file"
fi
else
if [ ! -f $GTID_FILE ] && [ -z "$mypid" ]; then
recluster=true
fi
fi

if [ -f $GTID_FILE ]; then
for (( i=0; i<${DIPS_SIZE}; i++ ))
do
gtidfile=$(ssh -o StrictHostKeyChecking=no ${DIPS[i]} "ls $GTID_FILE | cut -d "/" -f 4")
if [[ $gtidfile != "" ]]; then
gtid[i]=$(ssh -o StrictHostKeyChecking=no ${DIPS[i]} "cat $GTID_FILE")
fi
gtidfile=$(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 ${DIPS[i]} "ls $GTID_FILE | cut -d "/" -f 4")
if [[ $gtidfile != "" ]]; then
gtid[i]=$(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 ${DIPS[i]} "cat $GTID_FILE")
fi
done

gtids=${#gtid[@]}
Expand All @@ -231,7 +266,9 @@ if [ -f $GTID_FILE ]; then
fi
fi
fi
}

chkNRun_cluster_mon() {
cmon_run=$(verify_cmon)
# Check for cmon and if its the VIP node let cmon run or start it
if [ $viponme -eq 1 ]; then
Expand All @@ -244,6 +281,11 @@ if [ $viponme -eq 1 ]; then
sleep $PERIODIC_RMQ_CHK_INTER
(exec $RMQ_MONITOR)&
fi
cerr=$(grep "$cmonerror" "$cmonlog" | wc -l)
if [[ $cerr != 0 ]]; then
(exec rm -rf "$cmonlog")&
(exec $RESTART_CMON)&
fi
else
if [ $cmon_run == "y" ]; then
(exec $STOP_CMON)&
Expand Down Expand Up @@ -274,8 +316,19 @@ else
log_info_msg "Restarted HAP becuase of stale dips"
fi
fi
if [ -f $cleanuppending ]; then
(exec rm -rf $cleanuppending)&
fi
if [ -f $rstcnt]; then
(exec rm -rf $rstcnt)&
fi
if [ -f $numrst]; then
(exec rm -rf $numrst)&
fi
fi
}

cleanup() {
#Cleanup if there exists sockets in CLOSE_WAIT
clssoc=$(netstat -natp | grep 33306 | grep CLOSE_WAIT | wc -l)
if [[ $clssoc -ne 0 ]]; then
Expand All @@ -288,5 +341,85 @@ if [[ $clssoc -ne 0 ]]; then
xargs kill -9
log_info_msg "Cleaned connections to mysql that were in CLOSE_WAIT"
fi
}

reCluster() {
# RE-CLUSTER
noconn=0
for (( i=0; i<${DIPS_SIZE}; i++ ))
do
status[i]=$(ping -c 1 -w 1 -W 1 -n ${DIPS[i]} | grep packet | awk '{print $6}' | cut -c1)
done

for (( i=0; i<${#status[@]}; i++ ))
do
if [[ ${status[i]} == 1 ]]; then
((noconn++))
fi
done

mypid=$(pidof mysqld)
if [ -n "$mypid" ] && [ $galerastate == "n" ]; then
$MYSQL_BIN --connect_timeout 5 -u $MYSQL_USERNAME -p${MYSQL_PASSWORD} -e "$MYSQL_WSREP_STATE" 2> >( cat <() > $STDERR)
err1=$(cat $STDERR | grep "$ERROR2002" | awk '{print $2}')
err2=$(cat $STDERR | grep "$ERROR1205" | awk '{print $2}')
if [[ $err1 == 2002 ]] || [[ $err2 == 1205 ]]; then
recluster=true
fi
(exec rm -rf "$STDERR")&
fi
if [ ! -f $RECLUSTRUN ]; then
if [ $noconn -ge $cSize ] || [[ "$recluster" == true ]]; then
log_info_msg "Connectivity lost with $noconn peers"
log_info_msg "Mysql Galera Error $err1 $err2 requires reclustering"
log_info_msg "Reclustering MySql Galera"
(exec $RECLUSTER)&
touch $RECLUSTRUN
fi
fi
epmd=$(pidof epmd)
if [ $noconn -ge $cSize ] && [ -n "$epmd" ]; then
log_info_msg "Stop RMQ"
(exec $RMQ_MONITOR_STOP)&
touch $RMQSTOP
else
if [ -f $RMQSTOP ]; then
(exec $RMQ_SRVR_RST)&
(exec rm -rf $RMQSTOP)&
fi
fi
}
cmonFailDomainDiscovery() {
if [ -f $MYID ]; then
myid=$(cat $MYID)
fi
if [[ $myid == 1 ]] && [ $galerastate == "y" ]; then
if [ ! -f $cmondisco ]; then
for (( i=0; i<${DIPS_SIZE}; i++ ))
do
if [ $MYIP != ${DIPS[i]} ]; then
(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 ${DIPS[i]} "$MYSQL_SVC_STOP")
fi
done
touch "$cmondisco"
fi
fi
}
main()
{
vip_info
bootstrap
chkNRun_cluster_mon
ka_vip_del
procs_check
cleanup
reCluster
cmonFailDomainDiscovery
exit 0
}
main
exit 0

0 comments on commit ad0d54c

Please sign in to comment.