Skip to content

Commit

Permalink
Close-Bug: #1449230. Fix provides galera node recovery on node / rack…
Browse files Browse the repository at this point in the history
… / power / network failures and data consistency after the failures

Change-Id: Ia4adc59ee8e1194d04d0b3c6b1919a84aeaaab18
  • Loading branch information
sanju-a committed Apr 29, 2015
1 parent 2f6d28e commit 311b5c4
Show file tree
Hide file tree
Showing 2 changed files with 185 additions and 177 deletions.
188 changes: 72 additions & 116 deletions contrail_provisioning/openstack/scripts/contrail-bootstrap-galera.sh
Expand Up @@ -8,6 +8,8 @@ readonly PROGNAME=$(basename "$0")
readonly LOCKFILE_DIR=/tmp/galera-chk
readonly LOCK_FD=200

readonly boot=$1

LOGFILE=/var/log/galera-bootstrap.log
RETRIES=3
MYSQL_USERNAME="cmon"
Expand All @@ -17,13 +19,12 @@ MYIDFILE="/etc/contrail/galeraid"
SST_FILE="/var/lib/mysql/rsync_sst_complete"
GRA_FILE="/var/lib/mysql/grastate.dat"
MYSQL_STOP="service mysql stop"
RETRY_TIMEOUT=60
RETRY_INTERVAL=5
GTID_DIR="/tmp/galera"
GTID_FILE="/tmp/galera/gtid"
DONOR="DONOR"

MYSQL_WSREP_STATE="show status like 'wsrep_local_state';"
MYSQL_CLUSTER_STATE="show status like 'wsrep_cluster_status';"
galera_state_file="/tmp/galera-chk/wsrep.state"
cluster_state_file="/tmp/galera-chk/cluster.state"
SYNCED=4
STATUS="Primary"
CMON_MON_STOP="service contrail-hamon stop"
Expand All @@ -35,12 +36,8 @@ if [ ! -f "$LOCKFILE_DIR" ] ; then
mkdir -p $LOCKFILE_DIR
fi

if [ ! -f "$galera_state_file" ] ; then
touch "$galera_state_file"
fi

if [ ! -f "$cluster_state_file" ] ; then
touch "$cluster_state_file"
if [ ! -f "$GTID_DIR" ] ; then
mkdir -p $GTID_DIR
fi

lock() {
Expand All @@ -64,7 +61,7 @@ eexit() {
}

timestamp() {
date +"%T"
date
}

log_error_msg() {
Expand All @@ -82,73 +79,26 @@ log_info_msg() {
echo "$(timestamp): INFO: $msg" >> $LOGFILE
}

verify_mysql() {
retry_count=$(($RETRY_TIMEOUT / RETRY_INTERVAL))
for i in $(eval echo {1..$retry_count}); do
sleep $RETRY_INTERVAL
pid=$(pidof mysqld)
if [ "$pid" == '' ]; then
echo "$pid"
return
fi
log_info_msg "Checking for consistent mysql PID: $pid."
done
echo "$pid"
}


galera_check()
{
for (( i=0; i<${DIPS_SIZE}; i++ ))
do
$MYSQL_BIN -u $MYSQL_USERNAME -p${MYSQL_PASSWORD} -h ${DIPS[i]} -e "$MYSQL_WSREP_STATE" 2> >( cat <() > $STDERR_FILE )
error=`cat ${STDERR_FILE} | awk '{print $1}'`
if [[ $error == "ERROR" ]]; then
checkNKill
fi
$MYSQL_BIN -u $MYSQL_USERNAME -p${MYSQL_PASSWORD} -h ${DIPS[i]} -e "$MYSQL_WSREP_STATE" | awk '{print $2}' | sed '1d' > "$galera_state_file" 2> >( cat <() > $STDERR_FILE )
$MYSQL_BIN -u $MYSQL_USERNAME -p${MYSQL_PASSWORD} -h ${DIPS[i]} -e "$MYSQL_CLUSTER_STATE" | awk '{print $2}' | sed '1d' > "$cluster_state_file" 2> >( cat <() > $STDERR_FILE )
done
$MYSQL_BIN -u $MYSQL_USERNAME -p${MYSQL_PASSWORD} -e "$MYSQL_WSREP_STATE" 2> >( cat <() > $STDERR_FILE )
error=`cat ${STDERR_FILE} | awk '{print $1}'`
if [[ $error == "ERROR" ]]; then
checkNKill
fi
(exec rm -rf $STDERR_FILE)&
}
verify_wsrepstate()
{
wsrepstate=`cat $galera_state_file`
if [[ $wsrepstate == $SYNCED ]]; then
echo "y"
return 1
else
echo "n"
return 0
fi
}
verify_clusterstatus()
{
clusterstatus=`cat $cluster_state_file`
if [[ $clusterstatus == $STATUS ]]; then
echo "y"
return 1
else
echo "n"
return 0
fi
}
cleanup_state()
{
wsrepstate_run=$(verify_wsrepstate)
clusterstatus_run=$(verify_clusterstatus)
if [[ $wsrepstate_run == "n" ]] || [[ $clusterstate_run == "n" ]]; then
if [ -f $SST_FILE ]; then
(exec rm -rf "$SST_FILE")&
fi
if [ -f $GRA_FILE ]; then
(exec rm -rf "$GRA_FILE")&
fi
fi
for (( i=0; i<${DIPS_SIZE}; i++ ))
do
wval=$($MYSQL_BIN -u $MYSQL_USERNAME -p${MYSQL_PASSWORD} -h ${DIPS[i]} -e "$MYSQL_WSREP_STATE" | awk '{print $2}' | sed '1d')
cval=$($MYSQL_BIN -u $MYSQL_USERNAME -p${MYSQL_PASSWORD} -h ${DIPS[i]} -e "$MYSQL_CLUSTER_STATE" | awk '{print $2}' | sed '1d')
if [[ $wval == $SYNCED ]] & [[ $cval == $STATUS ]]; then
ret="y"
break
else
ret="n"
fi
done
echo $ret
}
checkNKill()
Expand All @@ -172,72 +122,78 @@ if [ -n "$mysqlpid" ]; then
fi
}
checkNKillMysql()
{
mysqlpid=$(pidof mysqld)
if [ -n "$mysqlpid" ]; then
$MYSQL_STOP
sleep 10
log_info_msg "Mysql is Running. Kill the process: $mysqlpid"
(exec kill -9 $mysqlpid)&
fi
}
bootstrap()
{
# Get the myid of the galera node
if [ -e $MYIDFILE ]; then
myid=$(cat $MYIDFILE)
log_info_msg "Galera node ID: $myid"
else
(exec rm -rf "$galera_state_file")&
(exec rm -rf "$cluster_state_file")&
log_error_msg "Galera node ID not set in $MYIDFILE exiting bootstrap..."
exit 0
fi
wsrepstate_run=$(verify_wsrepstate)
clusterstatus_run=$(verify_clusterstatus)
# Bootstrap galera cluster
checkNKill
log_info_msg "Bootstraping galera cluster."
retry_flag=0
if [ -f $SST_FILE ]; then
(exec rm -rf "$SST_FILE")&
fi
if [ -f $GRA_FILE ]; then
(exec rm -rf "$GRA_FILE")&
fi
bootstrap_retry_count=$(($RETRY_TIMEOUT / RETRY_INTERVAL))
for i in $(eval echo {1..$bootstrap_retry_count}); do
mysql_pid=$(verify_mysql)
mysql_pid=$(pidof mysqld)
if [ "$mysql_pid" == '' ]; then
log_warn_msg "Mysql stopped, trying to start...."
if [[ $wsrepstate_run == "n" ]] || [[ $clusterstatus_run == "n" ]] && [[ $myid == 1 ]]; then
checkNKill
cleanup_state
cmd="service mysql start --wsrep_cluster_address=gcomm://"
log_info_msg "Starting mysql : $cmd"
$cmd >> $LOGFILE
else
if [ $retry_flag == 1 ]; then
cleanup_state
log_warn_msg "Mysql stopped on local, trying to start...."
galchk=$(galera_check)
if [[ $galchk == "n" ]]; then
cmd="service mysql start --wsrep_recover"
log_info_msg "Starting mysql recovery: $cmd"
setsid $cmd >> $LOGFILE
if [ -f $GRA_FILE ]; then
uuid=$(cat $GRA_FILE | grep uuid | awk '{print $2}')
gtid=$(grep "Recovered position: $uuid" /var/log/mysql/error.log | awk '{print $7}' | cut -d ":" -f 2 | tail -1)
echo $gtid > $GTID_FILE
else
log_info_msg "$GRA_FILE not found. Recover mysql without grastate"
gtid=$(grep "Recovered position: " /var/log/mysql/error.log | awk '{print $7}' | cut -d ":" -f 2 | tail -1)
echo $gtid > $GTID_FILE
fi
cmd="service mysql start"
log_info_msg "Starting mysql : $cmd"
$cmd >> $LOGFILE
fi
retry_flag=1
sleep 5
if [[ $galchk == "y" ]]; then
log_info_msg "One of the galera cluster node is up. Cluster monitor will initialize the galera cluster."
fi
else
log_info_msg "Galera cluster is up and running."
log_info_msg "Galera bootstrap completed."
fi
done
}
main()
{
lock $PROGNAME \
|| eexit "Only one instance of $PROGNAME can run at one time."
if [[ $boot == $DONOR ]]; then
checkNKillMysql
log_info_msg "bootstrapping this instance of mysql as DONOR based on the GTID"
cmd="service mysql start --wsrep_cluster_address=gcomm://"
setsid $cmd >> $LOGFILE
(ssh -o StrictHostKeyChecking=no $VIP "$CMON_STOP")&
else
bootstrap
$CMON_MON_START
fi
if [ -f $SST_FILE ]; then
(exec rm -rf "$SST_FILE")&
fi
galera_check
bootstrap
$CMON_MON_START
(exec rm -rf "$galera_state_file")&
(exec rm -rf "$cluster_state_file")&
if [ -f $GRA_FILE ]; then
(exec rm -rf "$GRA_FILE")&
fi
}
main

0 comments on commit 311b5c4

Please sign in to comment.