Skip to content

Commit

Permalink
Packaging changes to faciliate monitoring of cassandra:
Browse files Browse the repository at this point in the history
1. Addition of a wrapper init.d script - contrail-database that controls
   when cassandra can be started based on the time since last started
2. Move cassandra out of supervisord
Partial-Bug: #1484297

Conflicts:
	common/control_files/supervisord_database.conf

Change-Id: Ie3b550fb0e3d9ba966aeaac9727bfd01581443f7
(cherry picked from commit bbaf90f)
  • Loading branch information
Megh Bhatt committed Sep 22, 2015
1 parent f148314 commit e46317b
Show file tree
Hide file tree
Showing 4 changed files with 164 additions and 11 deletions.
159 changes: 155 additions & 4 deletions common/control_files/contrail-database.initd
@@ -1,6 +1,157 @@
#!/usr/bin/env bash
#! /bin/sh
### BEGIN INIT INFO
# Provides: contrail-database wrapper over cassandra
# Default-Start: 2 3 4 5
# Default-Stop: 0 1 6
# Short-Description: contrail database wrapper over cassandra
# Description: Determines the time since last cassandra shutdown:
# 1. If greater than gc_grace_seconds, then does not start
# cassandra
# 2. If less than gc_grace_seconds and greater than hinted
# handoff time, then starts cassandra and starts
# nodetool repair
# 3. If less than hinted handoff time, then starts
# cassandra
### END INIT INFO

# chkconfig: 2345 99 01
# description: Juniper Network Virtualization Collector
# Author: Megh Bhatt <meghb@juniper.net>

supervisorctl -s unix:///tmp/supervisord_database.sock ${1} `basename ${0}`
NAME=cassandra
PIDFILE=/var/run/$NAME.pid
SERVICE="service $NAME"
CLUSTER_STATUS_UP_FILE=/var/log/cassandra/status-up
DEFAULT_GC_GRACE_SECONDS=864000
DEFAULT_HINTED_HANDOFF_SECONDS=10800
REPAIR=/usr/bin/contrail-cassandra-repair
NODETOOL=/usr/bin/nodetool

#
# Function that determines secs since last stop of cassandra
#
seconds_since_last_stop()
{
if [ -f $CLUSTER_STATUS_UP_FILE ]; then
local time_last_up=`stat -c %Y $CLUSTER_STATUS_UP_FILE`
local now=`date +%s`
SECONDS_SINCE_LAST_STOP=$((now - time_last_up))
else
SECONDS_SINCE_LAST_STOP=0
fi
}

#
# Function that determines the maximum allowed down seconds. By default
# this is 90% of DEFAULT_GC_GRACE_SECONDS
#
max_allowed_down_seconds()
{
MAX_ALLOWED_DOWN_SECONDS=$((DEFAULT_GC_GRACE_SECONDS * 9 / 10))
}

#
# Function that determines the maximum down time allowed without
# the need to run repair in secs, By default this is %90 of
# DEFAULT_HINTED_HANDOFF_SECONDS
#
max_down_seconds_before_repair()
{
MAX_DOWN_SECONDS_BEFORE_REPAIR=$((DEFAULT_HINTED_HANDOFF_SECONDS * 9 / 10))
}

#
# Function that returns 0 if process is running, or nonzero if not.
#
# The nonzero value is 3 if the process is simply not running, and 1 if the
# process is not running but the pidfile exists (to match the exit codes for
# the "status" command; see LSB core spec 3.1, section 20.2)
#
CMD_PATT="-user.cassandra.+CassandraDaemon"
is_running()
{
if [ -f $PIDFILE ]; then
pid=`cat $PIDFILE`
grep -Eq "$CMD_PATT" "/proc/$pid/cmdline" 2>/dev/null && return 0
return 1
fi
return 3
}

#
# Function that starts the daemon/service
#
do_start()
{
# Return
# 0 if daemon has been started
# 1 if daemon was already running
# 2 if daemon could not be started
is_running && return 1

max_allowed_down_seconds
max_down_seconds_before_repair
seconds_since_last_stop

if [ "$SECONDS_SINCE_LAST_STOP" -lt "$MAX_DOWN_SECONDS_BEFORE_REPAIR" ]; then
# Now call the cassandra init script
$SERVICE start
stat=$?
return "$stat"
elif [ "$SECONDS_SINCE_LAST_STOP" -ge "$MAX_DOWN_SECONDS_BEFORE_REPAIR" ] &&
[ "$SECONDS_SINCE_LAST_STOP" -lt "$MAX_ALLOWED_DOWN_SECONDS" ]; then
# Now call the cassandra init script
$SERVICE start
stat=$?
if [ "$stat" -eq 0 ]; then
# Wait for cassandra to startup and join the cluster
echo "Waiting for cassandra initialization to complete..."
$NODETOOL info > /dev/null 2>&1
istatus=$?
icounter=0
while [ "$istatus" -ne 0 ] && [ "$icounter" -lt 10 ]; do
sleep 1
$NODETOOL info > /dev/null 2>&1
istatus=$?
icounter=$(($icounter + 1))
done
if [ "$istatus" -ne 0 ]; then
echo "Cassandra initialization not yet complete, please run cassandra repair after initialization is done"
return 0
fi
# Extract node ID to check cluster joining status
echo "Waiting for cassandra to join cluster..."
nodeid=$($NODETOOL info | grep ID | awk '{print $3}')
nstatus=$($NODETOOL status | grep $nodeid | awk '{print $1}')
ncounter=0
while [ -z $(echo $nstatus | grep U) ] && [ "$ncounter" -lt 10 ]; do
sleep 1
nstatus=$($NODETOOL status | grep $nodeid | awk '{print $1}')
ncounter=$(($ncounter + 1))
done
if [ "$ncounter" -ge 10 ]; then
echo "Cassandra not yet joined cluster, please run cassandra repair after cluster formation"
return 0
fi
# Start nodetool repair
echo "Starting cassandra repair, check /var/log/cassandra/repair-*.log for progress"
$REPAIR --log-file /var/log/cassandra/repair.log &> /dev/null &
fi
else
echo "Cassandra has been down for at least $MAX_ALLOWED_DOWN_SECONDS seconds, not starting"
return 2
fi
}

case "$1" in
start)
do_start
;;
*)
$SERVICE "$1"
stat=$?
exit "$stat"
;;
esac

:

# vi:ai sw=4 ts=4 tw=0 et
11 changes: 5 additions & 6 deletions common/control_files/supervisord_database.conf
Expand Up @@ -21,7 +21,6 @@ logfile_backups=5 ; (num of main logfile rotation backups;default 10)
loglevel=info ; (log level;default info; others: debug,warn,trace)
pidfile=/var/run/supervisord_contrail_database.pid ; (supervisord pidfile;default supervisord.pid)
nodaemon=false ; (start in foreground if true;default false)
minfds=100000 ; (min. avail startup file descriptors;default 1024)
minprocs=200 ; (min. avail process descriptors;default 200)
;umask=022 ; (process file creation umask;default 022)
;user=chrism ; (default is current user, required if root)
Expand Down Expand Up @@ -56,22 +55,22 @@ autostart=true ; start at supervisord start (default: true)
killasgroup=false ; SIGKILL the UNIX process group (def false)
environment=LOG_DIR=/var/log/kafka

[program:contrail-database]
command=cassandra -f
;[program:theprogramname]
;command=/bin/cat
;process_name=%(program_name)s ; process_name expr (default %(program_name)s)
;numprocs=1 ; number of processes copies to start (def 1)
;directory=/tmp ; directory to cwd to before exec (def no cwd)
;umask=022 ; umask for process (default None)
;priority=300 ; the relative start priority (default 999)
autostart=true ; start at supervisord start (default: true)
;autostart=true ; start at supervisord start (default: true)
;autorestart=unexpected ; whether/when to restart (default: unexpected)
;startsecs=1 ; number of secs prog must stay running (def. 1)
;startretries=3 ; max # of serial start failures (default 3)
;exitcodes=0,2 ; 'expected' exit codes for process (default 0,2)
stopsignal=KILL ; signal used to kill process (default TERM)
;stopsignal=KILL ; signal used to kill process (default TERM)
;stopwaitsecs=10 ; max num secs to wait b4 SIGKILL (default 10)
;stopasgroup=false ; send stop signal to the UNIX process group (default false)
killasgroup=false ; SIGKILL the UNIX process group (def false)
;killasgroup=false ; SIGKILL the UNIX process group (def false)
;user=chrism ; setuid to this UNIX account to run the program
;redirect_stderr=true ; redirect proc stderr to stdout (default false)
;stdout_logfile=/a/path ; stdout log path, NONE for none; default AUTO
Expand Down
Expand Up @@ -4,6 +4,7 @@ if [ "$1" = "configure" -a -z "$2" ]; then
echo "Running Postinst for contrail-openstack-database install.."
sudo service cassandra stop
sudo rm -rf /var/lib/cassandra
sudo rm -rf /etc/init.d/cassandra
sudo update-rc.d cassandra disable
sudo update-rc.d contrail-database defaults
fi
echo "Postinst for contrail-openstack-database done"
2 changes: 2 additions & 0 deletions common/rpm/contrail-database.spec
Expand Up @@ -134,6 +134,8 @@ done
popd

%post
chkconfig cassandra off
chkconfig contrail-database on
# this is upgrade from 1.02 release to newer i.e cassandra 1.1.7 to 1.2.11
if [ -f /usr/share/cassandra/conf/cassandra.yaml.rpmsave ]; then
CASSANDRA_CONF_OLD=/usr/share/cassandra/conf/cassandra.yaml.rpmsave
Expand Down

0 comments on commit e46317b

Please sign in to comment.