Packaging changes to faciliate monitoring of cassandra:

1. Addition of a wrapper init.d script - contrail-database that controls when cassandra can be started based on the time since last started 2. Move cassandra out of supervisord Partial-Bug: #1484297 Conflicts: common/control_files/supervisord_database.conf Change-Id: Ie3b550fb0e3d9ba966aeaac9727bfd01581443f7 (cherry picked from commit bbaf90f)
Juniper · Sep 22, 2015 · e46317b · e46317b
1 parent f148314
commit e46317b
Show file tree

Hide file tree

Showing 4 changed files with 164 additions and 11 deletions.
diff --git a/common/control_files/contrail-database.initd b/common/control_files/contrail-database.initd
@@ -1,6 +1,157 @@
-#!/usr/bin/env bash
+#! /bin/sh
+### BEGIN INIT INFO
+# Provides:          contrail-database wrapper over cassandra
+# Default-Start:     2 3 4 5
+# Default-Stop:      0 1 6
+# Short-Description: contrail database wrapper over cassandra
+# Description:       Determines the time since last cassandra shutdown:
+#                    1. If greater than gc_grace_seconds, then does not start
+#                       cassandra
+#                    2. If less than gc_grace_seconds and greater than hinted
+#                       handoff time, then starts cassandra and starts
+#                       nodetool repair
+#                    3. If less than hinted handoff time, then starts
+#                       cassandra
+### END INIT INFO
 
-# chkconfig: 2345 99 01
-# description: Juniper Network Virtualization Collector
+# Author: Megh Bhatt <meghb@juniper.net>
 
-supervisorctl -s unix:///tmp/supervisord_database.sock ${1} `basename ${0}`
+NAME=cassandra
+PIDFILE=/var/run/$NAME.pid
+SERVICE="service $NAME"
+CLUSTER_STATUS_UP_FILE=/var/log/cassandra/status-up
+DEFAULT_GC_GRACE_SECONDS=864000
+DEFAULT_HINTED_HANDOFF_SECONDS=10800
+REPAIR=/usr/bin/contrail-cassandra-repair
+NODETOOL=/usr/bin/nodetool
+
+#
+# Function that determines secs since last stop of cassandra
+#
+seconds_since_last_stop()
+{
+    if [ -f $CLUSTER_STATUS_UP_FILE ]; then
+        local time_last_up=`stat -c %Y $CLUSTER_STATUS_UP_FILE`
+        local now=`date +%s`
+        SECONDS_SINCE_LAST_STOP=$((now - time_last_up))
+    else
+        SECONDS_SINCE_LAST_STOP=0
+    fi
+}
+
+#
+# Function that determines the maximum allowed down seconds. By default
+# this is 90% of DEFAULT_GC_GRACE_SECONDS
+#
+max_allowed_down_seconds()
+{
+    MAX_ALLOWED_DOWN_SECONDS=$((DEFAULT_GC_GRACE_SECONDS * 9 / 10))
+}
+
+#
+# Function that determines the maximum down time allowed without
+# the need to run repair in secs, By default this is %90 of
+# DEFAULT_HINTED_HANDOFF_SECONDS
+#
+max_down_seconds_before_repair()
+{
+    MAX_DOWN_SECONDS_BEFORE_REPAIR=$((DEFAULT_HINTED_HANDOFF_SECONDS * 9 / 10))
+}
+
+#
+# Function that returns 0 if process is running, or nonzero if not.
+#
+# The nonzero value is 3 if the process is simply not running, and 1 if the
+# process is not running but the pidfile exists (to match the exit codes for
+# the "status" command; see LSB core spec 3.1, section 20.2)
+#
+CMD_PATT="-user.cassandra.+CassandraDaemon"
+is_running()
+{
+    if [ -f $PIDFILE ]; then
+        pid=`cat $PIDFILE`
+        grep -Eq "$CMD_PATT" "/proc/$pid/cmdline" 2>/dev/null && return 0
+        return 1
+    fi
+    return 3
+}
+
+#
+# Function that starts the daemon/service
+#
+do_start()
+{
+    # Return
+    #   0 if daemon has been started
+    #   1 if daemon was already running
+    #   2 if daemon could not be started
+    is_running && return 1
+
+    max_allowed_down_seconds
+    max_down_seconds_before_repair
+    seconds_since_last_stop
+
+    if [ "$SECONDS_SINCE_LAST_STOP" -lt "$MAX_DOWN_SECONDS_BEFORE_REPAIR" ]; then
+        # Now call the cassandra init script
+        $SERVICE start
+        stat=$?
+        return "$stat"
+    elif [ "$SECONDS_SINCE_LAST_STOP" -ge "$MAX_DOWN_SECONDS_BEFORE_REPAIR" ] &&
+         [ "$SECONDS_SINCE_LAST_STOP" -lt "$MAX_ALLOWED_DOWN_SECONDS" ]; then
+        # Now call the cassandra init script
+        $SERVICE start
+        stat=$?
+        if [ "$stat" -eq 0 ]; then
+            # Wait for cassandra to startup and join the cluster
+            echo "Waiting for cassandra initialization to complete..."
+            $NODETOOL info > /dev/null 2>&1
+            istatus=$?
+            icounter=0
+            while [ "$istatus" -ne 0 ] && [ "$icounter" -lt 10 ]; do
+                sleep 1
+                $NODETOOL info > /dev/null 2>&1
+                istatus=$?
+                icounter=$(($icounter + 1))
+            done
+            if [ "$istatus" -ne 0 ]; then
+                echo "Cassandra initialization not yet complete, please run cassandra repair after initialization is done"
+                return 0
+            fi
+            # Extract node ID to check cluster joining status
+            echo "Waiting for cassandra to join cluster..."
+            nodeid=$($NODETOOL info | grep ID | awk '{print $3}')
+            nstatus=$($NODETOOL status | grep $nodeid | awk '{print $1}')
+            ncounter=0
+            while [ -z $(echo $nstatus | grep U) ] && [ "$ncounter" -lt 10 ]; do
+                sleep 1
+                nstatus=$($NODETOOL status | grep $nodeid | awk '{print $1}')
+                ncounter=$(($ncounter + 1))
+            done
+            if [ "$ncounter" -ge 10 ]; then
+                echo "Cassandra not yet joined cluster, please run cassandra repair after cluster formation"
+                return 0
+            fi
+            # Start nodetool repair
+            echo "Starting cassandra repair, check /var/log/cassandra/repair-*.log for progress"
+            $REPAIR --log-file /var/log/cassandra/repair.log &> /dev/null &
+        fi
+    else
+        echo "Cassandra has been down for at least $MAX_ALLOWED_DOWN_SECONDS seconds, not starting"
+        return 2
+    fi
+}
+
+case "$1" in
+  start)
+	do_start
+	;;
+  *)
+	$SERVICE "$1"
+        stat=$?
+        exit "$stat"
+	;;
+esac
+
+:
+
+# vi:ai sw=4 ts=4 tw=0 et
diff --git a/common/control_files/supervisord_database.conf b/common/control_files/supervisord_database.conf
@@ -21,7 +21,6 @@ logfile_backups=5           ; (num of main logfile rotation backups;default 10)
 loglevel=info                ; (log level;default info; others: debug,warn,trace)
 pidfile=/var/run/supervisord_contrail_database.pid  ; (supervisord pidfile;default supervisord.pid)
 nodaemon=false               ; (start in foreground if true;default false)
-minfds=100000                ; (min. avail startup file descriptors;default 1024)
 minprocs=200                 ; (min. avail process descriptors;default 200)
 ;umask=022                   ; (process file creation umask;default 022)
 ;user=chrism                 ; (default is current user, required if root)
@@ -56,22 +55,22 @@ autostart=true                ; start at supervisord start (default: true)
 killasgroup=false             ; SIGKILL the UNIX process group (def false)
 environment=LOG_DIR=/var/log/kafka
 
-[program:contrail-database]
-command=cassandra -f
+;[program:theprogramname]
+;command=/bin/cat
 ;process_name=%(program_name)s ; process_name expr (default %(program_name)s)
 ;numprocs=1                    ; number of processes copies to start (def 1)
 ;directory=/tmp                ; directory to cwd to before exec (def no cwd)
 ;umask=022                     ; umask for process (default None)
 ;priority=300                  ; the relative start priority (default 999)
-autostart=true                ; start at supervisord start (default: true)
+;autostart=true                ; start at supervisord start (default: true)
 ;autorestart=unexpected        ; whether/when to restart (default: unexpected)
 ;startsecs=1                   ; number of secs prog must stay running (def. 1)
 ;startretries=3                ; max # of serial start failures (default 3)
 ;exitcodes=0,2                 ; 'expected' exit codes for process (default 0,2)
-stopsignal=KILL               ; signal used to kill process (default TERM)
+;stopsignal=KILL               ; signal used to kill process (default TERM)
 ;stopwaitsecs=10               ; max num secs to wait b4 SIGKILL (default 10)
 ;stopasgroup=false             ; send stop signal to the UNIX process group (default false)
-killasgroup=false             ; SIGKILL the UNIX process group (def false)
+;killasgroup=false             ; SIGKILL the UNIX process group (def false)
 ;user=chrism                   ; setuid to this UNIX account to run the program
 ;redirect_stderr=true          ; redirect proc stderr to stdout (default false)
 ;stdout_logfile=/a/path        ; stdout log path, NONE for none; default AUTO

diff --git a/common/debian/contrail-openstack-database/debian/contrail-openstack-database.postinst b/common/debian/contrail-openstack-database/debian/contrail-openstack-database.postinst
@@ -4,6 +4,7 @@ if [ "$1" = "configure" -a -z "$2" ]; then
     echo "Running Postinst for contrail-openstack-database install.."
     sudo service cassandra stop
     sudo rm -rf /var/lib/cassandra
-    sudo rm -rf /etc/init.d/cassandra
+    sudo update-rc.d cassandra disable
+    sudo update-rc.d contrail-database defaults
 fi
 echo "Postinst for contrail-openstack-database done"
diff --git a/common/rpm/contrail-database.spec b/common/rpm/contrail-database.spec
@@ -134,6 +134,8 @@ done
 popd
 
 %post
+chkconfig cassandra off
+chkconfig contrail-database on
 # this is upgrade from 1.02 release to newer i.e cassandra 1.1.7 to 1.2.11
 if [ -f /usr/share/cassandra/conf/cassandra.yaml.rpmsave ]; then
     CASSANDRA_CONF_OLD=/usr/share/cassandra/conf/cassandra.yaml.rpmsave