From e46317b8965099ded353d0c5dbb8b2522885446c Mon Sep 17 00:00:00 2001 From: Megh Bhatt Date: Mon, 31 Aug 2015 16:02:12 -0700 Subject: [PATCH] Packaging changes to faciliate monitoring of cassandra: 1. Addition of a wrapper init.d script - contrail-database that controls when cassandra can be started based on the time since last started 2. Move cassandra out of supervisord Partial-Bug: #1484297 Conflicts: common/control_files/supervisord_database.conf Change-Id: Ie3b550fb0e3d9ba966aeaac9727bfd01581443f7 (cherry picked from commit bbaf90fb3230d11fa7d8c04ca48b288c3413ac47) --- common/control_files/contrail-database.initd | 159 +++++++++++++++++- .../control_files/supervisord_database.conf | 11 +- .../contrail-openstack-database.postinst | 3 +- common/rpm/contrail-database.spec | 2 + 4 files changed, 164 insertions(+), 11 deletions(-) diff --git a/common/control_files/contrail-database.initd b/common/control_files/contrail-database.initd index be97cd3c3..0657dd36a 100755 --- a/common/control_files/contrail-database.initd +++ b/common/control_files/contrail-database.initd @@ -1,6 +1,157 @@ -#!/usr/bin/env bash +#! /bin/sh +### BEGIN INIT INFO +# Provides: contrail-database wrapper over cassandra +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: contrail database wrapper over cassandra +# Description: Determines the time since last cassandra shutdown: +# 1. If greater than gc_grace_seconds, then does not start +# cassandra +# 2. If less than gc_grace_seconds and greater than hinted +# handoff time, then starts cassandra and starts +# nodetool repair +# 3. If less than hinted handoff time, then starts +# cassandra +### END INIT INFO -# chkconfig: 2345 99 01 -# description: Juniper Network Virtualization Collector +# Author: Megh Bhatt -supervisorctl -s unix:///tmp/supervisord_database.sock ${1} `basename ${0}` +NAME=cassandra +PIDFILE=/var/run/$NAME.pid +SERVICE="service $NAME" +CLUSTER_STATUS_UP_FILE=/var/log/cassandra/status-up +DEFAULT_GC_GRACE_SECONDS=864000 +DEFAULT_HINTED_HANDOFF_SECONDS=10800 +REPAIR=/usr/bin/contrail-cassandra-repair +NODETOOL=/usr/bin/nodetool + +# +# Function that determines secs since last stop of cassandra +# +seconds_since_last_stop() +{ + if [ -f $CLUSTER_STATUS_UP_FILE ]; then + local time_last_up=`stat -c %Y $CLUSTER_STATUS_UP_FILE` + local now=`date +%s` + SECONDS_SINCE_LAST_STOP=$((now - time_last_up)) + else + SECONDS_SINCE_LAST_STOP=0 + fi +} + +# +# Function that determines the maximum allowed down seconds. By default +# this is 90% of DEFAULT_GC_GRACE_SECONDS +# +max_allowed_down_seconds() +{ + MAX_ALLOWED_DOWN_SECONDS=$((DEFAULT_GC_GRACE_SECONDS * 9 / 10)) +} + +# +# Function that determines the maximum down time allowed without +# the need to run repair in secs, By default this is %90 of +# DEFAULT_HINTED_HANDOFF_SECONDS +# +max_down_seconds_before_repair() +{ + MAX_DOWN_SECONDS_BEFORE_REPAIR=$((DEFAULT_HINTED_HANDOFF_SECONDS * 9 / 10)) +} + +# +# Function that returns 0 if process is running, or nonzero if not. +# +# The nonzero value is 3 if the process is simply not running, and 1 if the +# process is not running but the pidfile exists (to match the exit codes for +# the "status" command; see LSB core spec 3.1, section 20.2) +# +CMD_PATT="-user.cassandra.+CassandraDaemon" +is_running() +{ + if [ -f $PIDFILE ]; then + pid=`cat $PIDFILE` + grep -Eq "$CMD_PATT" "/proc/$pid/cmdline" 2>/dev/null && return 0 + return 1 + fi + return 3 +} + +# +# Function that starts the daemon/service +# +do_start() +{ + # Return + # 0 if daemon has been started + # 1 if daemon was already running + # 2 if daemon could not be started + is_running && return 1 + + max_allowed_down_seconds + max_down_seconds_before_repair + seconds_since_last_stop + + if [ "$SECONDS_SINCE_LAST_STOP" -lt "$MAX_DOWN_SECONDS_BEFORE_REPAIR" ]; then + # Now call the cassandra init script + $SERVICE start + stat=$? + return "$stat" + elif [ "$SECONDS_SINCE_LAST_STOP" -ge "$MAX_DOWN_SECONDS_BEFORE_REPAIR" ] && + [ "$SECONDS_SINCE_LAST_STOP" -lt "$MAX_ALLOWED_DOWN_SECONDS" ]; then + # Now call the cassandra init script + $SERVICE start + stat=$? + if [ "$stat" -eq 0 ]; then + # Wait for cassandra to startup and join the cluster + echo "Waiting for cassandra initialization to complete..." + $NODETOOL info > /dev/null 2>&1 + istatus=$? + icounter=0 + while [ "$istatus" -ne 0 ] && [ "$icounter" -lt 10 ]; do + sleep 1 + $NODETOOL info > /dev/null 2>&1 + istatus=$? + icounter=$(($icounter + 1)) + done + if [ "$istatus" -ne 0 ]; then + echo "Cassandra initialization not yet complete, please run cassandra repair after initialization is done" + return 0 + fi + # Extract node ID to check cluster joining status + echo "Waiting for cassandra to join cluster..." + nodeid=$($NODETOOL info | grep ID | awk '{print $3}') + nstatus=$($NODETOOL status | grep $nodeid | awk '{print $1}') + ncounter=0 + while [ -z $(echo $nstatus | grep U) ] && [ "$ncounter" -lt 10 ]; do + sleep 1 + nstatus=$($NODETOOL status | grep $nodeid | awk '{print $1}') + ncounter=$(($ncounter + 1)) + done + if [ "$ncounter" -ge 10 ]; then + echo "Cassandra not yet joined cluster, please run cassandra repair after cluster formation" + return 0 + fi + # Start nodetool repair + echo "Starting cassandra repair, check /var/log/cassandra/repair-*.log for progress" + $REPAIR --log-file /var/log/cassandra/repair.log &> /dev/null & + fi + else + echo "Cassandra has been down for at least $MAX_ALLOWED_DOWN_SECONDS seconds, not starting" + return 2 + fi +} + +case "$1" in + start) + do_start + ;; + *) + $SERVICE "$1" + stat=$? + exit "$stat" + ;; +esac + +: + +# vi:ai sw=4 ts=4 tw=0 et diff --git a/common/control_files/supervisord_database.conf b/common/control_files/supervisord_database.conf index 7f2a83d6e..46abd7969 100644 --- a/common/control_files/supervisord_database.conf +++ b/common/control_files/supervisord_database.conf @@ -21,7 +21,6 @@ logfile_backups=5 ; (num of main logfile rotation backups;default 10) loglevel=info ; (log level;default info; others: debug,warn,trace) pidfile=/var/run/supervisord_contrail_database.pid ; (supervisord pidfile;default supervisord.pid) nodaemon=false ; (start in foreground if true;default false) -minfds=100000 ; (min. avail startup file descriptors;default 1024) minprocs=200 ; (min. avail process descriptors;default 200) ;umask=022 ; (process file creation umask;default 022) ;user=chrism ; (default is current user, required if root) @@ -56,22 +55,22 @@ autostart=true ; start at supervisord start (default: true) killasgroup=false ; SIGKILL the UNIX process group (def false) environment=LOG_DIR=/var/log/kafka -[program:contrail-database] -command=cassandra -f +;[program:theprogramname] +;command=/bin/cat ;process_name=%(program_name)s ; process_name expr (default %(program_name)s) ;numprocs=1 ; number of processes copies to start (def 1) ;directory=/tmp ; directory to cwd to before exec (def no cwd) ;umask=022 ; umask for process (default None) ;priority=300 ; the relative start priority (default 999) -autostart=true ; start at supervisord start (default: true) +;autostart=true ; start at supervisord start (default: true) ;autorestart=unexpected ; whether/when to restart (default: unexpected) ;startsecs=1 ; number of secs prog must stay running (def. 1) ;startretries=3 ; max # of serial start failures (default 3) ;exitcodes=0,2 ; 'expected' exit codes for process (default 0,2) -stopsignal=KILL ; signal used to kill process (default TERM) +;stopsignal=KILL ; signal used to kill process (default TERM) ;stopwaitsecs=10 ; max num secs to wait b4 SIGKILL (default 10) ;stopasgroup=false ; send stop signal to the UNIX process group (default false) -killasgroup=false ; SIGKILL the UNIX process group (def false) +;killasgroup=false ; SIGKILL the UNIX process group (def false) ;user=chrism ; setuid to this UNIX account to run the program ;redirect_stderr=true ; redirect proc stderr to stdout (default false) ;stdout_logfile=/a/path ; stdout log path, NONE for none; default AUTO diff --git a/common/debian/contrail-openstack-database/debian/contrail-openstack-database.postinst b/common/debian/contrail-openstack-database/debian/contrail-openstack-database.postinst index 29763c383..1d4a18646 100644 --- a/common/debian/contrail-openstack-database/debian/contrail-openstack-database.postinst +++ b/common/debian/contrail-openstack-database/debian/contrail-openstack-database.postinst @@ -4,6 +4,7 @@ if [ "$1" = "configure" -a -z "$2" ]; then echo "Running Postinst for contrail-openstack-database install.." sudo service cassandra stop sudo rm -rf /var/lib/cassandra - sudo rm -rf /etc/init.d/cassandra + sudo update-rc.d cassandra disable + sudo update-rc.d contrail-database defaults fi echo "Postinst for contrail-openstack-database done" diff --git a/common/rpm/contrail-database.spec b/common/rpm/contrail-database.spec index 0fa12c644..6603a5a8d 100644 --- a/common/rpm/contrail-database.spec +++ b/common/rpm/contrail-database.spec @@ -134,6 +134,8 @@ done popd %post +chkconfig cassandra off +chkconfig contrail-database on # this is upgrade from 1.02 release to newer i.e cassandra 1.1.7 to 1.2.11 if [ -f /usr/share/cassandra/conf/cassandra.yaml.rpmsave ]; then CASSANDRA_CONF_OLD=/usr/share/cassandra/conf/cassandra.yaml.rpmsave