%%information This is a custom script and may not be on your system. %% !Primary Purpose This script checks to see if various ports are "listening" and if not, attempts to restart the service associated with the port. !Deployed Location Located at /usr/bin/ndscheck.sh The [LDAPBuild Process|LDAPBuildProcess] distribute the <buildhome>[Directory-Info.com]/config/ndscheck.sh file to /usr/bin/ndscheck.sh !Method of Execution The script is run via [cron.tab|NDSCron.tab] !Logging The script creates a log file [/var/nds/ndscheck.log|ConfigFilesNdscheck.log] !Alerting Methods Upon an alert, an email message is sent to the defined email groups dependent on the level of the alert. !Sending Test Message Calling /usr/bin/ndscheck.sh and passing "msgtest" on the commandline will send a test message to the [EMAIL_NOTIFY] group. !Whatch out for These Items Currently the following items are "hard-coded" within the ndscheck.sh. * Ports that is checked by the script See Line: checkPort=(.524.*LISTEN .636.*LISTEN .8389.*LISTEN .8636.*LISTEN) * Maintenance time window: sMaintTime and eMaintTime values are from the [Script Variables|Shared Script Variables] Typical script as implemented is shown below. {{{ #!/bin/bash #:ndscheck.sh # NOTE: All shell values cleared at this point # Modification history: # 2/18/2003-- Created # 3/10/2003-- Fixed use of mailx # 3/13/2003-- Tweaked logging and notification # 3/14/2003-- Tweaked notification some more # 3/21/2003-- Handle and alert on recurring restarts # Fixed logging on some types of alerts # Combined NOTIFY and URGENT messages into a single mailx command # Check for previous run of ndscheck.sh and send URGENT alert # if found # Finalized logic on notification # 4/4/2003 -- Changed checkAction for eDirectory to use eDirAutoStart # 8/11/2003-- Commented out check for '.dsbackup' file # 11/13/2003--Modified for 8.7.1 - # Added ports for SSL on HTTPSTACK # Commented out start of ndsimonitor and pki as these are loaded via the # /usr/lib/nds-modules/ndsmodules.conf file jim@willeke.com # 12/03/2003--Modified for changes in ndsenv and ndeenv_functions # 1/9/2004 -- Changed reference to function 'eDirAutoStart' to 'f_edirautostart' # Fixed load location for .ndsenv* # 2005/6/7 -- msgtest Pass as a parameter and we will send a test message and exit # # netstat -n -a | grep LISTEN # Look for LISTEN ports for the following services: # NDS:524:'/etc/init.d/nds stop; /etc/init.d/nds start' # LDAP:636:'/etc/init.d/nds stop; /etc/init.d/nds start' # iMonitor:8389:'$bindir/ndsimonitor -u; $bindir/ndsimonitor -l' # DXMLDriver:16384:'dxmlconfig command -t $TREENAME `cat /var/nds/.dsbackup` -d cn=B1ENTLoopback.cn=$SERVERNAME-driverset.ou=DirXML.$BaseDNdot -c start cn=$SERVERNAME-DSbackup.ou=Administration.$BaseDNdot' # # dxmlconfig command -t $TREENAME `cat /var/nds/.dsbackup` -d cn=B1ENTtoCCSAD.cn=$SERVERNAME-driverset.ou=DirXML.$BaseDNdot -c start cn=$SERVERNAME-DSbackup.ou=Administration.$BaseDNdot # # To pull driver names and their Authentication Context info (auto-detection of DirXML driver info): # ice -v -o -SLDAP -L/var/nds/$TREENAME.der -dcn=$SERVERNAME-DSbackup,ou=Administration,$BaseDN -w`cat /var/nds/.dsbackup|awk -F" " '{print $2}'` -bou=DirXML,$BaseDN -F"objectClass=DirXML-Driver" -DDELIM -f /tmp/tmp.dxmlcheck.csv -tdn,cn,DirXML-ShimAuthServer # # To get just the Publisher port from the attribute # echo "ndstest1.security.[Directory-Info.com].net:9192:ino0s701.svr.[Directory-Info.com].net:9292"|awk -F":" '{print $4}' # Read eDirectory installation variables and subroutines if [ -f /var/nds/.ndsenv ] then . /var/nds/.ndsenv_functions . /var/nds/.ndsenv else printf "\nMissing /var/nds/.ndsenv -- cannot run!\n" exit 1 fi # Read variables specific to bash shell; defines what processes to check if [ -f /var/nds/.ndsenv.bash ] then . /var/nds/.ndsenv.bash else printf "\nMissing /var/nds/.ndsenv.bash -- cannot run!\n" exit 1 fi # Because automatic detection of DirXML drivers is not implemented, this is not needed: # ! -f /var/nds/$TREENAME.der -o #if [ ! -f /var/nds/.dsbackup ]; then # printf "\nSystem not PREPared for ndsbackup.sh.\n" # exit 1 #fi ###################################################################### handleMESSAGE() { if test -f $ndscheckAlertFile; then if [ "$NEWALERT" = "NO" ] && [ "$STATUS_COUNT" -le $ndscheckMAX_STATUS_COUNT ]; then f_write_and_log "${MESSAGE}${DATE:0:4},${DATE:4:2},${DATE:6:2},`date '+%H:%M:%S'`,Alert already sent; new alert will not be sent.#STATUS_COUNT=$STATUS_COUNT" printf "${MESSAGE}\nURGENT=${URGENT}\nRECURRENT=${RECURRENT}">$ndscheckAlertFile echo $STATUS_COUNT>$ndsfailCountFile elif [ "$NEWALERT" = "NO" ] && [ "$STATUS_COUNT" -gt $ndscheckMAX_STATUS_COUNT ]; then printf "\n`date`\n" > $EMAIL_BODY printf "\n${MESSAGE}" >> $EMAIL_BODY if [ "$URGENT"="YES" ]; then f_write_and_log "${MESSAGE}${DATE:0:4},${DATE:4:2},${DATE:6:2},`date '+%H:%M:%S'`,Wait expired; resending URGENT alert." $bindir/mailx -s"URGENT -- ${SUBJECT}" $EMAIL_URGENT $EMAIL_NOTIFY < $EMAIL_BODY else f_write_and_log "${MESSAGE}${DATE:0:4},${DATE:4:2},${DATE:6:2},`date '+%H:%M:%S'`,Wait expired; resending NOTIFY alert." $bindir/mailx -s"NOTIFY -- ${SUBJECT}" $EMAIL_NOTIFY < $EMAIL_BODY fi rm -f $EMAIL_BODY printf "${MESSAGE}\nURGENT=${URGENT}\nRECURRENT=${RECURRENT}">$ndscheckAlertFile echo "1">$ndsfailCountFile fi if [ "$NEWALERT" = "YES" ]; then #printf "\n---- Original message ----\n" #printf "`cat $ndscheckAlertFile`" #printf "\n---- New message ----\n" #printf "${MESSAGE}" printf "\n`date`\n" > $EMAIL_BODY printf "\n${MESSAGE}" >> $EMAIL_BODY if [ "$URGENT"="YES" ]; then f_write_and_log "${MESSAGE}${DATE:0:4},${DATE:4:2},${DATE:6:2},`date '+%H:%M:%S'`,Alert already sent but new URGENT alert situation detected." $bindir/mailx -s"URGENT -- ${SUBJECT}" $EMAIL_URGENT $EMAIL_NOTIFY < $EMAIL_BODY else f_write_and_log "${MESSAGE}${DATE:0:4},${DATE:4:2},${DATE:6:2},`date '+%H:%M:%S'`,Alert already sent but new NOTIFY alert situation detected." $bindir/mailx -s"NOTIFY -- ${SUBJECT}" $EMAIL_NOTIFY < $EMAIL_BODY fi rm -f $EMAIL_BODY printf "${MESSAGE}\nURGENT=${URGENT}\nRECURRENT=${RECURRENT}">$ndscheckAlertFile echo "1">$ndsfailCountFile fi fi if test ! -f $ndscheckAlertFile; then printf "\n`date`\n" > $EMAIL_BODY printf "\n${MESSAGE}" >> $EMAIL_BODY if [ "$URGENT" = "YES" ]; then f_write_and_log "${MESSAGE}${DATE:0:4},${DATE:4:2},${DATE:6:2},`date '+%H:%M:%S'`,Alert message URGENT being sent." $bindir/mailx -s"URGENT -- ${SUBJECT}" $EMAIL_URGENT $EMAIL_NOTIFY < $EMAIL_BODY else f_write_and_log "${MESSAGE}${DATE:0:4},${DATE:4:2},${DATE:6:2},`date '+%H:%M:%S'`,Alert message NOTIFY being sent." $bindir/mailx -s"NOTIFY -- ${SUBJECT}" $EMAIL_NOTIFY < $EMAIL_BODY fi rm -f $EMAIL_BODY printf "${MESSAGE}\nURGENT=${URGENT}\nRECURRENT=${RECURRENT}">$ndscheckAlertFile echo "1">$ndsfailCountFile fi } ############################################################## # Override ndsenv variables ############################################################## # The following variables should be set in /var/nds/.ndsenv; if these aren't set # for some reason, initialize them to some good default values # # Define maintenance window (start/end time); ndscheck.sh will not process # during this window. In the form of "hhmmss". Defined in ndsenv sMaintTime=${sMaintTime:="010000"} eMaintTime=${eMaintTime:="013000"} # File location to set flag -- ignores down condition during mainteance Defined in ndsenv ndscheckMaintFlag=${ndscheckMaintFlag:=/tmp/nondscheck} # Maximum polling internvals to ignore a down condition Defined in ndsenv ndscheckMAX_STATUS_COUNT=${ndscheckMAX_STATUS_COUNT:=6} # Define local environment variables EMAIL_BODY=${EMAIL_BODY:=/tmp/ndscheckMail.tmp} # Temp file to hold email message ndsfailCountFile=${ndsfailCountFile:=/tmp/ndsfailCount.tmp} # Temp file to hold counter for successive failures detected ndsrestartCountFile=${ndsrestartCountFile:=/tmp/ndsrestartCount.tmp} # Temp file to hold counter for successive restarts ndscheckAlertFile=${ndscheckAlertFile:=/tmp/ndscheckAlert.tmp} # Temp file to keep track of what services are down from poll to poll STATUS_COUNT=0 NEWALERT="NO" MESSAGE='' OLDMESSAGE='' SUBJECT="Subject: ${SERVERNAME} eDirectory status alert" i_test="$1" if [ "$i_test" = "msgtest" ] then f_messagesend exit 0 fi PATH=$PATH:/bin:/usr/local/bin # Permanent log to keep 90 days worth of ndscheck status messages log_file=/var/nds/ndscheck.log # Set notification flags NOTIFY="NO" if test -f $ndscheckAlertFile; then eval `grep "URGENT=" $ndscheckAlertFile` eval `grep "RECURRENT=" $ndscheckAlertFile` fi URGENT=${URGENT:="NO"} RECURRENT=${RECURRENT:="NO"} if test -f $ndsfailCountFile; then STATUS_COUNT=`cat $ndsfailCountFile` fi let "STATUS_COUNT = $STATUS_COUNT + 1" # Check for a previous run of this script before continuing if [ -f /tmp/ndscheck.pid ]; then pid=`cat /tmp/ndscheck.pid` if [ -n "`ps -fp $pid | grep -v "PPID"`" ]; then MESSAGE="${MESSAGE}${DATE:0:4},${DATE:4:2},${DATE:6:2},`date '+%H:%M:%S'`,Previous $0 still running; please check server#Current PID=$$#Old PID=`cat /tmp/ndscheck.pid`\n" NOTIFY="YES"; URGENT="YES" if [ -f $ndscheckAlertFile ] && [ -z "`grep "Previous $0 still running" $ndscheckAlertFile`" ]; then NEWALERT="YES" fi handleMESSAGE exit 0 fi fi echo "$$" > /tmp/ndscheck.pid if [[ ${DATE:8}00 > ${sMaintTime} && ${DATE:8}00 < ${eMaintTime} ]]; then f_write_and_log "${DATE:0:4},${DATE:4:2},${DATE:6:2},`date '+%H:%M:%S'`,Scheduled maintenance window; exiting without action." # Clear PID file rm -f /tmp/ndscheck.pid exit 0 fi # Check for maintenance flag (ignore down condition temporarily) if [ -f $ndscheckMaintFlag ]; then checkDelay=`cat $ndscheckMaintFlag` let "checkDelay = $checkDelay - 1" if [ $checkDelay -le 0 ]; then rm -f $ndscheckMaintFlag f_write_and_log "${DATE:0:4},${DATE:4:2},${DATE:6:2},`date '+%H:%M:%S'`,manual maintenance in progress (0 left); exiting." exit 0 else # Never allow services to be down for more than 6 polling cycles (~1 hour) if [ $checkDelay -gt $ndscheckMAX_STATUS_COUNT ]; then checkDelay=$ndscheckMAX_STATUS_COUNT fi echo "$checkDelay">$ndscheckMaintFlag f_write_and_log "${DATE:0:4},${DATE:4:2},${DATE:6:2},`date '+%H:%M:%S'`,manual maintenance in progress ($checkDelay left); exiting." # Clear PID file rm -f /tmp/ndscheck.pid exit 0 fi fi # Define a default set of arrays if they haven't been setup for this server if [ -z "$checkName" ]; then checkName=(NDS LDAPS iMonitor) # checkPort=(.524.*LISTEN .636.*LISTEN .8389.*LISTEN) JSW checkPort=(.524.*LISTEN .636.*LISTEN .8389.*LISTEN .8636.*LISTEN) checkAction=( \ "$SUDO /etc/init.d/nds stop; sleep 30; f_edirautostart; $bindir/ndsstat" \ # These services are restarted by 8.7.1 in /usr/lib/nds-modules/ndsmodules.conf # "$SUDO $sbindir/npki -u; sleep 10; $SUDO $sbindir/npki -l" \ # "$SUDO $bindir/ndsimonitor -u; sleep 10; $SUDO $bindir/ndsimonitor -l" \ ) fi # Run full check of all monitored ports checkCount=${#checkName[@]} index=0 # Loop through all services defined in the array while [ $index -lt $checkCount ]; do netstat -n -a | grep ${checkPort[$index]}>/dev/null 2>&1; es=$? if [ $es -ne 0 ]; then # Attempt corrective action and keep the output in a variable result="`(eval ${checkAction[$index]}) 2>&1 | sed -e :a -e '$!N;s/\n/#/;ta' -e 'P;D'`" MESSAGE="${MESSAGE}${DATE:0:4},${DATE:4:2},${DATE:6:2},`date '+%H:%M:%S'`,${checkName[$index]},${checkPort[$index]},Restart message=\"${result}\"\n" if [ -f $ndscheckAlertFile ] && [ -z "`grep "${checkName[$index]},${checkPort[$index]}" $ndscheckAlertFile`" ]; then NEWALERT="YES" fi # Wait up to 10 additional seconds before giving up on restart attempt RETRY=10 netstat -n -a | grep ${checkPort[$index]}>/dev/null 2>&1; es=$? while [ $es -ne 0 -a $RETRY -gt 0 ]; do sleep 1 netstat -n -a | grep ${checkPort[$index]}>/dev/null 2>&1; es=$? let "RETRY = $RETRY - 1" done # Check the result of the restart attempt if [ $es -ne 0 ]; then MESSAGE="${MESSAGE}${DATE:0:4},${DATE:4:2},${DATE:6:2},`date '+%H:%M:%S'`,${checkName[$index]},${checkPort[$index]},Restart failed\n" NOTIFY="YES"; URGENT="YES" else # Service restarted OK MESSAGE="${MESSAGE}${DATE:0:4},${DATE:4:2},${DATE:6:2},`date '+%H:%M:%S'`,${checkName[$index]},${checkPort[$index]},Restart OK\n" NOTIFY="YES" fi # Flag problem URGENT if service is having a chronic problem # egrep "All OK|B1ENTLoopback,.16384.*LISTEN,.*Restart message" $log_file | tail -6 | grep "Restart message" | wc -l | tr -d " " # Figure out the number of events that relate to this service within the last $ndscheckMAX_STATUS_COUNT polling attempts restartCount=`egrep "All OK|${checkName[$index]},${checkPort[$index]},.*Restart message" $log_file | tail -$ndscheckMAX_STATUS_COUNT | grep "Restart message" | wc -l | tr -d " "` # If the restarts of this service > $ndscheckMAX_STATUS_COUNT/2, send an URGENT alert if [ `expr $restartCount + 1` -ge `expr $ndscheckMAX_STATUS_COUNT \/ 2` ]; then MESSAGE="${MESSAGE}${DATE:0:4},${DATE:4:2},${DATE:6:2},`date '+%H:%M:%S'`,${checkName[$index]},${checkPort[$index]},Recurring problem detected#restartCount=`expr $restartCount + 1`\n" URGENT="YES" if [ "$RECURRENT" = "NO" ]; then NEWALERT="YES" fi RECURRENT="YES" fi fi let "index = $index + 1" done # We've checked/restarted all services, now do something with the accumulated messages if test "$MESSAGE"; then handleMESSAGE # Clear PID file rm -f /tmp/ndscheck.pid exit 0 fi # If we made it this far, everything must be running f_write_and_log "${DATE:0:4},${DATE:4:2},${DATE:6:2},`date '+%H:%M:%S'`,All OK" if test -f $ndscheckAlertFile; then # Send an all clear notification if previous message was sent f_write_and_log "${DATE:0:4},${DATE:4:2},${DATE:6:2},`date '+%H:%M:%S'`,Alert message CORRECTED being sent." printf "\n`date`\n" > $EMAIL_BODY printf "\n${DATE:0:4},${DATE:4:2},${DATE:6:2},`date '+%H:%M:%S'`,All OK">> $EMAIL_BODY if [ -z "`cat $ndscheckAlertFile | grep URGENT=YES`" ]; then $bindir/mailx -s"CORRECTED -- ${SUBJECT}" $EMAIL_NOTIFY < $EMAIL_BODY else $bindir/mailx -s"CORRECTED -- ${SUBJECT}" $EMAIL_URGENT $EMAIL_NOTIFY < $EMAIL_BODY fi rm -f $EMAIL_BODY rm -f $ndscheckAlertFile fi if test -f $ndsfailCountFile; then rm -f $ndsfailCountFile fi # Groom log file (keep up to 90 days worth of entries) cp $log_file /tmp/ndscheck.log.$$ tail -1259 /tmp/ndscheck.log.$$>$log_file rm -f /tmp/ndscheck.log.$$ # Clear PID file rm -f /tmp/ndscheck.pid }}}