#!/bin/bash

# beegfs-ondemand-stoplocal
# This file contains helper functions to stop BeeOND services locally on one node.
# This is meant to be sourced from another script (i.e. beeond)


# Checks the return code of the last command that has been executed. If the code is !=0, indicating
# an error, it prints a message and sets an error flag.
# Parameters:
#     * The return code of the last command
#     * A string containing a hint on what was being done that could have caused the error. It is
#       used for the error message.
# Modifies:
#     ERROR: Is set to "true" when an error was encountered.
sl_checkerror()
{
   if [ "${1}" != 0 ]
   then
      echo "ERROR: There was a problem ${2} on host $(hostname)"
      ERROR="true"
   fi
}

# Prints an info message if the QUIET variable is not set.
# Parameter:
#     A string (the message). It is prefixed with INFO when printed.
# Checks:
#     QUIET: If "true", nothing is printed.
sl_print_info()
{
   local MESSAGE=${1}
   if [ "${QUIET}" != "true" ]
   then
      echo "INFO: ${MESSAGE}"
   fi
}

# unmounts tmpfs mounts listed in the status file
sl_unmount_tmpfs()
{
   local SERVICE MOUNTPOINT _
   IFS=,
   while read -r _ SERVICE MOUNTPOINT _ _
   do
      if [ "${SERVICE}" != "tmpfs" ]
      then
         continue
      fi

      sl_print_info "Unmounting tmpfs at ${MOUNTPOINT}"

      if [ "${CLEANUP}" != "true" ]
      then
         fuser -k "${MOUNTPOINT}"
         umount -l "${MOUNTPOINT}"

         sl_checkerror $? "unmounting tmpfs"
      else
         fuser -k "${MOUNTPOINT}" 2>/dev/null
         umount -l "${MOUNTPOINT}" 2>/dev/null
         true
      fi
   done < "${STATUSFILE}"
   unset IFS
}

# Unmounts all local mounts listed in the status file
sl_unmount_local_mounts()
{
   local SERVICE MOUNTPOINT _
   IFS=,
   while read -r _ SERVICE MOUNTPOINT _ _
   do
      if [ "${SERVICE}" != "${CLIENTSERVICE}" ]
      then
         continue
      fi

      sl_print_info "Unmounting ${MOUNTPOINT}"
      if [ "${CLEANUP}" != "true" ]
      then
         fuser -k "${MOUNTPOINT}" # no "sl_checkerror" after this, becuase fuser also returns
                                  # non-zero when there are no processes accessing the file system
         umount -l "${MOUNTPOINT}"
         sl_checkerror $? "unmounting the ondemand file system"
      else
         fuser -k "${MOUNTPOINT}" 2>/dev/null
         umount -l "${MOUNTPOINT}" 2>/dev/null
         true # reset error code before next invocation of sl_checkerror
      fi
   done < "${STATUSFILE}"
   unset IFS

   # try to remove the client module - this is allowed to fail, because we might have a "normal"
   # beegfs mount somewhere in the system.
   rmmod beegfs 2>/dev/null || true
}

# sends a SIGTERM to a process, then waits until the process is stopped or appriximately 10 seconds
# have passed.
# Parameter:
#     The PID of the proces
# Returns:
#     0 if process was stopped within 10 seconds, 1 if it wasn't, 255 if initial kill returned an
#     error.
sl_kill_check()
{
   local PID=$1

   if ! kill "$PID"
   then
      return 255
   fi

   for ((i=0; i<100; i++))
   do
      if kill -0 "$PID" 2>/dev/null
      then
         sleep 0.1
      else
         return 0
      fi
   done

   return 1
}

# stops all services listed in the status file except for clients
sl_stop_services()
{
   local SERVICE DATAPATH PIDFILE _
   IFS=,
   while read -r _ SERVICE DATAPATH _ PIDFILE
   do
      if [ "${PIDFILE}" != "-" ] # pidfile is "-" for beegfs-client and tmpfs, because it is not
                                 # a process
      then
         if [ -e "${PIDFILE}" ]
         then
            PID=$(cat "${PIDFILE}")
            sl_kill_check "${PID}"
            RES=$?
            if [ $RES -eq 1 ]
            then
               echo "ERROR: ${SERVICE} did not stop within 10 seconds (PID ${PID})."
               ERROR="true"
            elif [ $RES -eq 255 ]
            then
               echo "ERROR: ${SERVICE} does not seem to be running any more (PID ${PID})."
            fi
         else
            if [ "${CLEANUP}" != "true" ]
            then
               echo "ERROR: PID file ${PIDFILE} does not exist on host $(hostname)"
               ERROR="true"
            fi
         fi

         # delete data...
         if [ "${DELETE_DATA}" = "true" ]
         then
            if [ "${DATAPATH}" != "-" ]
            then
               sl_print_info "Deleting stored data; Data path: ${DATAPATH}"
               rm -rf "${DATAPATH}"
               sl_checkerror $? "deleting ${DATAPATH}"
            fi
         fi

      # delete preferredMds and preferredTarget files
      rm -f "${PREFERRED_MDS_FILE}"
      sl_checkerror $? "deleting ${PREFERRED_MDS_FILE}"
      rm -f "${PREFERRED_TARGET_FILE}"
      sl_checkerror $? "deleting ${PREFERRED_TARGET_FILE}"
      fi
   done < "${STATUSFILE}"
   unset IFS

   # unmount tempfs if it was used
   sl_unmount_tmpfs
}

# deletes the logfiles listed in the status file if ERROR is set to false
# If the log directory is empty afterwards, it is also deleted
sl_delete_logfiles()
{
   local LOGFILE # declare it here, because the last LOGFILE path is needed to delete the directory
                 # after the loop

   # delete log files
   if [ "${ERROR}" != "true" ] # if we haven't encountered an error yet.
   then
      # delete log files
      local SERVICE LOGFILE _
      IFS=,
      while read -r _ SERVICE _ LOGFILE _
      do
         if [ "${ONLY_UNMOUNT}" = "true" ] && [ "${SERVICE}" != "${CLIENTSERVICE}" ]
            then continue; fi
         if [ "${ONLY_STOP_SERVER}" = "true" ] && [ "${SERVICE}" = "${CLIENTSERVICE}" ]
            then continue; fi
         if [ "${LOGFILE}" != "-" ]
         then
            sl_print_info "Deleting log file ${LOGFILE}"
            rm -f "${LOGFILE}" 2>/dev/null  # beegfs-client does not (always) generate a logfile.
                                          # in this case rm gives an error message, but we don't
                                          # want to see it. - for the same reason no sl_checkerror
                                          # here
         fi
      done < "${STATUSFILE}"
      unset IFS

      # delete log directory if empty
      local LOG_DIR
      LOG_DIR=$(dirname "${LOGFILE}")
      if [ "${LOG_DIR}" != "." ] && [ ! "$(ls -A "${LOG_DIR}")" ]
      then
         echo "Deleting log directory ${LOG_DIR}"
         rmdir "${LOG_DIR}"
         sl_checkerror $? "deleting ${LOG_DIR}"
      fi
   else
      sl_print_info "Not deleting log files because of a previous error."
   fi
}

# The "main" stoplocal function. From here, the functions to unmount the file system and stop the
# services are called. If there was no error, sl_delete_logfiles is called, and the status file is
# also removed.
# Checks the following variables:
#     STATUSFILE        The location of the status file
#     ONLY_STOP_SERVER  If "true", the umount_local_mounts step is skipped, and status file is not
#                       removed.
#     ONLY_UNMOUNT      If "true", the stop_services step is skipped, and status file is not
#                       removed.
# Modifies:
#     ERROR             Is set to "true" (and an error message is printed to %2) if an error is
#                       encountered in any step.
stoplocal()
{
   sl_print_info "Using status file ${STATUSFILE}"

   # do the actual shutdown process

   # unmount the file system (skip this step if we only want to stop the server)
   if [ "${ONLY_STOP_SERVER}" != "true" ]
   then
      sl_unmount_local_mounts
   fi

   # stop the services (skip this step if we only got asked to unmount the file system)
   if [ "${ONLY_UNMOUNT}" != "true" ]
   then
      sl_stop_services
   fi

   # delete the logfiles
   if [ "${ERROR}" != "true" ] && [ "${DELETE_LOGS}" = "true" ]
   then
      sl_delete_logfiles
   fi


   # delete the status file (only if a full shutdown was requested)
   if [ "${ONLY_UNMOUNT}" != "true" ] && [ "${ONLY_STOP_SERVER}" != "true" ]
   then
      rm -f "${STATUSFILE}"
      sl_checkerror $? "deleting the status file"
   fi
}

# the user interface / main entry point to stoplocal
# Options:
#   -i FILENAME => Status information filename
#                  (DEFAULT: ${DEFAULT_STATUSFILE})
#   -d          => Delete BeeGFS data on disks
#   -L          => Delete log files after successful shutdown
#   -q          => Suppress \"INFO\" messages, only print \"ERROR\"s
#   -c          => "Cleanup": Remove remaining processes and directories of a
#                  potentially unsuccessful shutdown of an earlier beeond
#                  instance. This switch silences the error message when a status
#                  information file is not found or an unmount command fails;
#                  instead, a message is printed (if \"INFO\" messages are not
#                  suppressed) when a status file DOES exist, because this means
#                  there actually was an instance before that is now being
#                  cleaned up.
#   -u          => ONLY unmount the file systems(*)
#   -s          => ONLY stop non-client services(*)
#
#                  (*) Options -u and -s are mutually exclusive
#                      If -u or -s are given, the status file is not deleted.
do_stoplocal()
{
   local DEFAULT_STATUSFILE=/tmp/beeond.tmp
   local CLIENTSERVICE=beegfs-client
   local DELETE_DATA="false"
   local DELETE_LOGS="false"
   local ONLY_UNMOUNT="false"
   local ONLY_STOP_SERVER="false"
   local PREFERRED_MDS_FILE=/tmp/preferredMds.fod
   local PREFERRED_TARGET_FILE=/tmp/preferredTarget.fod
   local QUIET="false"

   local ERROR="false"
   local STATUSFILE="${DEFAULT_STATUSFILE}"

   local OPTIND=1
   local OPTARG=""
   while getopts ":i:dLusqc" opt "$@"
   do
      case $opt in
         i)
            STATUSFILE=${OPTARG}
         ;;
         d)
            DELETE_DATA="true"
         ;;
         L)
            DELETE_LOGS="true"
         ;;
         u)
            if [ "${ONLY_STOP_SERVER}" = "true" ]
            then
               echo "ERROR: Options -s and -${OPTARG} are mutually exclusive" >&2
               if declare -f -F print_usage_and_exit >/dev/null
                  then print_usage_and_exit; fi
               return 1
            fi
            ONLY_UNMOUNT="true"
         ;;
         s)
            if [ "${ONLY_UNMOUNT}" = "true" ]
            then
               echo "ERROR: Options -u and -${OPTARG} are mutually exclusive" >&2
               if declare -f -F print_usage_and_exit >/dev/null
                  then print_usage_and_exit; fi
               return 1
            fi
            ONLY_STOP_SERVER="true"
         ;;
         q)
            QUIET="true"
         ;;
         c)
            CLEANUP="true"
         ;;
         \?)
            echo "ERROR: invalid option -${OPTARG}" >&2
            if declare -f -F print_usage_and_exit >/dev/null
               then print_usage_and_exit; fi
            return 1
         ;;
         :)
            echo "ERROR: Option -${OPTARG} requires an argument" >&2
            if declare -f -F print_usage_and_exit >/dev/null
               then print_usage_and_exit; fi
            return 1
         ;;
      esac
   done

   # if statusfile can't be found, print a message and exit.
   if [ ! -f ${STATUSFILE} ]
   then
      # only print message when we're not doing a cleanup run.
      if [ "${CLEANUP}" != "true" ]
      then
         echo "ERROR: Status file ${STATUSFILE} not found." >&2

         # If the user has specified a status file, just give a brief error message and exit.
         # If the user has not specified a status file, give the full usage info - maybe the user
         # didn't know how to specify a status file.
         if [ "${STATUSFILE}" = "${DEFAULT_STATUSFILE}" ]
         then
            if declare -f -F "print_usage_and_exit" >/dev/null
               then print_usage_and_exit; fi
         fi

         return 1
      else
         return 0 # return 0 if we're doing a cleanup so that pdsh doesn't complain
      fi
   fi

   # if we're doing a cleanup run, inform the user that a status file was found.
   if [ "${CLEANUP}" = "true" ]
   then
      sl_print_info "Status file found."
   fi

   stoplocal

   if [ "${ERROR}" = "true" ]
   then
      return 1
   else
      return 0
   fi
}
