#!/bin/bash ############################################################################## # # condor_mpirun # $Id: condor_mpirun,v 1.2 2007/12/10 01:26:26 dbrown Exp $ # # Script to execute mpich mpi jobs under condor and handle process # eviction in a sane way. Communication between processes is performed # via various lock files, so the jobs must be run from a directory on # a shared file system. Hopefully there are no race conditions. # # Copyright (C) 2006 Duncan A. Brown # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # ############################################################################## ############################################################################## # path the the version of mpich we should use MPI=${CONDOR_MPI_PATH:-"/opt/condor/openmpi-1.2.5"} # amount of time we give the processes to exit after a term signal KILL_TIMEOUT=$(( 55 * 60 )) # signal to send an mpi process to tell it to checkpoint and evict SIG_EVICT="SIGUSR1" # location of the ssh executables to use SSH_KEYGEN=/usr/bin/ssh-keygen SSHD=/usr/sbin/sshd CONDOR_SSH=/opt/condor/bin/condor_ssh SSH_BASE_PORT=4000 SSH_MAX_PORT=4100 ############################################################################## # get the number of processes, this process number and the name of this script _CONDOR_PROCNO=$_CONDOR_PROCNO _CONDOR_NPROCS=$_CONDOR_NPROCS # create the names of the lock files JOBNAME=`basename ${_CONDOR_REMOTE_SPOOL_DIR}` JOBNAME=`expr substr ${JOBNAME} 1 $((\`expr length ${JOBNAME}\` - 1))` MASTER_LOCKFILE=${JOBNAME}0.lock WORKER_LOCKFILE=${JOBNAME}${_CONDOR_PROCNO}.lock ############################################################################## # replacement functions for echo SCRIPTNAME=`basename ${0}` HOSTNAME=`hostname` error_echo() { echo -n "`date +'%D %H:%M:%S'` " >&2 echo -n "${SCRIPTNAME}[${HOSTNAME}][${_CONDOR_PROCNO}]: ERROR: " >&2 echo ${1} >&2 } verbose_echo() { if [ ${VERBOSE} -eq 1 ] ; then echo -n "`date +'%D %H:%M:%S'` " echo -n "${SCRIPTNAME}[${HOSTNAME}][${_CONDOR_PROCNO}]: " echo ${1} fi } debug_echo() { if [ ${DEBUG} -eq 1 ] ; then echo -n "`date +'%D %H:%M:%S'` " echo -n "${SCRIPTNAME}[${HOSTNAME}][${_CONDOR_PROCNO}]: " echo ${1} fi } ############################################################################## # function called if we get a SIGTERM while in the initialization process exit_on_term() { # make sure the workers are not hanging around waiting for a lockfile if [ ${_CONDOR_PROCNO} -eq 0 ] then echo ${HOSTNAME} > ${MASTER_LOCKFILE} sleep 5 rm -f ${MASTER_LOCKFILE} fi error_echo "recieved SIGTERM before mpirun started, exiting with status 126" exit 126 } ############################################################################## # if we get a SIGTERM before we've started mpirun give up trap exit_on_term 15 ############################################################################## # create the temporary file names needed by this script # these files must exist on a shared file system so that # the master and workers can communicate with each other PROCFILE=${JOBNAME}${_CONDOR_PROCNO}.vm NODEFILE=${JOBNAME}0.vmfile MACHINEFILE=${JOBNAME}0.machines EVICTFILE=${JOBNAME}0.evict EXECNAME=${JOBNAME}0.exe SSHKEY=${JOBNAME}0.key LOGDIR="${JOBNAME}0.`date +%Y%m%d%H%M%S`.log" JOB_EVICTED=0 ############################################################################## # function to print command line useage of this script usage() { echo "${SCRIPTNAME} [OPTIONS] EXECUTABLE [-- ARGUMENTS]" echo "Run mpi EXECUTABLE from within the Condor parallel universe with optional" echo "command-line arguments given by ARGUMENTS." echo "" echo "The following OPTIONS are passed to the condor_mpirun script:" echo "" echo " -v, --verbose print progress messages to stdout" echo " -d, --debug print debugging messages to stdout" echo " -s, --stdout OUTFILE redirect stdout or mpirun to OUTFILE" echo " -e, --stderr ERRFILE redirect stderr or mpirun to ERRFILE" echo " -l, --ld-libs PATH prepend PATH to LD_LIBRARY_PATH" echo " -k, --keep-logs try to preserve any log files on exit" echo " -h, --help print this message" echo "" echo "If the user's mpi program has command line options or aguments, follow" echo "the EXECUTABLE name with a double hyphen (--) and then add the options" echo "to the mpi program. All options and arguments after the double hypen are" echo "ignored by condor_mpirun and passed to the user's progrem." echo "" echo "The command EXECUTABLE must be an MPI compiled executable suitable for" echo "running with mpirun with the ch_p4 device with tcp and shared memory IPC." echo "" echo "If the either the --stdout or --stderr option are given, then the stdout" echo "and/or stderr of the mpirun command are appended to the given filenames." echo "" echo "If the keep-logs flag is set, any files ending in out or err are copied" echo "into a new directory called log.%Y%m%d%H%M%S at key intervals during" echo "execution and when the script exits." echo "" } ############################################################################## # parse the command line arguments # default options VERBOSE=0 DEBUG=0 MPI_STDOUT="" MPI_STDOUT="" KEEP_LOGS=0 ARGS=`getopt --name ${0} --options vds:e:l:kh \ --longoptions verbose,debug,stdout:,stderr:,ld-libs:,keep-logs,help -- $@` if [ ${?} != 0 ] then usage >&2 error_echo "could not parse command line arguments" exit 126 fi eval set -- "${ARGS}" for ARG do case "${ARG}" in -v|--verbose) VERBOSE=1; shift;; -d|--debug) DEBUG=1; shift;; -h|--help) usage ; exit 0;; -s|--stdout) MPI_STDOUT=${2}; shift; shift;; -e|--stderr) MPI_STDERR=${2}; shift; shift;; -l|--ld-libs) LDLIBS=${2}; shift; shift;; -k|--keep-logs) KEEP_LOGS=1; shift;; --) shift; break;; esac done # get the executable name and check that it exists EXECUTABLE=${1} ; shift if [ ! -r ${EXECUTABLE} ] then error_echo "${EXECUTABLE} does not exist" exit 126 fi if [ ! -x ${EXECUTABLE} ] then error_echo "${EXECUTABLE} is not executable" exit 127 fi # set and check libraries if [ -n ${LDLIBS} ] then LD_LIBRARY_PATH=${LDLIBS}:${LD_LIBRARY_PATH} export LD_LIBRARY_PATH fi # pass the rest of the arguments to the executable EXECUTABLE_ARGS=$@ ############################################################################## # function to clean up any processes that may have been started by mpirun cleanup_procs() { verbose_echo "cleanup_procs() called for ${EXECNAME}" # send a SIGTERM to all the processes and sleep for 5 seconds verbose_echo "sending all ${EXECNAME} processes the TERM signal" killall -TERM ${EXECNAME} 2>/dev/null sleep 5 # send the processes a kill to make sure they are gone verbose_echo "sending all ${EXECNAME} processes the KILL signal" killall -KILL ${EXECNAME} 2>/dev/null return 0 } ############################################################################## # function to safely evict mpi jobs on recieving a sigterm evict_job() { # ignore any subsequent sigterms trap "" 15 verbose_echo "uninstalled eviction signal handler" verbose_echo "caught SIGTERM, evict_job() called for ${EXECNAME}" # set evited variable to tell calling script that there has been an eviction JOB_EVICTED=1 # tell the master that we should continue to wait for mpirun to exit MPIRUN_DONE=0 # sleep the number of seconds of the procnum to avoid an eviction race verbose_echo "sleeping for ${_CONDOR_PROCNO} seconds to avoid a race" sleep ${_CONDOR_PROCNO} verbose_echo "checking for ${EVICTFILE}: `cat ${EVICTFILE} 2>&1`" if cat ${EVICTFILE} &>/dev/null then # there is an eviction in process so do nothing verbose_echo "ignoring SIGTERM, eviction in progress on `cat ${EVICTFILE}`" return 0 fi # let master and workers know that we have evicted a job echo ${HOSTNAME} > ${EVICTFILE} verbose_echo "created ${EVICTFILE}" # figure out which pid is the child of ssh or mpirun to avoid killing # the mpich comm process, as this makes things go to hell for CLUSTER_PID in `pgrep -f ${EXECNAME}` do PARENT_NAME=`ps -p \`ps -p ${CLUSTER_PID} -o ppid=\` -o comm=` verbose_echo "${CLUSTER_PID} is child of ${PARENT_NAME}" if [ ${PARENT_NAME} == "mpirun" ] || [ ${PARENT_NAME} == "sshd" ] || [ ${PARENT_NAME} == "orted" ] then EXEC_PID=${CLUSTER_PID} fi done # and send the lead process the eviction signal if [ -z ${EXEC_PID} ] then verbose_echo "could not determine pid of ${EXECNAME}" else verbose_echo "sending ${SIG_EVICT} to ${EXECNAME} process id ${EXEC_PID}" kill -s ${SIG_EVICT} ${EXEC_PID} fi # release the workers from their obligations rm -f ${MASTER_LOCKFILE} return 0 } ############################################################################## # function to start sshd server start_sshd() { # start an ssh daemon on the next available port PORT=${SSH_BASE_PORT} SSHD_OUT=${JOBNAME}${_CONDOR_PROCNO}.sshd SSHD_RUNNING=0 while [ ${SSHD_RUNNING} -eq 0 ] && [ ${PORT} -lt ${SSH_MAX_PORT} ] do verbose_echo "attempting to start ${SSHD} on port ${PORT}" ${SSHD} -p${PORT} -oAuthorizedKeysFile=`pwd`/${SSHKEY}.pub \ -h`pwd`/${SSHKEY} -De -f/dev/null -oStrictModes=no \ -oPidFile=/dev/null -oAcceptEnv=_CONDOR \ ${SSHD_OUT} 2>&1 & SSH_PID=${!} sleep 3 if grep "Server listening" ${SSHD_OUT} > /dev/null 2>&1 then verbose_echo "${SSHD} running on ${HOSTNAME}:${PORT}" SSHD_RUNNING=1 rm -f ${SSHD_OUT} else # ssh seems to have done missing. either it could not open # the port or something has gone horribly wrong. if grep "Cannot bind any address" ${SSHD_OUT} > /dev/null 2>&1 then # increment the port number and try again verbose_echo "port ${PORT} is use, cannot bind sshd" PORT=$(( ${PORT} + 1 )) else error_echo "could not start ${SSHD}: exit code ${SSH_EXIT_CODE}" error_echo "killing ${SSHD} process ${SSH_PID}" kill -9 ${SSH_PID} >/dev/null 2>&1 fi fi done } ############################################################################## # # start of main script verbose_echo '$Id: condor_mpirun,v 1.2 2007/12/10 01:26:26 dbrown Exp $ starting' # output the names of all the files used for communication debug_echo "JOBNAME is ${JOBNAME}" debug_echo "PROCFILE is ${PROCFILE}" debug_echo "NODEFILE is ${NODEFILE}" debug_echo "MACHINEFILE is ${MACHINEFILE}" debug_echo "MASTER_LOCKFILE is ${MASTER_LOCKFILE}" debug_echo "WORKER_LOCKFILE is ${WORKER_LOCKFILE}" debug_echo "EVICTFILE is ${EVICTFILE}" debug_echo "EXECNAME is ${EXECNAME}" debug_echo "VERBOSE = $VERBOSE" debug_echo "DEBUG = $DEBUG" debug_echo "MPI_STDOUT = $MPI_STDOUT" debug_echo "MPI_STDERR = $MPI_STDERR" debug_echo "LOGDIR = $LOGDIR" if [ ${_CONDOR_PROCNO} -gt 0 ] then ############################################################################ # give the master a little time to get ready verbose_echo "sleeping for 10 seconds" sleep 10 # wait until the master has created the ssh keys until cat ${SSHKEY} &>/dev/null && cat ${SSHKEY}.pub &>/dev/null do cat ${SSHKEY} ${SSHKEY}.pub >/dev/null 2>&1 debug_echo "waiting for master to create ${SSHKEY}" sleep 1 done start_sshd if [ ${SSHD_RUNNING} -eq 0 ] then WORKER_EXITCODE=126 verbose_echo "exiting with code ${WORKER_EXITCODE}" exit ${WORKER_EXITCODE} fi # echo the hostnme and ssh port into the procfile for this process # which tells the master that we are ready to go for mpirun echo ${HOSTNAME} ${PORT} `whoami` `pwd` > ${PROCFILE} # worker sleeps until the master lockfile is created until cat ${MASTER_LOCKFILE} &>/dev/null do debug_echo "waiting for master to create lockfile" sleep 1 done # master is ready to mpirun, so install the eviction signal handler trap evict_job 15 verbose_echo "installed eviction signal handler" # create the worker lockfile to tell the master we are ready echo ${HOSTNAME} > ${WORKER_LOCKFILE} # worker sleeps until the master lockfile is deleted while [ -s ${MASTER_LOCKFILE} ] do debug_echo "master lock file exists" sleep 1 done # ignore any subsequent sigterms as we are exiting trap "" 15 verbose_echo "uninstalled eviction signal handler" verbose_echo "master lock file has been deleted" if [ ${JOB_EVICTED} -eq 1 ] then verbose_echo "exiting due to eviction" WORKER_EXITCODE=143 else WORKER_EXITCODE=0 fi # give the process chance to exit cleanly verbose_echo "waiting for ${EXECNAME} processes to exit" KILL_TIMER=${KILL_TIMEOUT} while pgrep -f ${EXECNAME} &>/dev/null && [ ${KILL_TIMER} -gt 0 ] do if [ $(( ${KILL_TIMER} % 5 )) -eq 0 ] then verbose_echo "giving ${EXECNAME} ${KILL_TIMER} more seconds to exit" fi sleep 1 KILL_TIMER=$(( ${KILL_TIMER} - 1 )) done # stop the sshd process on this node verbose_echo "sending TERM signal to ${SSHD} pid ${SSH_PID}" kill -TERM ${SSH_PID} # delete the worker lockfile to tell the master we are done verbose_echo "deleting ${WORKER_LOCKFILE}" rm -f ${WORKER_LOCKFILE} # exit with code zero or the eviction code verbose_echo "exiting with code ${WORKER_EXITCODE}" exit ${WORKER_EXITCODE} else ############################################################################ # master is responsible for running the mpi job and reporting # success, failure or eviction to condor # if an eviction, a node file, an executable or a machine file exists # at this stage, remove these files and exit with code 143 so that # the job gets re-run if cat ${NODEFILE} >/dev/null 2>&1 || \ cat ${MACHINEFILE} >/dev/null 2>&1 || \ cat ${EVICTFILE} >/dev/null 2>&1 || \ cat ${EXECNAME} >/dev/null 2>&1 then verbose_echo "node, machine, evict or exec files exist" # clean up all temporary files for file in ${MASTER_LOCKFILE} ${EXECNAME} ${MACHINEFILE} ${EVICTFILE} \ ${JOBNAME}*.vm ${NODEFILE} ${SSHKEY} ${SSHKEY}.pub ${SSHKEY}.hosts do verbose_echo "cleaning up ${file}" rm -f ${file} done verbose_echo "exiting with status code 143" exit 143 fi # make a directory to preserve the log files if [ ${KEEP_LOGS} -eq 1 ] then mkdir -p ${LOGDIR} fi # create an ssh key for communication betwixt the nodes ${SSH_KEYGEN} -q -f ${SSHKEY} -t rsa -N '' # start the ssh server start_sshd if [ ${SSHD_RUNNING} -eq 0 ] then rm -f ${JOBNAME}*.vm ${NODEFILE} ${JOBNAME}*.lock ${JOBNAME}*.evict \ ${SSHKEY} ${SSHKEY}.pub error_echo "could not start sshd server, exiting with code 126" exit 126 fi # echo the hostnme into the procfile for this process echo ${HOSTNAME} ${PORT} `whoami` `pwd` > ${PROCFILE} # wait until all the workers have created procfiles or 100 seconds verbose_echo "waiting for workers to report hostnames and ssh ports" done=0 count=120 while [ ${done} -eq 0 ] && [ ${count} -gt 0 ] do debug_echo "waiting ${count} more seconds for workers" sleep 1 cat ${JOBNAME}*.vm > ${NODEFILE} lines=`wc -l ${NODEFILE} | awk '{print $1}'` if [ ${lines} -eq ${_CONDOR_NPROCS} ] then done=1 else count=$(( ${count} - 1 )) fi done # if we haven't heard back from all the workers, give up if [ ${done} -eq 0 ] then rm -f ${JOBNAME}*.vm ${NODEFILE} ${JOBNAME}*.lock ${JOBNAME}*.evict \ ${SSHKEY} ${SSHKEY}.pub error_echo "cound not get hostnames from workers, exiting with code 126" exit 126 fi # turn the vm file into a machinefile for mpirun verbose_echo "constructing mpirun machine file" for c in `awk '{print $1}' ${NODEFILE} | sort | uniq` do n=`grep "${c} " ${NODEFILE} | wc -l | awk '{print $1}'` echo "${c} slots=${n} max-slots=${n}" >> ${MACHINEFILE} # echo ${c}:${n} >> ${MACHINEFILE} done # turn off the glibc memory leak detection as it interferes with mpich MALLOC_CHECK_=0 export MALLOC_CHECK_ # copy the executable to a uniquely named program cp ${EXECUTABLE} ${EXECNAME} chmod u+x ${EXECNAME} # tell mpirun to use the special ssh that figures out what the port is verbose_echo "Setting P4_RSHCOMMAND to ${CONDOR_SSH}" P4_RSHCOMMAND=${CONDOR_SSH} OMPI_MCA_pls_rsh_agent=${CONDOR_SSH} P4_CONDOR_NODEFILE=`pwd`/${NODEFILE} P4_CONDOR_SSH_KEY=`pwd`/${SSHKEY} export P4_RSHCOMMAND OMPI_MCA_pls_rsh_agent export P4_CONDOR_NODEFILE P4_CONDOR_SSH_KEY # create the master lockfile to tell the workers # to install their eviction signal handlers echo ${HOSTNAME} > ${MASTER_LOCKFILE} sleep 5 trap evict_job 15 verbose_echo "installed eviction signal handler" if [ ${JOB_EVICTED} -eq 0 ] then # run the mpirun command for the specified command using the machinefile MPICMD="${MPI}/bin/mpirun --mca btl_tcp_if_include eth0 --universe ${JOBNAME} --no-daemonize --np ${_CONDOR_NPROCS} --machinefile ${MACHINEFILE} ${EXECNAME} ${EXECUTABLE_ARGS}" verbose_echo "executing ${MPICMD}" if [ ! -z ${MPI_STDOUT} ] && [ ! -z ${MPI_STDERR} ] then # redirect both stdout and stderr of mpirun verbose_echo "redirecting stdout to ${MPI_STDOUT} and stderr to ${MPI_STDERR}" eval ${MPICMD} 1>>${MPI_STDOUT} 2>>${MPI_STDERR} & elif [ ! -z ${MPI_STDOUT} ] && [ -z ${MPI_STDERR} ] then # redirect only stdout but not stderr of mpirun verbose_echo "redirecting stdout to ${MPI_STDOUT}" eval ${MPICMD} 1>>${MPI_STDOUT} & elif [ -z ${MPI_STDOUT} ] && [ ! -z ${MPI_STDERR} ] then # redirect only stderr but not stdout of mpirun verbose_echo "redirecting stderr to ${MPI_STDOUT}" eval ${MPICMD} 2>>${MPI_STDERR} & else # default case is no redirection eval ${MPICMD} & fi MPIJOB=${!} # wait for mpirun to complete, making sure we go back to the wait if # there has been an eviction SIGTERM signal sent to the master MPIRUN_DONE=0 while [ ${MPIRUN_DONE} -eq 0 ] do MPIRUN_DONE=1 verbose_echo "waiting for mpirun pid ${MPIJOB} to complete" wait ${MPIJOB} done # save the mpirun exit code for return to condor MPIRUN_EXITCODE=${?} verbose_echo "mpirun ${EXECNAME} exited with status code ${MPIRUN_EXITCODE}" else error_echo "eviction handler called before mpirun, exiting with code 126" exit 126 fi # don't let sigterm kick us into the evict function now we're exiting trap "" 15 verbose_echo "uninstalled eviction signal handler" if [ ${MPIRUN_EXITCODE} -eq 143 ] && [ ${JOB_EVICTED} -eq 0 ] then # make sure we only return exit code 143 if there has been an eviction error_echo "overriding mpirun exit code 143 to prevent resubmission" MPIRUN_EXITCODE=1 fi # sleep for 5 seconds, then check to see if a worker evicted this job sleep 5 verbose_echo "checking for ${EVICTFILE}: `cat ${EVICTFILE} 2>&1`" if cat ${EVICTFILE} &>/dev/null then # exit with status 128 + 15 to tell condor that there was an eviction verbose_echo "exiting due to eviction on worker `cat ${EVICTFILE} 2>&1`" MASTER_EXITCODE=143 else MASTER_EXITCODE=${MPIRUN_EXITCODE} fi # release the workers from their obligations and trigger them to cleanup rm -f ${MASTER_LOCKFILE} # clean up temporary files except for the eviction file for file in ${EXECNAME} ${MACHINEFILE} \ ${JOBNAME}*.vm ${NODEFILE} ${SSHKEY} ${SSHKEY}.pub ${SSHKEY}.hosts do verbose_echo "cleaning up ${file}" rm -f ${file} done # the master should not exit until all the workers have exited to # make sure they have all had chance to run cleanup_procs() and # kill any rouge mpi processes that my be hanging around. The master # waits 60 seconds longer than the kill timer on the workers, then # gives up and lets condor kill the condor_mpirun scripts on the # workers. There is not much we can do if there are still mpi # processes hanging around after this, as condor won't kill them. verbose_echo "waiting for workers to clean up" KILL_TIMER=$(( ${KILL_TIMEOUT} + 60 )) while find ${JOBNAME}*.lock &>/dev/null && [ ${KILL_TIMER} -gt 0 ] do if [ $(( ${KILL_TIMER} % 5 )) -eq 0 ] then verbose_echo "waiting ${KILL_TIMER} more seconds for workers to clean up" fi sleep 1 KILL_TIMER=$(( ${KILL_TIMER} - 1 )) done # kill the ssh daemon on the master verbose_echo "sending TERM signal to ${SSHD} pid ${SSH_PID}" kill -TERM ${SSH_PID} # remove the eviction file verbose_echo "removing ${EVICTFILE}" rm -f ${EVICTFILE} # exit with whatever status code mpirun terminated with verbose_echo "exiting with status code ${MASTER_EXITCODE}" # make a copy of the current the log files if [ ${KEEP_LOGS} -eq 1 ] then verbose_echo "backing up logfiles to ${LOGDIR}" cp *out *err ${LOGDIR} fi exit ${MASTER_EXITCODE} fi