[Users] Fwd: need help running simulation on slurm
Maya Baireddy
maya.baireddy at gmail.com
Sat Nov 30 17:44:44 CST 2024
Hello Everyone,
I am new to ETK. I am working on my high school research project trying to
run the simulation of BNS merger on amarel supercomputer from my local
university.
Could you please help me to start my simulation on SLURM. I have followed
the ETK gallery example for BNS simulation steps 1-5. But I am not able to
proceed to successfully create a machine to run the simulation.
I run the following steps
/home/sb1554/BNS/simfactory/bin/sim create bns --parfile
/home/sb1554/BNS/bns.par --machine slurmbns
srun bns.sh -o slurm.bns.%N.%j.out
and got the error:
**** An error occurred in MPI_Init_thread*** on a NULL communicator***
MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,***
and potentially your MPI job)*
I am attaching my machine, submit script, run script, log files.
I would appreciate any pointers from you. Or if you could point me to the
right person.
I was trying to post this on EKT forum, but need one credit to post.
Thank you,
Maya
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.einsteintoolkit.org/pipermail/users/attachments/20241130/56593dd9/attachment.htm>
-------------- next part --------------
+ set -e
+ cd /home/sb1554/simulations/bns/output-0000-active
+ echo Checking:
+ pwd
+ hostname
+ date
+ echo Environment:
+ export CACTUS_NUM_PROCS=1
+ CACTUS_NUM_PROCS=1
+ export CACTUS_NUM_THREADS=8
+ CACTUS_NUM_THREADS=8
+ export GMON_OUT_PREFIX=gmon.out
+ GMON_OUT_PREFIX=gmon.out
+ export OMP_NUM_THREADS=8
+ OMP_NUM_THREADS=8
+ sort
+ env
+ echo Starting:
++ date +%s
+ export CACTUS_STARTTIME=1732921316
+ CACTUS_STARTTIME=1732921316
+ '[' 1 = 1 ']'
+ '[' 0 -eq 0 ']'
+ /home/sb1554/simulations/bns/SIMFACTORY/exe/cactus_sim -L 3 /home/sb1554/simulations/bns/output-0000/bns.par
*** An error occurred in MPI_Init_thread
*** on a NULL communicator
*** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
*** and potentially your MPI job)
[slepner085.amarel.rutgers.edu:07624] Local abort before MPI_INIT completed completed successfully, but am not able to aggregate error messages, and not able to guarantee that all other processes were killed!
-------------- next part --------------
[LOG:2024-11-29 18:01:08] restart.create(simulationName, parfile)::Creating simulation bns
[LOG:2024-11-29 18:01:08] restart.create(simulationName, parfile)::Simulation directory: /home/sb1554/simulations/bns
[LOG:2024-11-29 18:01:08] restart.create(simulationName, parfile)::Simulation Properties:
[LOG:2024-11-29 18:01:08] restart.create(simulationName, parfile)::
[LOG:2024-11-29 18:01:08] restart.create(simulationName, parfile)::[properties]
[LOG:2024-11-29 18:01:08] restart.create(simulationName, parfile)::machine = slurmbns
[LOG:2024-11-29 18:01:08] restart.create(simulationName, parfile)::simulationid = simulation-bns-slurmbns-amarel1.amarel.rutgers.edu-sb1554-2024.11.29-18.01.08-16276
[LOG:2024-11-29 18:01:08] restart.create(simulationName, parfile)::sourcedir = /home/sb1554/BNS
[LOG:2024-11-29 18:01:08] restart.create(simulationName, parfile)::configuration = sim
[LOG:2024-11-29 18:01:08] restart.create(simulationName, parfile)::configid = config-sim-slepner088.amarel.rutgers.edu-cache-home-sb1554-BNS
[LOG:2024-11-29 18:01:08] restart.create(simulationName, parfile)::buildid = build-sim-slepner088.amarel.rutgers.edu-sb1554-2024.11.15-02.32.38-2196
[LOG:2024-11-29 18:01:08] restart.create(simulationName, parfile)::testsuite = False
[LOG:2024-11-29 18:01:08] restart.create(simulationName, parfile)::executable = /home/sb1554/simulations/bns/SIMFACTORY/exe/cactus_sim
[LOG:2024-11-29 18:01:08] restart.create(simulationName, parfile)::optionlist = /home/sb1554/simulations/bns/SIMFACTORY/cfg/OptionList
[LOG:2024-11-29 18:01:08] restart.create(simulationName, parfile)::submitscript = /home/sb1554/simulations/bns/SIMFACTORY/run/SubmitScript
[LOG:2024-11-29 18:01:08] restart.create(simulationName, parfile)::runscript = /home/sb1554/simulations/bns/SIMFACTORY/run/RunScript
[LOG:2024-11-29 18:01:08] restart.create(simulationName, parfile)::parfile = /home/sb1554/simulations/bns/SIMFACTORY/par/bns.par
[LOG:2024-11-29 18:01:08] restart.create(simulationName, parfile)::
[LOG:2024-11-29 18:01:08] restart.create(simulationName, parfile)::Simulation bns created
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::Creating new properties because this is an independant run, not a run following a submit
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::Determined the following properties
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::[properties]
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::machine = slurmbns
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::simulationid = simulation-bns-slurmbns-amarel1.amarel.rutgers.edu-sb1554-2024.11.29-18.01.08-16276
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::sourcedir = /home/sb1554/BNS
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::configuration = sim
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::configid = config-sim-slepner088.amarel.rutgers.edu-cache-home-sb1554-BNS
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::buildid = build-sim-slepner088.amarel.rutgers.edu-sb1554-2024.11.15-02.32.38-2196
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::testsuite = False
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::executable = /home/sb1554/simulations/bns/SIMFACTORY/exe/cactus_sim
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::optionlist = /home/sb1554/simulations/bns/SIMFACTORY/cfg/OptionList
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::submitscript = /home/sb1554/simulations/bns/SIMFACTORY/run/SubmitScript
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::runscript = /home/sb1554/simulations/bns/SIMFACTORY/run/RunScript
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::parfile = /home/sb1554/simulations/bns/SIMFACTORY/par/bns.par
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::nodes = 1
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::procsrequested = 8
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::ppn = 8
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::numprocs = 1
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::nodeprocs = 1
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::procs = 1
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::numthreads = 8
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::ppnused = 8
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::numsmt = 1
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::hostname = amarel1.amarel.rutgers.edu
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::user = sb1554
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::memory = 124000
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::cpufreq =
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::pbsSimulationName= bns-0000
[LOG:2024-11-29 18:01:56] restart.userRun(simulationName)::
[LOG:2024-11-29 18:01:56] self.makeActive()::Simulation bns with restart-id 0 has been made active
[LOG:2024-11-29 18:01:56] self.run(debug)::Prepping for execution/run
[LOG:2024-11-29 18:01:56] checkpointing = self.PrepareCheckpointing(recover_id)::PrepareCheckpointing: max_restart_id: -1
[LOG:2024-11-29 18:01:56] self.run(debug)::Defined substitution properties for execution/run
[LOG:2024-11-29 18:01:56] self.run(debug)::{'MACHINE': 'slurmbns', 'SOURCEDIR': '/home/sb1554/BNS', 'SIMULATION_NAME': 'bns', 'SHORT_SIMULATION_NAME': 'bns-0000', 'SIMULATION_ID': 'simulation-bns-slurmbns-amarel1.amarel.rutgers.edu-sb1554-2024.11.29-18.01.08-16276', 'RESTART_ID': 0, 'SCRIPTFILE': '/home/sb1554/simulations/bns/SIMFACTORY/run/SubmitScript', 'SUBMITSCRIPT': '/home/sb1554/simulations/bns/SIMFACTORY/run/SubmitScript', 'CONFIGURATION': 'sim', 'EXECUTABLE': '/home/sb1554/simulations/bns/SIMFACTORY/exe/cactus_sim', 'PARFILE': '/home/sb1554/simulations/bns/output-0000/bns.par', 'RUNDIR': '/home/sb1554/simulations/bns/output-0000', 'HOSTNAME': 'amarel1.amarel.rutgers.edu', 'USER': 'sb1554', 'ALLOCATION': 'NO_ALLOCATION', 'NODES': 1, 'PROCS_REQUESTED': 8, 'PPN': 8, 'NUM_PROCS': 1, 'NODE_PROCS': 1, 'PROCS': 1, 'NUM_THREADS': 8, 'PPN_USED': 8, 'NUM_SMT': 1, 'MEMORY': '124000', 'CPUFREQ': None, 'RUNDEBUG': 0}
[LOG:2024-11-29 18:01:56] self.run(debug)::Executing run command: /home/sb1554/simulations/bns/output-0000/SIMFACTORY/RunScript
[LOG:2024-11-29 18:05:43] restart.load(simulationName, active_id)::For simulation bns, loaded restart id 0, long restart id 0000
[LOG:2024-11-29 18:29:40] ret = restart.load(sim, restart_id)::For simulation bns, loaded restart id 0, long restart id 0000
[LOG:2024-11-29 18:29:40] ret = restart.load(sim, restart_id)::For simulation bns, loaded restart id 0, long restart id 0000
-------------- next part --------------
1 #! /bin/bash
2
3 export SIMFACTORY=/home/sb1554/Cactus/simafactory/bin
4 export SOURCE_DIR=/home/sb1554/Cactus
5 export CACTUS_PATH=/home/sb1554/BNS
6
7 #BATCH --partition=main # Partition (job queue)
8
9 #SBATCH --requeue # Return job to the queue if preempted
10
11 #SBATCH --job-name=bnsnew # Assign a short name to your job
12
13 #SBATCH --nodes=1 # Number of nodes you require
14
15 #SBATCH --ntasks=1 # Total # of tasks across all nodes
16
17 #SBATCH --ntasks-per-node=1
18
19 #SBATCH --cpus-per-task=1 # Cores per task (>1 if multithread tasks)
20
21 #SBATCH --mem=124000 # Real memory (RAM) required (MB)
22
23 #SBATCH --time=70:00:00 # Total run time limit (HH:MM:SS)
24
25 #SBATCH --output=slurm.bns.%N.%j.out # STDOUT output file
26
27 #SBATCH --error=slurm.bns.%N.%j.err # STDERR output file (optional)
28
29
30 module use /projects/community/modulefiles
31 #module load gcc/10.2.0/openmpi/4.0.5-bz186
32 module load gcc/11.2/openmpi/4.1.3-kholodvl
33 module load libnl/3.2.25-sb1554
34 module load rdma-core/54.0-sb1554
35 module load gsl/2.5-bd387
36
37
38
39 cd /home/sb1554/BNS
40 /home/sb1554/BNS/simfactory/mdb/runscripts/slurmbns.run
-------------- next part --------------
[slurmbns]
# This machine description file is used internally by simfactory as a template
# during the sim setup and sim setup-silent commands
# Edit at your own risk
# Machine description
nickname = slurmbns
name = slurmbns
location = LSU
description = CCT
status = production
# Access to this machine
hostname = amarel1.amarel.rutgers.edu
aliaspattern = ^\w+(\.amarel\.rutgers\.edu)?$
# Source tree management
sourcebasedir = /home/sb1554
optionlist = generic.cfg
submitscript = slurmbns.sub
runscript = slurmbns.run
make = make -j at MAKEJOBS@
basedir = /home/sb1554/simulations
ppn = 8
max-num-threads = 128
num-threads = 8
memory = 124000
nodes = 2
num-smt = 1
#procs = 16
submit = sbatch /home/sb1554/BNS/simfactory/mdb/runscripts/slurmbns.run
getstatus = squeue -j @JOB_ID@
# need to kill the whole set of processes descending from @JOB_ID@, not just the
# (simfactory) top-level process
stop = scancel @JOB_ID@
submitpattern = 'Submitted batch job (\d+)'
statuspattern = '@JOB_ID@ '
queuedpattern = ' PD '
queue = checkpt
runningpattern = ' (CF|CG|R|TO) '
holdingpattern = '\(JobHeldUser\)'
[sb1554 at amarel1 machines]$
exechostpattern = (.*)
stdout = cat @SIMULATION_NAME at .out
stderr = cat @SIMULATION_NAME at .err
stdout-follow = sleep 10 ; sattach @JOB_ID at .0
# stdout-follow = while ! scontrol >/dev/null wait_job @JOB_ID@ ; do sleep 5 ; done ; tail -n 100 -f @SIMULATION_NAME at .out @SIMULATION_NAME at .err
maxwalltime = 72:00:00
disabled-thorns = CactusUtils/SystemTopology
[slurmbns]
# This machine description file is used internally by simfactory as a template
# during the sim setup and sim setup-silent commands
# Edit at your own risk
# Machine description
nickname = slurmbns
name = slurmbns
location = LSU
description = CCT
status = production
# Access to this machine
hostname = amarel1.amarel.rutgers.edu
aliaspattern = ^\w+(\.amarel\.rutgers\.edu)?$
# Source tree management
sourcebasedir = /home/sb1554
optionlist = generic.cfg
submitscript = slurmbns.sub
runscript = slurmbns.run
make = make -j at MAKEJOBS@
basedir = /home/sb1554/simulations
ppn = 8
max-num-threads = 128
num-threads = 8
memory = 124000
nodes = 33
submit = sbatch /home/sb1554/BNS/simfactory/mdb/runscripts/slurmbns.run
getstatus = squeue -j @JOB_ID@
# need to kill the whole set of processes descending from @JOB_ID@, not just the
# (simfactory) top-level process
stop = scancel @JOB_ID@
submitpattern = 'Submitted batch job (\d+)'
statuspattern = '@JOB_ID@ '
queuedpattern = ' PD '
queue = checkpt
runningpattern = ' (CF|CG|R|TO) '
holdingpattern = '\(JobHeldUser\)'
exechost = hostname -s
exechostpattern = (.*)
stdout = cat @SIMULATION_NAME at .out
stderr = cat @SIMULATION_NAME at .err
stdout-follow = sleep 10 ; sattach @JOB_ID at .0
# stdout-follow = while ! scontrol >/dev/null wait_job @JOB_ID@ ; do sleep 5 ; done ; tail -n 100 -f @SIMULATION_NAME at .out @SIMULATION_NAME at .err
maxwalltime = 72:00:00
disabled-thorns = CactusUtils/SystemTopology
-------------- next part --------------
1 #! /bin/bash
2
3 echo "Preparing:"
4 set -x # Output commands
5 set -e # Abort on errors
6
7 cd /home/sb1554/
8
9 echo "Checking:"
10 pwd
11 hostname
12 date
13
14 echo "Environment:"
15 export CACTUS_PATH=/home/sb1554/BNS
16 export CACTUS_NUM_PROCS=2
17 export CACTUS_NUM_THREADS=8
18 export GMON_OUT_PREFIX=gmon.out
19 export OMP_NUM_THREADS=8
20 export OMP_PLACES=cores # TODO: maybe use threads when smt is used?
21 # https://github.com/open-mpi/ompi/issues/4948
22 export OMPI_MCA_btl_vader_single_copy_mechanism=none
23 env | sort > /home/sb1554/BNS/simfactory/ENVIRONMENT
24
25 echo "Starting:"
26 export CACTUS_STARTTIME=$(date +%s)
27 #time srun -n ${CACTUS_NUM_PROCS} @EXECUTABLE@ -L 3 /home/sb1554/BNS/bns.par
28 time /home/sb1554/BNS/simfactory/bin/sim run bns --parfile /home/sb1554/BNS/bns. par --machine slurmbns
29 #time srun @EXECUTABLE@ -L 3 /home/sb1554/BNS/bns.par
30 echo "Stopping:"
31 date
32
33 echo "Done."
-------------- next part --------------
A non-text attachment was scrubbed...
Name: bns.sh
Type: application/x-sh
Size: 607 bytes
Desc: not available
URL: <http://lists.einsteintoolkit.org/pipermail/users/attachments/20241130/56593dd9/attachment.sh>
More information about the Users
mailing list