I parallelized three nested-loops with MPI. When I ran the code, an error popped up, saying 'srun: error: Unable to create step for job 20258899: More processors requested than permitted'
Here is the script that I used to submit job.
#!/bin/bash
#SBATCH --partition=workq
#SBATCH --job-name="code"
#SBATCH --nodes=2
#SBATCH --time=1:00:00
#SBATCH --exclusive
#SBATCH --err=std.err
#SBATCH --output=std.out
#---#
module switch PrgEnv-cray PrgEnv-intel
export OMP_NUM_THREADS=1
#---#
echo "The job "${SLURM_JOB_ID}" is running on "${SLURM_JOB_NODELIST}
#---#
srun --ntasks=1000 --cpus-per-task=${OMP_NUM_THREADS} --hint=nomultithread ./example_parallel
I paste my code below. Would anyone please tell me what problem is with my code? Is the MPI that I used wrong or not? Thank you very much.
PROGRAM THREEDIMENSION
USE MPI
IMPLICIT NONE
INTEGER, PARAMETER :: dp = SELECTED_REAL_KIND(p=15,r=14)
INTEGER :: i, j, k, le(3)
REAL (KIND=dp), ALLOCATABLE :: kp(:,:,:,:), kpt(:,:), col1(:), col2(:)
REAL (KIND=dp) :: su, co, tot
INTEGER :: world_size, world_rank, ierr
INTEGER :: world_comm_1st, world_comm_2nd, world_comm_3rd
INTEGER :: th3_dimension_size, th3_dimension_size_max, th3_dimension_rank
INTEGER :: th2_dimension_size, th2_dimension_size_max, th2_dimension_rank
INTEGER :: th1_dimension_size, th1_dimension_size_max, th1_dimension_rank
INTEGER :: proc_1st_dimension_len, proc_2nd_dimension_len, proc_3rd_last_len, proc_i, proc_j, proc_k
REAL (KIND=dp) :: t0, t1
CALL MPI_INIT(ierr)
CALL MPI_COMM_SIZE(MPI_COMM_WORLD, world_size, ierr)
CALL MPI_COMM_RANK(MPI_COMM_WORLD, world_rank, ierr)
IF (world_rank == 0) THEN
t0 = MPI_WTIME()
END IF
le(1) = 1000
le(2) = 600
le(3) = 900
ALLOCATE (kp(le(1),le(2),le(3),3))
ALLOCATE (kpt(le(3),3))
ALLOCATE (col1(le(1)))
ALLOCATE (col2(le(2)))
DO i = 1, le(1), 1
DO j = 1, le(2), 1
DO k = 1, le(3), 1
kp(i,j,k,1) = DBLE(i+j+j+1)
kp(i,j,k,2) = DBLE(i+j+k+2)
kp(i,j,k,3) = DBLE(i+j+k+3)
END DO
END DO
END DO
proc_1st_dimension_len = (world_size - 1) / le(1) + 1
proc_2nd_dimension_len = (world_size - 1 / (le(1) + le(2))) + 1
proc_3rd_last_len = MOD(world_size - 1, le(1)+le(2)) + 1
IF (world_rank <= proc_3rd_last_len*proc_2nd_dimension_len*proc_1st_dimension_len) THEN
proc_i = MOD(world_rank,proc_1st_dimension_len)
proc_j = world_rank / proc_1st_dimension_len
proc_k = world_rank / (proc_1st_dimension_len*proc_2nd_dimension_len)
ELSE
proc_i = MOD(world_rank-proc_3rd_last_len,proc_1st_dimension_len-1)
proc_j = (world_rank-proc_3rd_last_len) / proc_1st_dimension_len-1
proc_k = (world_rank-proc_3rd_last_len) / (proc_2nd_dimension_len*proc_2nd_dimension_len-1)
END IF
CALL MPI_BARRIER(MPI_COMM_WORLD,ierr)
CALL MPI_COMM_SPLIT(MPI_COMM_WORLD,proc_i,world_rank,world_comm_1st,ierr)
CALL MPI_COMM_SIZE(world_comm_1st,th1_dimension_size,ierr)
CALL MPI_COMM_RANK(world_comm_1st,th1_dimension_rank,ierr)
CALL MPI_COMM_SPLIT(MPI_COMM_WORLD,proc_j,world_rank,world_comm_2nd,ierr)
CALL MPI_COMM_SIZE(world_comm_2nd,th2_dimension_size,ierr)
CALL MPI_COMM_RANK(world_comm_2nd,th2_dimension_rank,ierr)
CALL MPI_COMM_SPLIT(MPI_COMM_WORLD,proc_k,world_rank,world_comm_3rd,ierr)
CALL MPI_COMM_SIZE(world_comm_3rd,th3_dimension_size,ierr)
CALL MPI_COMM_RANK(world_comm_3rd,th3_dimension_rank,ierr)
CALL MPI_BARRIER(MPI_COMM_WORLD,ierr)
CALL MPI_ALLREDUCE(th1_dimension_size,th1_dimension_size_max,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD,ierr)
CALL MPI_ALLREDUCE(th2_dimension_size,th2_dimension_size_max,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD,ierr)
IF (world_rank == 0) THEN
OPEN (UNIT=3, FILE='out.dat', STATUS='UNKNOWN')
END IF
col1 = 0.0
DO i = 1, le(1), 1
IF (MOD(i-1,th1_dimension_size_max) /= th1_dimension_rank) CYCLE
col2 = 0.0
DO j = 1, le(2), 1
IF (MOD(j-1,th2_dimension_size_max) /= th2_dimension_rank) CYCLE
kpt = kp(i,j,:,:)
su = 0.0
DO k = 1, le(3), 1
IF(MOD(k-1,th1_dimension_size*th2_dimension_size) /= th3_dimension_rank) CYCLE
CALL CAL(kpt(k,3),co)
su = su + co
END DO
CALL MPI_BARRIER(world_comm_3rd,ierr)
CALL MPI_REDUCE(su,col2(j),1,MPI_DOUBLE,MPI_SUM,0,world_comm_3rd,ierr)
END DO
CALL MPI_BARRIER(world_comm_2nd,ierr)
CALL MPI_REDUCE(col2,col1(i),le(2),MPI_DOUBLE,MPI_SUM,0,world_comm_2nd,ierr)
END DO
CALL MPI_BARRIER(world_comm_1st,ierr)
tot = 0.0
IF (th1_dimension_rank == 0) THEN
CALL MPI_REDUCE(col1,tot,le(1),MPI_DOUBLE,MPI_SUM,0,world_comm_1st,ierr)
WRITE (UNIT=3, FMT=*) tot
CLOSE (UNIT=3)
END IF
DEALLOCATE (kp)
DEALLOCATE (kpt)
DEALLOCATE (col1)
DEALLOCATE (col2)
IF (world_rank == 0) THEN
t1 = MPI_WTIME()
WRITE (UNIT=3, FMT=*) 'Total time:', t1 - t0, 'seconds'
END IF
CALL MPI_FINALIZE (ierr)
STOP
END PROGRAM THREEDIMENSION
SUBROUTINE CAL(arr,co)
IMPLICIT NONE
INTEGER, PARAMETER :: dp=SELECTED_REAL_KIND(p=15,r=14)
INTEGER :: i
REAL (KIND=dp) :: arr(3), co
co = 0.0d0
co = co + (arr(1) ** 2 + arr(2) * 3.1d1) / (arr(3) + 5.0d-1)
RETURN
END SUBROUTINE CAL
With the #SBATCH directives in the header of the file, you request two nodes explicitly, and, as you do not specify --ntasks, you get the default of one task per node, so you implicitly request two tasks.
Then, when the job starts, your srun line tries to "use" 1000 tasks. You should have a line
#SBATCH --ntasks=1000
in the header as suggested per @Gilles. The srun command will inherit from that 1000 tasks by default so there is no need to specify it there in this case.
Also, if ${OMP_NUM_THREADS} were not 1, you would have to specify the --cpu-per-tasks in the header as a SBATCH directive otherwise you will face the same error.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With