subroutine cplrhs (nrwtop, nrwblk, nbloks, nrwbot, b, * nparts, work) c double precision b(1), work(nrwblk,nrwblk,1) integer nrwtop, nrwblk, nbloks, nrwbot, nparts c ***----------------------------------------------------------*** c * This subroutine is called after subroutine 'cpllhs' to * c * line up vector b with repacked arrays lftblk and rgtblk. * c * * c * Specifically, the sub-vector of b of length nrwbot * c * starting at position nrwtop+nbloks*nrwblk+1 is bubbled * c * up to position nrwtop+1. * c * * c * Note: The workspace is required to be larger than its * c * optimal size of nparts*nrwbot in order to avoid processor * c * contention for cached memory. * c ***----------------------------------------------------------*** integer j, minblk, remblk, kpart, base, top c ***----------------------------------------------------------*** c * Use non-partitioned coupling if requested number * c * of partitions would result in fewer than 2 blocks * c * per partition. * c ***----------------------------------------------------------*** if (nbloks .lt. 2*nparts) then nparts = 1 endif c ***----------------------------------------------------------*** c * Calculate minimum number of blocks per partition * c * Remaining blocks are distributed evenly among the * c * first partitions. * c ***----------------------------------------------------------*** minblk = nbloks/nparts remblk = nbloks - minblk*nparts c ***----------------------------------------------------------*** c * Make a copy of the bottom nrwbot slice of b in each * c * partition. (This could be done concurrently.) * c ***----------------------------------------------------------*** do 10 kpart = 1, nparts c ***----------------------------------------------------------*** c * temp_kpart <-- b(nrwtop+base*nrwblk-nrwbot+1.. * c * nrwtop+base*nrwblk) * c ***----------------------------------------------------------*** call partx(minblk,remblk,kpart,base,top) call DCOPY(nrwbot,b(nrwtop+base*nrwblk-nrwbot+1),1, * work(1,1,kpart),1) 10 continue c ***----------------------------------------------------------*** c * Each loop 30 iteration is independent and could * c * execute concurrently with the others. * c ***----------------------------------------------------------*** C$DOACROSS SHARE (nrwtop, nrwblk, nrwbot, b, C$& minblk, remblk, nparts, work), C$& LOCAL (j, kpart, base, top) do 30 kpart = 1, nparts call partx(minblk,remblk,kpart,base,top) c ***----------------------------------------------------------*** c * b(nrwtop+(top-1)*nrwblk+1..nrwtop+base*nrwblk-nrwbot), * c * contained entirely within this partition, is shifted * c * down nrwbot positions. * c ***----------------------------------------------------------*** do 20 j = nrwtop+base*nrwblk, top*nrwblk+1, -1 b(j) = b(j-nrwbot) 20 continue c ***----------------------------------------------------------*** c * In each partition except the first, the top nrwbot * c * slice of b is recovered from the copy of the bottom * c * nrwbot slice of b from the partition above. * c ***----------------------------------------------------------*** if (kpart .gt. 1) then call DCOPY(nrwbot,work(1,1,kpart-1),1, * b(nrwtop+(top-1)*nrwblk+1),1) end if 30 continue c ***----------------------------------------------------------*** c * b(nrwtop+1..nrwblk) * c * <-- b(nrwtop+nbloks*nrwblk+1..(nbloks+1)*nrwblk) * c ***----------------------------------------------------------*** call DCOPY(nrwbot,b(nrwtop+nbloks*nrwblk+1),1,b(nrwtop+1),1) c ***----------------------------------------------------------*** c * b(nrwtop+nbloks*nrwblk+1..(nbloks+1)*nrwblk * c * <-- temp_nparts * c ***----------------------------------------------------------*** call DCOPY(nrwbot,work(1,1,nparts),1, * b(nrwtop+nbloks*nrwblk+1),1) return end