diff --git a/MPI1/AMR/amr.c b/MPI1/AMR/amr.c index 1b9e2f124..098923fdb 100755 --- a/MPI1/AMR/amr.c +++ b/MPI1/AMR/amr.c @@ -72,14 +72,14 @@ HISTORY: - Written by Rob Van der Wijngaart, February September 2016. #define EPSILON 1.e-8 #define COEFX 1.0 #define COEFY 1.0 - #define FSTR "%7.4lf" + #define FSTR "%10.4lf" #else #define DTYPE float #define MPI_DTYPE MPI_FLOAT #define EPSILON 0.0001f #define COEFX 1.0f #define COEFY 1.0f - #define FSTR "%7.4f" + #define FSTR "%10.4f" #endif /* define shorthand for indexing multi-dimensional arrays */ @@ -112,16 +112,26 @@ void get_BG_data(int load_balance, DTYPE *in_bg, DTYPE *ing_r, int my_ID, long e long G_istart_r, long G_jstart_r, MPI_Comm comm_bg, MPI_Comm comm_r, long L_istart_r_gross, long L_iend_r_gross, long L_jstart_r_gross, long L_jend_r_gross, - long L_width_r_true_gross, long L_istart_r_true_gross, - long L_jstart_r_true_gross, int g) { + long L_width_r_true_gross, long L_istart_r_true_gross, long L_iend_r_true_gross, + long L_jstart_r_true_gross, long L_jend_r_true_gross, int g) { long send_vec[8], *recv_vec, offset, i, j, p, acc_send, acc_recv; int *recv_offset, *recv_count, *send_offset, *send_count, error=0; DTYPE *recv_buf, *send_buf; - /* in case of no_talk we just copy the in-rank data from BG to refinement */ if (load_balance == no_talk) { - + /* in case of no_talk we just copy the in-rank data from BG to refinement */ + if (comm_r != MPI_COMM_NULL) { + for (j=L_jstart_r_true_gross; j<=L_jend_r_true_gross; j++) + for (i=L_istart_r_true_gross; i<=L_iend_r_true_gross; i++) { + ING_R(i,j)=0.0; + } + for (j=L_jstart_r_gross; j<=L_jend_r_gross; j++) + for (i=L_istart_r_gross; i<=L_iend_r_gross; i++) { + int ir = i-G_istart_r, jr = j-G_jstart_r; + ING_R(ir*expand,jr*expand) = IN(i,j); + } + } } else { recv_vec = (long *) prk_malloc(sizeof(long)*Num_procs*8); @@ -200,11 +210,13 @@ void get_BG_data(int load_balance, DTYPE *in_bg, DTYPE *ing_r, int my_ID, long e } /* fill send buffer with BG data to all other ranks who need it */ offset = 0; - if (comm_bg != MPI_COMM_NULL) for (p=0; p0; Num_procs_bgx--) { if (!(Num_procs_bg%Num_procs_bgx)) { Num_procs_bgy = Num_procs_bg/Num_procs_bgx; break; } } - } - - - /* compute tiling of refinements */ - switch(load_balance) { - case no_talk: // this balancer does not use a process grid for refinements - break; - case fine_grain: // refinements are partitioned exactly like BG - for (g=0; g<4; g++) { - Num_procs_rx[g] = Num_procs_bgx; - Num_procs_ry[g] = Num_procs_bgy; - } - break; - case high_water: // refinements are partitioned independently, but similar to BG - for (g=0; g<4; g++) { - if (comm_r[g] != MPI_COMM_NULL) { - Num_procs_rx[g] = Num_procs_ry[g] = 0; - for (Num_procs_rx[g]=(int) (sqrt(Num_procs_r[g]+1)); - Num_procs_rx[g]>0; Num_procs_rx[g]--) { - if (!(Num_procs_r[g]%Num_procs_rx[g])) { - Num_procs_ry[g] = Num_procs_r[g]/Num_procs_rx[g]; - break; - } - } - } - } - break; - case amnesia: break; - } - - /* communication neighbors on BG are computed for all who own part of it */ - if (comm_bg != MPI_COMM_NULL) { + /* communication neighbors on BG are computed for all who own part of it */ my_ID_bgx = my_ID_bg%Num_procs_bgx; my_ID_bgy = my_ID_bg/Num_procs_bgx; /* compute neighbors; catch dropping off edges of grid */ @@ -616,62 +601,8 @@ int main(int argc, char ** argv) { if (my_ID_bgx > 0) left_nbr_bg = my_ID-1; if (my_ID_bgy < Num_procs_bgy-1) top_nbr_bg = my_ID+Num_procs_bgx; if (my_ID_bgy > 0) bottom_nbr_bg = my_ID-Num_procs_bgx; - } - - /* same for communication neighbors on refinements */ - for (g=0; g<4; g++) if (comm_r[g] != MPI_COMM_NULL) { - my_ID_rx[g] = my_ID_r[g]%Num_procs_rx[g]; - my_ID_ry[g] = my_ID_r[g]/Num_procs_rx[g]; - /* compute neighbors; catch dropping off edges of grid */ - right_nbr_r[g] = left_nbr_r[g] = top_nbr_r[g] = bottom_nbr_r[g] = -1; - if (my_ID_rx[g] < Num_procs_rx[g]-1) right_nbr_r[g] = my_ID_r[g]+1; - if (my_ID_rx[g] > 0) left_nbr_r[g] = my_ID_r[g]-1; - if (my_ID_ry[g] < Num_procs_ry[g]-1) top_nbr_r[g] = my_ID_r[g]+Num_procs_rx[g]; - if (my_ID_ry[g] > 0) bottom_nbr_r[g] = my_ID_r[g]-Num_procs_rx[g]; - } - - if (my_ID == root) { - printf("Number of ranks = %d\n", Num_procs); - printf("Background grid size = %ld\n", n); - printf("Radius of stencil = %d\n", RADIUS); - printf("Tiles in x/y-direction for BG = %d/%d\n", Num_procs_bgx, Num_procs_bgy); - printf("Tiles per refinement = %d, %d, %d, %d\n", - Num_procs_r[0], Num_procs_r[1], Num_procs_r[2], Num_procs_r[3]); - printf("Type of stencil = star\n"); -#if DOUBLE - printf("Data type = double precision\n"); -#else - printf("Data type = single precision\n"); -#endif -#if LOOPGEN - printf("Script used to expand stencil loop body\n"); -#else - printf("Compact representation of stencil loop body\n"); -#endif - printf("Number of iterations = %d\n", iterations); - printf("Load balancer = %s\n", c_load_balance); - printf("Refinements:\n"); - printf(" Background grid points = %ld\n", n_r); - printf(" Grid size = %ld\n", n_r_true); - printf(" Refinement level = %d\n", refine_level); - printf(" Period = %d\n", period); - printf(" Duration = %d\n", duration); - printf(" Sub-iterations = %d\n", sub_iterations); - } - - /* compute layout of refinements */ - G_istart_r[0] = G_istart_r[2] = 0; - G_iend_r[0] = G_iend_r[2] = n_r-1; - G_istart_r[1] = G_istart_r[3] = n-n_r; - G_iend_r[1] = G_iend_r[3] = n-1; - G_jstart_r[0] = G_jstart_r[3] = 0; - G_jend_r[0] = G_jend_r[3] = n_r-1; - G_jstart_r[1] = G_jstart_r[2] = n-n_r; - G_jend_r[1] = G_jend_r[2] = n-1; - - /* reserve space for background input/output fields */ - if (comm_bg != MPI_COMM_NULL) { + /* create decomposition and reserve space for BG input/output fields */ L_width_bg = n/Num_procs_bgx; leftover = n%Num_procs_bgx; @@ -709,7 +640,7 @@ int main(int argc, char ** argv) { error = 1; goto ENDOFBG; } - + if (L_width_bg < RADIUS || L_height_bg < RADIUS) { printf("ERROR: rank %d's BG work tile smaller than stencil radius: %d\n", my_ID, MIN(L_width_bg, L_height_bg)); @@ -730,15 +661,121 @@ int main(int argc, char ** argv) { } ENDOFBG:; } - else { + else { // bogus empty patch L_istart_bg = 0; L_iend_bg = -1; L_jstart_bg = 0;; L_jend_bg = -1; } - bail_out(error); + /* compute global layout of refinements */ + G_istart_r[0] = G_istart_r[2] = 0; + G_iend_r[0] = G_iend_r[2] = n_r-1; + G_istart_r[1] = G_istart_r[3] = n-n_r; + G_iend_r[1] = G_iend_r[3] = n-1; + G_jstart_r[0] = G_jstart_r[3] = 0; + G_jend_r[0] = G_jend_r[3] = n_r-1; + G_jstart_r[1] = G_jstart_r[2] = n-n_r; + G_jend_r[1] = G_jend_r[2] = n-1; + + /* compute tiling of refinements */ + switch(load_balance) { + case no_talk: // check if calling rank's BG patch overlaps with refinement*/ + for (g=0; g<4; g++) { + L_istart_r[g] = MAX(L_istart_bg,G_istart_r[g]); + L_iend_r[g] = MIN(L_iend_bg, G_iend_r[g]); + L_jstart_r[g] = MAX(L_jstart_bg,G_jstart_r[g]); + L_jend_r[g] = MIN(L_jend_bg, G_jend_r[g]); + if (L_istart_r[g]<=L_iend_r[g] && + L_jstart_r[g]<=L_jend_r[g]) color_r = 1; + else color_r = MPI_UNDEFINED; + MPI_Comm_split(MPI_COMM_WORLD, color_r, my_ID, &comm_r[g]); + if (comm_r[g] != MPI_COMM_NULL) { + MPI_Comm_size(comm_r[g], &Num_procs_r[g]); + MPI_Comm_rank(comm_r[g], &my_ID_r[g]); + // determine layout of subset + long ilow, ihigh, jlow, jhigh; + MPI_Allreduce(&my_ID_bgx,&ilow ,1,MPI_LONG,MPI_MIN,comm_r[g]); + MPI_Allreduce(&my_ID_bgx,&ihigh,1,MPI_LONG,MPI_MAX,comm_r[g]); + MPI_Allreduce(&my_ID_bgy,&jlow ,1,MPI_LONG,MPI_MIN,comm_r[g]); + MPI_Allreduce(&my_ID_bgy,&jhigh,1,MPI_LONG,MPI_MAX,comm_r[g]); + Num_procs_rx[g] = ihigh-ilow+1; + Num_procs_ry[g] = jhigh-jlow+1; + }; + } + break; + case fine_grain: // refinements are partitioned exactly like BG + for (g=0; g<4; g++) { + Num_procs_rx[g] = Num_procs_bgx; + Num_procs_ry[g] = Num_procs_bgy; + } + break; + case high_water: // refinements are partitioned independently, but similar to BG + for (g=0; g<4; g++) { + if (comm_r[g] != MPI_COMM_NULL) { + for (Num_procs_rx[g]=(int) (sqrt(Num_procs_r[g]+1)); + Num_procs_rx[g]>0; Num_procs_rx[g]--) { + if (!(Num_procs_r[g]%Num_procs_rx[g])) { + Num_procs_ry[g] = Num_procs_r[g]/Num_procs_rx[g]; + break; + } + } + } + } + break; + case amnesia: break; + } + + /* compute communication neighbors on refinements */ + for (g=0; g<4; g++) if (comm_r[g] != MPI_COMM_NULL) { + my_ID_rx[g] = my_ID_r[g]%Num_procs_rx[g]; + my_ID_ry[g] = my_ID_r[g]/Num_procs_rx[g]; + /* compute neighbors; catch dropping off edges of grid */ + right_nbr_r[g] = left_nbr_r[g] = top_nbr_r[g] = bottom_nbr_r[g] = -1; + if (my_ID_rx[g] < Num_procs_rx[g]-1) right_nbr_r[g] = my_ID_r[g]+1; + if (my_ID_rx[g] > 0) left_nbr_r[g] = my_ID_r[g]-1; + if (my_ID_ry[g] < Num_procs_ry[g]-1) top_nbr_r[g] = my_ID_r[g]+Num_procs_rx[g]; + if (my_ID_ry[g] > 0) bottom_nbr_r[g] = my_ID_r[g]-Num_procs_rx[g]; + } + + MPI_Barrier(MPI_COMM_WORLD); + if (my_ID == root) { + printf("Number of ranks = %d\n", Num_procs); + printf("Background grid size = %ld\n", n); + printf("Radius of stencil = %d\n", RADIUS); + printf("Tiles in x/y-direction on BG = %d/%d\n", Num_procs_bgx, Num_procs_bgy); + } + for (g=0; g<4; g++) { + MPI_Barrier(MPI_COMM_WORLD); + if ((comm_r[g] != MPI_COMM_NULL) && (my_ID_r[g]==root)) + printf("Tiles in x/y-direction on ref %d = %d/%d\n", + g, Num_procs_rx[g], Num_procs_ry[g]); usleep(1000); + } + MPI_Barrier(MPI_COMM_WORLD); + if (my_ID == root) { + printf("Type of stencil = star\n"); +#if DOUBLE + printf("Data type = double precision\n"); +#else + printf("Data type = single precision\n"); +#endif +#if LOOPGEN + printf("Script used to expand stencil loop body\n"); +#else + printf("Compact representation of stencil loop body\n"); +#endif + printf("Number of iterations = %d\n", iterations); + printf("Load balancer = %s\n", c_load_balance); + printf("Refinements:\n"); + printf(" Background grid points = %ld\n", n_r); + printf(" Grid size = %ld\n", n_r_true); + printf(" Refinement level = %d\n", refine_level); + printf(" Period = %d\n", period); + printf(" Duration = %d\n", duration); + printf(" Sub-iterations = %d\n", sub_iterations); + } + /* reserve space for refinement input/output fields; first compute extents */ /* we partition the refinement in terms of BG indices, so that we know @@ -797,46 +834,40 @@ int main(int argc, char ** argv) { L_jend_r_true[g] = L_jstart_r_true[g] + L_height_r_true[g] - 1; } - /* make sure that the gross boundaries of the patch coincide with BG points */ - L_istart_r_true_gross[g] = (L_istart_r_true[g]/expand)*expand; - L_iend_r_true_gross[g] = ((L_iend_r_true[g]+expand-1)/expand)*expand; - L_jstart_r_true_gross[g] = (L_jstart_r_true[g]/expand)*expand; - L_jend_r_true_gross[g] = ((L_jend_r_true[g]+expand-1)/expand)*expand; - L_istart_r_gross[g] = L_istart_r_true_gross[g]/expand; - L_iend_r_gross[g] = L_iend_r_true_gross[g]/expand; - L_jstart_r_gross[g] = L_jstart_r_true_gross[g]/expand; - L_jend_r_gross[g] = L_jend_r_true_gross[g]/expand; - /* shift refinement patch boundaries to BG coordinates */ L_istart_r[g] += G_istart_r[g]; L_iend_r[g] += G_istart_r[g]; L_jstart_r[g] += G_jstart_r[g]; L_jend_r[g] += G_jstart_r[g]; - /* shift unexpanded gross refinement patch boundaries to BG coordinates */ - L_istart_r_gross[g] += G_istart_r[g]; L_iend_r_gross[g] += G_istart_r[g]; - L_jstart_r_gross[g] += G_jstart_r[g]; L_jend_r_gross[g] += G_jstart_r[g]; - - /* we're not skipping this refinement by default */ + /* we're not skipping any refinement, in principle */ skip_r[g] = 0; } - else { // for NO_TALK we compute refinement partition boundaries using the BG - L_istart_r[g] = MAX(G_istart_r[g], L_istart_bg); - L_iend_r[g] = MIN(G_iend_r[g], L_iend_bg); - L_jstart_r[g] = MAX(G_jstart_r[g], L_jstart_bg); - L_jend_r[g] = MIN(G_jend_r[g], L_jend_bg); - if (L_istart_r[g]>L_iend_r[g] || L_jstart_r[g]>L_jend_r[g]) skip_r[g] = 1; - else skip_r[g] = 0; - } - - L_height_r[g] = L_jend_r[g] - L_jstart_r[g] + 1; - L_width_r[g] = L_iend_r[g] - L_istart_r[g] + 1; - - /* now we need to compute gross boundaries for allocation and interpolation */ - if (refine_level==-1) { - L_istart_r_gross[g] = L_istart_r_true[g] = L_istart_r_true_gross[g] = L_istart_r[g]; - L_iend_r_gross[g] = L_iend_r_true[g] = L_iend_r_true_gross[g] = L_iend_r[g]; - L_jstart_r_gross[g] = L_jstart_r_true[g] = L_jstart_r_true_gross[g] = L_jstart_r[g]; - L_jend_r_gross[g] = L_jend_r_true[g] = L_jend_r_true_gross[g] = L_jend_r[g]; + else if (load_balance == no_talk) { // already computed refinement partition boundaries + L_istart_r_true[g] = (L_istart_r[g] - G_istart_r[g])*expand; + if (my_ID_rx[g]>0) L_istart_r_true[g] -= expand/2; + L_iend_r_true[g] = (L_iend_r[g] - G_istart_r[g])*expand; + if (my_ID_rx[g] < Num_procs_rx[g]-1) L_iend_r_true[g] += (expand-1)/2; + L_jstart_r_true[g] = (L_jstart_r[g] - G_jstart_r[g])*expand; + if (my_ID_ry[g]>0) L_jstart_r_true[g] -= expand/2; + L_jend_r_true[g] = (L_jend_r[g] - G_jstart_r[g])*expand; + if (my_ID_ry[g] < Num_procs_ry[g]-1) L_jend_r_true[g] += (expand-1)/2; + skip_r[g] = 0; } + /* make sure that the gross boundaries of the patch coincide with BG points */ + L_istart_r_true_gross[g] = (L_istart_r_true[g]/expand)*expand; + L_iend_r_true_gross[g] = ((L_iend_r_true[g]+expand-1)/expand)*expand; + L_jstart_r_true_gross[g] = (L_jstart_r_true[g]/expand)*expand; + L_jend_r_true_gross[g] = ((L_jend_r_true[g]+expand-1)/expand)*expand; + L_istart_r_gross[g] = L_istart_r_true_gross[g]/expand; + L_iend_r_gross[g] = L_iend_r_true_gross[g]/expand; + L_jstart_r_gross[g] = L_jstart_r_true_gross[g]/expand; + L_jend_r_gross[g] = L_jend_r_true_gross[g]/expand; + + /* shift unexpanded gross refinement patch boundaries to global BG coordinates */ + L_istart_r_gross[g] += G_istart_r[g]; L_iend_r_gross[g] += G_istart_r[g]; + L_jstart_r_gross[g] += G_jstart_r[g]; L_jend_r_gross[g] += G_jstart_r[g]; + + L_height_r[g] = L_jend_r[g] - L_jstart_r[g] + 1; + L_width_r[g] = L_iend_r[g] - L_istart_r[g] + 1; L_height_r_gross[g] = L_jend_r_gross[g] - L_jstart_r_gross[g] + 1; L_width_r_gross[g] = L_iend_r_gross[g] - L_istart_r_gross[g] + 1; L_height_r_true_gross[g] = L_jend_r_true_gross[g] - L_jstart_r_true_gross[g] + 1; @@ -844,8 +875,8 @@ int main(int argc, char ** argv) { L_height_r_true[g] = L_jend_r_true[g] - L_jstart_r_true[g] + 1; L_width_r_true[g] = L_iend_r_true[g] - L_istart_r_true[g] + 1; - if ( skip_r[g] && (L_height_r[g] == 0 || L_width_r[g] == 0)) { - printf("WARNING: rank %d has no work to do\n", my_ID); + if ( !skip_r[g] && (L_height_r_true[g] == 0 || L_width_r_true[g] == 0)) { + printf("WARNING: rank %d has no work to do on refinement %d\n", my_ID, g); skip_r[g] = 1; } @@ -858,10 +889,9 @@ int main(int argc, char ** argv) { } if (!skip_r[g]) { - total_length_in_r[g] = (long) ((L_width_r_gross[g] -1)*expand+1+2*RADIUS)* - (long) ((L_height_r_gross[g]-1)*expand+1+2*RADIUS); - total_length_out_r[g] = (long) ((L_width_r_gross[g] -1)*expand+1)* - (long) ((L_height_r_gross[g]-1)*expand+1); + total_length_in_r[g] = (L_width_r_true_gross[g]+2*RADIUS)* + (L_height_r_true_gross[g]+2*RADIUS); + total_length_out_r[g] = L_width_r_true_gross[g] * L_height_r_true_gross[g]; in_r[g] = (DTYPE *) prk_malloc(sizeof(DTYPE)*total_length_in_r[g]); out_r[g] = (DTYPE *) prk_malloc(sizeof(DTYPE)*total_length_out_r[g]); if (!in_r[g] || !out_r[g]) { @@ -870,7 +900,7 @@ int main(int argc, char ** argv) { } } } - else {//bogus patch + else {//Bogus patch L_istart_r_gross[g] = 0; L_iend_r_gross[g] = -1; L_jstart_r_gross[g] = 0; @@ -913,14 +943,14 @@ int main(int argc, char ** argv) { bottom_buf_out_bg = top_buf_out_bg + 2*RADIUS*L_width_bg; bottom_buf_in_bg = top_buf_out_bg + 3*RADIUS*L_width_bg; - right_buf_out_bg = (DTYPE *) prk_malloc(4*sizeof(DTYPE)*RADIUS*L_height_bg); + right_buf_out_bg = (DTYPE *) prk_malloc(4*sizeof(DTYPE)*RADIUS*(L_height_bg+2)); if (!right_buf_out_bg) { printf("ERROR: Rank %d could not allocate comm buffers for x-direction\n", my_ID); error = 1; } - right_buf_in_bg = right_buf_out_bg + RADIUS*L_height_bg; - left_buf_out_bg = right_buf_out_bg + 2*RADIUS*L_height_bg; - left_buf_in_bg = right_buf_out_bg + 3*RADIUS*L_height_bg; + right_buf_in_bg = right_buf_out_bg + RADIUS*(L_height_bg+2); + left_buf_out_bg = right_buf_out_bg + 2*RADIUS*(L_height_bg+2); + left_buf_in_bg = right_buf_out_bg + 3*RADIUS*(L_height_bg+2); } bail_out(error); @@ -968,7 +998,6 @@ int main(int argc, char ** argv) { local_stencil_time = wtime(); } /* first complete communication on background grid to help no_talk balancer */ - if (comm_bg != MPI_COMM_NULL) { /* need to fetch ghost point data from neighbors in y-direction */ if (my_ID_bgy < Num_procs_bgy-1) { @@ -1008,31 +1037,32 @@ int main(int argc, char ** argv) { } } - /* need to fetch ghost point data from neighbors in x-direction */ + /* need to fetch ghost point data from neighbors in x-direction; take into account + the load balancer; NO_TALK needs wider copy */ if (my_ID_bgx < Num_procs_bgx-1) { - MPI_Irecv(right_buf_in_bg, RADIUS*L_height_bg, MPI_DTYPE, right_nbr_bg, 1010, + MPI_Irecv(right_buf_in_bg, RADIUS*(L_height_bg+2), MPI_DTYPE, right_nbr_bg, 1010, comm_bg, &(request_bg[1+4])); - for (int kk=0,j=L_jstart_bg; j<=L_jend_bg; j++) + for (int kk=0,j=L_jstart_bg-1; j<=L_jend_bg+1; j++) for (int i=L_iend_bg-RADIUS+1; i<=L_iend_bg; i++) { right_buf_out_bg[kk++]= IN(i,j); } - MPI_Isend(right_buf_out_bg, RADIUS*L_height_bg, MPI_DTYPE, right_nbr_bg, 990, + MPI_Isend(right_buf_out_bg, RADIUS*(L_height_bg+2), MPI_DTYPE, right_nbr_bg, 990, comm_bg, &(request_bg[0+4])); } if (my_ID_bgx > 0) { - MPI_Irecv(left_buf_in_bg, RADIUS*L_height_bg, MPI_DTYPE, left_nbr_bg, 990, + MPI_Irecv(left_buf_in_bg, RADIUS*(L_height_bg+2), MPI_DTYPE, left_nbr_bg, 990, comm_bg, &(request_bg[3+4])); - for (int kk=0,j=L_jstart_bg; j<=L_jend_bg; j++) + for (int kk=0,j=L_jstart_bg-1; j<=L_jend_bg+1; j++) for (int i=L_istart_bg; i<=L_istart_bg+RADIUS-1; i++) { left_buf_out_bg[kk++]= IN(i,j); } - MPI_Isend(left_buf_out_bg, RADIUS*L_height_bg, MPI_DTYPE, left_nbr_bg, 1010, + MPI_Isend(left_buf_out_bg, RADIUS*(L_height_bg+2), MPI_DTYPE, left_nbr_bg, 1010, comm_bg, &(request_bg[2+4])); } if (my_ID_bgx < Num_procs_bgx-1) { MPI_Wait(&(request_bg[0+4]), MPI_STATUS_IGNORE); MPI_Wait(&(request_bg[1+4]), MPI_STATUS_IGNORE); - for (int kk=0,j=L_jstart_bg; j<=L_jend_bg; j++) + for (int kk=0,j=L_jstart_bg-1; j<=L_jend_bg+1; j++) for (int i=L_iend_bg+1; i<=L_iend_bg+RADIUS; i++) { IN(i,j) = right_buf_in_bg[kk++]; } @@ -1040,7 +1070,7 @@ int main(int argc, char ** argv) { if (my_ID_bgx > 0) { MPI_Wait(&(request_bg[2+4]), MPI_STATUS_IGNORE); MPI_Wait(&(request_bg[3+4]), MPI_STATUS_IGNORE); - for (int kk=0,j=L_jstart_bg; j<=L_jend_bg; j++) + for (int kk=0,j=L_jstart_bg-1; j<=L_jend_bg+1; j++) for (int i=L_istart_bg-RADIUS; i<=L_istart_bg-1; i++) { IN(i,j) = left_buf_in_bg[kk++]; } @@ -1057,8 +1087,8 @@ int main(int argc, char ** argv) { G_istart_r[g], G_jstart_r[g], comm_bg, comm_r[g], L_istart_r_gross[g], L_iend_r_gross[g], L_jstart_r_gross[g], L_jend_r_gross[g], - L_width_r_true_gross[g], L_istart_r_true_gross[g], - L_jstart_r_true_gross[g], g); + L_width_r_true_gross[g], L_istart_r_true_gross[g], L_iend_r_true_gross[g], + L_jstart_r_true_gross[g], L_jend_r_true_gross[g], g); if (comm_r[g] != MPI_COMM_NULL) { @@ -1067,7 +1097,7 @@ int main(int argc, char ** argv) { L_jstart_r_true_gross[g], L_jend_r_true_gross[g], L_istart_r_true[g], L_iend_r_true[g], L_jstart_r_true[g], L_jend_r_true[g], - expand, h_r); + expand, h_r, g, Num_procs, my_ID); } /* even though this rank may not interpolate, some just did, so we keep track */ num_interpolations++; @@ -1179,7 +1209,7 @@ int main(int argc, char ** argv) { } /* add constant to solution to force refresh of neighbor data, if any */ - for (j=L_jstart_r_true[g]; j<=L_jend_r_true[g]; j++) + for (j=L_jstart_r_true[g]; j<=L_jend_r_true[g]; j++) for (i=L_istart_r_true[g]; i<=L_iend_r_true[g]; i++) IN_R(g,i,j)+= (DTYPE)1.0; } } diff --git a/MPI1/AMR/implementation_details.md b/MPI1/AMR/implementation_details.md index 8723e326d..04242d399 100644 --- a/MPI1/AMR/implementation_details.md +++ b/MPI1/AMR/implementation_details.md @@ -61,7 +61,10 @@ Steps: This is ambiguous for NO_TALK if TRUE_REF, because a small shift in patch boundaries would still not require communications. We use the following disambiguation. Partition BG points of the refinement patch - in the same way as the BG itself. Add to each refinement partition + in the same way as the BG itself. Place true refinement boundaries + in the middle between BG points. This creates the best load balance, + and also avoids as much as possible very small refinement patch slivers + that could create problems with ghost point acquisition. one point to the left and/or bottom, unless the partition already borders on the respective edge of the partition; this creates a one-point BG overlap between adjacent refinement partitions. All @@ -70,11 +73,11 @@ Steps: boundary, are assigned to the rank that owns the augmented partition. NO_TALK BG: | X X X X X | - a a b b b c c + assignment a a b b b c c refinement |.......x.......x.......x.......x.......x.......| - aaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbcccccccccccccccc - allocation aaaaaaaaa - bbbbbbbbbbbbbbbbbbbbbbbbb + assignment aaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbccccccccccccc + allocation aaaaaaaaaaaaaaaaa + bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb ccccccccccccccccc Legend: | = refinement patch boundary point @@ -91,7 +94,7 @@ Steps: FINE_GRAIN (assuming a total of 4 ranks in the horizontal direction) Sets {a,b,c} and {p,q,r,s} generally overlap. BG: | X X X X X | - a a b b b c c + assignment a a b b b c c refinement |.......x.......x.......x.......x.......x.......| assignment pppppppppppppqqqqqqqqqqqqrrrrrrrrrrrrssssssssssss allocation ppppppppppppppppp diff --git a/scripts/small/runmpi1 b/scripts/small/runmpi1 index 8f5e00741..64e5489ef 100755 --- a/scripts/small/runmpi1 +++ b/scripts/small/runmpi1 @@ -19,5 +19,6 @@ $MPIRUN -np $NUMPROCS MPI1/PIC-static/pic $NUMITERS 1000 1000000 1 2 GEOME $MPIRUN -np $NUMPROCS MPI1/PIC-static/pic $NUMITERS 1000 1000000 0 1 SINUSOIDAL; echo $SEPLINE $MPIRUN -np $NUMPROCS MPI1/PIC-static/pic $NUMITERS 1000 1000000 1 0 LINEAR 1.0 3.0; echo $SEPLINE $MPIRUN -np $NUMPROCS MPI1/PIC-static/pic $NUMITERS 1000 1000000 1 0 PATCH 0 200 100 200; echo $SEPLINE -$MPIRUN -np $NUMPROCS MPI1/AMR/amr $NUMITERS 1000 100 3 2 1 5 FINE_GRAIN; echo $SEPLINE -$MPIRUN -np $NUMPROCS MPI1/AMR/amr $NUMITERS 1000 100 3 2 1 5 HIGH_WATER; echo $SEPLINE +$MPIRUN -np $NUMPROCS MPI1/AMR/amr $NUMITERS 1000 100 3 2 1 5 FINE_GRAIN; echo $SEPLINE +$MPIRUN -np $NUMPROCS MPI1/AMR/amr $NUMITERS 1000 100 3 2 1 5 HIGH_WATER; echo $SEPLINE +$MPIRUN -np $NUMPROCS MPI1/AMR/amr $NUMITERS 1000 100 3 2 1 5 NO_TALK; echo $SEPLINE