program as05a !Program that solves iteratively the stationary heat equation (i.e. the Laplace equation) !in a uniform 3D block suspended in between two media that are kept at constant !temperatures T1 and T2 implicit none real(8),parameter :: T_up=10.0_8 !boundary temperature of the upper half-block real(8),parameter :: T_down=0.0_8 !boundary temperature of the lower half-block real(8),parameter :: tol_T=0.001_8 !threshold for maximum temperature change integer,parameter :: nx=301,ny=201,nz=101 !number of points along each direction integer,parameter :: nxm=nx-1,nym=ny-1,nzm=nz-1 real(8),allocatable :: T(:,:,:),Q(:,:,:) !3D arrays of temperature values at current and previous iterations integer iter !iteration counter integer i,j,k,nzh,nyh real(8) T_ave,maxdiff,p,a allocate(T(1:nx,1:ny,1:nz),Q(1:nx,1:ny,1:nz)) nzh=nz/2 T_ave=(T_up+T_down)/2 T(:,:,1:nzh)=T_down T(:,:,nzh+1:nz)=T_up if (mod(nz,2)/=0) T(:,:,nzh+1)=T_ave T(2:nxm,2:nym,2:nzm)=T_ave iter=0 maxdiff=huge(maxdiff) do while (maxdiff>tol_T) iter=iter+1 !Parallel copying of an array !$omp parallel workshare Q=T !$omp end parallel workshare maxdiff=0.0_8 !Here we do the averaging and find the maximum difference [abs(T(:,:,:)-Q(:,:,:))] !in the same loop. When reduction(max:maxdiff) is used, OpenMP automatically !initialized maxdiff to the most negative number possible !$omp parallel do private(i,j,k,p,a) reduction(max:maxdiff) shared(T,Q) do k=2,nzm do j=2,nym do i=2,nxm p=(1/6.0_8)*(Q(i+1,j,k)+Q(i-1,j,k)+Q(i,j+1,k)+Q(i,j-1,k)+Q(i,j,k+1)+Q(i,j,k-1)) T(i,j,k)=p a=abs(p-Q(i,j,k)) if (a>maxdiff) maxdiff=a enddo enddo enddo !$omp end parallel do write(*,'(1x,a,i6,a,d10.3)') 'iteration',iter,' maxdiff=',maxdiff enddo nyh=ny/2+1 open(1,file='t.dat') do k=1,nz write(1,*) T(1:nx,nyh,k) enddo close(1) deallocate(T,Q) end program as05a !Note: The above OpenMP parallelization will result only in a very small speedup (a few percent) on !a typical PC because the bottleneck in the calculations is not the number crunching (evaluation !of p in the loop) but slow memory. No matter how many cores we have, those cores will be idle !most of the time waiting for data (elements of Q) to be read from memory. The total memory bandwidth !is capped and even just one thread can pretty much saturate it. !Moral: When writing parallel programs (be it CPUs, or GPUs) it is very important to identify the actual !bottlenecks and address them properly. Our implementation of the algorithm does not allow that. One way to !mitigate the memory bottleneck problem would be to split arrays T and Q into smaller blocks that fit !into CPU cache (which is somewhat faster than the usual RAM, so data reading) and then process those !blocks one by one rather then the entire arrays