program as05a
!Program that solves iteratively the stationary heat equation (i.e. the Laplace equation)
!in a uniform 3D block suspended in between two media that are kept at constant 
!temperatures T1 and T2
implicit none

real(8),parameter :: T_up=10.0_8 !boundary temperature of the upper half-block
real(8),parameter :: T_down=0.0_8  !boundary temperature of the lower half-block
real(8),parameter :: tol_T=0.001_8 !threshold for maximum temperature change 
integer,parameter :: nx=301,ny=201,nz=101  !number of points along each direction
integer,parameter :: nxm=nx-1,nym=ny-1,nzm=nz-1
real(8),allocatable :: T(:,:,:),Q(:,:,:) !3D arrays of temperature values at current and previous iterations
integer  iter  !iteration counter
integer  i,j,k,nzh,nyh
real(8)  T_ave,maxdiff,p,a

allocate(T(1:nx,1:ny,1:nz),Q(1:nx,1:ny,1:nz))

nzh=nz/2
T_ave=(T_up+T_down)/2
T(:,:,1:nzh)=T_down  
T(:,:,nzh+1:nz)=T_up  
if (mod(nz,2)/=0) T(:,:,nzh+1)=T_ave  
T(2:nxm,2:nym,2:nzm)=T_ave

iter=0
maxdiff=huge(maxdiff)
do while (maxdiff>tol_T)
  iter=iter+1
  !Parallel copying of an array
  !$omp parallel workshare
  Q=T
  !$omp end parallel workshare
  maxdiff=0.0_8
  !Here we do the averaging and find the maximum difference [abs(T(:,:,:)-Q(:,:,:))]
  !in the same loop. When reduction(max:maxdiff) is used, OpenMP automatically
  !initialized maxdiff to the most negative number possible
  !$omp parallel do private(i,j,k,p,a) reduction(max:maxdiff) shared(T,Q) 
  do k=2,nzm
    do j=2,nym
	  do i=2,nxm
	    p=(1/6.0_8)*(Q(i+1,j,k)+Q(i-1,j,k)+Q(i,j+1,k)+Q(i,j-1,k)+Q(i,j,k+1)+Q(i,j,k-1))
		T(i,j,k)=p
		a=abs(p-Q(i,j,k))
		if (a>maxdiff) maxdiff=a
	  enddo
	enddo
  enddo
  !$omp end parallel do
  
  write(*,'(1x,a,i6,a,d10.3)') 'iteration',iter,'    maxdiff=',maxdiff
enddo

nyh=ny/2+1
open(1,file='t.dat')
do k=1,nz
  write(1,*) T(1:nx,nyh,k)
enddo
close(1)
deallocate(T,Q)

end program as05a

!Note: The above OpenMP parallelization will result only in a very small speedup (a few percent) on
!a typical PC because the bottleneck in the calculations is not the number crunching (evaluation 
!of p in the loop) but slow memory. No matter how many cores we have, those cores will be idle
!most of the time waiting for data (elements of Q) to be read from memory. The total memory bandwidth
!is capped and even just one thread can pretty much saturate it. 
!Moral: When writing parallel programs (be it CPUs, or GPUs) it is very important to identify the actual 
!bottlenecks and address them properly. Our implementation of the algorithm does not allow that. One way to
!mitigate the memory bottleneck problem would be to split arrays T and Q into smaller blocks that fit
!into CPU cache (which is somewhat faster than the usual RAM, so data reading) and then process those 
!blocks one by one rather then the entire arrays