!===============================================================================
!
! Routines:
!
! (1) diag main         Originally by MLT       Last Edited: 5/12/2008 (JRD)
!
!     For more details see the README_absorption file.
!
!     Calculates the real and imaginary parts of the macroscopic dielectric
!     function starting from interaction matrix elements calculated by
!     the Kernel code. It uses interpolation in the matrix elements and
!     direct diagonalization of the Bethe-Salpeter equation.
!     Spin-polarized case implemented.
!
!     For more details, see:
!     Rohlfing & Louie, PRB 62:(8), 4927 (2000)
!     G. Strinati, Rivista del Nuovo Cimento 11:(12), 1 (1988)
!
!     Please report bugs to: jdeslip@civet.berkeley.edu
!     
!================================================================================

#include "f_defs.h"

subroutine diag(eqp,xct,flag,neig,nmax)

  use global_m
  use absp_io_m
  use fullbz_m
  use genwf_m
  use intkernel_m
  use intwfn_m
  use misc_m
  use mtxel_optical_m
  use diagonalize_m
  implicit none

  type (eqpinfo), intent(inout) :: eqp
  type (xctinfo), intent(inout) :: xct
  type (flags), intent(inout) :: flag
  integer, intent(inout) :: neig
  integer, intent(in) :: nmax

  type (crystal) :: crys
  type (symmetry) :: syms
  type (gspace) :: gvec
  type (grid) :: kg_fi, kgq_fi,kg_co,kgq_co
  type (kpoints) :: kp_fi, kpq_fi
  type (wavefunction) :: wfnc_fi
  type (wavefunction) :: wfnvq_fi
  type (work_genwf) :: work, workq
  type (int_wavefunction) :: intwfnc
  type (int_wavefunction) :: intwfnv

  character :: tmpstr*128,filename*20
  integer :: ii,ipol,jj,ncount,nmat,ncvs_fi,pblock
  integer :: ikb, icb, ivb
  integer :: ik,ikq,ikt,iblock,ikrq,ikcvs,ikcvsd,ic,iv,is
  integer :: iunit_c,iunit_v
  real(DP) :: vol,omega_plasma,en_diff_max
  real(DP) :: tsec(2),tmin(2),tmax(2)
  
  character*16, allocatable :: routnam(:)
  integer, allocatable :: routsrt(:)
  integer, allocatable :: fi2co_wfn(:,:),indexq_fi(:)
  real(DP), allocatable :: evals(:), kco(:,:), cs(:,:), cs0(:), rdum(:,:)

  SCALAR, allocatable :: &
    dcc(:,:,:,:,:),dvv(:,:,:,:,:),s0(:),s1(:,:),s1k(:,:,:,:),dummy(:,:), &
    hqpcc(:,:,:), hqpvv(:,:,:), rdum2(:,:)
  SCALAR, allocatable :: dipoles_l(:,:), dipoles_r(:,:), cs_full(:,:)
  !> FHJ: left/right eigenvectors of the BSE Hamiltonian. The left evecs
  !! are only used if we solve the full BSE, without the TDA.
  SCALAR, allocatable :: evecs_r(:,:), evecs_l(:,:)
  !> (kcvs, k`c`v`s`), "A" block of BSE Hamiltonian
  SCALAR, allocatable :: hbse_a(:,:)
  !> (kcvs, k`c`v`s`), "B" block of BSE Hamiltonian, only if tda=.false.
  SCALAR, allocatable :: hbse_b(:,:)
  real(DP), allocatable :: intp_coefs(:,:)
  character(len=2) :: suffix(3) = (/'b1', 'b2', 'b3'/)
  
  ! DYQ: Variables used in finite Q
  integer, allocatable :: kg0_temp(:,:)
  integer :: umk
  real(DP) :: kq(3),qq(3)
  real :: delta

  !DYQ: Variables used for clustered subsampling
  integer ::ik_sub,nsub_files,nk_sub
  type(grid) :: kg_sub_co
  SCALAR, allocatable :: dvv_sub(:,:,:,:,:,:),dcc_sub(:,:,:,:,:,:)
  integer,allocatable :: closepts_sub(:,:)

  PUSH_SUB(diag)

! JRD: A Check for Finite Q

!      if (peinf%inode .eq. 0) then
!        write(6,*) 'nkpt_co = ', xct%nkpt_co
!      endif

  if(flag%vm.eq.2) then
    if (peinf%inode.eq.0) then
      write(0,*) 'WARNING: read_eps2_moments not supported in this diagonalization code. Ignoring keyword.'
    endif
    flag%vm=0
  endif
  
!--------------------------
! If eigenvalues.dat is available, read them and go straight to
! calculation of the absorption spectrum


  if (flag%spec.eq.1) then
    if (peinf%inode .eq. 0) then

      omega_plasma = 0.d0

      write(6,*) 'Create absorption_noeh.dat from eigenvalues_noeh.dat'
      do ipol=1,xct%npol
        call read_eigenvalues_noeh(xct,neig,vol,eqp,s0,ipol)
        call absp0(eqp,xct,s0,vol,omega_plasma,flag,ipol)
        SAFE_DEALLOCATE_P(eqp%evqp)
        SAFE_DEALLOCATE_P(eqp%ecqp)
        SAFE_DEALLOCATE_P(eqp%evlda)
        SAFE_DEALLOCATE_P(eqp%eclda)
        SAFE_DEALLOCATE(s0)
      enddo

      if (xct%iabsorp0 .eq. 0) then
        write(6,*) 'Create absorption_eh.dat from eigenvalues.dat'
        do ipol=1,xct%npol
          call read_eigenvalues(xct,neig,vol,evals,cs0,ipol)
          call absp(xct,neig,cs0,evals,vol,omega_plasma,flag,ipol)
          SAFE_DEALLOCATE(evals)
          SAFE_DEALLOCATE(cs0)
        enddo
      endif
    endif

    call diag_end()
    POP_SUB(diag)
    return
  endif

!--------------------------
! Read wavefunctions on the fine grid

  call logit('Calling input')
  call timacc(2,1)
  call input(crys,gvec,kg_fi,kp_fi,syms,eqp,xct,flag, &
    omega_plasma,.true.,intwfnc)
  
! If there is no specified number of eigenvalues, calculate
! all eigenvalues

  nmat = xct%nkpt_fi*xct%ncb_fi*xct%nvb_fi*xct%nspin
  if (xct%algo==BSE_ALGO_DIAG) then
    if (xct%tda) then
      if (neig==0) neig = nmat
    else
      if (peinf%inode==0.and.neig/=0) then
        write(0,'(/,a,/)') 'WARNING: BSE calculations ignore the `number_eigenvalues` flag.'
      endif
#ifdef USESCALAPACK
      ! FHJ: Meiyue`s solver is structure preserving, so we only compute the positive evecs
      neig = nmat
#else
      ! FHJ: generic solver
      neig = 2*nmat
#endif
    endif
    if ((neig<=0).or.(neig>2*nmat).or.(neig>nmat.and.xct%tda)&
#ifdef USESCALAPACK
      .or.(neig>nmat)&
#endif
    ) then
      write(tmpstr,'(a,2i6)') 'Incomprehensible request of eigenvalues : ', neig, nmat
      call die(tmpstr, only_root_writes = .true.)
    endif
  endif
  
  vol = xct%nktotal*crys%celvol
  if (peinf%inode.eq.0) then
    write(6,'(/1x,a)') 'More job parameters:'
    write(6,'(1x,a,es9.3e2)') '- Crystal volume (bohr): ', vol
    write(6,'(1x,a,f0.3)') '- Broadening (eV): ', xct%eta
    write(6,'(1x,a,i0)') '- Number of valence bands: ', xct%nvb_fi
    write(6,'(1x,a,i0)') '- Number of cond. bands: ', xct%ncb_fi
    write(6,'(1x,a,i0)') '- Number of spins: ', xct%nspin
    write(6,'(1x,a,i0)') '- Number of eigenvalues to be computed: ', neig
    write(6,'()')
  endif
  call timacc(2,2)
      
  SAFE_ALLOCATE(indexq_fi, (xct%nkpt_fi))
  SAFE_ALLOCATE(xct%indexq_fi, (xct%nkpt_fi))
  if (flag%vm.ne.1.or. .not. flag%read_dtmat) then
    call timacc(3,1)
    call logit('Calling input_q')
    call input_q(kp_fi,crys,gvec,kg_fi,kgq_fi,kpq_fi,syms,xct,indexq_fi,eqp,flag,intwfnv)
    call timacc(3,2)
  endif

! JRD: Don`t do this next section if only want absorption_noeh.dat

  if (xct%iabsorp0 .eq. 0) then

!------------------------------
! Calculate the transformation matrices from coarse grid wavefunctions
! FHJ: These are the final transformation coefs that will be used to interpolate
! the kernel. However, we might use an unrestricted version of dvv/dcc to
! interpolate eqp if xct%unrestricted_transf==.true..
    SAFE_ALLOCATE(dvv, (xct%nvb_fi,xct%n1b_co,xct%nspin,xct%nkpt_fi,xct%npts_intp_kernel))
    SAFE_ALLOCATE(dcc, (xct%ncb_fi,xct%n2b_co,xct%nspin,xct%nkpt_fi,xct%npts_intp_kernel))
    SAFE_ALLOCATE(kco, (3,xct%nkpt_co))
    SAFE_ALLOCATE(fi2co_wfn, (xct%npts_intp_kernel,xct%nkpt_fi))
    SAFE_ALLOCATE(intp_coefs, (xct%npts_intp_kernel, xct%nkpt_fi))

    call logit('Calling intwfn')
    call timacc(4,1)
    call intwfn(kp_fi,crys,syms,xct,flag,gvec,kg_fi,kgq_fi,kg_co,kgq_co,dcc,dvv,&
      kco,fi2co_wfn,indexq_fi,eqp,intwfnv,intwfnc,intp_coefs)
    call timacc(4,2)


  endif

  SAFE_DEALLOCATE_P(xct%ifmax)
  if (flag%vm.ne.1.or. .not. flag%read_dtmat) then
    ! otherwise, we did not call input_q to allocate it
    SAFE_DEALLOCATE_P(xct%ifmaxq)
  endif
  
!------------ Calculate the velocity (or momentum) matrix elements -------------

! Each PE calculates a small number of them. At the end, share data
!
! If flag%vm.eq.1, skip this part and just read the matrix elements
! from "vmtxel".
!
! peinf%block_sz = size of a distributed column in hbse_a


  call logit('Calculating v/p matrix elememts')
  ncvs_fi = xct%ncb_fi*xct%nvb_fi*xct%nspin
  if (xct%ipar .eq. 1) then
    peinf%block_sz = ncvs_fi
  else if (xct%ipar .eq. 2) then
    peinf%block_sz = xct%nvb_fi*xct%nspin
  else
    peinf%block_sz = xct%nspin
  endif
  nmat = xct%nkpt_fi*ncvs_fi
  SAFE_ALLOCATE(s1, (nmat,xct%npol))
  SAFE_ALLOCATE(s1k, (xct%ncb_fi,xct%nvb_fi,xct%nspin,xct%npol))
  s1 = ZERO
  s1k = ZERO

  call timacc(10,1)

  if (flag%vm.eq.0) then
    do ikt=1, peinf%ikt(peinf%inode+1)
      ik = peinf%ik(peinf%inode+1,ikt)
      if (xct%qflag .eq. 1) then
        ikq = indexq_fi(ik)
      endif
      ikrq = kg_fi%indr(ik) ! Is this used anywhere?
      
      call genwf(crys,gvec,kg_fi,syms,wfnc_fi,xct,ik,ik,work,intwfnc,is_cond = .true.)
      call genwf(crys,gvec,kgq_fi,syms,wfnvq_fi,xct,ik,ikq,workq,intwfnv,is_cond = .false.)

      if (xct%npol==1) then
        if (flag%opr.eq.0 .or. xct%qflag.ne.1) then
          call mtxel_v(wfnc_fi,wfnvq_fi,gvec,xct%qshift,wfnc_fi%nband,wfnvq_fi%nband,s1k(:,:,:,1))
        elseif (flag%opr.eq.1) then
          call mtxel_m(crys,wfnc_fi,wfnvq_fi,gvec,eqp,xct,wfnc_fi%nband,wfnvq_fi%nband,s1k(:,:,:,1),ik,.true.)
        endif
      else
        do ipol=1,3
          xct%pol = ZERO
          xct%pol(ipol) = 1d0
          xct%lpol=sqrt(DOT_PRODUCT(xct%pol,MATMUL(crys%bdot,xct%pol)))
          if (flag%opr.eq.0 .or. xct%qflag.eq.2) then
            call mtxel_v(wfnc_fi,wfnvq_fi,gvec,xct%qshift,wfnc_fi%nband,wfnvq_fi%nband,s1k(:,:,:,ipol))
          else
            call mtxel_m(crys,wfnc_fi,wfnvq_fi,gvec,eqp,xct,wfnc_fi%nband,wfnvq_fi%nband,s1k(:,:,:,ipol),ik,.true.)
          endif
        enddo
      endif

      do is=1,xct%nspin
        do ic=1,xct%ncb_fi
          do iv=1,xct%nvb_fi
            s1(bse_index(ik, ic, iv, is, xct),:) = s1k(ic,iv,is,:)
          enddo
        enddo
      enddo
      SAFE_DEALLOCATE_P(wfnc_fi%cg)
      SAFE_DEALLOCATE_P(wfnc_fi%isort)
      SAFE_DEALLOCATE_P(wfnvq_fi%cg)
      SAFE_DEALLOCATE_P(wfnvq_fi%isort)
    enddo
    
    ! typedefs initializes all of these ikolds to 0
    if(work%ikold.ne.0) then
      SAFE_DEALLOCATE_P(work%cg)
      SAFE_DEALLOCATE_P(work%ph)
      SAFE_DEALLOCATE_P(work%ind)
      SAFE_DEALLOCATE_P(work%isort)
    endif
    if(workq%ikold.ne.0) then
      SAFE_DEALLOCATE_P(workq%cg)
      SAFE_DEALLOCATE_P(workq%ph)
      SAFE_DEALLOCATE_P(workq%ind)
      SAFE_DEALLOCATE_P(workq%isort)
    endif
! Share matrix elements

#ifdef MPI
    SAFE_ALLOCATE(dummy, (nmat,xct%npol))
    dummy = s1
    call MPI_ALLREDUCE(dummy(1,1), s1(1,1), size(dummy), MPI_SCALAR, MPI_SUM, &
      MPI_COMM_WORLD, mpierr)
    SAFE_DEALLOCATE(dummy)
#endif

    call write_vmtxel(xct,flag,nmat,s1)
    
  else ! ...if the matrix elements were calculated already
    call read_vmtxel(xct,flag,nmat,s1)
  endif

  call timacc(10,2)

  if (flag%vm.ne.1.or. .not. flag%read_dtmat) then
    call dealloc_grid(kgq_fi)
  endif

  if (flag%vm == 0 .and. .not. flag%read_dtmat) then
    SAFE_DEALLOCATE_P(intwfnc%cgk)
    SAFE_DEALLOCATE_P(intwfnv%cgk)
    SAFE_DEALLOCATE_P(intwfnc%isort)
    SAFE_DEALLOCATE_P(intwfnv%isort)
  endif

! End Calculating Matrix Elements
!-------------------------------------------------------------------------------

!----------------------------
! Calculate the non-interacting spectrum. Only one PE works

  call timacc(9,1)
  call logit('Calling absp0')
  if (peinf%inode.eq.0) then
    do ipol=1,xct%npol
      call absp0(eqp,xct,s1(:,ipol),vol,omega_plasma,flag,ipol)
    enddo
  endif
  call timacc(9,2)
  
  SAFE_DEALLOCATE_P(eqp%eclda)
  SAFE_DEALLOCATE_P(eqp%evlda)
  SAFE_DEALLOCATE(s1k)
  SAFE_DEALLOCATE(indexq_fi)

! JRD If we only want absorp0 - we finish here

  if (xct%iabsorp0 .eq. 1) then
    call diag_end()
    POP_SUB(diag)
    return
  endif

!------------ Build Hamiltonian Matrix --------------------------------------------

! Build Hamiltonian matrix. Diagonal part comes from the "non-interacting"
! quasiparticle Hamiltonians.  If the QP Greens function is diagonal,
! then these are just the quasiparticle energy differences on the
! diagonal.  The more general case is:
!
!       <cvk|H0|c'v'k'> = delta(k,k') *
!            [ <c|hqp|c'>*delta(v',v) - delta(c,c')*<v'|hqp|v> ]
!
! The rest of the Hamiltonian, which is the electron-hole interaction,
! comes from interpolation further below.

  call logit('Building non-interacting Hamiltonian')
  SAFE_ALLOCATE(hbse_a, (xct%nkpt_fi*ncvs_fi, peinf%nblocks*peinf%block_sz))
  hbse_a(:,:) = 0.0d0
  if (.not.xct%tda) then
    SAFE_ALLOCATE(hbse_b, (xct%nkpt_fi*ncvs_fi, peinf%nblocks*peinf%block_sz))
    hbse_b(:,:) = 0.0d0
  endif

! iblock loop. This loop is over proc owned k if ipar = 1, (k,c)  if 
! ipar = 2 and (k,c,v) if ipar = 3 

  en_diff_max = 0d0
  do iblock=1,peinf%ibt(peinf%inode+1)
    ik=peinf%ikb(iblock)
    
    if (ik .eq. 0) then
      write(0,*) "Illegal value for ik",peinf%inode, iblock, ik
      call die("internal error in diag, ik = 0")
    endif
    
! Build <c|hqp|c'> and <v|hqp|v'> for this kpoint

    SAFE_ALLOCATE(hqpcc, (xct%ncb_fi,xct%ncb_fi,xct%nspin))
    SAFE_ALLOCATE(hqpvv, (xct%nvb_fi,xct%nvb_fi,xct%nspin))
    hqpcc = 0.0d0
    hqpvv = 0.0d0
    
! Put QP energies on diagonals of hqpcc and hqpvv to start
    do is=1,xct%nspin
      do ic=1,xct%ncb_fi
        ! Skip if k+Q falls outside the patch
        if (xct%indexq_fi(ik).eq.0 .and. xct%patched_sampling) cycle
        hqpcc(ic,ic,is) = eqp%ecqp(ic,ik,is)
      enddo
      do iv=1,xct%nvb_fi
        if (xct%qflag.ne.2) then
          !DYQ: for qflag.eq.0, eqp%evqp(iv,ik,is) corresponds to the k-point kgq_fi%f(xct%indexq_fi(ik))
          hqpvv(iv,iv,is) = eqp%evqp(iv,ik,is)
        else
          ! Skip if k+Q falls outside the patch
          if (xct%indexq_fi(ik).eq.0 .and. xct%patched_sampling) cycle
          hqpvv(iv,iv,is) = eqp%evqp(iv,xct%indexq_fi(ik),is)
        endif
      enddo
    enddo
    
! Read possible offdiagonal QP elements from "hqp.<ik>" file
! if it exists.  JRD: This is broken for now.  Someone should fix
! it in the future if they want to use it

    !if (ik.lt.10) then
    !  write(tmpstr,'(a,i1)') 'hqp.',ik
    !else if (ik.lt.100) then
    !  write(tmpstr,'(a,i2)') 'hqp.',ik
    !else if (ik.lt.1000) then
    !  write(tmpstr,'(a,i3)') 'hqp.',ik
    !else if (ik.lt.100000) then
    !  write(tmpstr,'(a,i5)') 'hqp.',ik
        !else
    !  write(0,*) 'too many kpoints for reading hqp'
    !endif
    !call open_file(9,file=tmpstr,form='formatted',status='old')
    !if (is.eq.0) then
    !  if (peinf%inode.eq.0) then
    !    write(6,*) 'Reading offdiagonal hqp from file ',tmpstr
    !    write(6,*) 'All values in eV'
    !  endif
    !  do
    !    read(9,*,end=999) nocc,ii,jj,x,y
    
    ! if ii and jj both refer to valence, states, put
    ! matrix element into hqpvv
    
    !    if ((ii<=nocc).and.(ii>nocc-xct%nvb_fi).and. &
    !    (jj<=nocc).and.(jj>nocc-xct%nvb_fi)) then
    !      if (peinf%inode.eq.0) write(6,'(a,2i5,2f20.10)') ' hqp(v,vp) = ',ii,jj,x,y
    !      ii=nocc-ii+1
    !      jj=nocc-jj+1
    !      is = 1
#ifdef CPLX
    !      hqpvv(ii,jj,is) = CMPLX(x,y)/ryd
#else
    !      hqpvv(ii,jj,is) = x/ryd
#endif
    !    else if ((ii>nocc).and.(ii<=nocc+xct%ncb_fi).and. &
    !    (jj>nocc).and.(jj<=nocc+xct%ncb_fi)) then
    !      if (peinf%inode.eq.0) write(6,'(a,2i5,2f20.10)') ' hqp(c,cp) = ',ii,jj,x,y
    !      ii=ii-nocc
    !      jj=jj-nocc
    !      is = 1
#ifdef CPLX
    !      hqpcc(ii,jj,is) = CMPLX(x,y)/ryd
#else
    !      hqpcc(ii,jj,is) = x/ryd
#endif
    !    endif
    !  enddo
    !999      call close_file(9)
    !  write(6,*)
    !endif ! if hqp.<ik> was found
    
    ! Now build hamiltonian from hqcc and hqvv
    ! Consider only diagonal elements in k,v,c

    ! FHJ: Note: here, iv and ic are dummy indices, and the actual band indices
    ! are ivb/icb. When should we use the dummy or real band indices?
    ! - Use the dummy indices iv/ic to index a column of hbse_a (which is distributed)
    ! - Use the real indices ivb/icb to index a row of hbse_a (which is not distributed)
    ikb = ik
    do is=1,xct%nspin 
      do iv=1,peinf%nv_block !1 for ipar==2
        do ic=1,peinf%nc_block !1 for ipar==2 or ipar==3
          if (xct%ipar==1) then
            ivb=iv
            icb=ic
          else if (xct%ipar==2) then
            icb=peinf%icb(iblock)
            ivb=iv
          else if (xct%ipar==3) then
            ivb=peinf%ivb(iblock)
            icb=peinf%icb(iblock)
          endif
          ikcvs = bse_index(ikb, icb, ivb, is, xct)
          ikcvsd = bse_index(iblock, ic, iv, is, xct, ncband=peinf%nc_block, nvband=peinf%nv_block)
          en_diff_max = max(en_diff_max, dble(hqpcc(icb,icb,is) - hqpvv(ivb,ivb,is)))
          hbse_a(ikcvs,ikcvsd) = hqpcc(icb,icb,is) - hqpvv(ivb,ivb,is)
        enddo
      enddo
    enddo
    SAFE_DEALLOCATE(hqpcc)
    SAFE_DEALLOCATE(hqpvv)
  enddo ! loop on k-points on this processor
#ifdef MPI
  call MPI_Allreduce(MPI_IN_PLACE, en_diff_max, 1, MPI_DOUBLE_PRECISION, &
    MPI_MAX, MPI_COMM_WORLD, mpierr)
#endif

!----------------------------
! Define the mapping of eigenvectors: the ii-th column of the matrix
! evecs_r(:,:) stored in PE #ipe will contain the eigenvector of order
! peinf%peig(ipe,ii). The total number of eigenvectors stored in
! each processor is given by peinf%neig(1:peinf%npes).
! pblock >= maxval(peinf%neig(1:peinf%npes))

  ! FHJ: Note: pblock gets ~doubled automatically in full BSE calculations.
  ! In BLACS terms, the following lines would set up a 1d block-column
  ! distribution for the eigenvectors with:
  ! M=(2*)nmat, N=neig, MB=M, NB=peinf%block_sz, LLD=M
  ! We should really get rid of this manual distribution and use BLACS.
  pblock = neig/(peinf%npes*peinf%block_sz)
  if (pblock*peinf%npes*peinf%block_sz.lt.neig) pblock = pblock + 1
  pblock = pblock*peinf%block_sz
  SAFE_ALLOCATE(peinf%neig, (peinf%npes))
  SAFE_ALLOCATE(peinf%peig, (peinf%npes,pblock))
  peinf%neig=0
  peinf%peig=0
  ii=1
  do jj=1,neig
    if (ii.eq.peinf%npes+1) ii=1
    peinf%neig(ii)=peinf%neig(ii)+1
    peinf%peig(ii,peinf%neig(ii))=jj
    if (mod(jj,peinf%block_sz).eq.0) ii=ii+1
  enddo

!-----------------------------
! Interpolation scheme in the Kernel

  call logit('Calling intkernel')
  call timacc(5,1)
  if (xct%tda) then
    if (xct%subsample_line) then
      call intkernel(crys,kg_fi,kp_fi,syms,xct,hbse_a,dcc,dvv,kco,fi2co_wfn,flag,gvec,intp_coefs,&
        dcc_sub=dcc_sub,dvv_sub=dvv_sub,closepts_sub=closepts_sub)
    else
      call intkernel(crys,kg_fi,kp_fi,syms,xct,hbse_a,dcc,dvv,kco,fi2co_wfn,flag,gvec,intp_coefs)
    endif
  else
    if (xct%subsample_line) then
      call intkernel(crys,kg_fi,kp_fi,syms,xct,hbse_a,dcc,dvv,kco,fi2co_wfn,flag,gvec,intp_coefs,hbse_b=hbse_b,&
        dcc_sub=dcc_sub,dvv_sub=dvv_sub,closepts_sub=closepts_sub)
    else
      call intkernel(crys,kg_fi,kp_fi,syms,xct,hbse_a,dcc,dvv,kco,fi2co_wfn,flag,gvec,intp_coefs,hbse_b=hbse_b)
    endif
  endif
  call logit('Done intkernel')
  call timacc(5,2)
  SAFE_DEALLOCATE(fi2co_wfn)
  SAFE_DEALLOCATE(dcc)
  SAFE_DEALLOCATE(dvv)
  SAFE_DEALLOCATE(kco)


!--------------------------------
! Exact diagonalization


  if (xct%algo==BSE_ALGO_DIAG) then

    call logit('Calling diagonalize')
    call timacc(6,1)
    SAFE_ALLOCATE(evals, (neig))
    if (xct%tda) then
      SAFE_ALLOCATE(evecs_r, (nmat,pblock))
      call diagonalize(xct, neig, nmat, hbse_a, evals, evecs_r)
    else
      SAFE_ALLOCATE(evecs_r, (2*nmat,pblock))
      SAFE_ALLOCATE(evecs_l, (2*nmat,pblock))
      call diagonalize(xct, neig, nmat, hbse_a, evals, evecs_r, hbse_b=hbse_b, evecs_l=evecs_l)
      if (peinf%inode==0) then
        write(6,'(/,1x,a,i0)') 'Number of positive eigenvalues: ', count(evals>0d0)
      endif
    endif
    call timacc(6,2)
  
    !--------------------------------
    ! Calculate transition matrix elements
    ! oscillator strength = 2 * cs / omega, as defined below

    call timacc(11,1)
    call logit('Computing transition matrix elements')
    SAFE_ALLOCATE(cs, (neig,xct%npol))
    cs = 0d0
    SAFE_ALLOCATE(dipoles_r, (neig,xct%npol))
    dipoles_r = ZERO
    if (.not.xct%tda) then
      SAFE_ALLOCATE(dipoles_l, (neig,xct%npol))
      dipoles_l = ZERO
      SAFE_ALLOCATE(cs_full, (neig,xct%npol))
      cs_full = ZERO
    endif

    ! The factor of 1/sqrt(dble(xct%nspin)) below is required to obtain the same
    ! transition matrix elements for the singlet excitons for nspin = 1 and nspin = 2,
    ! See just after eq. (25) in Rohlfing and Louie, PRB 62, 4927 (2000)
    do ipol=1,xct%npol
      if (xct%tda) then
        do ii=1,peinf%neig(peinf%inode+1)
          jj = peinf%peig(peinf%inode+1,ii)
          dipoles_r(jj,ipol) = sum(evecs_r(1:nmat,ii)*MYCONJG(s1(1:nmat,ipol))) / sqrt(dble(xct%nspin))
          cs(jj,ipol) = MYABS2(dipoles_r(jj,ipol))
        enddo
      else
        do ii=1,peinf%neig(peinf%inode+1)
          jj = peinf%peig(peinf%inode+1,ii)
          ! FHJ: contributions from positive transitions
          dipoles_l(jj,ipol) = sum(evecs_l(1:nmat,ii)*MYCONJG(s1(1:nmat,ipol))) / sqrt(dble(xct%nspin))
          dipoles_r(jj,ipol) = sum(evecs_r(1:nmat,ii)*MYCONJG(s1(1:nmat,ipol))) / sqrt(dble(xct%nspin))
          ! FHJ: contributions from negative transitions. Some notes:
          ! (1) For the velocity matrix elements: s_(c->v) = - s_(v->c)^* . Proof:
          !     1st-order expand the wfns |ck> -> |ck+q> and |vk> -> |vk+q> and project
          !     onto <vk| and <ck|. Note that there`s a sign flip in the energy denom.
          ! (2) There`s a negative sign in dipoles_r for (c->v) transitions, which
          !     originates from Fermi factors. See eqn 8 in PRL 80, 4510 (1998).
          dipoles_l(jj,ipol) = dipoles_l(jj,ipol) &
            - sum(evecs_l(nmat+1:2*nmat,ii)*(-s1(1:nmat,ipol))) / sqrt(dble(xct%nspin))
          dipoles_r(jj,ipol) = dipoles_r(jj,ipol) &
            + sum(evecs_r(nmat+1:2*nmat,ii)*(-s1(1:nmat,ipol))) / sqrt(dble(xct%nspin))
          cs_full(jj,ipol) = MYCONJG(dipoles_l(jj,ipol)) * dipoles_r(jj,ipol)
        enddo
      endif
    enddo

#ifdef MPI
    SAFE_ALLOCATE(rdum, (neig,xct%npol))
    rdum = cs
    call MPI_REDUCE(rdum(1,1), cs(1,1), size(rdum), MPI_REAL_DP, MPI_SUM, 0, MPI_COMM_WORLD, mpierr)
    SAFE_DEALLOCATE(rdum)
    SAFE_ALLOCATE(rdum2, (neig,xct%npol))
    rdum2 = dipoles_r
    call MPI_REDUCE(rdum2(1,1), dipoles_r(1,1), size(rdum2), MPI_SCALAR, MPI_SUM, 0, MPI_COMM_WORLD, mpierr)
    if (.not.xct%tda) then
      rdum2 = dipoles_l
      call MPI_REDUCE(rdum2(1,1), dipoles_l(1,1), size(rdum2), MPI_SCALAR, MPI_SUM, 0, MPI_COMM_WORLD, mpierr)
      rdum2 = cs_full
      call MPI_REDUCE(rdum2(1,1), cs_full(1,1), size(rdum2), MPI_SCALAR, MPI_SUM, 0, MPI_COMM_WORLD, mpierr)
     endif
    SAFE_DEALLOCATE(rdum2)
#endif
    if (.not.xct%tda) then
      if (peinf%inode==0) then
        cs = dble(cs_full)
#ifdef CPLX
        write(6,'(/,1x,a)') 'Imaginary part of the oscillator strength:'
        write(6,'(1x,a,es9.3e2)') ' - Max. absolute value: ', maxval(abs(AIMAG(cs_full)))
        write(6,'(1x,a,es9.3e2)') ' - Max. absolute value relative to real part: ', &
          maxval(abs(AIMAG(cs_full)/dble(cs_full)))
        write(6,*)
#endif
      endif
    endif

    call timacc(11,2)

    ! Convert eigenvalues to eV and write them out
    evals(:) = evals(:)*ryd
    if (.not.xct%tda) then
      call write_eigenvalues(xct,flag,neig,vol,evals,cs,dipoles_r,dipoles_l=dipoles_l)
    else
      call write_eigenvalues(xct,flag,neig,vol,evals,cs,dipoles_r)
    endif
    
    SAFE_DEALLOCATE(dipoles_r)
    if (.not.xct%tda) then
      SAFE_DEALLOCATE(dipoles_l)
    endif

    !------------------------------
    ! Calculate the absorption and density of excitonic states

    call timacc(12,1)
    call logit('Calling absp')
    if (peinf%inode==0) then
      do ipol=1,xct%npol
        call absp(xct, neig, cs(:, ipol), evals, vol, omega_plasma, flag, ipol)
      enddo
    endif
    call timacc(12,2)

  !------------------------------
  ! Write out eigenvectors to file if needed

    call timacc(13,1)
    if (flag%eig/=0) then
      call logit('Calling write_eigenvectors')
      if (xct%tda) then
        call write_eigenvectors(xct,kg_fi,nmat,pblock,neig,evals,evecs_r,flag%eig)
      else
        call write_eigenvectors(xct,kg_fi,2*nmat,pblock,neig,evals,evecs_r,flag%eig,evecs_l=evecs_l)
      endif
    endif
    call timacc(13,2)

    SAFE_DEALLOCATE(cs)
    SAFE_DEALLOCATE(evals)
    SAFE_DEALLOCATE(evecs_r)
    if (.not.xct%tda) then
      SAFE_DEALLOCATE(evecs_l)
      SAFE_DEALLOCATE(cs_full)
    endif

  endif
  ! xct%algo==BSE_ALGO_DIAG

  SAFE_DEALLOCATE(s1)
  
!-------------------------------
! Deallocate stuff

  SAFE_DEALLOCATE_P(peinf%neig)
  SAFE_DEALLOCATE_P(peinf%peig)
  call dealloc_grid(kg_fi)

  if (eqp%spl_tck%n>0) then
    SAFE_DEALLOCATE_P(eqp%spl_tck%t)
    SAFE_DEALLOCATE_P(eqp%spl_tck%c)
  endif

  SAFE_DEALLOCATE(hbse_a)
  if (.not.xct%tda) then
    SAFE_DEALLOCATE(hbse_b)
  endif
  SAFE_DEALLOCATE_P(eqp%ecqp)
  SAFE_DEALLOCATE_P(eqp%evqp)
  SAFE_DEALLOCATE(intp_coefs)

  if (xct%iwritecoul .eq. 1 .and. peinf%inode .eq. 0) then
    call close_file(19) ! file vcoul
  endif

  call diag_end()

  POP_SUB(diag)
  return

contains

  subroutine diag_end()

    PUSH_SUB(diag.diag_end)

#ifdef MPI
    call MPI_BARRIER(MPI_COMM_WORLD,mpierr)
#endif

!--------------------------------
! Time accounting

    SAFE_ALLOCATE(routnam, (60))
    routnam(1)='TOTAL:'
    routnam(2)='INPUT:'
    routnam(3)='INPUT_Q:'
    routnam(4)='INTWFN:'
    routnam(5)='INTKERNEL:'
    routnam(6)='DIAGONALIZE:'
    routnam(7)='EPSDIAG:'
    routnam(8)='EPS COMM:'
    routnam(9)='ABSORP0:'
    routnam(10)='VMTXEL:'
    routnam(11)='TRANS MTXEL:'
    routnam(12)='ABSORP:'
    routnam(13)='WRITE EIG:'
    routnam(41)='IW Input_co:'
    routnam(42)='IW Interp:'
    routnam(43)='IW Genwf:'
    routnam(44)='IW Gwnwf_Co:'
    routnam(45)='IW Mtxel_t:'
    routnam(46)='IW Write:'
    routnam(47)='IW Reduce:'
    routnam(51)='IK Setup:'
    routnam(52)='IK C-Check:'
    routnam(53)='IK Input:'
    routnam(54)='IK Inteps:'
    routnam(55)='IK Vcoul:'
    routnam(56)='IK Cache:'
    routnam(57)='IK Interp:'
    routnam(58)='IK Sum:'
    SAFE_ALLOCATE(routsrt, (28))
    routsrt=(/ (ii,ii=2,13), (ii,ii=41,47), (ii,ii=51,58), 1 /)
  
    call timacc(1,2)
    if(peinf%inode.eq.0) then
      write(6,*)
      write(6,9000) 'CPU (s)','WALL (s)','#'
      write(6,*)
    endif
    
    do ii=1,ubound(routsrt, 1)
      call timacc(routsrt(ii),3,tsec,ncount)
#ifdef MPI
      call MPI_ALLREDUCE(tsec,tmin,2,MPI_REAL_DP,MPI_MIN,MPI_COMM_WORLD,mpierr)
      call MPI_ALLREDUCE(tsec,tmax,2,MPI_REAL_DP,MPI_MAX,MPI_COMM_WORLD,mpierr)
#else
      tmin = tsec
      tmax = tsec
#endif
      if(peinf%inode==0) then
        if (ii>1) then
          if (routsrt(ii)-routsrt(ii-1)/=1) write(6,*)
        endif
        write(6,9001) routnam(routsrt(ii)),tmin(1),tmin(2),ncount
        write(6,9002) tsec(1),tsec(2)
        write(6,9003) tmax(1),tmax(2)
      endif
    enddo
    
9000 format(23x,a13,3x,a13,3x,a8)
9001 format(1x,a16,'(min.)',f13.3,3x,f13.3,3x,i8)
9002 format(   17x,'(PE 0)',f13.3,3x,f13.3)
9003 format(   17x,'(max.)',f13.3,3x,f13.3)

    POP_SUB(diag.diag_end)
    return
  end subroutine diag_end

end subroutine diag
