/*
    Copyright (C) 1998  Dennis Roddeman
    email: dennis.roddeman@uibk.ac.at

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software Foundation 
    59 Temple Place, Suite 330, Boston, MA, 02111-1307, USA
*/

#include "tochnog.h"

long int *global_local, nlocal;
double *b, *x, *d1, *d2, *p, *Ad1_thread, *Ad2_thread, *p_thread, *residue_thread;

#define EPS_EIGEN 1.e-10
#define EPS_ERROR 1.e-8
#define EPS_CHECK_ERROR 1.e-12
#define EPS_dAd 1.e-16
#define EPS_TMP 1.e-16
#define EPS_P 1.e-10

void solve( long int task )

{
  integer n=0, kl=0, ku=0, nrhs=0, ldmat=0, ldmatlin=0, 
    ldmatss=0, ldb=0, lmat=0, lmatlin=0, lmatss=0, lz=0, 
    lw=0, lwork=0, info=0, kd=0, ldz=0;
  long int i=0, ii=0, jj=0, inod=0, jnod=0, nnod=0,
    max_node=0, ipuknwn=0, iuknwn=0, element=0, 
    max_element=0, swit=0, length=0, band=0, itmp=0,
    ilocal=0, jlocal=0, iglobal=0, jglobal=0, indx=0, lb=0,
    lglobal_local=0, ready=0, anything_ordered=0, 
    icontrol=0, ieigen=0, neigen=0, new_node=0, ldum=0, idum[1], control_eigen[2],
    node_bounded[MPUKNWN], dof_principal[MUKNWN], 
    node_node[DATA_ITEM_SIZE], node_tyings_unknowns[MUKNWN],
    *element_matrix_unknowns=NULL, *ipiv=NULL, 
    *ordered_global=NULL, *global_ordered=NULL, *ordering_has_been_done=NULL;
  double max=0., tmp=0., control_eigen_scale=0., ddum[1],
    node_eigen[DATA_ITEM_SIZE], *mat=NULL, *matlin=NULL, 
    *matss=NULL, *w=NULL, *z=NULL,
    *work=NULL, *node_lhside=NULL, *node_rhside=NULL, *element_matrix_values=NULL,  
    *element_matrix_stress_stiffness_values=NULL, *node_dof_new=NULL;
  char jobz[10], uplo[10];

  if ( task==-DIAGONAL ) return;

  swit = set_swit(-1,-1,"solve");
  if ( swit ) pri( "In routine SOLVE" );

  if ( swit ) pri( "task", task );
  db_highest_index( ELEMENT_MATRIX_UNKNOWNS, max_element, VERSION_NORMAL );
  if ( max_element<0 ) {
    cout << "Error: " << db_name(task) << " can only be used after the\n";
    cout << "       matrices have been set up with -matrix in control_solver.\n";
    exit(TN_EXIT_STATUS);
  }

  db( ICONTROL, 0, &icontrol, ddum, ldum, VERSION_NORMAL, GET );
  if ( task==-CONTROL_EIGEN ) {
    db( CONTROL_EIGEN, icontrol, control_eigen, ddum, length, VERSION_NORMAL, GET );
    if ( swit ) pri( "control_eigen", control_eigen, 2 );
    if ( !db( CONTROL_EIGEN_SCALE, icontrol, idum, &control_eigen_scale,
      ldum, VERSION_NORMAL, GET_IF_EXISTS ) ) control_eigen_scale = 1;
    db_highest_index( NODE_LHSIDE, max_node, VERSION_NORMAL );
    db_highest_index( ELEMENT_MATRIX_VALUES, max_element, VERSION_NORMAL );
    if ( max_node<0 && max_element<0 ) {
      cout << "\nError: CONTROL_EIGEN is only possible after at least ";
      cout << "once CONTROL_TIMESTEP\n";
    }
  }
  else
    array_set( control_eigen, 0, 2 );

  db_highest_index( ELEMENT, max_element, VERSION_NORMAL );
  db_highest_index( NODE, max_node, VERSION_NORMAL );
  db( DOF_PRINCIPAL, 0, dof_principal, ddum, ldum, VERSION_NORMAL, GET );

  length = (1+max_node)*nprinc;
  b = get_new_dbl( length );
  array_set( b, 0., length );

  lglobal_local = (1+max_node)*npuknwn*2;
  global_local = get_new_int( lglobal_local );
  array_set( global_local, -NO, lglobal_local );

  length = 1+max_node;
  global_ordered = get_new_int( length );
  ordered_global = get_new_int( length );
  ordering_has_been_done = get_new_int( length );
  array_set( global_ordered, -NO, length );
  array_set( ordered_global, -NO, length );
  array_set( ordering_has_been_done, 0, length );

    // order nodes numbers to minimize band width
  if ( task==-MATRIX_ITERATIVE ) {
    for ( inod=0; inod<=max_node; inod++ ) {
      if ( db_active_index( NODE, inod, VERSION_NORMAL ) ) {
        global_ordered[inod] = inod;
        ordered_global[inod] = inod;
      }
    }
  }
  else {
    new_node = 0;
    global_ordered[max_node] = new_node;
    ordered_global[new_node] = max_node;
    while ( !ready ) {
      anything_ordered = 0;
      for ( inod=0; inod<=max_node; inod++ ) {
        if ( global_ordered[inod]!=-NO && !ordering_has_been_done[inod] ) {
          anything_ordered = 1;
          ordering_has_been_done[inod] = 1;
          db( NODE_NODE, inod, node_node, ddum, nnod, VERSION_NORMAL, GET );
          for ( jj=0; jj<nnod; jj++ ) {
            jnod = node_node[jj];
            if ( global_ordered[jnod]==-NO ) {
              new_node++;
              global_ordered[jnod] = new_node;
              ordered_global[new_node] = jnod;
              itmp = ( 1 + scalar_iabs( global_ordered[inod] -
                global_ordered[jnod] ) ) * nprinc;
              if ( itmp>band ) band = itmp;
            }
          }
        }
      }
      if ( anything_ordered )
        ready = 0;
      else {
        ready = 1;
          // if needed, take new node if front has ended
        for ( inod=0; ready && inod<=max_node; inod++ ) {
          if ( db_active_index( NODE, inod, VERSION_NORMAL ) &&
	       global_ordered[inod]==-NO ) {
            new_node++;
	    global_ordered[inod] = new_node;
            ordered_global[new_node] = inod;
	    ready = 0;
          }
        }
      }
    }
  }
  if ( swit ) {
    pri( "global_ordered", global_ordered, 1+max_node );
    pri( "ordered_global", ordered_global, 1+max_node );
  }

    // fill global=TOCHNOG -> local=SOLVER=TOCHNOG-BOUNDED-TYINGS info
  nlocal = 0;
  for ( ii=0; ii<=max_node; ii++ ) {
    inod = ordered_global[ii];
    if ( inod>=0 ) {
      array_set( node_bounded, 0, npuknwn );
      array_set( node_tyings_unknowns, -NO, nuknwn );
      db( NODE_BOUNDED, inod, node_bounded, ddum, ldum, 
        VERSION_NORMAL, GET_IF_EXISTS );
      db( NODE_TYINGS_UNKNOWNS, inod, node_tyings_unknowns, ddum, ldum, 
        VERSION_NORMAL, GET_IF_EXISTS );
      node_rhside = db_dbl( NODE_RHSIDE, inod, VERSION_NORMAL );
      for ( ipuknwn=0; ipuknwn<npuknwn; ipuknwn++ ) {
        iuknwn = ipuknwn*nder;
        iglobal = inod*npuknwn + ipuknwn;
        if ( !node_bounded[ipuknwn] && node_tyings_unknowns[iuknwn]!=-YES &&
             dof_principal[iuknwn]>=0 ) {
          global_local[iglobal] = ilocal;
          b[ilocal] = node_rhside[ipuknwn];
          ilocal++;
          nlocal++;
        }
      }
    }
  }
  ldb = nlocal;
  lb = nlocal;
  if ( band>nlocal ) band = nlocal;
  if ( swit ) {
    pri( "nlocal", nlocal );
    pri( "band", band );
    pri( "right hand side", b, nlocal );
    pri( "global_local", global_local, lglobal_local );
  }

  if ( nlocal==0 ) goto skip;

  if ( task==-CONTROL_EIGEN ) {
    neigen = control_eigen[1];
    if ( swit ) pri( "neigen", neigen );
    if ( neigen>nlocal ) neigen = nlocal;
    if ( neigen>0 ) db_set_dbl( NODE_EIGEN, VERSION_NORMAL );
    if ( neigen*nuknwn>DATA_ITEM_SIZE ) {
      pri( "Error: DATA_ITEM_SIZE is too small for NODE_EIGEN." );
      pri( "It should become at least", neigen*nuknwn );
      pri( "Change it in tochnog.h at recompile." );
      exit(TN_EXIT_STATUS);
    }
  }

  if ( task==-MATRIX_DIRECT || task==-CONTROL_EIGEN ) {
    ipiv = get_new_int( nlocal );
    array_set( ipiv, 0, nlocal );
    n = nlocal;
    kl = band;
    ku = band;
    nrhs = 1;
    ldmat = 2*kl+ku+1;
    lmat = ldmat*nlocal;
    mat = get_new_dbl( lmat );
    array_set( mat, 0., lmat );
    if ( swit ) {
      pri( "ldmat", ldmat );
      pri( "kl", kl );
      pri( "ku", ku );
    }
  }
  if ( neigen>0 ) {
    lw = nlocal;
    w = get_new_dbl( lw );
    array_set( w, 0., lw );
    ldz = nlocal;
    lz = ldz*nlocal;
    z = get_new_dbl( lz );
    array_set( z, 0., lz );
    lwork = 3*nlocal;
    work = get_new_dbl( lwork );
    array_set( work, 0., lwork );
    ldmatlin = ku+1;
    lmatlin = ldmatlin*nlocal;
    matlin = get_new_dbl( lmatlin );
    array_set( matlin, 0., lmatlin );
    if ( swit ) pri( "ldmatlin", ldmatlin );
    if ( control_eigen[0]==-BUCKLING ) {
      ldmatss = ku+1;
      lmatss = ldmatss*nlocal;
      matss = get_new_dbl( lmatss );
      array_set( matss, 0., lmatss );
      if ( swit ) pri( "ldmatss", ldmatss );
    }
  }

    // fill matrices in band format
  if ( task==-MATRIX_DIRECT || task==-CONTROL_EIGEN ) {
    for ( element=0; element<=max_element; element++ ) {
      if ( db_active_index( ELEMENT_MATRIX_UNKNOWNS, element, VERSION_NORMAL ) ) {
        length = db_len( ELEMENT_MATRIX_VALUES, element, VERSION_NORMAL );
        element_matrix_unknowns = 
          db_int( ELEMENT_MATRIX_UNKNOWNS, element, VERSION_NORMAL );
        element_matrix_values = 
          db_dbl( ELEMENT_MATRIX_VALUES, element, VERSION_NORMAL );
        if ( db_active_index( ELEMENT_MATRIX_STRESS_STIFFNESS_VALUES, 
          element, VERSION_NORMAL ) ) element_matrix_stress_stiffness_values = 
          db_dbl( ELEMENT_MATRIX_STRESS_STIFFNESS_VALUES, element, VERSION_NORMAL );
        for ( i=0; i<length; i++ ) {
          iglobal = element_matrix_unknowns[i*2+0];
          jglobal = element_matrix_unknowns[i*2+1];
          ilocal = global_local[iglobal];
          jlocal = global_local[jglobal];
          if ( ilocal!=-NO && jlocal!=-NO ) {
            ii = kl + ku + 1 + (ilocal+1) - (jlocal+1);
            jj = jlocal + 1;
            indx = (jj-1)*ldmat + ii;
            mat[indx-1] += element_matrix_values[i];
            if ( neigen>0 && jlocal>=ilocal ) {
              ii = ku + 1 + (ilocal+1) - (jlocal+1);
              jj = jlocal + 1;
              indx = (jj-1)*ldmatlin + ii;
              matlin[indx-1] += element_matrix_values[i];
              if ( control_eigen[0]==-BUCKLING ) {
                matss[indx-1] -= element_matrix_stress_stiffness_values[i];
                  // prevent small negative diagonals due to limited numerical accuracy
                if ( jlocal==ilocal ) matss[indx-1] += EPS_EIGEN; 
              }
            }
          }
        }
      }
    }
    for ( inod=0; inod<=max_node; inod++ ) {
      if ( db_active_index( NODE_LHSIDE, inod, VERSION_NORMAL ) ) {
        node_lhside = db_dbl( NODE_LHSIDE, inod, VERSION_NORMAL );
        for ( ipuknwn=0; ipuknwn<npuknwn; ipuknwn++ ) {
          iglobal = inod*npuknwn + ipuknwn;
          ilocal = global_local[iglobal];
          jlocal = ilocal;
          if ( ilocal!=-NO ) {
            ii = kl + ku + 1 + (ilocal+1) - (jlocal+1);
            jj = jlocal + 1;
            indx = (jj-1)*ldmat + ii;
            mat[indx-1] += node_lhside[ipuknwn];
            if ( ilocal==jlocal && mat[indx-1]<0. ) {
              pri("Error: Negative diagonal in matrix detected", mat[indx-1] );
              pri( "Try one of the following:" );
              pri( "  Smaller time steps." );
              pri( "  Including CONTROL_MATERI_VELOCITY_RELAXATION or so." );
              pri( "  CONTROL_SOLVER index -DIAGONAL." );
              exit(TN_EXIT_STATUS);
            }
            if ( neigen>0 ) {
              ii = ku + 1 + (ilocal+1) - (jlocal+1);
              jj = jlocal + 1;
              indx = (jj-1)*ldmatlin + ii;
              matlin[indx-1] += node_lhside[ipuknwn];
            }
          }
        }
      }
    }
  }

    // call solver
  if ( swit ) pri( "assembled total matrices in band format", mat, ldmat, nlocal );
  if ( task==-MATRIX_DIRECT ) {
    mdgbsv_( &n, &kl, &ku, &nrhs, mat, &ldmat, ipiv, b, &ldb, &info );
    if ( info ) {
      pri( "Error with matrix solver." );
      pri( "Try one of the following:" );
      pri( "  Smaller time steps." );
      pri( "  Including CONTROL_MATERI_VELOCITY_RELAXATION or so." );
      pri( "  CONTROL_SOLVER index -DIAGONAL." );
      pri( "" );
      exit(TN_EXIT_STATUS);
    }
    if ( swit ) pri( "solution", b, lb );
  }
  else if ( task==-MATRIX_ITERATIVE ) 
    solve_iterative( );

  if ( neigen>0 ) {
    jobz[0] = 'V';
    uplo[0] = 'U';
    n = nlocal;
    kd = ku;
    if ( nlocal==1 ) {
      if ( control_eigen[0]==-MATRIX ) {
        w[0] = matlin[lmatlin-1];
        z[0] = 1;
      }
      else {
        assert( control_eigen[0]==-BUCKLING );
        w[0] = matlin[lmatlin-1]/matss[lmatss-1];
        z[0] = 1;
      }
    }
    else {
      if ( control_eigen[0]==-MATRIX )
        dsbev_( jobz, uplo, &n, &kd, matlin, &ldmatlin, 
          w, z, &ldz, work, &info);
      else {
        assert( control_eigen[0]==-BUCKLING );
        dsbgv_(jobz, uplo, &n, &kd, &kd, matlin, &ldmatlin, matss, 
          &ldmatss, w, z, &ldz, work, &info );
      }
      if ( info ) {
        pri( "Error with eigen solver." );
        pri( "Try one of the following:" );
        pri( "  Linear elements." );
        pri( "  Less elements." );
        exit(TN_EXIT_STATUS);
      }
      for ( jlocal=0; jlocal<nlocal; jlocal++ ) {
        max = 0.;
        for ( ilocal=0; ilocal<nlocal; ilocal++ ) {
          tmp = scalar_dabs(z[jlocal*nlocal+ilocal]);
          if ( tmp>max ) max = tmp;
        }
        if ( max>0. ) {
          for ( ilocal=0; ilocal<nlocal; ilocal++ )
            z[jlocal*nlocal+ilocal] *= control_eigen_scale/max;
        }
      }
    }
    if ( swit ) pri( "w", w, nlocal );
    db( CONTROL_EIGEN_VALUES, 0, idum, w, neigen, VERSION_NORMAL, PUT );
  }

      // fill dofs
  for ( inod=0; inod<=max_node; inod++ ) {
    if ( db_active_index( NODE, inod, VERSION_NORMAL ) ) {
      if ( task==-MATRIX_DIRECT || task==-MATRIX_ITERATIVE ) 
        node_dof_new = db_dbl( NODE_DOF, inod, VERSION_NEW );
      for ( i=0; i<neigen*nuknwn; i++ ) node_eigen[i] = 0.;
      for ( ipuknwn=0; ipuknwn<npuknwn; ipuknwn++ ) {
        iuknwn = ipuknwn*nder;
        iglobal = inod*npuknwn + ipuknwn;
        if ( global_local[iglobal]>=0 ) {
          ilocal = global_local[iglobal];
          if ( ilocal!=-NO ) {
            if ( task==-MATRIX_DIRECT || task==-MATRIX_ITERATIVE ) 
              node_dof_new[iuknwn] += b[ilocal];
            for ( ieigen=0; ieigen<neigen; ieigen++ )
              node_eigen[ieigen*nuknwn+iuknwn] = z[ieigen*nlocal+ilocal];
          }
        }
      }
      if ( neigen>0 ) {
        length = neigen*nuknwn;
        db( NODE_EIGEN, inod, idum, node_eigen, length, VERSION_NORMAL, PUT );
      }
    }
  }

  if ( task==-MATRIX_DIRECT || task==-CONTROL_EIGEN ) {
    delete[] mat;
    delete[] ipiv;
  }
  if ( neigen>0 ) {
    delete[] w;
    delete[] z;
    delete[] work;
    delete[] matlin;
    if ( control_eigen[0]==-BUCKLING ) delete[] matss;
  }

  skip:

  delete[] b;
  delete[] ordered_global;
  delete[] global_ordered;
  delete[] global_local;
  delete[] ordering_has_been_done;

  if ( swit ) pri( "Out routine SOLVE" );

}

void solve_iterative( void )


{
  long int iter=0, max_iter=0, ilocal=0, nthread=0, ready=0, ldum=0, swit=0;
  double error=0., alpha=0., beta=0., dAd=0., check_error=0., last_error=0.,
   size_r1r2=0., ddum[1], *r1=NULL, *r2=NULL, *Ad1=NULL, *Ad2=NULL, 
   *p_tmp=NULL, *residue=NULL;

  swit = set_swit(-1,-1,"solve_iterative");
  if ( swit ) pri( "In routine SOLVE_ITERATIVE" );

  db( OPTIONS_PROCESSORS, 0, &nthread, ddum, ldum, VERSION_NORMAL, GET );

    // solution vector
  x = get_new_dbl( nlocal );
  array_set( x, 0., nlocal );
    // r1, r2 vectors
  r1 = get_new_dbl( nlocal );
  r2 = get_new_dbl( nlocal );
    // residue vector
  residue = get_new_dbl ( nlocal );
    // search directions
  d1 = get_new_dbl( nlocal );
  d2 = get_new_dbl( nlocal );
  array_set( d1, 0., nlocal );
  array_set( d2, 0., nlocal );
    // preconditioner
  p = get_new_dbl( nlocal );
  array_set( p, 1., nlocal );
    // work array for inverse of preconditioner
  p_tmp = get_new_dbl( nlocal );
    // A*d
  Ad1 = get_new_dbl( nlocal );
  Ad2 = get_new_dbl( nlocal );
    // A*d vectors for all threads
  Ad1_thread = get_new_dbl( nthread*nlocal );
  Ad2_thread = get_new_dbl( nthread*nlocal );
    // preconditioner for all threads
  p_thread = get_new_dbl( nthread*nlocal );
    // residue vector for all threads
  residue_thread = get_new_dbl ( nthread*nlocal );

    // set diagonal preconditioner
  solve_iterative_sys( Ad1, Ad2, p_tmp, residue );
  for ( ilocal=0; ilocal<nlocal; ilocal++ ) p[ilocal] = 1./(sqrt(p_tmp[ilocal]));
  if ( swit ) pri( "p", p, nlocal );

    // initial error
  solve_iterative_sys( Ad1, Ad2, p_tmp, residue );
  error = array_inproduct( residue, residue, nlocal );
  check_error = EPS_ERROR * error;
  if ( check_error<EPS_CHECK_ERROR ) check_error = EPS_CHECK_ERROR;
  if ( swit ) pri( "check_error", check_error );

    // start values for iterative loop
  for ( ilocal=0; ilocal<nlocal; ilocal++ ) {
    r1[ilocal] = b[ilocal] * p[ilocal];
    r2[ilocal] = b[ilocal] * p[ilocal];
    d1[ilocal] = r1[ilocal];
    d2[ilocal] = r2[ilocal];
  }

    // iterative loop, preconditioned biconjugate gradients
  ready=0; max_iter = 10*nlocal; error = 1.e10;
  for ( iter=0; !ready; iter++ ) {
    if ( swit ) pri( "iterative solve iteration", iter );
    solve_iterative_sys( Ad1, Ad2, p_tmp, residue );
    dAd = array_inproduct( d2, Ad1, nlocal );
    size_r1r2 = array_inproduct( r1, r2, nlocal );
    if ( swit ) {
      pri( "r1", r1, nlocal );
      pri( "r2", r2, nlocal );
      pri( "dAd", dAd );
      pri( "size_r1r2", size_r1r2 );
    }
    if ( scalar_dabs(size_r1r2)<EPS_TMP || scalar_dabs(dAd)<EPS_dAd )
      ready = 1;
    else {
      last_error = error;
      error = array_inproduct( residue, residue, nlocal );
      alpha =  array_inproduct( r1, r2, nlocal ) / dAd;
      for ( ilocal=0; ilocal<nlocal; ilocal++ ) {
        x[ilocal] += alpha * d1[ilocal];
        r1[ilocal] -= alpha * Ad1[ilocal];
        r2[ilocal] -= alpha * Ad2[ilocal];
      }
      beta = array_inproduct( r1, r2, nlocal ) / size_r1r2;
      for ( ilocal=0; ilocal<nlocal; ilocal++ ) {
        d1[ilocal] = r1[ilocal] + beta * d1[ilocal];
        d2[ilocal] = r2[ilocal] + beta * d2[ilocal];
      }
      if ( swit ) {
        pri( "alpha", alpha );
        pri( "x", x, nlocal );
        pri( "error", error );
        pri( "beta", beta );
        pri( "d1", d1, nlocal );
        pri( "d2", d2, nlocal );
      }
    }
    if ( error<check_error )
      ready = 1;
    else if ( scalar_dabs(error-last_error)<1.e-1*check_error )
      ready = 1;
    else if ( iter==max_iter ) {
      pri( "" );
      pri( "Error: CONTROL_SOLVER index -MATRIX_ITERATIVE did not converge." );
      pri( "Try -MATRIX_DIRECT." );
      pri( "" );
      exit(TN_EXIT_STATUS);
    }
  }

    // fill solution vector
  for ( ilocal=0; ilocal<nlocal; ilocal++ )
    b[ilocal] = x[ilocal] * p[ilocal];

  delete[] x;
  delete[] r1;
  delete[] r2;
  delete[] d1;
  delete[] d2;
  delete[] p;
  delete[] p_tmp;
  delete[] Ad1;
  delete[] Ad2;
  delete[] Ad1_thread;
  delete[] Ad2_thread;
  delete[] p_thread;
  delete[] residue;
  delete[] residue_thread;

  if ( swit ) pri( "Out routine SOLVE_ITERATIVE" );
}

void solve_iterative_sys( double *Ad1, double *Ad2, double *p_tmp, double *residue )

{
  long int inod=0, max_node=0, ilocal=0, iglobal=0, ipuknwn=0,
    ithread=0, nthread=0, swit=0, ldum=0;
  double ddum[1], *node_lhside=NULL;

  swit = set_swit(-1,-1,"solve_iterative_sys");
  if ( swit ) pri( "In routine SOLVE_ITERATIVE" );

  db( OPTIONS_PROCESSORS, 0, &nthread, ddum, ldum, VERSION_NORMAL, GET );
  array_set( Ad1_thread, 0., nthread*nlocal );
  array_set( Ad2_thread, 0., nthread*nlocal );
  array_set( p_thread, 0., nthread*nlocal );
  array_set( residue_thread, 0., nthread*nlocal );
  parallel_sys_routine( &parallel_solve_iterative_element );

  array_set( Ad1, 0., nlocal );
  array_set( Ad2, 0., nlocal );
  array_set( p_tmp, 0., nlocal );
  array_set( residue, 0., nlocal );
  db_highest_index( NODE_LHSIDE, max_node, VERSION_NORMAL );
  for ( inod=0; inod<=max_node; inod++ ) {
    if ( db_active_index( NODE_LHSIDE, inod, VERSION_NORMAL ) ) {
      node_lhside = db_dbl( NODE_LHSIDE, inod, VERSION_NORMAL );
      for ( ipuknwn=0; ipuknwn<npuknwn; ipuknwn++ ) {
        iglobal = inod*npuknwn + ipuknwn;
        ilocal = global_local[iglobal];
        if ( ilocal!=-NO ) {
          Ad1[ilocal] += node_lhside[ipuknwn] * d1[ilocal] * p[ilocal] * p[ilocal];
          Ad2[ilocal] += node_lhside[ipuknwn] * d2[ilocal] * p[ilocal] * p[ilocal];
          residue[ilocal] += b[ilocal] * p[ilocal] - node_lhside[ipuknwn] * 
            x[ilocal] * p[ilocal] * p[ilocal];
          p_tmp[ilocal] += node_lhside[ipuknwn];
        }
      }
    }
  }
  for ( ilocal=0; ilocal<nlocal; ilocal++ ) {
    for ( ithread=0; ithread<nthread; ithread++ ) {
      Ad1[ilocal] += Ad1_thread[ithread*nlocal+ilocal];
      Ad2[ilocal] += Ad2_thread[ithread*nlocal+ilocal];
      p_tmp[ilocal] += p_thread[ithread*nlocal+ilocal];
      residue[ilocal] += residue_thread[ithread*nlocal+ilocal];
    }
    if ( scalar_dabs(p_tmp[ilocal])<EPS_P ) p_tmp[ilocal] = 1.;
  }
  if ( swit ) {
    pri( "Ad1", Ad1, nlocal );
    pri( "Ad2", Ad2, nlocal );
    pri( "p_tmp", p_tmp, nlocal );
    pri( "p_thread", p_thread, nthread, nlocal );
    pri( "residue_thread", residue_thread, nthread, nlocal );
  }

  if ( swit ) pri( "Out routine SOLVE_ITERATIVE_SYS" );
}

void parallel_solve_iterative_element( void )

{
  long int element=0, max_element=0, iloop=0, nloop=0, 
    ithread=0, *next_of_loop=NULL;

    // loop over elements
  db_max_index( ELEMENT, max_element, VERSION_NORMAL, GET );
  if ( max_element>=0 ) {
    next_of_loop = get_new_int(1+max_element);
    parallel_sys_next_of_loop( next_of_loop, max_element, nloop, ithread );
    for ( iloop=0; iloop<nloop; iloop++ ) {
      element = next_of_loop[iloop];
      if ( element>max_element )
        break;
      else if ( db_active_index( ELEMENT, element, VERSION_NORMAL ) )
        solve_iterative_element( element, ithread );
    }
    delete[] next_of_loop;
  }

}


void solve_iterative_element( long int element, long int ithread )

{
  long int i=0, length=0, iglobal=0, ilocal=0, jglobal=0, jlocal=0,
    indx1=0, indx2=0, *element_matrix_unknowns=NULL;
  double *element_matrix_values=NULL;

  length = db_len( ELEMENT_MATRIX_VALUES, element, VERSION_NORMAL );
  element_matrix_unknowns = 
    db_int( ELEMENT_MATRIX_UNKNOWNS, element, VERSION_NORMAL );
  element_matrix_values = 
    db_dbl( ELEMENT_MATRIX_VALUES, element, VERSION_NORMAL );
  for ( i=0; i<length; i++ ) {
    iglobal = element_matrix_unknowns[i*2+0];
    jglobal = element_matrix_unknowns[i*2+1];
    ilocal = global_local[iglobal];
    jlocal = global_local[jglobal];
    if ( ilocal!=-NO && jlocal!=-NO ) {
      indx1 = ithread*nlocal + ilocal;
      indx2 = ithread*nlocal + jlocal;
      Ad1_thread[indx1] += element_matrix_values[i] * d1[jlocal] * 
        ( p[ilocal] * p[jlocal] ) ;  
      Ad2_thread[indx2] += element_matrix_values[i] * d2[ilocal] * 
        ( p[ilocal] * p[jlocal] ) ;  
      residue_thread[indx2] -= element_matrix_values[i] * x[ilocal] * 
        ( p[ilocal] * p[jlocal] ) ;  
      if ( ilocal==jlocal ) p_thread[indx1] += element_matrix_values[i];
    }
  }
}
