/* QBLAS-DEMO.C
 *
 * an example of usage of QBLAS library
 * 
 * Last Modified     8/2009
 * Original Version  8/2009
 *
 * (c) Elegant Mathematics, Ltd. 2006-2008
 * Hanauer Muehle 2
 * 66564 Ottweiler-Fuerth
 * Germany
 * e-mail: info@elegant-mathematics.com
 * Tel: +49-163-7414473
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <complex.h>
#include <qblas.h>


#define ALPHA 6.0

int N;

 typedef void MultOp(void *, void *);


 void MultD(void *xx, void *yy)
 { int i;
   double *X=(double*)xx;
   double *Y=(double*)yy;
   Y[0]=X[0]-X[1];
   for(i=1; i<N-1; i++)
     Y[i]=ALPHA*X[i]-X[i-1]-X[i+1];
   Y[N-1]=X[N-1]-X[N-2];
   return;
 }


 void MultZ(void *xx, void *yy)
 { int i;
   double __complex__ *X=(__complex__ double*)xx;
   double __complex__ *Y=(__complex__ double*)yy;
   Y[0]=X[0]-X[1];
   for(i=1; i<N-1; i++)
     Y[i]=ALPHA*X[i]-X[i-1]-X[i+1];
   Y[N-1]=X[N-1]-X[N-2];
   return;
 }


 void MultQD(void *xx, void *yy)
 { FLOATDD *X=(FLOATDD*)xx;
   FLOATDD *Y=(FLOATDD*)yy;
   FLOATDD  A, B;
   A.Hi=ALPHA;
   A.Lo=0.;
   B.Hi=0.;
   B.Lo=0.;
   dqaxpby(N, A, X, 1, B, Y, 1);
   A.Hi=-1.;
   dqaxpy(N-1, A, X+1, 1, Y, 1);
   dqaxpy(N-1, A, X, 1, Y+1, 1);
   return;
 }


 void MultQZ(void *xx, void *yy)
 { zZAHL *X=(zZAHL*)xx;
   zZAHL *Y=(zZAHL*)yy;
   zZAHL  A, B;
   A.Re.Hi=ALPHA;
   A.Re.Lo=0.;
   A.Im.Hi=0.;
   A.Im.Lo=0.;
   B.Re.Hi=0.;
   B.Re.Lo=0.;
   B.Im.Hi=0.;
   B.Im.Lo=0.;
   zqaxpby(N, A, X, 1, B, Y, 1);
   A.Re.Hi=-1.;
   zqaxpy(N-1, A, X+1, 1, Y, 1);
   zqaxpy(N-1, A, X, 1, Y+1, 1);
   return;
 }


 void dcg(int N, MultOp *MultA, MultOp *MultP, double *X,
        int MaxIter, double Eps)
 { double    Alpha, Beta, cc, c1, c2, Nrm=0, Nrm0=0;
   double   *Y, *R, *P;
   double   *yy, *rr, *pp, *xx=X;
   int       i, iter;
   double    L1, L2, LI, LL1=0., LL2=0., LLI=0.;
   const int Nsw=N;

   yy=Y=malloc(sizeof(double)*N);
   rr=R=malloc(sizeof(double)*N);
   pp=P=malloc(sizeof(double)*N);
   for(i=0; i<Nsw; i++)
   { rr[i]=xx[i];
     xx[i]=0;
   }
   MultP(R, P);
   for(cc=0, i=0; i<Nsw; i++)
     cc+=rr[i]*pp[i];
   for(iter=0; iter<MaxIter; iter++)
   { MultA((void*)P, (void*)Y);
     for(c1=0, i=0; i<Nsw; i++)
       c1+=yy[i]*pp[i];
     Alpha=(c1==0.)?0.:cc/c1;
     for(i=0; i<Nsw; i++)
     { xx[i]+=Alpha*pp[i];
       rr[i]-=Alpha*yy[i];
     }
     MultP(R, Y);
     L1=L2=LI=0;
     for(c2=Nrm=0, i=0; i<Nsw; i++)
     { c2+=rr[i]*yy[i];
       Nrm+=rr[i]*rr[i];
       L2=fabs(rr[i]);
       L1+=L2;
       if(LI<L2)
         LI=L2;
     }
     L2=Nrm=sqrt(Nrm);
     if(iter==0)
     { Nrm0=Nrm;
       LL1=L1;
       LL2=L2;
       LLI=LI;
     }
     Beta=(cc==0.)?0.:c2/cc;
     cc=c2;
     for(i=0; i<Nsw; i++)
       pp[i]=pp[i]*Beta+yy[i];
     printf(" DCG[%d] Res: L1=%g, L2=%g, LInf=%g\n", iter, L1/LL1, L2/LL2, LI/LLI);
     if(Nrm<Nrm0*Eps)
       break;
   }
   free(P);
   free(R);
   free(Y);
   return;
 }


 void zcg(int N, MultOp *MultA, MultOp *MultP, double __complex__ *X,
        int MaxIter, double Eps)
 { double    Alpha, Beta, cc, c1, c2, Nrm=0, Nrm0=0;
   double __complex__ *Y, *R, *P;
   double   *yy, *rr, *pp, *xx=(double*)(void*)X;
   int       i, iter;
   double    L1, L2, LI, LL1=0., LL2=0., LLI=0.;
   const int Nsw=N*2;

   yy=(double*)(void*)(Y=malloc(sizeof(double __complex__)*N));
   rr=(double*)(void*)(R=malloc(sizeof(double __complex__)*N));
   pp=(double*)(void*)(P=malloc(sizeof(double __complex__)*N));
   for(i=0; i<Nsw; i++)
   { rr[i]=xx[i];
     xx[i]=0;
   }
   MultP(R, P);
   for(cc=0, i=0; i<Nsw; i++)
     cc+=rr[i]*pp[i];
   for(iter=0; iter<MaxIter; iter++)
   { MultA((void*)P, (void*)Y);
     for(c1=0, i=0; i<Nsw; i++)
       c1+=yy[i]*pp[i];
     Alpha=(c1==0.)?0.:cc/c1;
     for(i=0; i<Nsw; i++)
     { xx[i]+=Alpha*pp[i];
       rr[i]-=Alpha*yy[i];
     }
     MultP(R, Y);
     L1=L2=LI=0;
     for(c2=Nrm=0, i=0; i<Nsw; i++)
     { c2+=rr[i]*yy[i];
       Nrm+=rr[i]*rr[i];
       L2=fabs(rr[i]);
       L1+=L2;
       if(LI<L2)
         LI=L2;
     }
     L2=Nrm=sqrt(Nrm);
     if(iter==0)
     { Nrm0=Nrm;
       LL1=L1;
       LL2=L2;
       LLI=LI;
     }
     Beta=(cc==0.)?0.:c2/cc;
     cc=c2;
     for(i=0; i<Nsw; i++)
       pp[i]=pp[i]*Beta+yy[i];
     printf(" ZCG[%d] Res: L1=%g, L2=%g, LInf=%g\n", iter, L1/LL1, L2/LL2, LI/LLI);
     if(Nrm<Nrm0*Eps)
       break;
   }
   free(P);
   free(R);
   free(Y);
   return;
 }


 void dqcg(int N, MultOp *MultA, MultOp *MultP, FLOATDD *X,
        int MaxIter, double Eps)
 { FLOATDD   Alpha, Beta, cc, c1, c2, Nrm, Nrm0;
   FLOATDD  *Y, *R, *P;
   int       iter;
   FLOATDD   L1, L2, LI, LL1, LL2, LLI;

   Y=malloc(sizeof(FLOATDD)*N);
   R=malloc(sizeof(FLOATDD)*N);
   P=malloc(sizeof(FLOATDD)*N);
   dqcopy(N, X, 1, R, 1);
   MultP(R, P);
   cc=dqdot(N, R, 1, P, 1);
   Nrm0=LL1=LL2=LLI=dZERO;
   for(iter=0; iter<MaxIter; iter++)
   { MultA((void*)P, (void*)Y);
     c1=dqdot(N, Y, 1, P, 1);
     Alpha=(iszero_DD(c1))?c1:div_DD_DD(cc, c1);
     dqaxpy(N, Alpha, P, 1, X, 1);
     dqaxpy(N, sign_DD(Alpha), Y, 1, R, 1);
     MultP(R, Y);
     L1=L2=LI=dZERO;
     c2=dqdot(N, R, 1, Y, 1);
     L1=dqasum(N, R, 1);
     Nrm=L2=dqnrm2(N, R, 1);
     LI=abs_DD(R[dqamax(N, R, 1)]);
     if(iter==0)
     { Nrm0=Nrm;
       LL1=L1;
       LL2=L2;
       LLI=LI;
     }
     Beta=(iszero_DD(cc))?cc:div_DD_DD(c2, cc);
     cc=c2;
     dqaxpby(N, dONE, Y, 1, Beta, P, 1);
     printf("DQCG[%d] Res: L1=%g, L2=%g, LInf=%g\n", iter, L1.Hi/LL1.Hi, L2.Hi/LL2.Hi, LI.Hi/LLI.Hi);
     if(Nrm.Hi<Nrm0.Hi*Eps)
       break;
   }
   free(P);
   free(R);
   free(Y);
   return;
 }


 void zqcg(int N, MultOp *MultA, MultOp *MultP, zZAHL *X,
        int MaxIter, double Eps)
 { FLOATDD   Alpha, Beta, cc, c1, c2, Nrm, Nrm0;
   zZAHL  *Y, *R, *P;
   int       iter;
   FLOATDD   L1, L2, LI, LL1, LL2, LLI;

   Y=malloc(sizeof(zZAHL)*N);
   R=malloc(sizeof(zZAHL)*N);
   P=malloc(sizeof(zZAHL)*N);
   zqcopy(N, X, 1, R, 1);
   MultP(R, P);
   cc=abs_ZZ(zqdot(N, R, 1, P, 1));
   Nrm0=LL1=LL2=LLI=dZERO;
   for(iter=0; iter<MaxIter; iter++)
   { MultA((void*)P, (void*)Y);
     c1=abs_ZZ(zqdot(N, Y, 1, P, 1));
     Alpha=(iszero_DD(c1))?c1:div_DD_DD(cc, c1);
     zqaxpy(N, cmplx_ZZ(Alpha, dZERO), P, 1, X, 1);
     zqaxpy(N, cmplx_ZZ(sign_DD(Alpha), dZERO), Y, 1, R, 1);
     MultP(R, Y);
     L1=L2=LI=dZERO;
     c2=abs_ZZ(zqdot(N, R, 1, Y, 1));
     L1=zqasum(N, R, 1);
     Nrm=L2=zqnrm2(N, R, 1);
     LI=abs_ZZ(R[zqamax(N, R, 1)]);
     if(iter==0)
     { Nrm0=Nrm;
       LL1=L1;
       LL2=L2;
       LLI=LI;
     }
     Beta=(iszero_DD(cc))?cc:div_DD_DD(c2, cc);
     cc=c2;
     zqaxpby(N, zONE, Y, 1, cmplx_ZZ(Beta, dZERO), P, 1);
     printf("ZQCG[%d] Res: L1=%g, L2=%g, LInf=%g\n", iter, L1.Hi/LL1.Hi, L2.Hi/LL2.Hi, LI.Hi/LLI.Hi);
     if(Nrm.Hi<Nrm0.Hi*Eps)
       break;
   }
   free(P);
   free(R);
   free(Y);
   return;
 }


 int main(int nArg, char **ppArg)
 { int     i;
   double *X;
   int     MaxIter=20;
   double  Eps=1.0e-10;

   N=100000;

   X=malloc(sizeof(double)*4*N);
   for(i=0; i<N; i++)
     X[i]=1./(double)(i+1);
   dcg(N, MultD, MultD, (double*)(void*)X, MaxIter, Eps);
   for(i=0; i<N*2; i++)
     X[i]=1./(double)(i+1);
   zcg(N, MultZ, MultZ, (double __complex__*)(void*)X, MaxIter, Eps);
   for(i=0; i<N*2; i++)
     X[i]=1./(double)(i+1);
   Eps=Eps*Eps;
   MaxIter=100;
   dqcg(N, MultQD, MultQD, (FLOATDD*)(void*)X, MaxIter, Eps);
   for(i=0; i<N*4; i++)
     X[i]=1./(double)(i+1);
   zqcg(N, MultQZ, MultQZ, (zZAHL*)(void*)X, MaxIter, Eps);
   free(X);
   return 0;
 }


