00001 #include "solvergpu_agm.h"
00002 #include <vector>
00003 #define NOMPI
00004 #include <string.h>
00005 #include <toolbox.h>
00006 #include "exception.h"
00007 #include "numericmethods.h"
00008
00009 template<class T, class S> class schur_complement : public generic_operator<T, S>
00010 {
00011 private:
00012 inverse_operator<T, S> &_A;
00013 const generic_operator<T, S> &_P;
00014 const generic_operator<T, S> &_R;
00015 vector<S> &_s;
00016 vector<S> &_t;
00017 public:
00018 schur_complement(inverse_operator<T, S> &_A, const generic_operator<T, S> &_P, const generic_operator<T, S> &_R, vector<S> &_s, vector<S> &_t)
00019 : _A(_A), _P(_P), _R(_R), _s(_s), _t(_t)
00020 {
00021 }
00022 void operator()(const vector<S> &_u, vector<S> &_v) const
00023 {
00024 _P(_u, _s);
00025 _A(_s, _t);
00026 _R(_t, _v);
00027 }
00028 };
00029
00030
00031 SolverGPU_AGM::SolverGPU_AGM(double tol,unsigned nIt)
00032 :m_tol(tol),m_nIt(nIt)
00033 {
00034
00035 }
00036
00042 void SolverGPU_AGM::solve(const SparseMatrix<double> &A,VecDouble &sol,const VecDouble &vRHS,double tol,unsigned nIt)
00043 {
00044
00045 std::vector<double> vElem;
00046 std::vector<int> vCols;
00047 std::vector<int> vCnt;
00048
00049 convertSparseMatrix(A,vCnt,vCols,vElem);
00050
00051
00052 std::vector<double> b(A.m());
00053 std::vector<double> x(A.m());
00054
00055 for (unsigned i=0;i<b.size();i++)
00056 b[i]=vRHS(i);
00057 gpu_agm(vCnt,vCols,vElem,b,x,tol,nIt);
00058
00059 for (unsigned i=0;i<x.size();i++)
00060 sol(i) = x[i];
00061
00062
00063
00064 }
00065
00066
00067
00075 void SolverGPU_AGM::gpu_agm(std::vector<int> &cnt,std::vector<int> &col, std::vector<double> &ele, std::vector<double> &_b, std::vector<double> &_x,double tol,unsigned nIt)
00076 {
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096 double gta, gtb, gtc, ta, tb;
00097 network::time(gta);
00098 int size, rank;
00099 network::size(size);
00100 network::rank(rank);
00101
00102 std::vector<int> nod(cnt.size());
00103 for(int i = 0; i < cnt.size(); i++) nod[i] = i;
00104
00106 network::barrier();
00107 network::time(ta);
00108 communicator<int, double> com(nod);
00109 network::time(tb);
00110 network::barrier();
00111
00112
00113 linear_operator<int, double> lin(cnt, col, ele);
00114 vector_product<int, double> sca(com);
00115
00116
00117 network::barrier();
00118 network::time(ta);
00119 double epsilon = 0.0, omega = 1;
00120 int max_level = 16;
00121 int min_nodes = 1;
00122 bool gauss_seidel = false;
00123 static amg_solver<int, double> amg(nod, cnt, col, ele, max_level, min_nodes, epsilon, omega, gauss_seidel);
00124 network::time(tb);
00125 network::barrier();
00126
00127
00128
00129 int n = 0;
00130 {
00131 double s = 0.0;
00132 vector<double> t(_b);
00133 lin(_x, t);
00134 sub_scale(t, _b, 1.0);
00135 vector<double> tt(t);
00136 com.accumulate(tt);
00137 scalar_product(tt, t, s);
00138 com.collect(s);
00139
00140 }
00141 network::barrier();
00142 network::time(ta);
00143 conjugate_gradient<int, double> cg(lin, amg, sca, tol, nIt, false);
00144 cg(_b, _x);
00145 n = cg.iterations();
00146 network::time(tb);
00147 network::barrier();
00148 cout << "Rank: " << rank << " ITERATIONS: " << n << " SOLVE Time: " << tb - ta << " (" << (tb - ta) / n << ") sec" << endl;
00149
00150
00151 {
00152 double s = 0.0;
00153 vector<double> t(_b);
00154 lin(_x, t);
00155 sub_scale(t, _b, 1.0);
00156 vector<double> tt(t);
00157 com.accumulate(tt);
00158 scalar_product(tt, t, s);
00159 com.collect(s);
00160
00161 }
00162
00163 network::time(gtc);
00164 if(rank == 0) cout << "Rank: " << rank << " GLOBAL Time: " << gtc - gta << " (" << gtc - gtb << ") sec" << endl;
00165
00166
00167 if (n >= nIt)
00168 {
00169
00170 throw new Exception("AGM did not converged %d %d",n,nIt);
00171 }
00172 }
00173
00174
00175 void SolverGPU_AGM::gpu_agmSchur(std::vector<int> &acnt,std::vector<int> &acol, std::vector<double> &aele,
00176 std::vector<int> &bcnt,std::vector<int> &bcol, std::vector<double> &bele,
00177 std::vector<int> &tcnt,std::vector<int> &tcol, std::vector<double> &tele,
00178 std::vector<double> &_f, std::vector<double> &_g,std::vector<double> &_u, std::vector<double> &_p)
00179 {
00180
00181
00182 double gta, gtb, gtc, ta, tb;
00183 network::time(gta);
00184 int size, rank;
00185 network::size(size);
00186 network::rank(rank);
00187
00188 std::vector<int> anod(acnt.size());
00189 for(int i = 0; i < acnt.size(); i++) anod[i] = i;
00190 std::vector<int> snod(tcnt.size());
00191 for(int i = 0; i < tcnt.size(); i++) snod[i] = i;
00192 cout << "AN: " << acnt.size() << " BN: " << tcnt.size() << endl;
00193
00194
00195
00197 network::barrier();
00198 network::time(ta);
00199 communicator<int, double> acom(anod);
00200 communicator<int, double> scom(snod);
00201 network::time(tb);
00202 network::barrier();
00203
00204
00205 linear_operator<int, double> lin(acnt, acol, aele);
00206 vector_product<int, double> sca(acom);
00207
00208 network::barrier();
00209 network::time(ta);
00210 double epsilon = 0.0, omega = 1.0;
00211 int max_level = 16;
00212 int min_nodes = 1;
00213 bool gauss_seidel = false;
00214 amg_solver<int, double> amg(anod, acnt, acol, aele, max_level, min_nodes, epsilon, omega, gauss_seidel);
00215 network::time(tb);
00216 network::barrier();
00217
00218
00219 linear_operator<int, double> linp(bcnt, bcol, bele);
00220 linear_operator<int, double> linr(tcnt, tcol, tele);
00221
00222 conjugate_gradient<int, double> cg(lin, amg, sca, 1.0e-8, 64, false);
00223
00224 vector<double> _s(acnt.size());
00225 vector<double> _t(acnt.size());
00226 schur_complement<int, double> schur(cg, linp, linr, _s, _t);
00227
00228 vector<double> _h(tcnt.size());
00229 cg(_f, _u);
00230 linr(_u, _h);
00231 sub_scale(_h, _g, 1.0);
00232
00233 #if 0
00234
00235
00236
00237 simple_triple_product<int, double> triple_product;
00238 vector<int> pnod(tcnt.size(), 0);
00239 for(int i = 0; i < tcnt.size(); i++) pnod[i] = i;
00240 vector<int> pcnt;
00241 vector<int> pcol;
00242 vector<double> pele;
00243 triple_product(acnt, acol, aele, bcnt, bcol, bele, pcnt, pcol, pele);
00244 linear_operator<int, double> plin(pcnt, pcol, pele);
00245
00246 network::barrier();
00247 network::time(ta);
00248 epsilon = 0.0, omega = 1.0;
00249 max_level = 16;
00250 min_nodes = 1;
00251 gauss_seidel = true;
00252 amg_solver<int, double> pcg(pnod, pcnt, pcol, pele, max_level, min_nodes, epsilon, omega, gauss_seidel);
00253
00254
00255
00256
00257
00258
00259
00260
00261
00262
00263
00264
00265
00266
00267 network::time(tb);
00268 network::barrier();
00269 if(rank == 0) cout << "Rank: " << rank << " PAMG Setup Time: " << tb - ta << " sec" << endl;
00270
00271 #else
00272 vector<double> mdia(acnt.size(), 0.0);
00273 for(int k = 0, i = 0; i < acnt.size(); i++)
00274 {
00275 for(int j = 0; j < acnt[i]; j++, k++)
00276 {
00277 if(i == acol[k]) mdia[i] = aele[k];
00278 }
00279 }
00280 acom.accumulate(mdia);
00281 for(int i = 0; i < acnt.size(); i++) mdia[i] = 1.0 / mdia[i];
00282 diagonal_solver<int, double> mass(mdia, acom);
00283 schur_complement<int, double> prec(mass, linp, linr, _s, _t);
00284 vector<double> pdia(tcnt.size(), 0.0);
00285 for(int k = 0, i = 0; i < tcnt.size(); i++)
00286 {
00287 double sum = 0.0;
00288 for(int j = 0; j < tcnt[i]; j++, k++)
00289 {
00290 sum += tele[k] * tele[k] * mdia[tcol[k]];
00291 }
00292 pdia[i] = 1.0 / sum;
00293
00294 }
00295 diagonal_solver<int, double> pinv(pdia, acom);
00296 conjugate_gradient<int, double> pcg(prec, pinv, sca, 1.0e-6, 32, false);
00297 #endif
00298
00299 int n = 0;
00300 double s0, s1;
00301 {
00302 double s = 0.0;
00303 vector<double> t(_h);
00304 schur(_p, t);
00305 sub_scale(t, _h, 1.0);
00306 vector<double> tt(t);
00307 scom.accumulate(tt);
00308 scalar_product(tt, t, s);
00309 scom.collect(s);
00310
00311 s0 = s;
00312 }
00313 network::time(ta);
00314 vector_product<int, double> scal(scom);
00315 conjugate_gradient<int, double> scg(schur, pcg, scal, 1.0e-6, 256, true);
00316 scg(_h, _p);
00317 n = scg.iterations();
00318 network::time(tb);
00319 cout << "Rank: " << rank << " ITERATIONS: " << n << " SOLVE Time: " << tb - ta << " (" << (tb - ta) / n << ") sec" << endl;
00320 cout << "Rank: " << rank << " MFLOPS: " << 2.0 * (aele.size() + 2 * bele.size()) * n / ((tb - ta) * 1000000.0) << endl;
00321 {
00322 double s = 0.0;
00323 vector<double> t(_h);
00324 schur(_p, t);
00325 sub_scale(t, _h, 1.0);
00326 vector<double> tt(t);
00327 scom.accumulate(tt);
00328 scalar_product(tt, t, s);
00329 scom.collect(s);
00330
00331 s1 = s;
00332 }
00333
00334
00335 linp(_p, _u);
00336 _t.assign(_f.begin(), _f.end());
00337 sub_scale(_t, _u, 1.0);
00338 cg(_t, _u);
00339
00340 network::time(gtc);
00341
00342
00343 }
00344
00345
00346
00347
00348 void SolverGPU_AGM::solveAgain(const SparseMatrix<double> &M,VecDouble &sol,const VecDouble &rhs)
00349 {
00350 solve(M,sol,rhs);
00351 }
00352
00353
00354
00355
00356
00357 void SolverGPU_AGM::solve(const SparseMatrix<double> &M,VecDouble &sol,const VecDouble &rhs)
00358 {
00359 try {
00360 solve(M,sol,rhs,m_tol,m_nIt);
00361 }
00362 catch (Exception *e)
00363 {
00364 std::cout << "Printing Matrix\n";
00365 std::cout << "IsSymetric: " << NumericMethods::IsSymetric(M,1e-10) << std::endl;
00366
00367
00368 throw e;
00369 }
00370
00371 }
00372
00373
00374 void SolverGPU_AGM::add_diagonal(std::vector<int> &cnt,std::vector<int> &col, std::vector<double> &ele,const VecDouble &addElem)
00375 {
00376 assert(cnt.size() == addElem.size());
00377 unsigned indexFirst=0;
00378 unsigned indexSent;
00379 unsigned j;
00380 for (unsigned i=0;i<cnt.size();i++)
00381 {
00382 indexSent=indexFirst+cnt[i];
00383 for (j=indexFirst;j<indexSent;j++)
00384 {
00385 if (col[j] == i)
00386 {
00387 ele[j]+=addElem(j);
00388 break;
00389 }
00390 }
00391 assert(j < indexSent);
00392 indexFirst=indexSent;
00393 }
00394 }
00395
00396
00397 void SolverGPU_AGM::gpu_agmII(std::vector<int> &cnt,std::vector<int> &col, std::vector<double> &ele_amg,std::vector<double> &ele_cg, std::vector<double> &_b, std::vector<double> &_x,double tol,unsigned nIt)
00398 {
00399
00400
00401
00402
00403 double gta, gtb, gtc, ta, tb;
00404 network::time(gta);
00405 int size, rank;
00406 network::size(size);
00407 network::rank(rank);
00408
00409 std::vector<int> nod(cnt.size());
00410 for(int i = 0; i < cnt.size(); i++) nod[i] = i;
00411
00413 network::barrier();
00414 network::time(ta);
00415 communicator<int, double> com(nod);
00416 network::time(tb);
00417 network::barrier();
00418
00419
00420 linear_operator<int, double> lin(cnt, col, ele_cg);
00421 vector_product<int, double> sca(com);
00422
00423
00424 network::barrier();
00425 network::time(ta);
00426 double epsilon = 0.0, omega = 1;
00427 int max_level = 16;
00428 int min_nodes = 1;
00429 bool gauss_seidel = false;
00430 static amg_solver<int, double> amg(nod, cnt, col, ele_amg, max_level, min_nodes, epsilon, omega, gauss_seidel);
00431 network::time(tb);
00432 network::barrier();
00433
00434
00435
00436 int n = 0;
00437 {
00438 double s = 0.0;
00439 vector<double> t(_b);
00440 lin(_x, t);
00441 sub_scale(t, _b, 1.0);
00442 vector<double> tt(t);
00443 com.accumulate(tt);
00444 scalar_product(tt, t, s);
00445 com.collect(s);
00446
00447 }
00448 network::barrier();
00449 network::time(ta);
00450 conjugate_gradient<int, double> cg(lin, amg, sca, tol, nIt, false);
00451 cg(_b, _x);
00452 n = cg.iterations();
00453 network::time(tb);
00454 network::barrier();
00455 cout << "Rank: " << rank << " ITERATIONS: " << n << " SOLVE Time: " << tb - ta << " (" << (tb - ta) / n << ") sec" << endl;
00456
00457
00458 {
00459 double s = 0.0;
00460 vector<double> t(_b);
00461 lin(_x, t);
00462 sub_scale(t, _b, 1.0);
00463 vector<double> tt(t);
00464 com.accumulate(tt);
00465 scalar_product(tt, t, s);
00466 com.collect(s);
00467
00468 }
00469
00470 network::time(gtc);
00471 if(rank == 0) cout << "Rank: " << rank << " GLOBAL Time: " << gtc - gta << " (" << gtc - gtb << ") sec" << endl;
00472
00473
00474 if (n >= nIt)
00475 {
00476
00477 throw new Exception("AGM did not converged %d %d",n,nIt);
00478 }
00479
00480 }
00481
00482
00483
00484
00485
00486
00487
00488
00489
00490
00491
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501
00502
00503
00504
00505
00506
00507
00508
00509
00510
00511
00512
00513 void SolverGPU_AGM::convertSparseMatrix(const SparseMatrix<double> &A,std::vector<int> &vCnt,std::vector<int> &vCols, std::vector<double> &vElem)
00514 {
00515 vElem.resize(A.n_nonzero_elements());
00516 vCols.resize(A.n_nonzero_elements());
00517 vCnt.resize(A.m());
00518
00519
00520 const SparsityPattern &pattern = A.get_sparsity_pattern();
00521 for (unsigned i=0;i<vCnt.size();i++)
00522 {
00523 vCnt[i] = pattern.row_length(i);
00524 }
00525
00526 std::vector<double>::iterator itElem = vElem.begin();
00527 std::vector<int>::iterator itCols = vCols.begin();
00528 for (SparseMatrix<double>::const_iterator it = A.begin();it!=A.end();it++,itElem++,itCols++)
00529 {
00530 *itElem = it->value();
00531 *itCols = it->column();
00532 }
00533
00534
00535 if (pattern.optimize_diagonal())
00536 {
00537 itElem = vElem.begin();
00538 itCols = vCols.begin();
00539 unsigned nRows = A.m();
00540
00541 for (unsigned i=0;i<nRows;i++)
00542 {
00543
00544 int nNonDiagColsPerRow = pattern.row_length(i) - 1;
00545 assert(nNonDiagColsPerRow >= 0);
00546 std::vector<double>::iterator itElemDiag = itElem++;
00547 std::vector<int>::iterator itColDiag = itCols++;
00548 for (int j=0;j<nNonDiagColsPerRow;j++)
00549 {
00550 if (*itColDiag > *itCols)
00551 {
00552 std::swap(*itColDiag++,*itCols++);
00553 std::swap(*itElemDiag++,*itElem++);
00554 }
00555 else
00556 {
00557 itCols++;
00558 itElem++;
00559 }
00560 }
00561 }
00562 }
00563
00564 }