00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 #ifndef _LA_MAT_H_
00022 #define _LA_MAT_H_
00023 #include "la_traits.h"
00024 
00025 namespace LA {
00026 
00027 
00031 template <typename T>
00032 class NRMat{
00033 protected:
00034         int nn;
00035         int mm;
00036 #ifdef MATPTR
00037         T **v;
00038 #else
00039         T *v;
00040 #endif
00041         int *count;
00042 #ifdef CUDALA
00043         GPUID location;
00044 #endif
00045 public:
00046         friend class NRVec<T>;
00047         friend class NRSMat<T>;
00048         
00050         ~NRMat();
00051 
00052         
00055         inline NRMat() : nn(0), mm(0), v(0), count(0){
00056                 #ifdef CUDALA
00057                 location = DEFAULT_LOC;
00058                 #endif
00059         };
00060 
00061         
00067         inline NRMat(const int n, const int m, const GPUID loc = undefined);
00068 
00070         inline NRMat(const T &a, const int n, const int m, const GPUID loc);
00071 
00073         inline NRMat(const T &a, const int n, const int m);
00074 
00076         NRMat(const T *a, const int n, const int m);
00077 
00079         inline NRMat(const NRMat &rhs);
00080 
00082         NRMat(const typename LA_traits_complex<T>::NRMat_Noncomplex_type &rhs, bool imagpart = false);
00083 
00085         explicit NRMat(const NRSMat<T> &rhs);
00086 
00088 #ifdef MATPTR
00089         explicit NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset = 0):NRMat(&rhs[0][0] + offset , n, m){
00090                 if (offset < 0 || n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");
00091         };
00092 #else
00093         explicit NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset = 0);
00094 #endif
00095 
00096 #ifdef MATPTR
00097         const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits<T>::gencmp(v[0],rhs.v[0],nn*mm);} 
00098 #else
00099         const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits<T>::gencmp(v,rhs.v,nn*mm);} 
00100 #endif
00101 
00102         const bool operator==(const NRMat &rhs) const {return !(*this != rhs);};
00103 
00105         inline int getcount() const {return count?*count:0;}
00106 
00108         void copyonwrite();
00109 
00110         
00115 #ifdef CUDALA
00116         inline GPUID getlocation() const {return location;}
00117         void moveto(const GPUID dest);
00118 #else
00119         inline GPUID getlocation() const {return cpu;}
00120         void moveto(const GPUID dest) {};
00121 #endif
00122 
00124         void randomize(const typename  LA_traits<T>::normtype &x);
00125 
00127         NRMat & operator=(const NRMat &rhs);
00129         NRMat & operator|=(const NRMat &rhs);
00130 
00132         NRMat & operator=(const T &a);
00134         NRMat & operator+=(const T &a);
00136         NRMat & operator-=(const T &a);
00137 
00139         NRMat & operator*=(const T &a);
00140 
00142         NRMat & operator+=(const NRMat &rhs);
00144         NRMat & operator-=(const NRMat &rhs);
00146         NRMat & operator^=(const NRMat<T> &rhs);
00147 
00149         NRMat & operator+=(const NRSMat<T> &rhs);
00151         NRMat & operator-=(const NRSMat<T> &rhs);
00152 
00154         const NRMat operator-() const;
00155 
00157         inline const NRMat operator+(const T &a) const;
00159         inline const NRMat operator-(const T &a) const;
00161         inline const NRMat operator*(const T &a) const;
00162 
00164         inline const NRMat operator+(const NRMat &rhs) const;
00166         inline const NRMat operator+(const NRSMat<T> &rhs) const;
00167         
00169         inline const NRMat operator-(const NRMat &rhs) const;
00171         inline const NRMat operator-(const NRSMat<T> &rhs) const;
00172         
00174         const NRMat operator*(const NRMat &rhs) const;
00176         const NRMat operator*(const NRSMat<T> &rhs) const; 
00177 
00179         const NRMat operator&(const NRMat &rhs) const;
00181         const NRMat operator|(const NRMat<T> &rhs) const;
00182 
00184         const NRVec<T> operator*(const NRVec<T> &rhs) const {
00185                 NRVec<T> result(nn, rhs.getlocation());
00186                 result.gemv((T)0, *this, 'n', (T)1, rhs);
00187                 return result;
00188         };
00190         const NRVec<complex<T> > operator*(const NRVec<complex<T> > &rhs) const {
00191                 NRVec<complex<T> > result(nn, rhs.getlocation());
00192                 result.gemv((T)0, *this, 'n', (T)1, rhs);
00193                 return result;
00194         };
00195 
00197         const T dot(const NRMat &rhs) const;
00198 
00200         const NRMat oplus(const NRMat &rhs) const;
00202         const NRMat otimes(const NRMat &rhs, bool reversecolumns = false) const;
00203 
00205         void diagmultl(const NRVec<T> &rhs);
00207         void diagmultr(const NRVec<T> &rhs);
00208 
00210         const NRSMat<T> transposedtimes() const;
00212         const NRSMat<T> timestransposed() const;
00213 
00215         const NRVec<T> rsum() const;
00217         const NRVec<T> csum() const;
00218 
00220         void orthonormalize(const bool rowcol, const NRSMat<T> *metric = NULL);
00221 
00223         const NRVec<T> row(const int i, int l = -1) const;
00224 
00226         const NRVec<T> column(const int j, int l = -1) const {
00227                 NOT_GPU(*this);
00228                 if(l < 0) l = nn;
00229                 NRVec<T> r(l);
00230                 for(register int i=0; i<l; ++i) r[i] = (*this)(i,j);
00231                 return r;
00232         };
00233 
00235         const T* diagonalof(NRVec<T> &, const bool divide = 0, bool cache = false) const;
00237         void diagonalset(const NRVec<T> &);
00238 
00240         void gemv(const T beta, NRVec<T> &r, const char trans, const T alpha, const NRVec<T> &x) const { r.gemv(beta, *this, trans, alpha, x); };
00242         void gemv(const T beta, NRVec<complex<T> > &r, const char trans, const T alpha, const NRVec<complex<T> > &x) const { r.gemv(beta, *this, trans, alpha, x); };
00243 
00245         inline T* operator[](const int i);
00247         inline const T* operator[](const int i) const;
00248 
00250         inline T& operator()(const int i, const int j);
00252         inline const T& operator()(const int i, const int j) const;
00254         inline const T get_ij(const int i, const int j) const;
00255 
00257         inline int nrows() const;
00259         inline int ncols() const;
00261         inline int size() const;
00262 
00264         void get(int fd, bool dimensions = 1, bool transposed = false);
00266         void put(int fd, bool dimensions = 1, bool transposed = false) const;
00268         void fprintf(FILE *f, const char *format, const int modulo) const;
00270         void fscanf(FILE *f, const char *format);
00271 
00273         void clear(){
00274                 if(nn&&mm){
00275                         copyonwrite();
00276                         LA_traits<T>::clear((*this)[0], nn*mm);
00277                 }
00278         };
00279 
00281         void resize(int n, int m);
00282 
00284         inline operator T*();
00286         inline operator const T*() const;
00287 
00289         NRMat& transposeme(const int n = 0);
00291         NRMat& conjugateme();
00292 
00294         const NRMat transpose(bool conj = false) const;
00296         const NRMat conjugate() const;
00297 
00299         const NRMat submatrix(const int fromrow, const int torow, const int fromcol, const int tocol) const;
00300 
00302         void storesubmatrix(const int fromrow, const int fromcol, const NRMat &rhs);
00303 
00305         void gemm(const T &beta, const NRMat &a, const char transa, const NRMat &b, const char transb, const T &alpha);
00306 
00308         const typename LA_traits<T>::normtype norm(const T scalar = (T)0) const;
00309 
00311         void axpy(const T alpha, const NRMat &x);
00312 
00314         inline const T amax() const;
00316         inline const T amin() const;
00317 
00319         const T trace() const;
00320 
00322         NRMat & swap_rows();
00324         NRMat & swap_cols();
00326         NRMat & swap_rows_cols();
00327 
00329         SparseSMat<T> operator*(const SparseSMat<T> &rhs) const;
00330 
00332         explicit NRMat(const SparseMat<T> &rhs);                
00334         explicit NRMat(const SparseSMat<T> &rhs);
00335 
00337         NRMat & operator+=(const SparseMat<T> &rhs);
00339         NRMat & operator-=(const SparseMat<T> &rhs);
00340 
00342         void gemm(const T &beta, const SparseMat<T> &a, const char transa, const NRMat &b, const char transb, const T &alpha);
00343 
00344         inline void simplify() {};
00345         bool issymmetric() const { return 0; };
00346 
00347 #ifndef NO_STRASSEN
00349         void strassen(const T beta, const NRMat &a, const char transa, const NRMat &b, const char transb, const T alpha);
00350         void s_cutoff(const int,const int,const int,const int) const;
00351 #endif
00352 };
00353 
00354 }
00355 
00356 
00357 #include "vec.h"
00358 #include "smat.h"
00359 #include "sparsemat.h"
00360 #include "sparsesmat.h"
00361 
00362 namespace LA {
00363 
00364 
00371 template <typename T>
00372 NRMat<T>::NRMat(const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int) {
00373         T* p;
00374         *count = 1;
00375         const int nm = n*m;
00376 #ifdef CUDALA
00377         location = (loc==undefined?DEFAULT_LOC:loc);
00378         if(location == cpu) {
00379 #endif
00380         #ifdef MATPTR
00381                 v = new T*[n];
00382                 p = v[0] = new T[nm];
00383                 for (int i=1; i<n; i++) v[i] = v[i-1] + m;
00384         #else
00385                 p = v = new T[nm];
00386         #endif
00387 #ifdef CUDALA
00388         }else{
00389                 const T val = 0;
00390                 v = (T*) gpualloc(nm*sizeof(T));
00391         }
00392 #endif
00393 }
00394 
00395 
00402 template <typename T>
00403 NRMat<T>::NRMat(const T &a, const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int) {
00404         const int nm = n*m;
00405         T *p;
00406         *count = 1;
00407 
00408 #ifdef CUDALA
00409         location = (loc==undefined?DEFAULT_LOC:loc);
00410         if(location==cpu){
00411 #endif
00412         #ifdef MATPTR
00413                 v = new T*[n];
00414                 p = v[0] = new T[nm];
00415                 for (register int i=1; i<n; i++) v[i] = v[i-1] + m;
00416         #else
00417                 p = v = new T[nm];
00418         #endif
00419                 if (a != (T)0){
00420                         for (register int i=0; i<nm; i++) *p++ = a;
00421                 }else{
00422                         memset(p, 0, nm*sizeof(T));
00423                 }
00424 #ifdef CUDALA
00425         }else{
00426                 if(sizeof(T)%sizeof(float) != 0)laerror("memory alignment error");
00427 
00428                 v = (T*)gpualloc(nm*sizeof(T));
00429                 smart_gpu_set(nm, a, v);
00430         }
00431 #endif
00432 }
00433 
00434 
00441 template <typename T>
00442 NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new int) {
00443         const int nm = n*m;
00444         T *p;
00445         *count = 1;
00446 
00447 #ifdef CUDALA
00448         location = DEFAULT_LOC;
00449         if(location==cpu){
00450 #endif
00451         #ifdef MATPTR
00452                 v = new T*[n];
00453                 p = v[0] = new T[nm];
00454                 for (register int i=1; i<n; i++) v[i] = v[i-1] + m;
00455         #else
00456                 p = v = new T[m*n];
00457         #endif
00458         if (a != (T)0)
00459                 for (register int i=0; i<nm; i++) *p++ = a;
00460         else
00461                 memset(p, 0, nm*sizeof(T));
00462 #ifdef CUDALA
00463         }else{
00464                 v = (T*)gpualloc(nm*sizeof(T));
00465                 smart_gpu_set(nm, a, v);
00466         }
00467 #endif
00468 }
00469 
00470 
00477 template <typename T>
00478 NRMat<T>::NRMat(const T *a, const int n, const int m) : nn(n), mm(m), count(new int) {
00479         const int nm = n*m;
00480 #ifdef CUDALA
00481         location = DEFAULT_LOC;
00482 #endif
00483 
00484         *count = 1;
00485 #ifdef CUDALA
00486         if(location==cpu){
00487 #endif
00488         #ifdef MATPTR
00489                 v = new T*[n];
00490                 v[0] = new T[nm];
00491                 for (register int i=1; i<n; i++) v[i] = v[i-1] + m;
00492                 memcpy(v[0], a, nm*sizeof(T));
00493         #else
00494                 v = new T[nm];
00495                 memcpy(v, a, nm*sizeof(T));
00496         #endif
00497 #ifdef CUDALA
00498         }else{
00499                 v = (T*) gpualloc(nm*sizeof(T));
00500                 cublasSetVector(nm, sizeof(T), a, 1, v, 1);
00501         }
00502 #endif
00503 
00504 }
00505 
00506 
00511 template <typename T>
00512 NRMat<T>::NRMat(const NRMat &rhs) {
00513 #ifdef CUDALA
00514         location = rhs.location;
00515 #endif
00516         nn = rhs.nn;
00517         mm = rhs.mm;
00518         count = rhs.count;
00519         v = rhs.v;
00520         if (count) ++(*count);
00521 }
00522 
00523 
00528 template <typename T>
00529 NRMat<T>::NRMat(const NRSMat<T> &rhs) {
00530         NOT_GPU(rhs); 
00531 
00532 #ifdef CUDALA
00533         location = rhs.location;
00534 #endif
00535 
00536         int i(0), j(0), k(0);
00537         nn = mm = rhs.nrows();
00538          count = new int;
00539         *count = 1;
00540 #ifdef MATPTR
00541         v = new T*[nn];
00542         v[0] = new T[mm*nn];
00543         for (int i=1; i<nn; i++) v[i] = v[i-1] + mm;
00544 #else
00545         v = new T[mm*nn];
00546 #endif
00547 
00548 #ifdef MATPTR
00549         for (i=0; i<nn; i++){
00550                 for (j=0; j<=i; j++){
00551                         v[i][j] = v[j][i] = rhs[k++];
00552                 }
00553         }
00554 #else
00555         for (i=0; i<nn; i++){
00556                 for (j=0; j<=i; j++){
00557                         v[i*nn + j] = v[j*nn + i] = rhs[k++];
00558                 }
00559         }
00560 #endif
00561 }
00562  
00563 
00570 #ifndef MATPTR
00571 template <typename T>
00572 NRMat<T>::NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset)
00573 {
00574         if (offset < 0 || n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");
00575 
00576 #ifdef CUDALA
00577         location=rhs.location;
00578 #endif
00579 
00580         nn = n;
00581         mm = m;
00582         count = rhs.count;
00583         v = rhs.v + offset;
00584         (*count)++;
00585 }
00586 #endif
00587 
00588 
00594 template <typename T>
00595 inline const NRMat<T> NRMat<T>::operator+(const NRSMat<T> &rhs) const {
00596         return NRMat<T>(*this) += rhs;
00597 }
00598 
00599 
00605 template <typename T>
00606 inline const NRMat<T> NRMat<T>::operator-(const NRSMat<T> &rhs) const {
00607         return NRMat<T>(*this) -= rhs;
00608 }
00609 
00610 
00614 template <typename T>
00615 inline T* NRMat<T>::operator[](const int i) {
00616 #ifdef DEBUG
00617         if (_LA_count_check && *count != 1) laerror("matrix with *count>1 used as l-value");
00618         if (i < 0 || i >= nn) laerror("Mat [] out of range");
00619         if (!v) laerror("unallocated matrix");
00620 #endif
00621         NOT_GPU(*this);
00622         #ifdef MATPTR
00623                 return v[i];
00624         #else
00625                 return v + i*mm;
00626         #endif
00627 }
00628 
00629 
00633 template <typename T>
00634 inline const T* NRMat<T>::operator[](const int i) const {
00635 #ifdef DEBUG
00636         if (i < 0 || i >= nn) laerror("index out of range");
00637         if (!v) laerror("unallocated matrix");
00638 #endif
00639         NOT_GPU(*this);
00640         #ifdef MATPTR
00641                 return v[i];
00642         #else
00643                 return v + i*mm;
00644         #endif
00645 }
00646 
00647 
00654 template <typename T>
00655 inline T& NRMat<T>::operator()(const int i, const int j){
00656 #ifdef DEBUG
00657         if (_LA_count_check && *count != 1) laerror("NRMat::operator(,) used as l-value for a matrix with count > 1");
00658         if (i < 0 || i >= nn && nn > 0 || j < 0 || j >= mm && mm > 0) laerror("index out of range");
00659         if (!v) laerror("unallocated matrix");
00660 #endif
00661         NOT_GPU(*this);
00662         #ifdef MATPTR
00663                 return v[i][j];
00664         #else
00665                 return v[i*mm + j];
00666         #endif
00667 }
00668 
00669 
00675 template <typename T>
00676 inline const T& NRMat<T>::operator()(const int i, const int j) const{
00677         T ret;
00678 #ifdef DEBUG
00679         if (i<0 || i>=nn && nn>0 || j<0 || j>=mm && mm>0) laerror("index out of range");
00680         if (!v) laerror("unallocated matrix");
00681 #endif
00682         NOT_GPU(*this);
00683         #ifdef MATPTR
00684                 return v[i][j];
00685         #else
00686                 return v[i*mm + j];
00687         #endif
00688 }
00689 
00690 
00696 template <typename T>
00697 inline const T NRMat<T>::get_ij(const int i, const int j) const{
00698         T ret;
00699 #ifdef DEBUG
00700         if (i<0 || i>=nn || j<0 || j>=mm) laerror("index out of range");
00701         if (!v) laerror("unallocated matrix");
00702 #endif
00703 #ifdef CUDALA
00704         if(location == cpu){
00705 #endif
00706         #ifdef MATPTR
00707                 return v[i][j];
00708         #else
00709                 return v[i*mm + j];
00710         #endif
00711 #ifdef CUDALA
00712         }else{
00713                 const int pozice = i*mm + j;
00714                 gpuget(1, sizeof(T), v + pozice, &ret);
00715                 return ret;
00716         }
00717 #endif
00718 }
00719 
00720 
00723 template <typename T>
00724 inline int NRMat<T>::nrows() const{
00725         return nn;
00726 }
00727 
00728 
00731 template <typename T>
00732 inline int NRMat<T>::ncols() const{
00733         return mm;
00734 }
00735 
00736 
00739 template <typename T>
00740 inline int NRMat<T>::size() const{
00741         return nn*mm;
00742 }
00743 
00744 
00747 template <typename T>
00748 inline NRMat<T>::operator T*(){
00749         #ifdef DEBUG
00750                 if (!v) laerror("unallocated matrix");
00751         #endif
00752         #ifdef MATPTR
00753                 return v[0];
00754         #else
00755                 return v;
00756         #endif
00757 }
00758 
00759 
00762 template <typename T>
00763 inline NRMat<T>::operator const T*() const{
00764         #ifdef DEBUG
00765                 if (!v) laerror("unallocated matrix");
00766         #endif
00767         #ifdef MATPTR
00768                 return v[0];
00769         #else
00770                 return v;
00771         #endif
00772 }
00773 
00774 
00779 template<>
00780 inline const double NRMat<double>::amax() const{
00781 #ifdef CUDALA
00782         if(location == cpu){
00783 #endif
00784         #ifdef MATPTR
00785                 return v[0][cblas_idamax(nn*mm, v[0], 1) - 1];
00786         #else
00787                 return v[cblas_idamax(nn*mm, v, 1) - 1];
00788         #endif
00789 #ifdef CUDALA
00790         }else{
00791                 double ret(0.0);
00792                 const int pozice = cublasIdamax(nn*mm, v, 1) - 1;
00793                 TEST_CUBLAS("cublasIdamax");
00794                 gpuget(1, sizeof(double), v + pozice, &ret);
00795                 return ret;
00796         }
00797 #endif
00798 }
00799 
00800 
00805 template<>
00806 inline const double NRMat<double>::amin() const{
00807         double ret(0.0);
00808 #ifdef CUDALA
00809         if(location == cpu){
00810 #endif
00811                 
00812                 const int nm = nn*mm;
00813                 double val(0.0);
00814                 int index(-1);
00815                 ret = std::numeric_limits<double>::max();
00816                 for(register int i=0; i < nm; i++){
00817                 #ifdef MATPTR                   
00818                         val = std::abs(v[0][i]);
00819                 #else
00820                         val = std::abs(v[i]);
00821                 #endif
00822                         if(val < ret){ index = i; ret = val; }
00823                 }
00824                 #ifdef MATPTR
00825                         ret = v[0][index];
00826                 #else
00827                         ret = v[index];
00828                 #endif  
00829 #ifdef CUDALA
00830         }else{
00831                 const int pozice = cublasIdamin(nn*mm, v, 1) - 1;
00832                 TEST_CUBLAS("cublasIdamin");
00833                 gpuget(1, sizeof(double), v + pozice, &ret);
00834         }
00835 #endif
00836         return ret;
00837 }
00838 
00839 
00844 template<>
00845 inline const complex<double> NRMat<complex<double> >::amax() const{
00846 #ifdef CUDALA
00847         if(location == cpu){
00848 #endif
00849         #ifdef MATPTR
00850                 return v[0][cblas_izamax(nn*mm, v[0], 1) - 1];
00851         #else
00852                 return v[cblas_izamax(nn*mm, v, 1) - 1];
00853         #endif
00854 #ifdef CUDALA
00855         }else{
00856                 complex<double> ret(0.0, 0.0);
00857                 const int pozice = cublasIzamax(nn*mm, (cuDoubleComplex*)v, 1) - 1;
00858                 TEST_CUBLAS("cublasIzamax");
00859                 gpuget(1, sizeof(complex<double>), v + pozice, &ret);
00860                 return ret;
00861         }
00862 #endif
00863 
00864 }
00865 
00866 
00871 template<>
00872 inline const complex<double> NRMat<complex<double> >::amin() const{
00873         complex<double> ret(0.0, 0.0);
00874 #ifdef CUDALA
00875         if(location == cpu){
00876 #endif
00877                 
00878                 const int nm = nn*mm;
00879                 int index(-1);
00880                 double val(0.0), min_val(0.0);
00881                 complex<double> z_val(0.0, 0.0);
00882 
00883                 min_val = std::numeric_limits<double>::max();
00884                 for(register int i=0; i < nm; i++){
00885                 #ifdef MATPTR                   
00886                         z_val = v[0][i];
00887                 #else
00888                         z_val = v[i];
00889                 #endif
00890                         val = std::abs(z_val.real()) + std::abs(z_val.imag());
00891                         if(val < min_val){ index = i; min_val = val; }
00892                 }
00893                 #ifdef MATPTR
00894                         ret = v[0][index];
00895                 #else
00896                         ret = v[index];
00897                 #endif  
00898 #ifdef CUDALA
00899         }else{
00900                 const int pozice = cublasIzamin(nn*mm, (cuDoubleComplex*)v, 1) - 1;
00901                 TEST_CUBLAS("cublasIzamin");
00902                 gpuget(1, sizeof(complex<double>), v + pozice, &ret);
00903         }
00904 #endif
00905         return ret;
00906 }
00907 
00908 
00912 template <typename T>
00913 NRMat<T>::~NRMat() {
00914         if(!count) return;
00915         if(--(*count) <= 0) {
00916                 if (v){
00917 #ifdef CUDALA
00918                         if(location == cpu){
00919 #endif
00920                         #ifdef MATPTR
00921                                 delete[] (v[0]);
00922                         #endif
00923                                 delete[] v;
00924 #ifdef CUDALA
00925                         }else{
00926                                 gpufree(v);
00927                         }
00928 #endif
00929                 }
00930                 delete count;
00931         }
00932 }
00933 
00934 
00939 template <typename T>
00940 NRMat<T> & NRMat<T>::operator=(const NRMat<T> &rhs) {
00941         if (this != &rhs){
00942                 if (count){
00943                         if (--(*count) ==0 ){
00944 #ifdef CUDALA
00945                                 if(location == cpu){
00946 #endif
00947                                 #ifdef MATPTR
00948                                         delete[] (v[0]);
00949                                 #endif
00950                                         delete[] v;
00951 #ifdef CUDALA
00952                                 }else{ gpufree(v); }
00953 #endif
00954                                 delete count;
00955                         }
00956                 }
00957                 v = rhs.v;
00958 #ifdef CUDALA
00959                 location = rhs.location;
00960 #endif
00961                 nn = rhs.nn;
00962                 mm = rhs.mm;
00963                 count = rhs.count;
00964                 if(count) (*count)++;
00965         }
00966         return *this;
00967 }
00968 
00969 
00970 
00975 template <typename T>
00976 NRMat<T> & NRMat<T>::operator|=(const NRMat<T> &rhs) {
00977         if(this == &rhs) return *this; 
00978         *this = rhs;
00979         this->copyonwrite();
00980         return *this;
00981 }
00982 
00983 
00987 template <typename T>
00988 void NRMat<T>::copyonwrite() {
00989         if(!count) laerror("attempt to call copyonwrite() for a matrix with count == 0");
00990         if(*count > 1){
00991                 (*count)--;
00992                 count = new int;
00993                 *count = 1;
00994 #ifdef CUDALA
00995                 if(location == cpu){ 
00996 #endif
00997                 #ifdef MATPTR
00998                         T **newv = new T*[nn];
00999                         newv[0] = new T[mm*nn];
01000                         memcpy(newv[0], v[0], mm*nn*sizeof(T));
01001                         v = newv;
01002                         for(register int i=1; i<nn; i++) v[i] = v[i-1] + mm;
01003                 #else
01004                         T *newv = new T[mm*nn];
01005                         memcpy(newv, v, mm*nn*sizeof(T));
01006                         v = newv;
01007                 #endif
01008 #ifdef CUDALA
01009                 }else{ 
01010                         T *newv = (T *) gpualloc(mm*nn*sizeof(T));
01011                         if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem");
01012                         cublasScopy(nn*mm*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
01013                         TEST_CUBLAS("cublasScopy");
01014                         v = newv;
01015                 }
01016 #endif
01017         }
01018 }
01019 
01020 
01027 template <typename T>
01028 void NRMat<T>::resize(int n, int m) {
01029 #ifdef DEBUG
01030         if (n<0 || m<0) laerror("illegal dimensions");
01031 #endif
01032 
01033         
01034         if(n == 0 || m == 0) m = n =0;
01035 
01036         if(count){
01037                 if(n==0 && m==0){
01038                         if(--(*count) <= 0){
01039 #ifdef CUDALA
01040                                 if(location==cpu){
01041 #endif
01042                                 #ifdef MATPTR
01043                                         if(v) delete[] (v[0]);
01044                                 #endif
01045                                         if(v) delete[] v;
01046 #ifdef CUDALA
01047                                 }
01048                                 else { gpufree(v); }
01049 #endif
01050                                 delete count;
01051                         }
01052                         count = 0;
01053                         nn = mm = 0;
01054                         v = 0;
01055                         return;
01056                 }
01057                 
01058 
01059 
01060 
01061                 if(*count > 1){
01062                         (*count)--;
01063                         count = 0;
01064                         nn = mm = 0;
01065                         v = 0;
01066                 }
01067         }
01068 
01069         if(!count){
01070                  count = new int;
01071                 *count = 1;
01072                 nn = n;
01073                 mm = m;
01074 #ifdef CUDALA
01075                 if(location==cpu){
01076 #endif
01077                 #ifdef MATPTR
01078                         v = new T*[nn];
01079                         v[0] = new T[m*n];
01080                         for (register int i=1; i< n; i++) v[i] = v[i-1] + m;
01081                 #else
01082                         v = new T[m*n];
01083                 #endif
01084 #ifdef CUDALA
01085                 }else{
01086                         v = (T *) gpualloc(n*m*sizeof(T));
01087                 }
01088 #endif
01089                 return;
01090         }
01091 
01092         
01093         if (n != nn || m != mm) {
01094                 nn = n;
01095                 mm = m;
01096 #ifdef CUDALA
01097                 if(location==cpu){
01098 #endif
01099                 #ifdef MATPTR
01100                         delete[] (v[0]);
01101                 #endif
01102                         delete[] v;
01103                 #ifdef MATPTR
01104                         v = new T*[nn];
01105                         v[0] = new T[m*n];
01106                         for (int i=1; i< n; i++) v[i] = v[i-1] + m;
01107                 #else
01108                         v = new T[m*n];
01109                 #endif
01110 #ifdef CUDALA
01111                 }else{
01112                         gpufree(v);
01113                         v=(T *) gpualloc(n*m*sizeof(T));
01114                 }
01115 #endif
01116         }
01117 }
01118 
01119 
01120 
01125 template<typename T>
01126 NRMat<complex<T> > complexify(const NRMat<T> &rhs) {
01127         NOT_GPU(rhs);
01128 
01129         NRMat<complex<T> > r(rhs.nrows(), rhs.ncols(), rhs.getlocation());
01130         for(register int i=0; i<rhs.nrows(); ++i){ 
01131                 for(register int j=0; j<rhs.ncols(); ++j) r(i,j) = rhs(i,j);
01132         }
01133         return r;
01134 }
01135 
01136 
01142 template <typename T>
01143 std::ostream& operator<<(std::ostream &s, const NRMat<T> &x) {
01144 #ifdef CUDALA
01145         if(x.getlocation() == cpu){
01146 #endif
01147                 int i(0),j(0);
01148                 int n(x.nrows()), m(x.ncols());
01149                 s << n << ' ' << m << '\n';
01150                 for(i=0; i<n; i++){
01151                         for(j=0; j<m; j++){
01152                                 
01153                                 s << (typename LA_traits_io<T>::IOtype) x[i][j] << (j==m-1 ? '\n' : ' ');
01154                         }
01155                 }
01156                 return s;
01157 #ifdef CUDALA
01158         }else{
01159                 NRMat<T> tmp = x;
01160                 tmp.moveto(cpu);
01161                 return s << tmp;
01162         }
01163 #endif
01164 }
01165 
01166 
01172 template <typename T>
01173 std::istream& operator>>(std::istream  &s, NRMat<T> &x)
01174 {
01175 #ifdef CUDALA
01176         if(x.getlocation() == cpu){
01177 #endif
01178                 int i(0), j(0), n(0), m(0);
01179                 s >> n >> m;
01180                 x.resize(n, m);
01181                 typename LA_traits_io<T>::IOtype tmp;
01182                 for(i=0;i<n;i++){
01183                         for(j=0; j<m;j++){
01184                                 s >> tmp;
01185                                 x[i][j] = tmp;
01186                         }
01187                 }
01188                 return s;
01189 #ifdef CUDALA
01190         }else{
01191                 NRMat<T> tmp;
01192                 tmp.moveto(cpu);
01193                 s >> tmp;
01194                 tmp.moveto(x.getlocation());
01195                 x = tmp;
01196                 return s;
01197         }
01198 #endif
01199 }
01200 
01201 
01206 template<typename T>
01207 class NRMat_from1 : public NRMat<T> {
01208 public:
01209         NRMat_from1(): NRMat<T>() {};
01210         explicit NRMat_from1(const int n): NRMat<T>(n) {};
01211         NRMat_from1(const NRMat<T> &rhs): NRMat<T>(rhs) {};
01212         NRMat_from1(const int n, const int m): NRMat<T>(n, m) {};
01213         NRMat_from1(const T &a, const int n, const int m): NRMat<T>(a, n, m) {};
01214         NRMat_from1(const T *a, const int n, const int m): NRMat<T>(a, n, m) {};
01215 
01216         inline const T& operator() (const int i, const int j) const {
01217         #ifdef DEBUG
01218                 if (i<1 || i>NRMat<T>::nn || j<1 || j>NRMat<T>::mm) laerror("index out of range");
01219                 if (!NRMat<T>::v) laerror("unallocated matrix");
01220         #endif
01221                 NOT_GPU(*this);
01222         #ifdef MATPTR
01223                 return NRMat<T>::v[i - 1][j - 1];
01224         #else
01225                 return NRMat<T>::v[(i-1)*NRMat<T>::mm+j-1];
01226         #endif
01227         }
01228 
01229         inline  T& operator() (const int i, const int j) {
01230         #ifdef DEBUG
01231                 if (_LA_count_check && *NRMat<T>::count != 1) laerror("matrix with *count > 1 used as l-value");
01232                 if (i<1 || i>NRMat<T>::nn || j<1 || j>NRMat<T>::mm) laerror("index out of range");
01233                 if (!NRMat<T>::v) laerror("unallocated matrix");
01234         #endif
01235                 NOT_GPU(*this);
01236         #ifdef MATPTR
01237                 return NRMat<T>::v[i-1][j-1];
01238         #else
01239                 return NRMat<T>::v[(i-1)*NRMat<T>::mm+j-1];
01240         #endif
01241         }
01242 
01243         inline const T get_ij(const int i, const int j) const {
01244                 T ret;
01245         #ifdef DEBUG
01246                 if (i<1 || i>NRMat<T>::nn || j<1 || j>NRMat<T>::mm) laerror("index out of range");
01247                 if (!NRMat<T>::v) laerror("unallocated matrix");
01248         #endif
01249         #ifdef CUDALA
01250                 if(NRMat<T>::location == cpu){
01251         #endif
01252         #ifdef MATPTR
01253                         return NRMat<T>::v[i - 1][j - 1];
01254         #else
01255                         return NRMat<T>::v[(i-1)*NRMat<T>::mm + (j-1)];
01256         #endif
01257         #ifdef CUDALA
01258                 }else{
01259                         const int pozice = (i-1)*NRMat<T>::mm + (j-1);
01260                         gpuget(1, sizeof(T), NRMat<T>::v + pozice, &ret);
01261                         return ret;
01262                 }
01263         #endif
01264         }
01265 };
01266 
01267 
01273 template<typename T>
01274 NRMat<T>& NRMat<T>::operator^=(const NRMat<T> &rhs){
01275 #ifdef DEBUG
01276         if (nn != rhs.nn || mm != rhs.mm) laerror("incompatible matrices");
01277 #endif
01278         SAME_LOC(*this, rhs);
01279         NOT_GPU(*this);
01280 
01281         copyonwrite();
01282 #ifdef MATPTR
01283         for (register int i=0; i< nn*mm; i++) v[0][i] *= rhs.v[0][i];
01284 #else
01285         const int Dim = nn*mm;
01286         for(register int i=0; i<Dim; i++) v[i] *= rhs.v[i];
01287 #endif
01288         return *this;
01289 }
01290 
01291 
01292 
01297 #ifdef CUDALA
01298 template<typename T>
01299 void NRMat<T>::moveto(const GPUID dest) {
01300         if(location == dest) return;
01301         
01302 
01303 
01304 
01305         CPU_GPU(location, dest);
01306         location = dest;
01307 
01308         if(v && !count) laerror("internal inconsistency of reference counting 1");
01309         if (!count) return;
01310 
01311         if(v && *count==0) laerror("internal inconsistency of reference counting 2");
01312         if(!v) return;
01313 
01314         T *vold = v;
01315 
01316         if(dest == cpu){ 
01317                 v = new T[nn*mm];
01318                 gpuget(nn*mm, sizeof(T), vold, v);
01319                 if(*count == 1){ gpufree(vold); }
01320                 else{ --(*count); count = new int(1); }
01321 
01322         }else{ 
01323                 v = (T *) gpualloc(nn*mm*sizeof(T));
01324                 gpuput(nn*mm, sizeof(T), vold, v);
01325                 if(*count == 1) delete[] vold;
01326                 else{ --(*count); count = new int(1);}
01327         }
01328 }
01329 #endif
01330 
01331 
01335 NRVECMAT_OPER(Mat, +)
01336 NRVECMAT_OPER(Mat, -)
01337 NRVECMAT_OPER(Mat, *)
01338 
01339 
01343 NRVECMAT_OPER2(Mat, +)
01344 NRVECMAT_OPER2(Mat, -)
01345 
01346 }
01347 #endif