00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #ifndef _LA_MAT_H_
00022 #define _LA_MAT_H_
00023 #include "la_traits.h"
00024
00025 namespace LA {
00026
00027
00031 template <typename T>
00032 class NRMat{
00033 protected:
00034 int nn;
00035 int mm;
00036 #ifdef MATPTR
00037 T **v;
00038 #else
00039 T *v;
00040 #endif
00041 int *count;
00042 #ifdef CUDALA
00043 GPUID location;
00044 #endif
00045 public:
00046 friend class NRVec<T>;
00047 friend class NRSMat<T>;
00048
00050 ~NRMat();
00051
00052
00055 inline NRMat() : nn(0), mm(0), v(0), count(0){
00056 #ifdef CUDALA
00057 location = DEFAULT_LOC;
00058 #endif
00059 };
00060
00061
00067 inline NRMat(const int n, const int m, const GPUID loc = undefined);
00068
00070 inline NRMat(const T &a, const int n, const int m, const GPUID loc);
00071
00073 inline NRMat(const T &a, const int n, const int m);
00074
00076 NRMat(const T *a, const int n, const int m);
00077
00079 inline NRMat(const NRMat &rhs);
00080
00082 NRMat(const typename LA_traits_complex<T>::NRMat_Noncomplex_type &rhs, bool imagpart = false);
00083
00085 explicit NRMat(const NRSMat<T> &rhs);
00086
00088 #ifdef MATPTR
00089 explicit NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset = 0):NRMat(&rhs[0][0] + offset , n, m){
00090 if (offset < 0 || n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");
00091 };
00092 #else
00093 explicit NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset = 0);
00094 #endif
00095
00096 #ifdef MATPTR
00097 const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits<T>::gencmp(v[0],rhs.v[0],nn*mm);}
00098 #else
00099 const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits<T>::gencmp(v,rhs.v,nn*mm);}
00100 #endif
00101
00102 const bool operator==(const NRMat &rhs) const {return !(*this != rhs);};
00103
00105 inline int getcount() const {return count?*count:0;}
00106
00108 void copyonwrite();
00109
00110
00115 #ifdef CUDALA
00116 inline GPUID getlocation() const {return location;}
00117 void moveto(const GPUID dest);
00118 #else
00119 inline GPUID getlocation() const {return cpu;}
00120 void moveto(const GPUID dest) {};
00121 #endif
00122
00124 void randomize(const typename LA_traits<T>::normtype &x);
00125
00127 NRMat & operator=(const NRMat &rhs);
00129 NRMat & operator|=(const NRMat &rhs);
00130
00132 NRMat & operator=(const T &a);
00134 NRMat & operator+=(const T &a);
00136 NRMat & operator-=(const T &a);
00137
00139 NRMat & operator*=(const T &a);
00140
00142 NRMat & operator+=(const NRMat &rhs);
00144 NRMat & operator-=(const NRMat &rhs);
00146 NRMat & operator^=(const NRMat<T> &rhs);
00147
00149 NRMat & operator+=(const NRSMat<T> &rhs);
00151 NRMat & operator-=(const NRSMat<T> &rhs);
00152
00154 const NRMat operator-() const;
00155
00157 inline const NRMat operator+(const T &a) const;
00159 inline const NRMat operator-(const T &a) const;
00161 inline const NRMat operator*(const T &a) const;
00162
00164 inline const NRMat operator+(const NRMat &rhs) const;
00166 inline const NRMat operator+(const NRSMat<T> &rhs) const;
00167
00169 inline const NRMat operator-(const NRMat &rhs) const;
00171 inline const NRMat operator-(const NRSMat<T> &rhs) const;
00172
00174 const NRMat operator*(const NRMat &rhs) const;
00176 const NRMat operator*(const NRSMat<T> &rhs) const;
00177
00179 const NRMat operator&(const NRMat &rhs) const;
00181 const NRMat operator|(const NRMat<T> &rhs) const;
00182
00184 const NRVec<T> operator*(const NRVec<T> &rhs) const {
00185 NRVec<T> result(nn, rhs.getlocation());
00186 result.gemv((T)0, *this, 'n', (T)1, rhs);
00187 return result;
00188 };
00190 const NRVec<complex<T> > operator*(const NRVec<complex<T> > &rhs) const {
00191 NRVec<complex<T> > result(nn, rhs.getlocation());
00192 result.gemv((T)0, *this, 'n', (T)1, rhs);
00193 return result;
00194 };
00195
00197 const T dot(const NRMat &rhs) const;
00198
00200 const NRMat oplus(const NRMat &rhs) const;
00202 const NRMat otimes(const NRMat &rhs, bool reversecolumns = false) const;
00203
00205 void diagmultl(const NRVec<T> &rhs);
00207 void diagmultr(const NRVec<T> &rhs);
00208
00210 const NRSMat<T> transposedtimes() const;
00212 const NRSMat<T> timestransposed() const;
00213
00215 const NRVec<T> rsum() const;
00217 const NRVec<T> csum() const;
00218
00220 void orthonormalize(const bool rowcol, const NRSMat<T> *metric = NULL);
00221
00223 const NRVec<T> row(const int i, int l = -1) const;
00224
00226 const NRVec<T> column(const int j, int l = -1) const {
00227 NOT_GPU(*this);
00228 if(l < 0) l = nn;
00229 NRVec<T> r(l);
00230 for(register int i=0; i<l; ++i) r[i] = (*this)(i,j);
00231 return r;
00232 };
00233
00235 const T* diagonalof(NRVec<T> &, const bool divide = 0, bool cache = false) const;
00237 void diagonalset(const NRVec<T> &);
00238
00240 void gemv(const T beta, NRVec<T> &r, const char trans, const T alpha, const NRVec<T> &x) const { r.gemv(beta, *this, trans, alpha, x); };
00242 void gemv(const T beta, NRVec<complex<T> > &r, const char trans, const T alpha, const NRVec<complex<T> > &x) const { r.gemv(beta, *this, trans, alpha, x); };
00243
00245 inline T* operator[](const int i);
00247 inline const T* operator[](const int i) const;
00248
00250 inline T& operator()(const int i, const int j);
00252 inline const T& operator()(const int i, const int j) const;
00254 inline const T get_ij(const int i, const int j) const;
00255
00257 inline int nrows() const;
00259 inline int ncols() const;
00261 inline int size() const;
00262
00264 void get(int fd, bool dimensions = 1, bool transposed = false);
00266 void put(int fd, bool dimensions = 1, bool transposed = false) const;
00268 void fprintf(FILE *f, const char *format, const int modulo) const;
00270 void fscanf(FILE *f, const char *format);
00271
00273 void clear(){
00274 if(nn&&mm){
00275 copyonwrite();
00276 LA_traits<T>::clear((*this)[0], nn*mm);
00277 }
00278 };
00279
00281 void resize(int n, int m);
00282
00284 inline operator T*();
00286 inline operator const T*() const;
00287
00289 NRMat& transposeme(const int n = 0);
00291 NRMat& conjugateme();
00292
00294 const NRMat transpose(bool conj = false) const;
00296 const NRMat conjugate() const;
00297
00299 const NRMat submatrix(const int fromrow, const int torow, const int fromcol, const int tocol) const;
00300
00302 void storesubmatrix(const int fromrow, const int fromcol, const NRMat &rhs);
00303
00305 void gemm(const T &beta, const NRMat &a, const char transa, const NRMat &b, const char transb, const T &alpha);
00306
00308 const typename LA_traits<T>::normtype norm(const T scalar = (T)0) const;
00309
00311 void axpy(const T alpha, const NRMat &x);
00312
00314 inline const T amax() const;
00316 inline const T amin() const;
00317
00319 const T trace() const;
00320
00322 NRMat & swap_rows();
00324 NRMat & swap_cols();
00326 NRMat & swap_rows_cols();
00327
00329 SparseSMat<T> operator*(const SparseSMat<T> &rhs) const;
00330
00332 explicit NRMat(const SparseMat<T> &rhs);
00334 explicit NRMat(const SparseSMat<T> &rhs);
00335
00337 NRMat & operator+=(const SparseMat<T> &rhs);
00339 NRMat & operator-=(const SparseMat<T> &rhs);
00340
00342 void gemm(const T &beta, const SparseMat<T> &a, const char transa, const NRMat &b, const char transb, const T &alpha);
00343
00344 inline void simplify() {};
00345 bool issymmetric() const { return 0; };
00346
00347 #ifndef NO_STRASSEN
00349 void strassen(const T beta, const NRMat &a, const char transa, const NRMat &b, const char transb, const T alpha);
00350 void s_cutoff(const int,const int,const int,const int) const;
00351 #endif
00352 };
00353
00354 }
00355
00356
00357 #include "vec.h"
00358 #include "smat.h"
00359 #include "sparsemat.h"
00360 #include "sparsesmat.h"
00361
00362 namespace LA {
00363
00364
00371 template <typename T>
00372 NRMat<T>::NRMat(const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int) {
00373 T* p;
00374 *count = 1;
00375 const int nm = n*m;
00376 #ifdef CUDALA
00377 location = (loc==undefined?DEFAULT_LOC:loc);
00378 if(location == cpu) {
00379 #endif
00380 #ifdef MATPTR
00381 v = new T*[n];
00382 p = v[0] = new T[nm];
00383 for (int i=1; i<n; i++) v[i] = v[i-1] + m;
00384 #else
00385 p = v = new T[nm];
00386 #endif
00387 #ifdef CUDALA
00388 }else{
00389 const T val = 0;
00390 v = (T*) gpualloc(nm*sizeof(T));
00391 }
00392 #endif
00393 }
00394
00395
00402 template <typename T>
00403 NRMat<T>::NRMat(const T &a, const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int) {
00404 const int nm = n*m;
00405 T *p;
00406 *count = 1;
00407
00408 #ifdef CUDALA
00409 location = (loc==undefined?DEFAULT_LOC:loc);
00410 if(location==cpu){
00411 #endif
00412 #ifdef MATPTR
00413 v = new T*[n];
00414 p = v[0] = new T[nm];
00415 for (register int i=1; i<n; i++) v[i] = v[i-1] + m;
00416 #else
00417 p = v = new T[nm];
00418 #endif
00419 if (a != (T)0){
00420 for (register int i=0; i<nm; i++) *p++ = a;
00421 }else{
00422 memset(p, 0, nm*sizeof(T));
00423 }
00424 #ifdef CUDALA
00425 }else{
00426 if(sizeof(T)%sizeof(float) != 0)laerror("memory alignment error");
00427
00428 v = (T*)gpualloc(nm*sizeof(T));
00429 smart_gpu_set(nm, a, v);
00430 }
00431 #endif
00432 }
00433
00434
00441 template <typename T>
00442 NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new int) {
00443 const int nm = n*m;
00444 T *p;
00445 *count = 1;
00446
00447 #ifdef CUDALA
00448 location = DEFAULT_LOC;
00449 if(location==cpu){
00450 #endif
00451 #ifdef MATPTR
00452 v = new T*[n];
00453 p = v[0] = new T[nm];
00454 for (register int i=1; i<n; i++) v[i] = v[i-1] + m;
00455 #else
00456 p = v = new T[m*n];
00457 #endif
00458 if (a != (T)0)
00459 for (register int i=0; i<nm; i++) *p++ = a;
00460 else
00461 memset(p, 0, nm*sizeof(T));
00462 #ifdef CUDALA
00463 }else{
00464 v = (T*)gpualloc(nm*sizeof(T));
00465 smart_gpu_set(nm, a, v);
00466 }
00467 #endif
00468 }
00469
00470
00477 template <typename T>
00478 NRMat<T>::NRMat(const T *a, const int n, const int m) : nn(n), mm(m), count(new int) {
00479 const int nm = n*m;
00480 #ifdef CUDALA
00481 location = DEFAULT_LOC;
00482 #endif
00483
00484 *count = 1;
00485 #ifdef CUDALA
00486 if(location==cpu){
00487 #endif
00488 #ifdef MATPTR
00489 v = new T*[n];
00490 v[0] = new T[nm];
00491 for (register int i=1; i<n; i++) v[i] = v[i-1] + m;
00492 memcpy(v[0], a, nm*sizeof(T));
00493 #else
00494 v = new T[nm];
00495 memcpy(v, a, nm*sizeof(T));
00496 #endif
00497 #ifdef CUDALA
00498 }else{
00499 v = (T*) gpualloc(nm*sizeof(T));
00500 cublasSetVector(nm, sizeof(T), a, 1, v, 1);
00501 }
00502 #endif
00503
00504 }
00505
00506
00511 template <typename T>
00512 NRMat<T>::NRMat(const NRMat &rhs) {
00513 #ifdef CUDALA
00514 location = rhs.location;
00515 #endif
00516 nn = rhs.nn;
00517 mm = rhs.mm;
00518 count = rhs.count;
00519 v = rhs.v;
00520 if (count) ++(*count);
00521 }
00522
00523
00528 template <typename T>
00529 NRMat<T>::NRMat(const NRSMat<T> &rhs) {
00530 NOT_GPU(rhs);
00531
00532 #ifdef CUDALA
00533 location = rhs.location;
00534 #endif
00535
00536 int i(0), j(0), k(0);
00537 nn = mm = rhs.nrows();
00538 count = new int;
00539 *count = 1;
00540 #ifdef MATPTR
00541 v = new T*[nn];
00542 v[0] = new T[mm*nn];
00543 for (int i=1; i<nn; i++) v[i] = v[i-1] + mm;
00544 #else
00545 v = new T[mm*nn];
00546 #endif
00547
00548 #ifdef MATPTR
00549 for (i=0; i<nn; i++){
00550 for (j=0; j<=i; j++){
00551 v[i][j] = v[j][i] = rhs[k++];
00552 }
00553 }
00554 #else
00555 for (i=0; i<nn; i++){
00556 for (j=0; j<=i; j++){
00557 v[i*nn + j] = v[j*nn + i] = rhs[k++];
00558 }
00559 }
00560 #endif
00561 }
00562
00563
00570 #ifndef MATPTR
00571 template <typename T>
00572 NRMat<T>::NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset)
00573 {
00574 if (offset < 0 || n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");
00575
00576 #ifdef CUDALA
00577 location=rhs.location;
00578 #endif
00579
00580 nn = n;
00581 mm = m;
00582 count = rhs.count;
00583 v = rhs.v + offset;
00584 (*count)++;
00585 }
00586 #endif
00587
00588
00594 template <typename T>
00595 inline const NRMat<T> NRMat<T>::operator+(const NRSMat<T> &rhs) const {
00596 return NRMat<T>(*this) += rhs;
00597 }
00598
00599
00605 template <typename T>
00606 inline const NRMat<T> NRMat<T>::operator-(const NRSMat<T> &rhs) const {
00607 return NRMat<T>(*this) -= rhs;
00608 }
00609
00610
00614 template <typename T>
00615 inline T* NRMat<T>::operator[](const int i) {
00616 #ifdef DEBUG
00617 if (_LA_count_check && *count != 1) laerror("matrix with *count>1 used as l-value");
00618 if (i < 0 || i >= nn) laerror("Mat [] out of range");
00619 if (!v) laerror("unallocated matrix");
00620 #endif
00621 NOT_GPU(*this);
00622 #ifdef MATPTR
00623 return v[i];
00624 #else
00625 return v + i*mm;
00626 #endif
00627 }
00628
00629
00633 template <typename T>
00634 inline const T* NRMat<T>::operator[](const int i) const {
00635 #ifdef DEBUG
00636 if (i < 0 || i >= nn) laerror("index out of range");
00637 if (!v) laerror("unallocated matrix");
00638 #endif
00639 NOT_GPU(*this);
00640 #ifdef MATPTR
00641 return v[i];
00642 #else
00643 return v + i*mm;
00644 #endif
00645 }
00646
00647
00654 template <typename T>
00655 inline T& NRMat<T>::operator()(const int i, const int j){
00656 #ifdef DEBUG
00657 if (_LA_count_check && *count != 1) laerror("NRMat::operator(,) used as l-value for a matrix with count > 1");
00658 if (i < 0 || i >= nn && nn > 0 || j < 0 || j >= mm && mm > 0) laerror("index out of range");
00659 if (!v) laerror("unallocated matrix");
00660 #endif
00661 NOT_GPU(*this);
00662 #ifdef MATPTR
00663 return v[i][j];
00664 #else
00665 return v[i*mm + j];
00666 #endif
00667 }
00668
00669
00675 template <typename T>
00676 inline const T& NRMat<T>::operator()(const int i, const int j) const{
00677 T ret;
00678 #ifdef DEBUG
00679 if (i<0 || i>=nn && nn>0 || j<0 || j>=mm && mm>0) laerror("index out of range");
00680 if (!v) laerror("unallocated matrix");
00681 #endif
00682 NOT_GPU(*this);
00683 #ifdef MATPTR
00684 return v[i][j];
00685 #else
00686 return v[i*mm + j];
00687 #endif
00688 }
00689
00690
00696 template <typename T>
00697 inline const T NRMat<T>::get_ij(const int i, const int j) const{
00698 T ret;
00699 #ifdef DEBUG
00700 if (i<0 || i>=nn || j<0 || j>=mm) laerror("index out of range");
00701 if (!v) laerror("unallocated matrix");
00702 #endif
00703 #ifdef CUDALA
00704 if(location == cpu){
00705 #endif
00706 #ifdef MATPTR
00707 return v[i][j];
00708 #else
00709 return v[i*mm + j];
00710 #endif
00711 #ifdef CUDALA
00712 }else{
00713 const int pozice = i*mm + j;
00714 gpuget(1, sizeof(T), v + pozice, &ret);
00715 return ret;
00716 }
00717 #endif
00718 }
00719
00720
00723 template <typename T>
00724 inline int NRMat<T>::nrows() const{
00725 return nn;
00726 }
00727
00728
00731 template <typename T>
00732 inline int NRMat<T>::ncols() const{
00733 return mm;
00734 }
00735
00736
00739 template <typename T>
00740 inline int NRMat<T>::size() const{
00741 return nn*mm;
00742 }
00743
00744
00747 template <typename T>
00748 inline NRMat<T>::operator T*(){
00749 #ifdef DEBUG
00750 if (!v) laerror("unallocated matrix");
00751 #endif
00752 #ifdef MATPTR
00753 return v[0];
00754 #else
00755 return v;
00756 #endif
00757 }
00758
00759
00762 template <typename T>
00763 inline NRMat<T>::operator const T*() const{
00764 #ifdef DEBUG
00765 if (!v) laerror("unallocated matrix");
00766 #endif
00767 #ifdef MATPTR
00768 return v[0];
00769 #else
00770 return v;
00771 #endif
00772 }
00773
00774
00779 template<>
00780 inline const double NRMat<double>::amax() const{
00781 #ifdef CUDALA
00782 if(location == cpu){
00783 #endif
00784 #ifdef MATPTR
00785 return v[0][cblas_idamax(nn*mm, v[0], 1) - 1];
00786 #else
00787 return v[cblas_idamax(nn*mm, v, 1) - 1];
00788 #endif
00789 #ifdef CUDALA
00790 }else{
00791 double ret(0.0);
00792 const int pozice = cublasIdamax(nn*mm, v, 1) - 1;
00793 TEST_CUBLAS("cublasIdamax");
00794 gpuget(1, sizeof(double), v + pozice, &ret);
00795 return ret;
00796 }
00797 #endif
00798 }
00799
00800
00805 template<>
00806 inline const double NRMat<double>::amin() const{
00807 double ret(0.0);
00808 #ifdef CUDALA
00809 if(location == cpu){
00810 #endif
00811
00812 const int nm = nn*mm;
00813 double val(0.0);
00814 int index(-1);
00815 ret = std::numeric_limits<double>::max();
00816 for(register int i=0; i < nm; i++){
00817 #ifdef MATPTR
00818 val = std::abs(v[0][i]);
00819 #else
00820 val = std::abs(v[i]);
00821 #endif
00822 if(val < ret){ index = i; ret = val; }
00823 }
00824 #ifdef MATPTR
00825 ret = v[0][index];
00826 #else
00827 ret = v[index];
00828 #endif
00829 #ifdef CUDALA
00830 }else{
00831 const int pozice = cublasIdamin(nn*mm, v, 1) - 1;
00832 TEST_CUBLAS("cublasIdamin");
00833 gpuget(1, sizeof(double), v + pozice, &ret);
00834 }
00835 #endif
00836 return ret;
00837 }
00838
00839
00844 template<>
00845 inline const complex<double> NRMat<complex<double> >::amax() const{
00846 #ifdef CUDALA
00847 if(location == cpu){
00848 #endif
00849 #ifdef MATPTR
00850 return v[0][cblas_izamax(nn*mm, v[0], 1) - 1];
00851 #else
00852 return v[cblas_izamax(nn*mm, v, 1) - 1];
00853 #endif
00854 #ifdef CUDALA
00855 }else{
00856 complex<double> ret(0.0, 0.0);
00857 const int pozice = cublasIzamax(nn*mm, (cuDoubleComplex*)v, 1) - 1;
00858 TEST_CUBLAS("cublasIzamax");
00859 gpuget(1, sizeof(complex<double>), v + pozice, &ret);
00860 return ret;
00861 }
00862 #endif
00863
00864 }
00865
00866
00871 template<>
00872 inline const complex<double> NRMat<complex<double> >::amin() const{
00873 complex<double> ret(0.0, 0.0);
00874 #ifdef CUDALA
00875 if(location == cpu){
00876 #endif
00877
00878 const int nm = nn*mm;
00879 int index(-1);
00880 double val(0.0), min_val(0.0);
00881 complex<double> z_val(0.0, 0.0);
00882
00883 min_val = std::numeric_limits<double>::max();
00884 for(register int i=0; i < nm; i++){
00885 #ifdef MATPTR
00886 z_val = v[0][i];
00887 #else
00888 z_val = v[i];
00889 #endif
00890 val = std::abs(z_val.real()) + std::abs(z_val.imag());
00891 if(val < min_val){ index = i; min_val = val; }
00892 }
00893 #ifdef MATPTR
00894 ret = v[0][index];
00895 #else
00896 ret = v[index];
00897 #endif
00898 #ifdef CUDALA
00899 }else{
00900 const int pozice = cublasIzamin(nn*mm, (cuDoubleComplex*)v, 1) - 1;
00901 TEST_CUBLAS("cublasIzamin");
00902 gpuget(1, sizeof(complex<double>), v + pozice, &ret);
00903 }
00904 #endif
00905 return ret;
00906 }
00907
00908
00912 template <typename T>
00913 NRMat<T>::~NRMat() {
00914 if(!count) return;
00915 if(--(*count) <= 0) {
00916 if (v){
00917 #ifdef CUDALA
00918 if(location == cpu){
00919 #endif
00920 #ifdef MATPTR
00921 delete[] (v[0]);
00922 #endif
00923 delete[] v;
00924 #ifdef CUDALA
00925 }else{
00926 gpufree(v);
00927 }
00928 #endif
00929 }
00930 delete count;
00931 }
00932 }
00933
00934
00939 template <typename T>
00940 NRMat<T> & NRMat<T>::operator=(const NRMat<T> &rhs) {
00941 if (this != &rhs){
00942 if (count){
00943 if (--(*count) ==0 ){
00944 #ifdef CUDALA
00945 if(location == cpu){
00946 #endif
00947 #ifdef MATPTR
00948 delete[] (v[0]);
00949 #endif
00950 delete[] v;
00951 #ifdef CUDALA
00952 }else{ gpufree(v); }
00953 #endif
00954 delete count;
00955 }
00956 }
00957 v = rhs.v;
00958 #ifdef CUDALA
00959 location = rhs.location;
00960 #endif
00961 nn = rhs.nn;
00962 mm = rhs.mm;
00963 count = rhs.count;
00964 if(count) (*count)++;
00965 }
00966 return *this;
00967 }
00968
00969
00970
00975 template <typename T>
00976 NRMat<T> & NRMat<T>::operator|=(const NRMat<T> &rhs) {
00977 if(this == &rhs) return *this;
00978 *this = rhs;
00979 this->copyonwrite();
00980 return *this;
00981 }
00982
00983
00987 template <typename T>
00988 void NRMat<T>::copyonwrite() {
00989 if(!count) laerror("attempt to call copyonwrite() for a matrix with count == 0");
00990 if(*count > 1){
00991 (*count)--;
00992 count = new int;
00993 *count = 1;
00994 #ifdef CUDALA
00995 if(location == cpu){
00996 #endif
00997 #ifdef MATPTR
00998 T **newv = new T*[nn];
00999 newv[0] = new T[mm*nn];
01000 memcpy(newv[0], v[0], mm*nn*sizeof(T));
01001 v = newv;
01002 for(register int i=1; i<nn; i++) v[i] = v[i-1] + mm;
01003 #else
01004 T *newv = new T[mm*nn];
01005 memcpy(newv, v, mm*nn*sizeof(T));
01006 v = newv;
01007 #endif
01008 #ifdef CUDALA
01009 }else{
01010 T *newv = (T *) gpualloc(mm*nn*sizeof(T));
01011 if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem");
01012 cublasScopy(nn*mm*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
01013 TEST_CUBLAS("cublasScopy");
01014 v = newv;
01015 }
01016 #endif
01017 }
01018 }
01019
01020
01027 template <typename T>
01028 void NRMat<T>::resize(int n, int m) {
01029 #ifdef DEBUG
01030 if (n<0 || m<0) laerror("illegal dimensions");
01031 #endif
01032
01033
01034 if(n == 0 || m == 0) m = n =0;
01035
01036 if(count){
01037 if(n==0 && m==0){
01038 if(--(*count) <= 0){
01039 #ifdef CUDALA
01040 if(location==cpu){
01041 #endif
01042 #ifdef MATPTR
01043 if(v) delete[] (v[0]);
01044 #endif
01045 if(v) delete[] v;
01046 #ifdef CUDALA
01047 }
01048 else { gpufree(v); }
01049 #endif
01050 delete count;
01051 }
01052 count = 0;
01053 nn = mm = 0;
01054 v = 0;
01055 return;
01056 }
01057
01058
01059
01060
01061 if(*count > 1){
01062 (*count)--;
01063 count = 0;
01064 nn = mm = 0;
01065 v = 0;
01066 }
01067 }
01068
01069 if(!count){
01070 count = new int;
01071 *count = 1;
01072 nn = n;
01073 mm = m;
01074 #ifdef CUDALA
01075 if(location==cpu){
01076 #endif
01077 #ifdef MATPTR
01078 v = new T*[nn];
01079 v[0] = new T[m*n];
01080 for (register int i=1; i< n; i++) v[i] = v[i-1] + m;
01081 #else
01082 v = new T[m*n];
01083 #endif
01084 #ifdef CUDALA
01085 }else{
01086 v = (T *) gpualloc(n*m*sizeof(T));
01087 }
01088 #endif
01089 return;
01090 }
01091
01092
01093 if (n != nn || m != mm) {
01094 nn = n;
01095 mm = m;
01096 #ifdef CUDALA
01097 if(location==cpu){
01098 #endif
01099 #ifdef MATPTR
01100 delete[] (v[0]);
01101 #endif
01102 delete[] v;
01103 #ifdef MATPTR
01104 v = new T*[nn];
01105 v[0] = new T[m*n];
01106 for (int i=1; i< n; i++) v[i] = v[i-1] + m;
01107 #else
01108 v = new T[m*n];
01109 #endif
01110 #ifdef CUDALA
01111 }else{
01112 gpufree(v);
01113 v=(T *) gpualloc(n*m*sizeof(T));
01114 }
01115 #endif
01116 }
01117 }
01118
01119
01120
01125 template<typename T>
01126 NRMat<complex<T> > complexify(const NRMat<T> &rhs) {
01127 NOT_GPU(rhs);
01128
01129 NRMat<complex<T> > r(rhs.nrows(), rhs.ncols(), rhs.getlocation());
01130 for(register int i=0; i<rhs.nrows(); ++i){
01131 for(register int j=0; j<rhs.ncols(); ++j) r(i,j) = rhs(i,j);
01132 }
01133 return r;
01134 }
01135
01136
01142 template <typename T>
01143 std::ostream& operator<<(std::ostream &s, const NRMat<T> &x) {
01144 #ifdef CUDALA
01145 if(x.getlocation() == cpu){
01146 #endif
01147 int i(0),j(0);
01148 int n(x.nrows()), m(x.ncols());
01149 s << n << ' ' << m << '\n';
01150 for(i=0; i<n; i++){
01151 for(j=0; j<m; j++){
01152
01153 s << (typename LA_traits_io<T>::IOtype) x[i][j] << (j==m-1 ? '\n' : ' ');
01154 }
01155 }
01156 return s;
01157 #ifdef CUDALA
01158 }else{
01159 NRMat<T> tmp = x;
01160 tmp.moveto(cpu);
01161 return s << tmp;
01162 }
01163 #endif
01164 }
01165
01166
01172 template <typename T>
01173 std::istream& operator>>(std::istream &s, NRMat<T> &x)
01174 {
01175 #ifdef CUDALA
01176 if(x.getlocation() == cpu){
01177 #endif
01178 int i(0), j(0), n(0), m(0);
01179 s >> n >> m;
01180 x.resize(n, m);
01181 typename LA_traits_io<T>::IOtype tmp;
01182 for(i=0;i<n;i++){
01183 for(j=0; j<m;j++){
01184 s >> tmp;
01185 x[i][j] = tmp;
01186 }
01187 }
01188 return s;
01189 #ifdef CUDALA
01190 }else{
01191 NRMat<T> tmp;
01192 tmp.moveto(cpu);
01193 s >> tmp;
01194 tmp.moveto(x.getlocation());
01195 x = tmp;
01196 return s;
01197 }
01198 #endif
01199 }
01200
01201
01206 template<typename T>
01207 class NRMat_from1 : public NRMat<T> {
01208 public:
01209 NRMat_from1(): NRMat<T>() {};
01210 explicit NRMat_from1(const int n): NRMat<T>(n) {};
01211 NRMat_from1(const NRMat<T> &rhs): NRMat<T>(rhs) {};
01212 NRMat_from1(const int n, const int m): NRMat<T>(n, m) {};
01213 NRMat_from1(const T &a, const int n, const int m): NRMat<T>(a, n, m) {};
01214 NRMat_from1(const T *a, const int n, const int m): NRMat<T>(a, n, m) {};
01215
01216 inline const T& operator() (const int i, const int j) const {
01217 #ifdef DEBUG
01218 if (i<1 || i>NRMat<T>::nn || j<1 || j>NRMat<T>::mm) laerror("index out of range");
01219 if (!NRMat<T>::v) laerror("unallocated matrix");
01220 #endif
01221 NOT_GPU(*this);
01222 #ifdef MATPTR
01223 return NRMat<T>::v[i - 1][j - 1];
01224 #else
01225 return NRMat<T>::v[(i-1)*NRMat<T>::mm+j-1];
01226 #endif
01227 }
01228
01229 inline T& operator() (const int i, const int j) {
01230 #ifdef DEBUG
01231 if (_LA_count_check && *NRMat<T>::count != 1) laerror("matrix with *count > 1 used as l-value");
01232 if (i<1 || i>NRMat<T>::nn || j<1 || j>NRMat<T>::mm) laerror("index out of range");
01233 if (!NRMat<T>::v) laerror("unallocated matrix");
01234 #endif
01235 NOT_GPU(*this);
01236 #ifdef MATPTR
01237 return NRMat<T>::v[i-1][j-1];
01238 #else
01239 return NRMat<T>::v[(i-1)*NRMat<T>::mm+j-1];
01240 #endif
01241 }
01242
01243 inline const T get_ij(const int i, const int j) const {
01244 T ret;
01245 #ifdef DEBUG
01246 if (i<1 || i>NRMat<T>::nn || j<1 || j>NRMat<T>::mm) laerror("index out of range");
01247 if (!NRMat<T>::v) laerror("unallocated matrix");
01248 #endif
01249 #ifdef CUDALA
01250 if(NRMat<T>::location == cpu){
01251 #endif
01252 #ifdef MATPTR
01253 return NRMat<T>::v[i - 1][j - 1];
01254 #else
01255 return NRMat<T>::v[(i-1)*NRMat<T>::mm + (j-1)];
01256 #endif
01257 #ifdef CUDALA
01258 }else{
01259 const int pozice = (i-1)*NRMat<T>::mm + (j-1);
01260 gpuget(1, sizeof(T), NRMat<T>::v + pozice, &ret);
01261 return ret;
01262 }
01263 #endif
01264 }
01265 };
01266
01267
01273 template<typename T>
01274 NRMat<T>& NRMat<T>::operator^=(const NRMat<T> &rhs){
01275 #ifdef DEBUG
01276 if (nn != rhs.nn || mm != rhs.mm) laerror("incompatible matrices");
01277 #endif
01278 SAME_LOC(*this, rhs);
01279 NOT_GPU(*this);
01280
01281 copyonwrite();
01282 #ifdef MATPTR
01283 for (register int i=0; i< nn*mm; i++) v[0][i] *= rhs.v[0][i];
01284 #else
01285 const int Dim = nn*mm;
01286 for(register int i=0; i<Dim; i++) v[i] *= rhs.v[i];
01287 #endif
01288 return *this;
01289 }
01290
01291
01292
01297 #ifdef CUDALA
01298 template<typename T>
01299 void NRMat<T>::moveto(const GPUID dest) {
01300 if(location == dest) return;
01301
01302
01303
01304
01305 CPU_GPU(location, dest);
01306 location = dest;
01307
01308 if(v && !count) laerror("internal inconsistency of reference counting 1");
01309 if (!count) return;
01310
01311 if(v && *count==0) laerror("internal inconsistency of reference counting 2");
01312 if(!v) return;
01313
01314 T *vold = v;
01315
01316 if(dest == cpu){
01317 v = new T[nn*mm];
01318 gpuget(nn*mm, sizeof(T), vold, v);
01319 if(*count == 1){ gpufree(vold); }
01320 else{ --(*count); count = new int(1); }
01321
01322 }else{
01323 v = (T *) gpualloc(nn*mm*sizeof(T));
01324 gpuput(nn*mm, sizeof(T), vold, v);
01325 if(*count == 1) delete[] vold;
01326 else{ --(*count); count = new int(1);}
01327 }
01328 }
01329 #endif
01330
01331
01335 NRVECMAT_OPER(Mat, +)
01336 NRVECMAT_OPER(Mat, -)
01337 NRVECMAT_OPER(Mat, *)
01338
01339
01343 NRVECMAT_OPER2(Mat, +)
01344 NRVECMAT_OPER2(Mat, -)
01345
01346 }
01347 #endif