LA library: cuda_la.h Source File

00001 //------------------------------------------------------------------------------
00002 /* vim: set ts=8 sw=8 sts=8 noexpandtab cindent: */
00003 //------------------------------------------------------------------------------
00004 #ifndef _CUDA_LA_H
00005 #define _CUDA_LA_H
00006 
00007 #include <errno.h>
00008 #ifdef CUDALA
00009 #undef MATPTR
00010 #include "cublas.h"
00011 #endif
00012 
00013 #include "la_traits.h"
00014 
00015 namespace LA {
00016 
00017 #ifdef CUDALA
00018 #define CPU_GPU(x,y) {if((x)!=cpu && (y)!=cpu) laerror("one operand must be in CPU memory");}
00019 #define NOT_GPU(x) {if((x).getlocation()!=cpu) laerror("Operation not implemented on GPU (yet). Use .moveto(0) first.");}
00020 #define NOT_CPU(x) {if((x).getlocation()==cpu) laerror("Operation not implemented on CPU (yet). Use .moveto(>0) first.");}
00021 #define SAME_LOC(x,y) {if((x).getlocation()!=(y).getlocation()) laerror("Operands have different location. Use .moveto() first.");}
00022 #define SAME_LOC3(x,y,z) {if((x).getlocation()!=(y).getlocation() || (x).getlocation()!=(z).getlocation()) laerror("Operands have different location. Use .moveto() first.");}
00023 #else
00024 #define CPU_GPU(x,y) {}
00025 #define NOT_GPU(x) {}
00026 #define SAME_LOC(x,y) {}
00027 #define SAME_LOC3(x,y,z) {}
00028 #endif
00029 
00030 #ifdef DEBUG
00031         #ifdef __GNUG__
00032                 #define TEST_CUBLAS(X) { if(cublasGetError() != CUBLAS_STATUS_SUCCESS){ laerror2(#X, __PRETTY_FUNCTION__); } }
00033         #else
00034                 #define TEST_CUBLAS(X) { if(cublasGetError() != CUBLAS_STATUS_SUCCESS){ laerror2(#X, __func__); } }
00035         #endif
00036 #else
00037         #define TEST_CUBLAS(X) {}
00038 #endif
00039 
00040 typedef enum {undefined=-1, cpu=0, gpu1=1, gpu2=2, gpu3=3, gpu4=4} GPUID;
00041 
00042 #ifdef CUDALA
00043 
00044 //global static instantiation of this class will provide automatic init/shutdown of GPU
00045 class GPU_START {
00046 public:
00047         GPU_START(void)
00048                 {
00049                 cublasStatus status = cublasInit();
00050                 if (status != CUBLAS_STATUS_SUCCESS) laerror("Cannot init GPU for CUBLAS");
00051                 errno = 0;
00052                 }
00053         ~GPU_START(void)
00054                 {
00055                 cublasStatus status = cublasShutdown();
00056                 if (status != CUBLAS_STATUS_SUCCESS) laerror("Cannot cleanly shutdown GPU");
00057                 }
00058 };
00059 
00060 extern void *gpualloc(size_t size);
00061 extern void gpufree(void *ptr);
00062 extern void gpuget(size_t n, size_t elsize, const void *from, void *to);
00063 extern void gpuput(size_t n, size_t elsize, const void *from, void *to);
00064 extern double *gpuputdouble(const double &x);
00065 extern complex<double> *gpuputcomplex(const complex<double> &x);
00066 
00067 void set_default_loc(const GPUID loc);
00068 
00069 template <typename T>
00070 void smart_gpu_set(size_t n, const T& val, void *gpu_to, size_t _step = 1){
00071         void *ptr(NULL);
00072         if(sizeof(T)%sizeof(float) != 0){ laerror("memory alignment error"); }
00073 
00074         cublasAlloc(1, sizeof(T), &ptr);
00075         TEST_CUBLAS("cublasAlloc");
00076 
00077         cublasSetVector(1, sizeof(T), &val, 1, ptr, 1);
00078         TEST_CUBLAS("cublasSetVector");
00079 
00080         if(sizeof(T) == sizeof(float)){
00081                 cublasScopy(n, (float*)ptr, 0, ((float*)gpu_to), _step);
00082                 TEST_CUBLAS("cublasScopy");
00083 
00084         }else if(sizeof(T) == sizeof(double)){
00085                 cublasDcopy(n, (double*)ptr, 0, ((double*)gpu_to), _step);
00086                 TEST_CUBLAS("cublasDcopy");
00087 
00088         }else if(sizeof(T) == sizeof(complex<double>)){
00089                 cublasZcopy(n, (cuDoubleComplex*)ptr, 0, (cuDoubleComplex*)gpu_to, _step);
00090                 TEST_CUBLAS("cublasZcopy");
00091 
00092         }else{
00093                 for(register int i=0; i<sizeof(T)/sizeof(float); i++){
00094                         cublasScopy(n, (float*)ptr + i, 0, ((float*)gpu_to) + i, sizeof(T)/sizeof(float)*_step);
00095                         TEST_CUBLAS("cublasScopy");
00096                 }
00097         }
00098 
00099         cublasFree(ptr);
00100         TEST_CUBLAS("cublasFree");
00101 }
00102 
00103 extern GPUID DEFAULT_LOC;
00104 
00105 #endif
00106 }
00107 #endif