emdata_cuda.cpp

Go to the documentation of this file.
00001 /*
00002  * Author: Steven Ludtke, 04/10/2003 (sludtke@bcm.edu)
00003  * Copyright (c) 2000-2006 Baylor College of Medicine
00004  *
00005  * This software is issued under a joint BSD/GNU license. You may use the
00006  * source code in this file under either license. However, note that the
00007  * complete EMAN2 and SPARX software packages have some GPL dependencies,
00008  * so you are responsible for compliance with the licenses of these packages
00009  * if you opt to use BSD licensing. The warranty disclaimer below holds
00010  * in either instance.
00011  *
00012  * This complete copyright notice must be included in any revised version of the
00013  * source code. Additional authorship citations may be added, but existing
00014  * author citations must be preserved.
00015  *
00016  * This program is free software; you can redistribute it and/or modify
00017  * it under the terms of the GNU General Public License as published by
00018  * the Free Software Foundation; either version 2 of the License, or
00019  * (at your option) any later version.
00020  *
00021  * This program is distributed in the hope that it will be useful,
00022  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00023  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
00024  * GNU General Public License for more details.
00025  *
00026  * You should have received a copy of the GNU General Public License
00027  * along with this program; if not, write to the Free Software
00028  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00029  *
00030  * */
00031 
00032 /*
00033  * This code is for a CUDA memory managment scheme. EMData rdata arrays are copied to CUDA DDRAM memory via
00034  * copy_to_cuda() and to texture memory via copy_to_cudaro(). EMData is copied back using copy_from_device(), 
00035  * and DDRAM data can be freed via rw_free() and ro_free(). When data is copied to CUDA DDRAM, memory is managed
00036  * via a doubly linked list. When copy_tocuda*() is called there is first a check to ensure that there is enough
00037  * memory available. If so , the copy is made and a call to addlist() is made, adding this EMData item to the
00038  * doubly linked list. If there is not enough memory, then the function, freeup_devicemem(), is called and the
00039  * last item on the linked list is removed. If there is still not enough room, then the next last item is removed, etc, etc
00040  * If there is still no room after the last item is removed, then no copy is made. Items are removed from the list via:
00041  * reomvefromlist(). Used in this maner the meory managment algorithm is a FILO(first in last out), HOWEVER, when CUDA is
00042  * used in applications a call to elementaccessed() can be made, which moves the item to the top of the list. When this
00043  * scheme is used, the memory management algorithm becomes, LRU(least recently used), which should give better results in 
00044  * almost all cases. As a side note, to actutally use texture memory, a call to bindcudaarray?() should be made, when needed
00045  * A corresponding call to unbindcudaarray?() needs to be made after texture memory is not needed. These operations do not actually
00046  * move data around, just bind it to a Texture object, which are very limited resources!!!. Note that elementacessed is called 
00047  * every time getcudarwdata(), getcudarodata() or isroongpu() called. Hence LRU is used by default, and you are forced to useage
00048  * these getter function b/c cudarwdata and cudarodata are private. You could get arround this in EMData functions, and most
00049  * functions in the EMData class just grab the cudarwdata pointer rather than calling the getcudarwdata() function. This is done to
00050  * avod unnecessary pointer arithmatic, but may cause issues. I haven't decided what the best call is....
00051  * Note that possible concurrency issues can arise, because when datra is copied bewteen the Host and GPU, there are two copies. 
00052  * To accout for this possible problem, CUDA functions need to call setdirtybit() which will copy back fron GPU to host whenever
00053  * get_data() is called(This function is a getter for EMData's rdata) 
00054 */
00055 
00056 #ifdef EMAN2_USING_CUDA
00057 
00058 #include "emdata.h"
00059 #include "exception.h"
00060 #include <cuda_runtime_api.h>
00061 #include <driver_functions.h>
00062 #include <cuda.h>
00063 #include <cuda/cuda_util.h>
00064 #include <cuda/cuda_emfft.h>
00065 
00066 using namespace EMAN;
00067 
00068 // Static init
00069 const EMData* EMData::firstinlist = 0;
00070 const EMData* EMData::lastinlist = 0;
00071 int EMData::memused = 0;
00072 int EMData::fudgemem = 1.024E8; //let's leave 10 MB of 'fudge' memory on the device
00073 int EMData::cudadevicenum = -1;
00074 bool EMData::usecuda = 0;
00075 bool EMData::nocudainit = (getenv("NOCUDAINIT") == NULL) ? 0 : bool(atoi(getenv("NOCUDAINIT")));
00076 
00077 bool EMData::copy_to_cuda() const
00078 {
00079         //cout << "copying from host to device RW" << " " << num_bytes << endl;
00080         if(rw_alloc()) {
00081                 memused += num_bytes;
00082                 cudaError_t error = cudaMemcpy(cudarwdata,rdata,num_bytes,cudaMemcpyHostToDevice);
00083                 if ( error != cudaSuccess) {
00084                         //cout << rdata << " " << cudarwdata << endl;
00085                         throw UnexpectedBehaviorException("CudaMemcpy (host to device) failed:" + string(cudaGetErrorString(error)));
00086                 }
00087         }else{return false;}
00088         //setdirtybit() //uncomment this line if you want to ensure that only one effective copy exists on either the host or GPU
00089         
00090         return true;
00091 }
00092 
00093 bool EMData::copy_to_cudaro() const
00094 {
00095         
00096         //cout << "copying from host to device RO" << " " << num_bytes << endl;
00097         if(ro_alloc()) {
00098                 memused += num_bytes;
00099                 copy_to_array(rdata, cudarodata, nx, ny, nz, cudaMemcpyHostToDevice);
00100         }else{return false;}
00101         //setdirtybit() //uncomment this line if you want to ensure that only one effective copy exists on either the host or GPU
00102         
00103         return true;
00104 }
00105 
00106 bool EMData::rw_alloc() const
00107 {
00108         if(cudarwdata){return true;} // already exists
00109         num_bytes = nxyz*sizeof(float);
00110         if(!freeup_devicemem(num_bytes)){return false;}
00111         cudaError_t error = cudaMalloc((void**)&cudarwdata,num_bytes);
00112         if ( error != cudaSuccess){return false;}
00113         if(!cudarodata){
00114                 addtolist();
00115         }else{
00116                 elementaccessed();
00117         }
00118         return true;
00119 }
00120 
00121 bool EMData::ro_alloc() const
00122 {
00123         if(cudarodata){return true;} // already exists
00124         num_bytes = nxyz*sizeof(float);
00125         if(!freeup_devicemem(num_bytes)){return false;}
00126         cudarodata = get_cuda_array(nx, ny, nz);
00127         if(cudarodata == 0) throw UnexpectedBehaviorException("Bad Array alloc");
00128         if(!cudarwdata){
00129                 addtolist();
00130         }else{
00131                 elementaccessed();
00132         }
00133 
00134         return true;
00135         
00136 }
00137 
00138 void EMData::bindcudaarrayA(const bool intp_mode) const
00139 {
00140         if(cudarodata == 0){throw UnexpectedBehaviorException( "Cuda Array not allocated!!");}
00141         if(nz > 1){
00142                 bind_cuda_array_to_textureA(cudarodata, 3, intp_mode);
00143         }else{
00144                 bind_cuda_array_to_textureA(cudarodata, 2, intp_mode);
00145         }
00146         
00147 }
00148 
00149 void EMData::unbindcudaarryA() const
00150 {
00151         
00152         if(nz > 1){
00153                 unbind_cuda_textureA(3);
00154         }else{
00155                 unbind_cuda_textureA(2);
00156         }
00157         
00158 }
00159 
00160 void EMData::bindcudaarrayB(const bool intp_mode) const
00161 {
00162         if(cudarodata == 0){throw UnexpectedBehaviorException( "Cuda Array not allocated!!");}
00163         if(nz > 1){
00164                 bind_cuda_array_to_textureB(cudarodata, 3, intp_mode);
00165         }else{
00166                 bind_cuda_array_to_textureB(cudarodata, 2, intp_mode);
00167         }
00168         
00169 }
00170 
00171 void EMData::unbindcudaarryB() const
00172 {
00173         
00174         if(nz > 1){
00175                 unbind_cuda_textureB(3);
00176         }else{
00177                 unbind_cuda_textureB(2);
00178         }
00179         
00180 }
00181 
00182 bool EMData::copy_from_device(const bool rocpy)
00183 {
00184         if(cudarwdata && !rocpy){
00185                 if(rdata == 0){rdata = (float*)malloc(num_bytes);} //allocate space if needed, assumes size hasn't changed(Which is hasn't so far)
00186                 cudaError_t error = cudaMemcpy(rdata,cudarwdata,num_bytes,cudaMemcpyDeviceToHost);
00187                 if ( error != cudaSuccess) throw UnexpectedBehaviorException( "CudaMemcpy (device to host) failed:" + string(cudaGetErrorString(error)));
00188                 rw_free(); //we have the data on either the host or device, not both (prevents concurrency issues)
00189                 if(cudarodata) ro_free(); // clear any RO data, for call safety
00190         } else if (cudarodata && rocpy) {
00191                 if(rdata == 0){rdata = (float*)malloc(num_bytes);} //allocate space if needed
00192                 if (nz > 1){
00193                         cudaExtent extent;
00194                         extent.width  = nx;
00195                         extent.height = ny;
00196                         extent.depth  = nz;
00197                         cudaMemcpy3DParms copyParams = {0};
00198                         copyParams.srcArray = cudarodata;
00199                         copyParams.dstPtr = make_cudaPitchedPtr((void*)rdata, extent.width*sizeof(float), extent.width, extent.height);
00200                         copyParams.extent   = extent;
00201                         copyParams.kind     = cudaMemcpyDeviceToHost;
00202                         cudaError_t error = cudaMemcpy3D(&copyParams);
00203                         if ( error != cudaSuccess) throw UnexpectedBehaviorException( "RO CudaMemcpy (device to host) failed:" + string(cudaGetErrorString(error)));
00204                 } else{
00205                         cudaError_t error = cudaMemcpyFromArray(rdata,cudarodata,0,0,num_bytes,cudaMemcpyDeviceToHost);
00206                         if ( error != cudaSuccess) throw UnexpectedBehaviorException( "RO CudaMemcpy (device to host) failed:" + string(cudaGetErrorString(error)));
00207                 }       
00208                 ro_free(); //we have the data on either the host or device, not both (prevents concurrency issues)
00209                 if(cudarwdata) rw_free(); // clear any RW data, for call safety
00210         } else {
00211                 return false;
00212         }
00213 
00214         update(); 
00215         return true;
00216 }
00217 
00218 bool EMData::copy_rw_to_ro() const
00219 {
00220         
00221         if(cudarwdata == 0){return false;}
00222         
00223         if(cudarodata == 0){
00224                 if(!freeup_devicemem(num_bytes)){return false;}
00225                 cudarodata = get_cuda_array(nx, ny, nz);
00226                 if(cudarodata == 0) throw UnexpectedBehaviorException("Bad Array alloc");
00227                 memused += num_bytes;
00228         }
00229         //this will copy over any prexisting data (saves a malloc)....(but sometimes not a safe call.....)
00230         copy_to_array(cudarwdata, cudarodata, nx, ny, nz, cudaMemcpyDeviceToDevice);
00231         roneedsupdate = 0; //just copied, so no longer need an update
00232         elementaccessed(); //To move the image to the top of the stack, prevents deletion before useage(If the image is at the stack bottom, and then anoth image is moved on....)
00233         return true;
00234         
00235 }
00236 
00237 // The policy here is that when an EMData object is created, cudarwdata is set to 0. and no mem is allocated. It is
00238 //only when cudarwdata points to allocated data does the EMData object go on the list. cudarwdata should NEVER be set 
00239 void EMData::runcuda(float * results) const
00240 {
00241         
00242         if(results == 0){throw UnexpectedBehaviorException( "Cuda failed!!!");}
00243         if(cudarwdata != 0){
00244                 //rw_free();} //delete the old data, why not jus overwrite!! (save a cudaFree)
00245         } else {
00246                 addtolist(); // now that we are using memory add to the list
00247         }
00248         cudarwdata = results;
00249         
00250 }
00251 
00252 void EMData::rw_free() const
00253 {
00254         cudaError_t error = cudaFree(cudarwdata);
00255         if ( error != cudaSuccess){
00256                 cout << rdata << " " << cudarwdata << endl;
00257                 throw UnexpectedBehaviorException( "CudaFree failed:" + string(cudaGetErrorString(error)));
00258         }
00259         cudarwdata = 0;
00260         memused -= num_bytes;
00261         if(!cudarodata){removefromlist();}
00262         
00263 }
00264 
00265 void EMData::ro_free() const
00266 {
00267         cudaError_t error = cudaFreeArray(cudarodata);
00268         if ( error != cudaSuccess) throw UnexpectedBehaviorException( "CudaFreeArray failed:" + string(cudaGetErrorString(error)));
00269         cudarodata = 0;
00270         memused -= num_bytes;
00271         if(!cudarwdata){removefromlist();}
00272         
00273 }
00274 
00275 bool EMData::isrodataongpu() const
00276 {
00277         if(cudarodata != 0 && !roneedsupdate){
00278                 if(cudarodata !=0) elementaccessed();
00279                 return true;
00280         }
00281         if(cudarwdata != 0){
00282                 if(copy_rw_to_ro()){;
00283                         return true;
00284                 } else {
00285                         return false;
00286                 }
00287         }else{
00288                 return false;
00289         }
00290         
00291 }
00292 
00293 bool EMData::freeup_devicemem(const int& num_bytes) const
00294 {
00295         size_t freemem=0, totalmem=0; //initialize to prevent undefined behaviour
00296         cudaMemGetInfo(&freemem, &totalmem);
00297         //cout  << "memusage" << " " << freemem << " " << totalmem << endl;
00298         if ((ptrdiff_t(freemem) - ptrdiff_t(fudgemem)) > ptrdiff_t(num_bytes)){
00299                 return true;
00300         }else{  
00301                 //keep on removing stuff until enough memory is available
00302                 while(lastinlist != 0){
00303                         if(lastinlist->cudarwdata){
00304                                 //screw the constness, always copy from GPU to host rather than throwing stuff away!!!
00305                                 const_cast<EMData*>(lastinlist)->copy_from_device();
00306                         }
00307                         if(lastinlist->cudarodata){
00308                                 const_cast<EMData*>(lastinlist)->copy_from_device(1);
00309                         }
00310                         cudaMemGetInfo(&freemem, &totalmem); //update free memory
00311                         if((ptrdiff_t(freemem) - ptrdiff_t(fudgemem)) > ptrdiff_t(num_bytes)){return true;}     //this should break the loop....
00312                 }
00313         }       
00314         
00315         return false;   //if we failed :(
00316 }
00317 
00318 void EMData::setdirtybit() const
00319 {
00320         cudadirtybit = 1;
00321 }
00322 
00323 void EMData::addtolist() const
00324 {
00325         //Adds item to top of list
00326         if(firstinlist == 0){ //if this is the first item in the list (first object in list), then make a new list
00327                 firstinlist = this;
00328                 lastinlist = this;
00329                 nextlistitem = 0;
00330                 prevlistitem = 0;
00331         }else{
00332                 //we add to top of list
00333                 firstinlist->nextlistitem = this;
00334                 prevlistitem = firstinlist;
00335                 nextlistitem = 0;
00336                 firstinlist = this;
00337         }
00338         
00339 }
00340 
00341 void EMData::elementaccessed() const
00342 {
00343         //DO not move item to top of list if already at top of list
00344         if(firstinlist == this){return;}
00345         removefromlist();
00346         addtolist();
00347 }
00348 
00349 void EMData::removefromlist() const
00350 {
00351 
00352         //remove from list
00353         if(firstinlist == lastinlist){ //last item in list....
00354                 firstinlist = 0;
00355                 lastinlist = 0;
00356                 nextlistitem = 0;
00357                 prevlistitem = 0;
00358                 return;
00359         }
00360         if(nextlistitem !=0){
00361                 nextlistitem->prevlistitem = prevlistitem;      //this object is not first in the list
00362         }else{
00363                 firstinlist = prevlistitem;
00364         }
00365         if(prevlistitem !=0){
00366                 prevlistitem->nextlistitem = nextlistitem;      //this item is not last in the list
00367         }else{
00368                 lastinlist = nextlistitem;
00369         }
00370         nextlistitem = 0;
00371         prevlistitem = 0;
00372         
00373 }
00374 
00375 void EMData::switchoncuda()
00376 {
00377         EMData::usecuda = 1;    
00378 }
00379 
00380 void EMData::switchoffcuda()
00381 {
00382         EMData::usecuda = 0;    
00383 }
00384 
00385 void EMData::cuda_cleanup() 
00386 {
00387         do_cuda_fft_cache_destroy();
00388         //Cleanup any object mess.... CUDA has OCD
00389         while(lastinlist){
00390                 if(lastinlist->cudarwdata) lastinlist->rw_free();
00391                 if(lastinlist && lastinlist->cudarodata) lastinlist->ro_free();
00392         }
00393         //Exit CUDA threads
00394         cudaThreadExit();
00395         //Free the CUDA device lock
00396         if(EMData::cudadevicenum >= 0)
00397         {
00398                 char filename[16];
00399                 sprintf(filename,"%s%d",cudalockfile,EMData::cudadevicenum); //Only works for Linux
00400                 remove(filename);
00401         }
00402 
00403 }
00404 
00405 bool EMData::cuda_initialize()
00406 {
00407         if(EMData::nocudainit) return 0;
00408         int device = device_init();
00409 
00410         if(device != -1)
00411         {
00412                 EMData::cudadevicenum = device;
00413                 switchoncuda();
00414                 return 1;
00415         } else {
00416                 switchoffcuda();
00417                 return 0;
00418         }
00419 }
00420 
00421 const char* EMData::getcudalock()
00422 {
00423         return cudalockfile;
00424 }
00425 
00426 #endif //EMAN2_USING_CUDA

Generated on Thu May 3 10:06:24 2012 for EMAN2 by  doxygen 1.4.7