Main Page | Modules | Namespace List | Class Hierarchy | Alphabetical List | Class List | Directories | File List | Namespace Members | Class Members | File Members

emdata_cuda.cpp

Go to the documentation of this file.
00001 /*
00002  * Author: Steven Ludtke, 04/10/2003 (sludtke@bcm.edu)
00003  * Copyright (c) 2000-2006 Baylor College of Medicine
00004  *
00005  * This software is issued under a joint BSD/GNU license. You may use the
00006  * source code in this file under either license. However, note that the
00007  * complete EMAN2 and SPARX software packages have some GPL dependencies,
00008  * so you are responsible for compliance with the licenses of these packages
00009  * if you opt to use BSD licensing. The warranty disclaimer below holds
00010  * in either instance.
00011  *
00012  * This complete copyright notice must be included in any revised version of the
00013  * source code. Additional authorship citations may be added, but existing
00014  * author citations must be preserved.
00015  *
00016  * This program is free software; you can redistribute it and/or modify
00017  * it under the terms of the GNU General Public License as published by
00018  * the Free Software Foundation; either version 2 of the License, or
00019  * (at your option) any later version.
00020  *
00021  * This program is distributed in the hope that it will be useful,
00022  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00023  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
00024  * GNU General Public License for more details.
00025  *
00026  * You should have received a copy of the GNU General Public License
00027  * along with this program; if not, write to the Free Software
00028  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00029  *
00030  * */
00031 
00032 
00033 
00034 #ifdef EMAN2_USING_CUDA
00035 
00036 #include "emdata.h"
00037 #include "exception.h"
00038 #include <cuda_runtime_api.h>
00039 #include <driver_functions.h>
00040 #include <cuda.h>
00041 #include "cuda/cuda_util.h"
00042 #include "cuda/cuda_processor.h"
00043 #include "cuda/cuda_emfft.h"
00044 
00045 using namespace EMAN;
00046 // Static init
00047 EMData::CudaCache EMData::cuda_cache(100);
00048 
00049 float* EMData::get_cuda_data() const {
00050         cout << cuda_cache_handle << endl;
00051         if (get_size() == 0 ) throw UnexpectedBehaviorException("The size of the data is 0?");
00052         if (cuda_cache_handle==-1 || EMDATA_GPU_NEEDS_UPDATE & flags) {
00053                 cout << "needs an update or handle=-1" << endl;
00054                 if (cuda_cache_handle != -1 && gpu_ro_is_current() ) {
00055                         cuda_cache.copy_ro_to_rw(cuda_cache_handle);
00056                 } else {
00057                         if (cuda_cache_handle !=-1 ) {
00058                                 cuda_cache.clear_item(cuda_cache_handle);
00059                         }
00060                         cuda_cache_handle = cuda_cache.cache_rw_data(this,rdata,nx,ny,nz);
00061                         if (cuda_cache_handle == -1) throw;
00062                 }
00063                 flags &= ~EMDATA_GPU_NEEDS_UPDATE;
00064         }
00065         return cuda_cache.get_rw_data(cuda_cache_handle);
00066 }
00067 
00068 bool EMData::gpu_rw_is_current() const {
00069         if (cuda_cache_handle !=-1 && !(EMDATA_GPU_NEEDS_UPDATE & flags)) return cuda_cache.has_rw_data(cuda_cache_handle);
00070         else return false;
00071 }
00072 
00073 bool EMData::cpu_rw_is_current() const {
00074         if      (!(EMDATA_CPU_NEEDS_UPDATE & flags) && rdata != 0) return true;
00075         return false;
00076 }
00077 
00078 bool EMData::gpu_ro_is_current() const {
00079         if (cuda_cache_handle !=-1 && !(EMDATA_GPU_RO_NEEDS_UPDATE & flags)) return cuda_cache.has_ro_data(cuda_cache_handle);
00080         else return false;
00081 }
00082 
00083 void EMData::bind_cuda_texture(const bool interp_mode) const {
00084         check_cuda_array_update();
00085         cuda_cache.lock(cuda_cache_handle);
00086         bind_cuda_array_to_texture(cuda_cache.get_ro_data(cuda_cache_handle),cuda_cache.get_ndim(cuda_cache_handle),interp_mode);
00087 }
00088 
00089 void EMData::unbind_cuda_texture() const {
00090         ::unbind_cuda_texture(cuda_cache.get_ndim(cuda_cache_handle));
00091         cuda_cache.unlock(cuda_cache_handle);
00092 }
00093 
00094 cudaArray* EMData::get_cuda_array() const {
00095         if (get_size() == 0 ) throw UnexpectedBehaviorException("The size of the data is 0?");
00096         check_cuda_array_update();
00097         return cuda_cache.get_ro_data(cuda_cache_handle);
00098 }
00099 
00100 void EMData::check_cuda_array_update() const {
00101         if (cuda_cache_handle==-1 || EMDATA_GPU_RO_NEEDS_UPDATE & flags) {
00102                 if (cuda_cache_handle !=- 1 && gpu_rw_is_current() )  {
00103                         cuda_cache.copy_rw_to_ro(cuda_cache_handle);
00104                 } else {
00105                         if (cuda_cache_handle !=-1 ) cuda_cache.clear_item(cuda_cache_handle);
00106                         cuda_cache_handle = cuda_cache.cache_ro_data(this,rdata,nx,ny,nz);
00107                         if (cuda_cache_handle >=50 ) throw InvalidValueException(cuda_cache_handle,"In get cuda data, the handle is strange");
00108                         if (cuda_cache_handle == -1) throw;
00109                 }
00110                 flags &= ~EMDATA_GPU_RO_NEEDS_UPDATE;
00111         }
00112 }
00113 
00114 void EMData::cuda_cache_lost_imminently() const {
00115         //scout << "In cache lost " << cuda_cache_handle << " " << nx << " " << ny << " " << nz << endl;
00116         get_data(); // This causes cuda memory to be copied to cpu memory
00117         flags |=  EMDATA_GPU_NEEDS_UPDATE| EMDATA_GPU_RO_NEEDS_UPDATE;
00118         cuda_cache_handle = -1;
00119 }
00120 void EMData::cuda_lock() const {
00121         if (cuda_cache_handle == -1) throw UnexpectedBehaviorException("No cuda handle, can't lock");
00122         cuda_cache.lock(cuda_cache_handle);
00123         //cuda_cache.debug_print();
00124 }
00125 void EMData::cuda_unlock() const {
00126         //cout << " " << cuda_cache_handle << endl;
00127         //cuda_cache.debug_print();
00128         if (cuda_cache_handle == -1) throw UnexpectedBehaviorException("No cuda handle, can't lock");
00129         cuda_cache.unlock(cuda_cache_handle);
00130 }
00131 EMDataForCuda EMData::get_data_struct_for_cuda() const {
00132         EMDataForCuda tmp = {get_cuda_data(),nx,ny,nz};
00133         return tmp;
00134 }
00135 
00136 bool EMData::gpu_operation_preferred() const {
00137         bool cpu = cpu_rw_is_current();
00138         bool gpu = gpu_rw_is_current();
00139         if ( cpu==0 &&  gpu==0 ) {
00140                 // This is what happens when set_size doesn't allocate
00141                 return false;
00142 //              cout << (!(EMDATA_CPU_NEEDS_UPDATE & flags) && rdata != 0) << " " << (cuda_cache_handle !=-1 && !(EMDATA_GPU_NEEDS_UPDATE & flags) && cuda_cache.has_rw_data(cuda_cache_handle)) << endl;
00143 //              cout << "GPU flag " << !(EMDATA_GPU_NEEDS_UPDATE & flags) << endl;
00144 //              cout << "CPU flag " << !(EMDATA_CPU_NEEDS_UPDATE & flags) << endl;
00145 //              cout << "Rdata " << rdata << endl;
00146 //              cout << "Cuda handle " << cuda_cache_handle << endl;
00147 //              throw UnexpectedBehaviorException("Neither the CPU or GPU data are current");
00148         }
00149         if (gpu) return true;
00150         return false;
00151 }
00152 
00153 EMData* EMData::calc_ccf_cuda( EMData*  image, bool use_texturing,bool center ) const {
00154         EMData* tmp;
00155         if (is_complex()) {
00156 //              cout << "Tmp is a copy of this" << endl;
00157                 tmp = new EMData(*this);
00158         } else {
00159 //              cout << "Tmp is this fftd" << endl;
00160                 tmp = do_fft_cuda();
00161         }
00162 
00163         Dict d;
00164         EMData* with = 0;
00165         if (image == this) {
00166                 d["with"] = (EMData*) tmp;
00167         } else {
00168                 if (!image->is_complex()) {
00169                         int wnx = image->get_xsize(); int wny = image->get_ysize(); int wnz = image->get_zsize();
00170                         if ( wnx != nx || wny != ny || wnz != nz ) {
00171 
00172                                 Region r;
00173                                 if (nz > 1) {
00174                                         r = Region((wnx-nx)/2, (wny-ny)/2, (wnz-nz)/2,nx,ny,nz);
00175                                 }
00176                                 else if (ny > 1) {
00177                                         r = Region((wnx-nx)/2, (wny-ny)/2,nx,ny);
00178                                 }
00179                                 else throw UnexpectedBehaviorException("Calc_ccf_cuda doesn't work on 1D images");
00180                                 EMData* tmp = image->get_clip(r);
00181                                 with = tmp->do_fft_cuda();
00182                                 delete tmp;
00183                         }else {
00184                                 with = image->do_fft_cuda();
00185                         }
00186                         d["with"] = (EMData*) with;
00187                 } else {
00188         //              cout << "With is the input image" << endl;
00189                         d["with"] = (EMData*)image;
00190                 }
00191         }
00192 
00193 
00194         EMDataForCuda left = tmp->get_data_struct_for_cuda();
00195         CudaDataLock lock(tmp);
00196         if (use_texturing) {
00197                 ((EMData*)d["with"])->bind_cuda_texture(false);
00198                 emdata_processor_correlation_texture(&left,center);
00199                 ((EMData*)d["with"])->unbind_cuda_texture();
00200         } else {
00201                 EMDataForCuda right = ((EMData*)d["with"])->get_data_struct_for_cuda();
00202                 CudaDataLock lock2((EMData*)d["with"]);
00203                 emdata_processor_correlation(&left,&right,center);
00204         }
00205         tmp->gpu_update();
00206 
00207 //      tmp->process_inplace("cuda.correlate",d);
00208 //      return tmp;
00209         if (with != 0 && image != this) {
00210                 delete with;
00211                 with = 0;
00212         }
00213 
00214         EMData* soln = tmp->do_ift_cuda(false);
00215         soln->gpu_update();
00216         delete tmp;
00217         tmp = 0;
00218 
00219         return soln;
00220 }
00221 
00222 EMData *EMData::make_rotational_footprint_cuda( bool unwrap)
00223 {
00224         ENTERFUNC;
00225 //
00226 //      update_stat();
00227 //      float edge_mean = get_edge_mean();
00228         float edge_mean = 0;
00229         CudaDataLock(this);
00230         if ( rot_fp != 0 && unwrap == true) {
00231                 return new EMData(*rot_fp);
00232         }
00233 //
00234 //      //static EMData obj_filt;
00235 //      //EMData* filt = &obj_filt;
00236 //      //filt->set_complex(true);
00238 //
00242 //
00243         int cs = (((nx * 7 / 4) & 0xfffff8) - nx) / 2; // this pads the image to 1 3/4 * size with result divis. by 8
00244 
00245         static EMData big_clip;
00246         int big_x = nx+2*cs;
00247         int big_y = ny+2*cs;
00248         int big_z = 1;
00249         if ( nz != 1 ) {
00250                 big_z = nz+2*cs;
00251         }
00252 
00253 
00254         if ( big_clip.get_xsize() != big_x || big_clip.get_ysize() != big_y || big_clip.get_zsize() != big_z ) {
00255                 big_clip.set_size_cuda(big_x,big_y,big_z);
00256                 big_clip.get_cuda_data();
00257                 big_clip.cuda_lock(); // Just lock for the entire duration of the program, it's static anyway...
00258         }
00259         big_clip.to_value(edge_mean);
00260 
00261         if (nz != 1) {
00262                 big_clip.insert_clip(this,IntPoint(cs,cs,cs));
00263         } else  {
00264                 big_clip.insert_clip(this,IntPoint(cs,cs,0));
00265         }
00266 //      // The filter object is nothing more than a cached high pass filter
00267 //      // Ultimately it is used an argument to the EMData::mult(EMData,prevent_complex_multiplication (bool))
00268 //      // function in calc_mutual_correlation. Note that in the function the prevent_complex_multiplication
00269 //      // set to true, which is used for speed reasons.
00270 //      if (filt->get_xsize() != clipped->get_xsize() +2-(clipped->get_xsize()%2) || filt->get_ysize() != clipped->get_ysize() ||
00271 //                 filt->get_zsize() != clipped->get_zsize()) {
00272 //              filt->set_size(clipped->get_xsize() + 2-(clipped->get_xsize()%2), clipped->get_ysize(), clipped->get_zsize());
00273 //              filt->to_one();
00274 //              filt->process_inplace("filter.highpass.gauss", Dict("cutoff_abs", 1.5f/nx));
00275 //      }
00276 //
00277         EMData *mc = big_clip.calc_ccf_cuda(&big_clip,false,true);
00278         mc->sub(mc->get_edge_mean());
00279 
00280         static EMData sml_clip;
00281         int sml_x = nx * 3 / 2;
00282         int sml_y = ny * 3 / 2;
00283         int sml_z = 1;
00284         if ( nz != 1 ) {
00285                 sml_z = nz * 3 / 2;
00286         }
00287 
00288         if ( sml_clip.get_xsize() != sml_x || sml_clip.get_ysize() != sml_y || sml_clip.get_zsize() != sml_z ) {
00289                 sml_clip.set_size_cuda(sml_x,sml_y,sml_z);
00290                 sml_clip.get_cuda_data();
00291                 sml_clip.cuda_lock(); // Just lock for the entire duration of the program, it's static anyway...
00292         }
00293         if (nz != 1) {
00294                 sml_clip.insert_clip(mc,IntPoint(-cs+nx/4,-cs+ny/4,-cs+nz/4));
00295         } else {
00296                 sml_clip.insert_clip(mc,IntPoint(-cs+nx/4,-cs+ny/4,0));
00297         }
00298 
00299         delete mc; mc = 0;
00300         EMData * result = NULL;
00301 
00302         if (!unwrap || nz != 1) {
00303                 //clipped_mc->process_inplace("mask.sharp", Dict("outer_radius", -1, "value", 0));
00304                 result = new EMData(sml_clip);
00305         }
00306         else {
00307                 result = sml_clip.unwrap();
00308         }
00309 
00310         result->gpu_update();
00311 
00312         if ( unwrap == true)
00313         { // this if statement reflects a strict policy of caching in only one scenario see comments at beginning of function block
00314 
00315                 // Note that the if statement at the beginning of this function ensures that rot_fp is not zero, so there is no need
00316                 // to throw any exception
00317                 // if ( rot_fp != 0 ) throw UnexpectedBehaviorException("The rotational foot print is only expected to be cached if it is not NULL");
00318 
00319                 // Here is where the caching occurs - the rot_fp takes ownsherhip of the pointer, and a deep copied EMData object is returned.
00320                 // The deep copy invokes a cost in terms of CPU cycles and memory, but prevents the need for complicated memory management (reference counting)
00321                 rot_fp = result;
00322                 return new EMData(*rot_fp);
00323         }
00324         else return result;
00325 }
00326 
00327 EMData* EMData::calc_ccfx_cuda( EMData * const with, int y0, int y1, bool no_sum)
00328 {
00329         ENTERFUNC;
00330 //      cout << "calc_ccfx cuda" << endl;
00331         if (!with) {
00332                 LOGERR("NULL 'with' image. ");
00333                 throw NullPointerException("NULL input image");
00334         }
00335 
00336         if (!EMUtil::is_same_size(this, with)) {
00337                 LOGERR("images not same size: (%d,%d,%d) != (%d,%d,%d)",
00338                            nx, ny, nz,
00339                            with->get_xsize(), with->get_ysize(), with->get_zsize());
00340                 throw ImageFormatException("images not same size");
00341         }
00342         if (get_ndim() > 2) {
00343                 LOGERR("2D images only");
00344                 throw ImageDimensionException("2D images only");
00345         }
00346 
00347         if (y1 <= y0) {
00348                 y1 = ny;
00349         }
00350 
00351         if (y0 >= y1) {
00352                 y0 = 0;
00353         }
00354 
00355         if (y0 < 0) {
00356                 y0 = 0;
00357         }
00358 
00359         if (y1 > ny) {
00360                 y1 = ny;
00361         }
00362 
00363         static int nx_device_fft = 0;
00364         static int ny_defice_fft = 0;
00365         static EMData f1;
00366         static EMData f2;
00367         static EMData rslt;
00368 
00369         int height = y1-y0;
00370         int width = (nx+2-(nx%2));
00371         if (width != nx_device_fft || height != ny_defice_fft ) {
00372                 f1.set_size_cuda(width,height);
00373                 f2.set_size_cuda(width,height);
00374                 rslt.set_size_cuda(nx,height);
00375                 nx_device_fft = width;
00376                 ny_defice_fft = height;
00377         }
00378 
00379         {// Make a local scope so that the locks are destructed
00380                 float * cd = get_cuda_data();
00381                 CudaDataLock lock(this);
00382                 float * f1cd = f1.get_cuda_data();
00383                 CudaDataLock lock2(&f1);
00384                 cuda_dd_fft_real_to_complex_1d(cd,f1cd,nx,height);
00385         }
00386         {// Make a local scope so that the locks are destructed
00387                 float * wcd = with->get_cuda_data();
00388                 CudaDataLock lock(this);
00389                 float * f2cd = f2.get_cuda_data();
00390                 CudaDataLock lock2(&f2);
00391                 cuda_dd_fft_real_to_complex_1d(wcd,f2cd,nx,height);
00392         }
00393 
00394         EMDataForCuda left = f1.get_data_struct_for_cuda();
00395         CudaDataLock lock(&f1);
00396 
00397         bool use_texturing = false;
00398         bool center = false;
00399         if (use_texturing) {
00400                 f2.bind_cuda_texture(false);
00401                 emdata_processor_correlation_texture(&left,center);
00402                 f2.unbind_cuda_texture();
00403         } else {
00404                 EMDataForCuda right = f2.get_data_struct_for_cuda();
00405                 CudaDataLock lock2(&f2);
00406                 emdata_processor_correlation(&left,&right,center);
00407         }
00408 
00409         {// Make a local scope so that the locks are destructed
00410                 float* rcd = rslt.get_cuda_data();
00411                 CudaDataLock rlock(&rslt);
00412                 float * f1cd = f1.get_cuda_data();
00413                 CudaDataLock lock2(&f1);
00414                 cuda_dd_fft_complex_to_real_1d(f1cd,rcd,nx,height);
00415         }
00416 
00417         if (no_sum) {
00418                 rslt.gpu_update();
00419                 EXITFUNC;
00420                 return new EMData(rslt);
00421         }
00422         else {
00423                 EXITFUNC;
00424                 return rslt.column_sum_cuda();
00425         }
00426 
00427 }
00428 
00429 EMData* EMData::column_sum_cuda() const {
00430         ENTERFUNC;
00431         if (get_ndim() != 2) throw ImageDimensionException("Column sum cuda has been prgogrammed work exclusively with 2D data.");
00432         EMData *cf = new EMData();
00433         cf->set_size_cuda(nx, 1, 1);
00434         EMDataForCuda left = cf->get_data_struct_for_cuda();
00435         CudaDataLock llock(cf);
00436         bind_cuda_texture(false);
00437         emdata_column_sum(&left,ny);
00438         unbind_cuda_texture();
00439         cf->gpu_update();
00440         EXITFUNC;
00441         return cf;
00442 }
00443 
00444 void EMData::set_gpu_rw_data(float* data, const int x, const int y, const int z) {
00445         nx = x; ny = y; nz = z;
00446         nxy = nx*ny;
00447         nxyz = nx*ny*nz;
00448         if (cuda_cache_handle!=-1) {
00449                 cuda_cache.replace_gpu_rw(cuda_cache_handle,data);
00450         } else {
00451                 cuda_cache_handle = cuda_cache.store_rw_data(this,data);
00452         }
00453         gpu_update();
00454 }
00455 
00456 void EMData::free_cuda_memory() const {
00457 //      cout << "Death comes to " << this << " " << cuda_cache_handle << endl;
00458         if (cuda_cache_handle!=-1) {
00459                 cuda_cache.clear_item(cuda_cache_handle);
00460                 cuda_cache_handle = -1;
00461         }
00462 }
00463 
00465 void EMData::copy_gpu_rw_to_cpu() {
00466         get_data();
00467 }
00468 
00469 void EMData::copy_cpu_to_gpu_rw() {
00470         get_cuda_data();
00471 }
00472 
00473 void EMData::copy_cpu_to_gpu_ro() {
00474         get_cuda_array();
00475 }
00476 
00477 void EMData::copy_gpu_rw_to_gpu_ro() {
00478         cuda_cache.copy_rw_to_ro(cuda_cache_handle);
00479 }
00480 
00481 void EMData::copy_gpu_ro_to_gpu_rw() const {
00482         cuda_cache.copy_ro_to_rw(cuda_cache_handle);
00483 }
00484 
00485 void EMData::copy_gpu_ro_to_cpu() const {
00486         cuda_cache.copy_ro_to_cpu(cuda_cache_handle,rdata);
00487 }
00488 
00489 
00490 EMData::CudaCache::CudaCache(const int size) : cache_size(size), current_insert_idx(0), mem_allocated(0), locked(size,0)
00491 {
00492         device_init();
00493         rw_cache = new float *[cache_size];
00494         caller_cache = new const EMData*[cache_size];
00495         ro_cache = new cudaArray *[cache_size];
00496 
00497         for(int i = 0; i < cache_size; ++ i ) {
00498                 rw_cache[i] = 0;
00499                 caller_cache[i] = 0;
00500                 ro_cache[i] = 0;
00501         }
00502 }
00503 
00504 EMData::CudaCache::~CudaCache()
00505 {
00506         for (int i = 0; i < cache_size; i++) {
00507                 clear_item(i);
00508         }
00509 
00510         if( rw_cache )
00511         {
00512                 delete[]rw_cache;
00513                 rw_cache = 0;
00514         }
00515 
00516         // No deletion responsibility for the caller_cache
00517         if( caller_cache )
00518         {
00519                 delete[]caller_cache;
00520                 caller_cache = 0;
00521         }
00522         // This might need some thinking
00523         cleanup_cuda_fft_dd_plan_cache();
00524 }
00525 
00526 void EMData::CudaCache::lock(const int idx) {
00527         if (idx < 0 || idx >= cache_size) throw InvalidValueException(idx,"The idx is beyond the cache size");
00528         locked[idx] += 1;
00529 //      debug_print();
00530 }
00531 void EMData::CudaCache::unlock(const int idx) {
00532         if (idx < 0 || idx >= cache_size) throw InvalidValueException(idx,"The idx is beyond the cache size");
00533         if (locked[idx] == 0) {
00534 // //           cout << "Warning - unlocked something that didn't need it" << endl;
00535                 return;
00536 
00537 //               throw UnexpectedBehaviorException("Can't unlock, it wasn't locked!");
00538         }
00539         locked[idx] -=1;
00540 }
00541 
00542 int EMData::CudaCache::cache_rw_data(const EMData* const emdata, const float* const data,const int nx, const int ny, const int nz)
00543 {
00544         ensure_slot_space();
00545 
00546         float* cuda_rw_data = alloc_rw_data(nx,ny,nz);
00547 
00548         if (data != 0 ) { // If rdata is zero it means we're working exclusively on the GPU
00549                 size_t num_bytes = nx*ny*nz*sizeof(float);
00550                 cudaError_t error = cudaMemcpy(cuda_rw_data,data,num_bytes,cudaMemcpyHostToDevice);
00551                 if ( error != cudaSuccess) throw UnexpectedBehaviorException( "CudaMemcpy (host to device) error:" + string(cudaGetErrorString(error)));
00552         }
00553 
00554         return blind_store_rw_data(emdata,cuda_rw_data);
00555 }
00556 
00557 int EMData::CudaCache::blind_store_rw_data(const EMData* const emdata, float*  cuda_rw_data)
00558 {
00559 //      debug_print();
00560         rw_cache[current_insert_idx] = cuda_rw_data;
00561         caller_cache[current_insert_idx] = emdata;
00562         ro_cache[current_insert_idx] = 0;
00563 
00564         int ret = current_insert_idx;
00565         current_insert_idx += 1;
00566         current_insert_idx %= cache_size; // Potentially inefficient to do this every time, the alternative is an if statement. Which is faster?
00567         if ( current_insert_idx > cache_size ) throw;// This is just for debug
00568 //      cout << "Inserted at " << ret  << " inc to " << current_insert_idx << " size " << get_emdata_bytes(ret)/sizeof(float) << endl;
00569         return ret;
00570 }
00571 
00572 int EMData::CudaCache::store_rw_data(const EMData* const emdata, float* cuda_rw_data)
00573 {
00574         ensure_slot_space();
00575 
00576         int nx = emdata->get_xsize();
00577         int ny = emdata->get_ysize();
00578         int nz = emdata->get_zsize();
00579         size_t num_bytes = nx*ny*nz*sizeof(float);
00580         mem_allocated += num_bytes;
00581 
00582         return blind_store_rw_data(emdata, cuda_rw_data);
00583 }
00584 
00585 void EMData::CudaCache::debug_print() const {
00586         cout << "Cuda device cache debug. Total mem allocated: " << static_cast<float>(mem_allocated)/1000000.0 << "MB" << endl;
00587         for(int i = 0; i < cache_size; ++i) {
00588                 int handle = -1;
00589                 int nx = 0;
00590                 int ny = 0;
00591                 int nz = 0;
00592                 if (caller_cache[i] != 0) {
00593                         handle = caller_cache[i]->cuda_cache_handle;
00594                         nx = caller_cache[i]->get_xsize();
00595                         ny = caller_cache[i]->get_ysize();
00596                         nz = caller_cache[i]->get_zsize();
00597                 }
00598                 cout << i << ": " << handle << " " << caller_cache[i] << " dims: " << nx << " " << ny << " " << nz << " locked: " << locked[i] << " rw " << rw_cache[i] << " ro " << ro_cache[i] << endl;
00599 //              }
00600         }
00601 }
00602 
00603 void EMData::CudaCache::replace_gpu_rw(const int idx,float* cuda_rw_data)
00604 {
00605         //clear_item(idx); // The ro data goes out of date anyway
00606         if  ( rw_cache[idx] != 0) {
00607                 mem_allocated -= get_emdata_bytes(idx);
00608                 cudaError_t error = cudaFree(rw_cache[idx]);
00609                 if ( error != cudaSuccess)
00610                         throw UnexpectedBehaviorException( "CudaFree error : " + string(cudaGetErrorString(error)));
00611         }
00612         rw_cache[idx] = 0;
00613 
00614         const EMData* d = caller_cache[idx];
00615         int nx = d->get_xsize();
00616         int ny = d->get_ysize();
00617         int nz = d->get_zsize();
00618         size_t num_bytes = nx*ny*nz*sizeof(float);
00619         mem_allocated += num_bytes;
00620 
00621         rw_cache[idx] = cuda_rw_data;
00622 }
00623 
00624 void EMData::CudaCache::ensure_slot_space() {
00625 
00626         int checked_entries = 0;
00627         while ( checked_entries < cache_size) {
00628                 const EMData* previous = caller_cache[current_insert_idx];
00629                 if (previous != 0 ) {
00630                         if ( locked[current_insert_idx] == 0 ) {
00631 //                              cout << "Sending imminent lost sig " << current_insert_idx  << endl;
00632                                 previous->cuda_cache_lost_imminently();
00633 //                              cout << "Clear..." << endl;
00634                                 clear_item(current_insert_idx);
00635                                 break;
00636                         } else {
00637 //                              cout <<  "Lucky it was locked! " << current_insert_idx << endl;
00638                                 current_insert_idx++;
00639                                 current_insert_idx %= cache_size;
00640 //                              cout <<  "Incremented to " << current_insert_idx << endl;
00641                                 checked_entries++;
00642                         }
00643                 } else break; // There IS space!
00644         }
00645 
00646         if (checked_entries == cache_size) {
00647                 throw UnexpectedBehaviorException("All of the data objects in the cuda cache are locked! There is no space.");
00648         }
00649 }
00650 
00651 float* EMData::CudaCache::alloc_rw_data(const int nx, const int ny, const int nz) {
00652         float* cuda_rw_data;
00653         size_t num_bytes = nx*ny*nz*sizeof(float);
00654 
00655         cudaError_t error = cudaMalloc((void**)&cuda_rw_data,num_bytes);
00656         if ( error != cudaSuccess) {
00657                 debug_print();
00658                 throw BadAllocException( "cudaMalloc error :" + string(cudaGetErrorString(error)));
00659         }
00660 
00661 
00662 //      float* testing;
00663 //      size_t pitch;
00664 //      cudaMallocPitch( (void**)&testing, &pitch, nx*sizeof(float), ny*nz);
00665 //      cout << "The pitch of that malloc as " << pitch << endl;
00666 //      cudaFree(testing);
00667 
00668         mem_allocated += num_bytes;
00669 //      cout << "Allocation went up, it is currently " << (float)mem_allocated/1000000.0f << " MB " << endl;
00670         return cuda_rw_data;
00671 
00672 }
00673 
00674 int EMData::CudaCache::cache_ro_data(const EMData* const emdata, const float* const data,const int nx, const int ny, const int nz) {
00675         ensure_slot_space();
00676 
00677         cudaArray *array = get_cuda_array_host(data,nx,ny,nz);
00678         if (array != 0) {
00679                 mem_allocated += nx*ny*nz*sizeof(float);
00680 //              cout << "Allocation went up, it is currently " << (float)mem_allocated/1000000.0f << " MB " << endl;
00681                 rw_cache[current_insert_idx] = 0;
00682                 caller_cache[current_insert_idx] = emdata;
00683                 ro_cache[current_insert_idx] = array;
00684 
00685                 int ret = current_insert_idx;
00686                 current_insert_idx += 1;
00687                 current_insert_idx %= cache_size; // Potentially inefficient to do this everytime, the alternative is an if statement. Which is faster?
00688 //              cout << "Inserted at " << ret  << " inc to " << current_insert_idx << " size " << nx*ny*nz << endl;
00689                 return ret;
00690         }
00691         else {
00692                 throw BadAllocException("The allocation of the CUDA array failed");
00693         }
00694 }
00695 
00696 
00697 void  EMData::CudaCache::copy_rw_to_ro(const int idx) {
00698 //      cout << "Copy rw to ro " << idx << endl;
00699         if (rw_cache[idx] == 0) throw UnexpectedBehaviorException("Can not update RO CUDA data: RW data is null.");
00700 
00701         if (ro_cache[idx] != 0)  {
00702                 cudaError_t error = cudaFreeArray(ro_cache[idx]);
00703                 if ( error != cudaSuccess) throw UnexpectedBehaviorException( "CudaFreeArray error " + string(cudaGetErrorString(error)));
00704                 ro_cache[idx] = 0;
00705         }
00706 
00707         const EMData* d = caller_cache[idx];
00708         int nx = d->get_xsize();
00709         int ny = d->get_ysize();
00710         int nz = d->get_zsize();
00711 
00712         cudaArray *array = get_cuda_array_device(rw_cache[idx],nx,ny,nz);
00713         if (array == 0) throw BadAllocException("The allocation of the CUDA array failed");
00714         ro_cache[idx] = array;
00715 }
00716 
00717 void  EMData::CudaCache::copy_ro_to_rw(const int idx) {
00718 //      cout << "Copy ro to rw " << idx << endl;
00719         if (ro_cache[idx] == 0) throw UnexpectedBehaviorException("Can not update RW CUDA data: RO data is null.");
00720 
00721         if (rw_cache[idx] != 0)  {
00722                 cudaError_t error = cudaFree(rw_cache[idx]);
00723                 if ( error != cudaSuccess)
00724                         throw UnexpectedBehaviorException( "CudaFree error " + string(cudaGetErrorString(error)));
00725                 rw_cache[idx] = 0;
00726         }
00727 
00728         const EMData* d = caller_cache[idx];
00729         int nx = d->get_xsize();
00730         int ny = d->get_ysize();
00731         int nz = d->get_zsize();
00732         size_t num_bytes = nx*ny*nz*sizeof(float);
00733 
00734         float* cuda_rw_data = alloc_rw_data(nx,ny,nz);
00735 
00736         if (nz > 1) {
00737                 cudaExtent extent;
00738                 extent.width  = nx;
00739                 extent.height = ny;
00740                 extent.depth  = nz;
00741                 cudaMemcpy3DParms copyParams = {0};
00742                 copyParams.srcArray   = ro_cache[idx];
00743                 copyParams.dstPtr = make_cudaPitchedPtr((void*)cuda_rw_data, extent.width*sizeof(float), extent.width, extent.height);
00744                 copyParams.extent   = extent;
00745                 copyParams.kind     = cudaMemcpyDeviceToDevice;
00746                 cudaError_t error = cudaMemcpy3D(&copyParams);
00747                 if ( error != cudaSuccess)
00748                         throw UnexpectedBehaviorException( "Copying device array to device pointer - CudaMemcpy3D error : " + string(cudaGetErrorString(error)));
00749 
00750         } else if ( ny > 1 ) {
00751                 cudaError_t error = cudaMemcpyFromArray(cuda_rw_data,ro_cache[idx],0,0,num_bytes,cudaMemcpyDeviceToDevice);
00752                 if ( error != cudaSuccess)
00753                         throw UnexpectedBehaviorException( "Copying device array to device pointer - cudaMemcpyFromArray error : " + string(cudaGetErrorString(error)));
00754         } else throw UnexpectedBehaviorException("Cuda infrastructure has not been designed to work on 1D data");
00755 
00756         rw_cache[idx] = cuda_rw_data;
00757 }
00758 
00759 
00760 void  EMData::CudaCache::copy_ro_to_cpu(const int idx,float* data) {
00761         if (ro_cache[idx] == 0) throw UnexpectedBehaviorException("Can not update RW CUDA data: RO data is null.");
00762         if (data == 0) throw NullPointerException("The cpu data pointer is NULL in copy_ro_to_cpu");
00763 
00764         const EMData* d = caller_cache[idx];
00765         int nx = d->get_xsize();
00766         int ny = d->get_ysize();
00767         int nz = d->get_zsize();
00768         size_t num_bytes = nx*ny*nz*sizeof(float);
00769 
00770         if (nz > 1) {
00771                 cudaExtent extent;
00772                 extent.width  = nx;
00773                 extent.height = ny;
00774                 extent.depth  = nz;
00775                 cudaMemcpy3DParms copyParams = {0};
00776                 copyParams.srcArray   = ro_cache[idx];
00777                 copyParams.dstPtr = make_cudaPitchedPtr((void*)data, extent.width*sizeof(float), extent.width, extent.height);
00778                 copyParams.extent   = extent;
00779                 copyParams.kind     = cudaMemcpyDeviceToHost;
00780                 cudaError_t error = cudaMemcpy3D(&copyParams);
00781                 if ( error != cudaSuccess)
00782                         throw UnexpectedBehaviorException( "Copying device array to device pointer - CudaMemcpy3D error : " + string(cudaGetErrorString(error)));
00783 
00784         } else if ( ny > 1 ) {
00785                 cudaError_t error = cudaMemcpyFromArray(data,ro_cache[idx],0,0,num_bytes,cudaMemcpyDeviceToHost);
00786                 if ( error != cudaSuccess)
00787                         throw UnexpectedBehaviorException( "Copying device array to device pointer - cudaMemcpyFromArray error : " + string(cudaGetErrorString(error)));
00788         } else throw UnexpectedBehaviorException("Cuda infrastructure has not been designed to work on 1D data");
00789 
00790 }
00791 void EMData::CudaCache::clear_item(const int idx) {
00792 //      debug_print();
00793         if  ( rw_cache[idx] != 0) {
00794                 mem_allocated -= get_emdata_bytes(idx);
00795                 cudaError_t error = cudaFree(rw_cache[idx]);
00796                 if ( error != cudaSuccess)
00797                         throw UnexpectedBehaviorException( "CudaFree error : " + string(cudaGetErrorString(error)));
00798         }
00799         rw_cache[idx] = 0;
00800 
00801         if  ( ro_cache[idx] != 0) {
00802                 mem_allocated -= get_emdata_bytes(idx);
00803                 cudaError_t error = cudaFreeArray(ro_cache[idx]);
00804                 if ( error != cudaSuccess) throw UnexpectedBehaviorException( "CudaFreeArray error : " + string(cudaGetErrorString(error)));
00805 
00806         }
00807         ro_cache[idx] = 0;
00808 
00809         caller_cache[idx] = 0;
00810 
00811         locked[idx] = 0;
00812 }
00813 
00814 
00815 EMData::CudaDataLock::CudaDataLock(const EMData* const emdata) : data_cuda_handle(-1)
00816 {
00817         emdata->set_gpu_rw_current();
00818         data_cuda_handle = emdata->cuda_cache_handle;
00819         EMData::cuda_cache.lock(data_cuda_handle);
00820 }
00821 
00822 EMData::CudaDataLock::~CudaDataLock() {
00823         EMData::cuda_cache.unlock(data_cuda_handle);
00824 }
00825 
00826 
00827 #endif //EMAN2_USING_CUDA

Generated on Thu Dec 9 13:45:45 2010 for EMAN2 by  doxygen 1.3.9.1