Main Page | Modules | Namespace List | Class Hierarchy | Alphabetical List | Class List | Directories | File List | Namespace Members | Class Members | File Members

emdata_cuda.cpp

Go to the documentation of this file.
00001 /*
00002  * Author: Steven Ludtke, 04/10/2003 (sludtke@bcm.edu)
00003  * Copyright (c) 2000-2006 Baylor College of Medicine
00004  *
00005  * This software is issued under a joint BSD/GNU license. You may use the
00006  * source code in this file under either license. However, note that the
00007  * complete EMAN2 and SPARX software packages have some GPL dependencies,
00008  * so you are responsible for compliance with the licenses of these packages
00009  * if you opt to use BSD licensing. The warranty disclaimer below holds
00010  * in either instance.
00011  *
00012  * This complete copyright notice must be included in any revised version of the
00013  * source code. Additional authorship citations may be added, but existing
00014  * author citations must be preserved.
00015  *
00016  * This program is free software; you can redistribute it and/or modify
00017  * it under the terms of the GNU General Public License as published by
00018  * the Free Software Foundation; either version 2 of the License, or
00019  * (at your option) any later version.
00020  *
00021  * This program is distributed in the hope that it will be useful,
00022  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00023  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
00024  * GNU General Public License for more details.
00025  *
00026  * You should have received a copy of the GNU General Public License
00027  * along with this program; if not, write to the Free Software
00028  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00029  *
00030  * */
00031 
00032 
00033 
00034 #ifdef EMAN2_USING_CUDA
00035 
00036 #include "emdata.h"
00037 #include "exception.h"
00038 #include <cuda_runtime_api.h>
00039 #include <driver_functions.h>
00040 #include <cuda.h>
00041 #include "cuda/cuda_util.h"
00042 #include "cuda/cuda_processor.h"
00043 #include "cuda/cuda_emfft.h"
00044 
00045 using namespace EMAN;
00046 // Static init
00047 EMData::CudaCache EMData::cuda_cache(100);
00048 
00049 float* EMData::get_cuda_data() const {
00050         if (get_size() == 0 ) throw UnexpectedBehaviorException("The size of the data is 0?");
00051         if (cuda_cache_handle==-1 || EMDATA_GPU_NEEDS_UPDATE & flags) {
00052                 if (cuda_cache_handle != -1 && gpu_ro_is_current() ) {
00053                         cuda_cache.copy_ro_to_rw(cuda_cache_handle);
00054                 } else {
00055                         if (cuda_cache_handle !=-1 ) {
00056                                 cuda_cache.clear_item(cuda_cache_handle);
00057                         }
00058                         cuda_cache_handle = cuda_cache.cache_rw_data(this,rdata,nx,ny,nz);
00059                         if (cuda_cache_handle == -1) throw;
00060                 }
00061                 flags &= ~EMDATA_GPU_NEEDS_UPDATE;
00062         }
00063         return cuda_cache.get_rw_data(cuda_cache_handle);
00064 }
00065 
00066 bool EMData::gpu_rw_is_current() const {
00067         if (cuda_cache_handle !=-1 && !(EMDATA_GPU_NEEDS_UPDATE & flags)) return cuda_cache.has_rw_data(cuda_cache_handle);
00068         else return false;
00069 }
00070 
00071 bool EMData::cpu_rw_is_current() const {
00072         if      (!(EMDATA_CPU_NEEDS_UPDATE & flags) && rdata != 0) return true;
00073         return false;
00074 }
00075 
00076 bool EMData::gpu_ro_is_current() const {
00077         if (cuda_cache_handle !=-1 && !(EMDATA_GPU_RO_NEEDS_UPDATE & flags)) return cuda_cache.has_ro_data(cuda_cache_handle);
00078         else return false;
00079 }
00080 
00081 void EMData::bind_cuda_texture(const bool interp_mode) const {
00082         check_cuda_array_update();
00083         cuda_cache.lock(cuda_cache_handle);
00084         bind_cuda_array_to_texture(cuda_cache.get_ro_data(cuda_cache_handle),cuda_cache.get_ndim(cuda_cache_handle),interp_mode);
00085 }
00086 
00087 void EMData::unbind_cuda_texture() const {
00088 	::unbind_cuda_texture(cuda_cache.get_ndim(cuda_cache_handle));
00089         cuda_cache.unlock(cuda_cache_handle);
00090 }
00091 
00092 cudaArray* EMData::get_cuda_array() const {
00093         if (get_size() == 0 ) throw UnexpectedBehaviorException("The size of the data is 0?");
00094         check_cuda_array_update();
00095         return cuda_cache.get_ro_data(cuda_cache_handle);
00096 }
00097 
00098 void EMData::check_cuda_array_update() const {
00099         if (cuda_cache_handle==-1 || EMDATA_GPU_RO_NEEDS_UPDATE & flags) {
00100                 if (cuda_cache_handle !=- 1 && gpu_rw_is_current() )  {
00101                         cuda_cache.copy_rw_to_ro(cuda_cache_handle);
00102                 } else {
00103                         if (cuda_cache_handle !=-1 ) cuda_cache.clear_item(cuda_cache_handle);
00104                         cuda_cache_handle = cuda_cache.cache_ro_data(this,rdata,nx,ny,nz);
00105                         if (cuda_cache_handle >=50 ) throw InvalidValueException(cuda_cache_handle,"In get cuda data, the handle is strange");
00106                         if (cuda_cache_handle == -1) throw;
00107                 }
00108                 flags &= ~EMDATA_GPU_RO_NEEDS_UPDATE;
00109         }
00110 }
00111 
00112 void EMData::cuda_cache_lost_imminently() const {
00113         //scout << "In cache lost " << cuda_cache_handle << " " << nx << " " << ny << " " << nz << endl;
00114         get_data(); // This causes cuda memory to be copied to cpu memory
00115         flags |=  EMDATA_GPU_NEEDS_UPDATE| EMDATA_GPU_RO_NEEDS_UPDATE;
00116         cuda_cache_handle = -1;
00117 }
00118 void EMData::cuda_lock() const {
00119         if (cuda_cache_handle == -1) throw UnexpectedBehaviorException("No cuda handle, can't lock");
00120         cuda_cache.lock(cuda_cache_handle);
00121         //cuda_cache.debug_print();
00122 }
00123 void EMData::cuda_unlock() const {
00124         //cout << " " << cuda_cache_handle << endl;
00125         //cuda_cache.debug_print();
00126         if (cuda_cache_handle == -1) throw UnexpectedBehaviorException("No cuda handle, can't lock");
00127         cuda_cache.unlock(cuda_cache_handle);
00128 }
00129 EMDataForCuda EMData::get_data_struct_for_cuda() const {
00130         EMDataForCuda tmp = {get_cuda_data(),nx,ny,nz};
00131         return tmp;
00132 }
00133 
00134 bool EMData::gpu_operation_preferred() const {
00135         bool cpu = cpu_rw_is_current();
00136         bool gpu = gpu_rw_is_current();
00137         if ( cpu==0 &&  gpu==0 ) {
00138                 // This is what happens when set_size doesn't allocate
00139                 return false;
00140 //              cout << (!(EMDATA_CPU_NEEDS_UPDATE & flags) && rdata != 0) << " " << (cuda_cache_handle !=-1 && !(EMDATA_GPU_NEEDS_UPDATE & flags) && cuda_cache.has_rw_data(cuda_cache_handle)) << endl;
00141 //              cout << "GPU flag " << !(EMDATA_GPU_NEEDS_UPDATE & flags) << endl;
00142 //              cout << "CPU flag " << !(EMDATA_CPU_NEEDS_UPDATE & flags) << endl;
00143 //              cout << "Rdata " << rdata << endl;
00144 //              cout << "Cuda handle " << cuda_cache_handle << endl;
00145 //              throw UnexpectedBehaviorException("Neither the CPU or GPU data are current");
00146         }
00147         if (gpu) return true;
00148         return false;
00149 }
00150 
00151 EMData* EMData::calc_ccf_cuda( EMData*  image, bool use_texturing,bool center ) const {
00152         EMData* tmp;
00153         if (is_complex()) {
00154 //              cout << "Tmp is a copy of this" << endl;
00155                 tmp = new EMData(*this);
00156         } else {
00157 //              cout << "Tmp is this fftd" << endl;
00158                 tmp = do_fft_cuda();
00159         }
00160 
00161         Dict d;
00162         EMData* with = 0;
00163         if (image == this) {
00164                 d["with"] = (EMData*) tmp;
00165         } else {
00166                 if (!image->is_complex()) {
00167                         int wnx = image->get_xsize(); int wny = image->get_ysize(); int wnz = image->get_zsize();
00168                         if ( wnx != nx || wny != ny || wnz != nz ) {
00169 
00170                                 Region r;
00171                                 if (nz > 1) {
00172                                         r = Region((wnx-nx)/2, (wny-ny)/2, (wnz-nz)/2,nx,ny,nz);
00173                                 }
00174                                 else if (ny > 1) {
00175                                         r = Region((wnx-nx)/2, (wny-ny)/2,nx,ny);
00176                                 }
00177                                 else throw UnexpectedBehaviorException("Calc_ccf_cuda doesn't work on 1D images");
00178                                 EMData* tmp = image->get_clip(r);
00179                                 with = tmp->do_fft_cuda();
00180                                 delete tmp;
00181                         }else {
00182                                 with = image->do_fft_cuda();
00183                         }
00184                         d["with"] = (EMData*) with;
00185                 } else {
00186         //              cout << "With is the input image" << endl;
00187                         d["with"] = (EMData*)image;
00188                 }
00189         }
00190 
00191 
00192         EMDataForCuda left = tmp->get_data_struct_for_cuda();
00193         CudaDataLock lock(tmp);
00194         if (use_texturing) {
00195                 ((EMData*)d["with"])->bind_cuda_texture(false);
00196                 emdata_processor_correlation_texture(&left,center);
00197                 ((EMData*)d["with"])->unbind_cuda_texture();
00198         } else {
00199                 EMDataForCuda right = ((EMData*)d["with"])->get_data_struct_for_cuda();
00200                 CudaDataLock lock2((EMData*)d["with"]);
00201                 emdata_processor_correlation(&left,&right,center);
00202         }
00203         tmp->gpu_update();
00204 
00205 //      tmp->process_inplace("cuda.correlate",d);
00206 //      return tmp;
00207         if (with != 0 && image != this) {
00208                 delete with;
00209                 with = 0;
00210         }
00211 
00212         EMData* soln = tmp->do_ift_cuda(false);
00213         soln->gpu_update();
00214         delete tmp;
00215         tmp = 0;
00216 
00217         return soln;
00218 }
00219 
00220 EMData *EMData::make_rotational_footprint_cuda( bool unwrap)
00221 {
00222         ENTERFUNC;
00223 //
00224 //      update_stat();
00225 //      float edge_mean = get_edge_mean();
00226         float edge_mean = 0;
00227         CudaDataLock(this);
00228         if ( rot_fp != 0 && unwrap == true) {
00229                 return new EMData(*rot_fp);
00230         }
00231 //
00232 //      //static EMData obj_filt;
00233 //      //EMData* filt = &obj_filt;
00234 //      //filt->set_complex(true);
00236 //
00240 //
00241         int cs = (((nx * 7 / 4) & 0xfffff8) - nx) / 2; // this pads the image to 1 3/4 * size with result divis. by 8
00242 
00243         static EMData big_clip;
00244         int big_x = nx+2*cs;
00245         int big_y = ny+2*cs;
00246         int big_z = 1;
00247         if ( nz != 1 ) {
00248                 big_z = nz+2*cs;
00249         }
00250 
00251 
00252         if ( big_clip.get_xsize() != big_x || big_clip.get_ysize() != big_y || big_clip.get_zsize() != big_z ) {
00253                 big_clip.set_size_cuda(big_x,big_y,big_z);
00254                 big_clip.get_cuda_data();
00255                 big_clip.cuda_lock(); // Just lock for the entire duration of the program, it's static anyway...
00256         }
00257         big_clip.to_value(edge_mean);
00258 
00259         if (nz != 1) {
00260                 big_clip.insert_clip(this,IntPoint(cs,cs,cs));
00261         } else  {
00262                 big_clip.insert_clip(this,IntPoint(cs,cs,0));
00263         }
00264 //      // The filter object is nothing more than a cached high pass filter
00265 //      // Ultimately it is used an argument to the EMData::mult(EMData,prevent_complex_multiplication (bool))
00266 //      // function in calc_mutual_correlation. Note that in the function the prevent_complex_multiplication
00267 //      // set to true, which is used for speed reasons.
00268 //      if (filt->get_xsize() != clipped->get_xsize() +2-(clipped->get_xsize()%2) || filt->get_ysize() != clipped->get_ysize() ||
00269 //                 filt->get_zsize() != clipped->get_zsize()) {
00270 //              filt->set_size(clipped->get_xsize() + 2-(clipped->get_xsize()%2), clipped->get_ysize(), clipped->get_zsize());
00271 //              filt->to_one();
00272 //              filt->process_inplace("eman1.filter.highpass.gaussian", Dict("highpass", 1.5f/nx));
00273 //      }
00274 //
00275         EMData *mc = big_clip.calc_ccf_cuda(&big_clip,false,true);
00276         mc->sub(mc->get_edge_mean());
00277 
00278         static EMData sml_clip;
00279         int sml_x = nx * 3 / 2;
00280         int sml_y = ny * 3 / 2;
00281         int sml_z = 1;
00282         if ( nz != 1 ) {
00283                 sml_z = nz * 3 / 2;
00284         }
00285 
00286         if ( sml_clip.get_xsize() != sml_x || sml_clip.get_ysize() != sml_y || sml_clip.get_zsize() != sml_z ) {
00287                 sml_clip.set_size_cuda(sml_x,sml_y,sml_z);
00288                 sml_clip.get_cuda_data();
00289                 sml_clip.cuda_lock(); // Just lock for the entire duration of the program, it's static anyway...
00290         }
00291         if (nz != 1) {
00292                 sml_clip.insert_clip(mc,IntPoint(-cs+nx/4,-cs+ny/4,-cs+nz/4));
00293         } else {
00294                 sml_clip.insert_clip(mc,IntPoint(-cs+nx/4,-cs+ny/4,0));
00295         }
00296 
00297         delete mc; mc = 0;
00298         EMData * result = NULL;
00299 
00300         if (!unwrap || nz != 1) {
00301                 //clipped_mc->process_inplace("mask.sharp", Dict("outer_radius", -1, "value", 0));
00302                 result = new EMData(sml_clip);
00303         }
00304         else {
00305                 result = sml_clip.unwrap();
00306         }
00307 
00308         result->gpu_update();
00309 
00310         if ( unwrap == true)
00311         { // this if statement reflects a strict policy of caching in only one scenario see comments at beginning of function block
00312 
00313                 // Note that the if statement at the beginning of this function ensures that rot_fp is not zero, so there is no need
00314                 // to throw any exception
00315                 // if ( rot_fp != 0 ) throw UnexpectedBehaviorException("The rotational foot print is only expected to be cached if it is not NULL");
00316 
00317                 // Here is where the caching occurs - the rot_fp takes ownsherhip of the pointer, and a deep copied EMData object is returned.
00318                 // The deep copy invokes a cost in terms of CPU cycles and memory, but prevents the need for complicated memory management (reference counting)
00319                 rot_fp = result;
00320                 return new EMData(*rot_fp);
00321         }
00322         else return result;
00323 }
00324 
00325 EMData* EMData::calc_ccfx_cuda( EMData * const with, int y0, int y1, bool no_sum)
00326 {
00327         ENTERFUNC;
00328 //      cout << "calc_ccfx cuda" << endl;
00329         if (!with) {
00330                 LOGERR("NULL 'with' image. ");
00331                 throw NullPointerException("NULL input image");
00332         }
00333 
00334         if (!EMUtil::is_same_size(this, with)) {
00335                 LOGERR("images not same size: (%d,%d,%d) != (%d,%d,%d)",
00336                            nx, ny, nz,
00337                            with->get_xsize(), with->get_ysize(), with->get_zsize());
00338                 throw ImageFormatException("images not same size");
00339         }
00340         if (get_ndim() > 2) {
00341                 LOGERR("2D images only");
00342                 throw ImageDimensionException("2D images only");
00343         }
00344 
00345         if (y1 <= y0) {
00346                 y1 = ny;
00347         }
00348 
00349         if (y0 >= y1) {
00350                 y0 = 0;
00351         }
00352 
00353         if (y0 < 0) {
00354                 y0 = 0;
00355         }
00356 
00357         if (y1 > ny) {
00358                 y1 = ny;
00359         }
00360 
00361         static int nx_device_fft = 0;
00362         static int ny_defice_fft = 0;
00363         static EMData f1;
00364         static EMData f2;
00365         static EMData rslt;
00366 
00367         int height = y1-y0;
00368         int width = (nx+2-(nx%2));
00369         if (width != nx_device_fft || height != ny_defice_fft ) {
00370                 f1.set_size_cuda(width,height);
00371                 f2.set_size_cuda(width,height);
00372                 rslt.set_size_cuda(nx,height);
00373                 nx_device_fft = width;
00374                 ny_defice_fft = height;
00375         }
00376 
00377         {// Make a local scope so that the locks are destructed
00378                 float * cd = get_cuda_data();
00379                 CudaDataLock lock(this);
00380                 float * f1cd = f1.get_cuda_data();
00381                 CudaDataLock lock2(&f1);
00382                 cuda_dd_fft_real_to_complex_1d(cd,f1cd,nx,height);
00383         }
00384         {// Make a local scope so that the locks are destructed
00385                 float * wcd = with->get_cuda_data();
00386                 CudaDataLock lock(this);
00387                 float * f2cd = f2.get_cuda_data();
00388                 CudaDataLock lock2(&f2);
00389                 cuda_dd_fft_real_to_complex_1d(wcd,f2cd,nx,height);
00390         }
00391 
00392         EMDataForCuda left = f1.get_data_struct_for_cuda();
00393         CudaDataLock lock(&f1);
00394 
00395         bool use_texturing = false;
00396         bool center = false;
00397         if (use_texturing) {
00398                 f2.bind_cuda_texture(false);
00399                 emdata_processor_correlation_texture(&left,center);
00400                 f2.unbind_cuda_texture();
00401         } else {
00402                 EMDataForCuda right = f2.get_data_struct_for_cuda();
00403                 CudaDataLock lock2(&f2);
00404                 emdata_processor_correlation(&left,&right,center);
00405         }
00406 
00407         {// Make a local scope so that the locks are destructed
00408                 float* rcd = rslt.get_cuda_data();
00409                 CudaDataLock rlock(&rslt);
00410                 float * f1cd = f1.get_cuda_data();
00411                 CudaDataLock lock2(&f1);
00412                 cuda_dd_fft_complex_to_real_1d(f1cd,rcd,nx,height);
00413         }
00414 
00415         if (no_sum) {
00416                 rslt.gpu_update();
00417                 EXITFUNC;
00418                 return new EMData(rslt);
00419         }
00420         else {
00421                 EXITFUNC;
00422                 return rslt.column_sum_cuda();
00423         }
00424 
00425 }
00426 
00427 EMData* EMData::column_sum_cuda() const {
00428         ENTERFUNC;
00429         if (get_ndim() != 2) throw ImageDimensionException("Column sum cuda has been prgogrammed work exclusively with 2D data.");
00430         EMData *cf = new EMData();
00431         cf->set_size_cuda(nx, 1, 1);
00432         EMDataForCuda left = cf->get_data_struct_for_cuda();
00433         CudaDataLock llock(cf);
00434         bind_cuda_texture(false);
00435         emdata_column_sum(&left,ny);
00436         unbind_cuda_texture();
00437         cf->gpu_update();
00438         EXITFUNC;
00439         return cf;
00440 }
00441 
00442 void EMData::set_gpu_rw_data(float* data, const int x, const int y, const int z) {
00443         nx = x; ny = y; nz = z;
00444         nxy = nx*ny;
00445         nxyz = nx*ny*nz;
00446         if (cuda_cache_handle!=-1) {
00447                 cuda_cache.replace_gpu_rw(cuda_cache_handle,data);
00448         } else {
00449                 cuda_cache_handle = cuda_cache.store_rw_data(this,data);
00450         }
00451         gpu_update();
00452 }
00453 
00454 void EMData::free_cuda_memory() const {
00455 //      cout << "Death comes to " << this << " " << cuda_cache_handle << endl;
00456         if (cuda_cache_handle!=-1) {
00457                 cuda_cache.clear_item(cuda_cache_handle);
00458                 cuda_cache_handle = -1;
00459         }
00460 }
00461 
00463 void EMData::copy_gpu_rw_to_cpu() {
00464         get_data();
00465 }
00466 
00467 void EMData::copy_cpu_to_gpu_rw() {
00468         get_cuda_data();
00469 }
00470 
00471 void EMData::copy_cpu_to_gpu_ro() {
00472         get_cuda_array();
00473 }
00474 
00475 void EMData::copy_gpu_rw_to_gpu_ro() {
00476         cuda_cache.copy_rw_to_ro(cuda_cache_handle);
00477 }
00478 
00479 void EMData::copy_gpu_ro_to_gpu_rw() const {
00480         cuda_cache.copy_ro_to_rw(cuda_cache_handle);
00481 }
00482 
00483 void EMData::copy_gpu_ro_to_cpu() const {
00484         cuda_cache.copy_ro_to_cpu(cuda_cache_handle,rdata);
00485 }
00486 
00487 
00488 EMData::CudaCache::CudaCache(const int size) : cache_size(size), current_insert_idx(0), mem_allocated(0), locked(size,0)
00489 {
00490         device_init();
00491         rw_cache = new float *[cache_size];
00492         caller_cache = new const EMData*[cache_size];
00493         ro_cache = new cudaArray *[cache_size];
00494 
00495         for(int i = 0; i < cache_size; ++ i ) {
00496                 rw_cache[i] = 0;
00497                 caller_cache[i] = 0;
00498                 ro_cache[i] = 0;
00499         }
00500 }
00501 
00502 EMData::CudaCache::~CudaCache()
00503 {
00504         for (int i = 0; i < cache_size; i++) {
00505                 clear_item(i);
00506         }
00507 
00508         if( rw_cache )
00509         {
00510                 delete[]rw_cache;
00511                 rw_cache = 0;
00512         }
00513 
00514         // No deletion responsibility for the caller_cache
00515         if( caller_cache )
00516         {
00517                 delete[]caller_cache;
00518                 caller_cache = 0;
00519         }
00520         // This might need some thinking
00521         cleanup_cuda_fft_dd_plan_cache();
00522 }
00523 
00524 void EMData::CudaCache::lock(const int idx) {
00525         if (idx < 0 || idx >= cache_size) throw InvalidValueException(idx,"The idx is beyond the cache size");
00526         locked[idx] += 1;
00527 //      debug_print();
00528 }
00529 void EMData::CudaCache::unlock(const int idx) {
00530         if (idx < 0 || idx >= cache_size) throw InvalidValueException(idx,"The idx is beyond the cache size");
00531         if (locked[idx] == 0) {
00532 // //           cout << "Warning - unlocked something that didn't need it" << endl;
00533                 return;
00534 
00535 //               throw UnexpectedBehaviorException("Can't unlock, it wasn't locked!");
00536         }
00537         locked[idx] -=1;
00538 }
00539 
00540 int EMData::CudaCache::cache_rw_data(const EMData* const emdata, const float* const data,const int nx, const int ny, const int nz)
00541 {
00542         ensure_slot_space();
00543 
00544         float* cuda_rw_data = alloc_rw_data(nx,ny,nz);
00545 
00546         if (data != 0 ) { // If rdata is zero it means we're working exclusively on the GPU
00547                 size_t num_bytes = nx*ny*nz*sizeof(float);
00548                 cudaError_t error = cudaMemcpy(cuda_rw_data,data,num_bytes,cudaMemcpyHostToDevice);
00549                 if ( error != cudaSuccess) throw UnexpectedBehaviorException( "CudaMemcpy (host to device) error:" + string(cudaGetErrorString(error)));
00550         }
00551 
00552         return blind_store_rw_data(emdata,cuda_rw_data);
00553 }
00554 
00555 int EMData::CudaCache::blind_store_rw_data(const EMData* const emdata, float*  cuda_rw_data)
00556 {
00557 //      debug_print();
00558         rw_cache[current_insert_idx] = cuda_rw_data;
00559         caller_cache[current_insert_idx] = emdata;
00560         ro_cache[current_insert_idx] = 0;
00561 
00562         int ret = current_insert_idx;
00563         current_insert_idx += 1;
00564         current_insert_idx %= cache_size; // Potentially inefficient to do this every time, the alternative is an if statement. Which is faster?
00565         if ( current_insert_idx > cache_size ) throw;// This is just for debug
00566 //      cout << "Inserted at " << ret  << " inc to " << current_insert_idx << " size " << get_emdata_bytes(ret)/sizeof(float) << endl;
00567         return ret;
00568 }
00569 
00570 int EMData::CudaCache::store_rw_data(const EMData* const emdata, float* cuda_rw_data)
00571 {
00572         ensure_slot_space();
00573 
00574         int nx = emdata->get_xsize();
00575         int ny = emdata->get_ysize();
00576         int nz = emdata->get_zsize();
00577         size_t num_bytes = nx*ny*nz*sizeof(float);
00578         mem_allocated += num_bytes;
00579 
00580         return blind_store_rw_data(emdata, cuda_rw_data);
00581 }
00582 
00583 void EMData::CudaCache::debug_print() const {
00584         cout << "Cuda device cache debug. Total mem allocated: " << static_cast<float>(mem_allocated)/1000000.0 << "MB" << endl;
00585         for(int i = 0; i < cache_size; ++i) {
00586                 int handle = -1;
00587                 int nx = 0;
00588                 int ny = 0;
00589                 int nz = 0;
00590                 if (caller_cache[i] != 0) {
00591                         handle = caller_cache[i]->cuda_cache_handle;
00592                         nx = caller_cache[i]->get_xsize();
00593                         ny = caller_cache[i]->get_ysize();
00594                         nz = caller_cache[i]->get_zsize();
00595                 }
00596                 cout << i << ": " << handle << " " << caller_cache[i] << " dims: " << nx << " " << ny << " " << nz << " locked: " << locked[i] << " rw " << rw_cache[i] << " ro " << ro_cache[i] << endl;
00597 //              }
00598         }
00599 }
00600 
00601 void EMData::CudaCache::replace_gpu_rw(const int idx,float* cuda_rw_data)
00602 {
00603         //clear_item(idx); // The ro data goes out of date anyway
00604         if  ( rw_cache[idx] != 0) {
00605                 mem_allocated -= get_emdata_bytes(idx);
00606                 cudaError_t error = cudaFree(rw_cache[idx]);
00607                 if ( error != cudaSuccess)
00608                         throw UnexpectedBehaviorException( "CudaFree error : " + string(cudaGetErrorString(error)));
00609         }
00610         rw_cache[idx] = 0;
00611 
00612         const EMData* d = caller_cache[idx];
00613         int nx = d->get_xsize();
00614         int ny = d->get_ysize();
00615         int nz = d->get_zsize();
00616         size_t num_bytes = nx*ny*nz*sizeof(float);
00617         mem_allocated += num_bytes;
00618 
00619         rw_cache[idx] = cuda_rw_data;
00620 }
00621 
00622 void EMData::CudaCache::ensure_slot_space() {
00623 
00624         int checked_entries = 0;
00625         while ( checked_entries < cache_size) {
00626                 const EMData* previous = caller_cache[current_insert_idx];
00627                 if (previous != 0 ) {
00628                         if ( locked[current_insert_idx] == 0 ) {
00629 //                              cout << "Sending imminent lost sig " << current_insert_idx  << endl;
00630                                 previous->cuda_cache_lost_imminently();
00631 //                              cout << "Clear..." << endl;
00632                                 clear_item(current_insert_idx);
00633                                 break;
00634                         } else {
00635 //                              cout <<  "Lucky it was locked! " << current_insert_idx << endl;
00636                                 current_insert_idx++;
00637                                 current_insert_idx %= cache_size;
00638 //                              cout <<  "Incremented to " << current_insert_idx << endl;
00639                                 checked_entries++;
00640                         }
00641                 } else break; // There IS space!
00642         }
00643 
00644         if (checked_entries == cache_size) {
00645                 throw UnexpectedBehaviorException("All of the data objects in the cuda cache are locked! There is no space.");
00646         }
00647 }
00648 
00649 float* EMData::CudaCache::alloc_rw_data(const int nx, const int ny, const int nz) {
00650         float* cuda_rw_data;
00651         size_t num_bytes = nx*ny*nz*sizeof(float);
00652 
00653         cudaError_t error = cudaMalloc((void**)&cuda_rw_data,num_bytes);
00654         if ( error != cudaSuccess) {
00655                 debug_print();
00656                 throw BadAllocException( "cudaMalloc error :" + string(cudaGetErrorString(error)));
00657         }
00658 
00659 
00660 //      float* testing;
00661 //      size_t pitch;
00662 //      cudaMallocPitch( (void**)&testing, &pitch, nx*sizeof(float), ny*nz);
00663 //      cout << "The pitch of that malloc as " << pitch << endl;
00664 //      cudaFree(testing);
00665 
00666         mem_allocated += num_bytes;
00667 //      cout << "Allocation went up, it is currently " << (float)mem_allocated/1000000.0f << " MB " << endl;
00668         return cuda_rw_data;
00669 
00670 }
00671 
00672 int EMData::CudaCache::cache_ro_data(const EMData* const emdata, const float* const data,const int nx, const int ny, const int nz) {
00673         ensure_slot_space();
00674 
00675         cudaArray *array = get_cuda_array_host(data,nx,ny,nz);
00676         if (array != 0) {
00677                 mem_allocated += nx*ny*nz*sizeof(float);
00678 //              cout << "Allocation went up, it is currently " << (float)mem_allocated/1000000.0f << " MB " << endl;
00679                 rw_cache[current_insert_idx] = 0;
00680                 caller_cache[current_insert_idx] = emdata;
00681                 ro_cache[current_insert_idx] = array;
00682 
00683                 int ret = current_insert_idx;
00684                 current_insert_idx += 1;
00685                 current_insert_idx %= cache_size; // Potentially inefficient to do this everytime, the alternative is an if statement. Which is faster?
00686 //              cout << "Inserted at " << ret  << " inc to " << current_insert_idx << " size " << nx*ny*nz << endl;
00687                 return ret;
00688         }
00689         else {
00690                 throw BadAllocException("The allocation of the CUDA array failed");
00691         }
00692 }
00693 
00694 
00695 void  EMData::CudaCache::copy_rw_to_ro(const int idx) {
00696 //      cout << "Copy rw to ro " << idx << endl;
00697         if (rw_cache[idx] == 0) throw UnexpectedBehaviorException("Can not update RO CUDA data: RW data is null.");
00698 
00699         if (ro_cache[idx] != 0)  {
00700                 cudaError_t error = cudaFreeArray(ro_cache[idx]);
00701                 if ( error != cudaSuccess) throw UnexpectedBehaviorException( "CudaFreeArray error " + string(cudaGetErrorString(error)));
00702                 ro_cache[idx] = 0;
00703         }
00704 
00705         const EMData* d = caller_cache[idx];
00706         int nx = d->get_xsize();
00707         int ny = d->get_ysize();
00708         int nz = d->get_zsize();
00709 
00710         cudaArray *array = get_cuda_array_device(rw_cache[idx],nx,ny,nz);
00711         if (array == 0) throw BadAllocException("The allocation of the CUDA array failed");
00712         ro_cache[idx] = array;
00713 }
00714 
00715 void  EMData::CudaCache::copy_ro_to_rw(const int idx) {
00716 //      cout << "Copy ro to rw " << idx << endl;
00717         if (ro_cache[idx] == 0) throw UnexpectedBehaviorException("Can not update RW CUDA data: RO data is null.");
00718 
00719         if (rw_cache[idx] != 0)  {
00720                 cudaError_t error = cudaFree(rw_cache[idx]);
00721                 if ( error != cudaSuccess)
00722                         throw UnexpectedBehaviorException( "CudaFree error " + string(cudaGetErrorString(error)));
00723                 rw_cache[idx] = 0;
00724         }
00725 
00726         const EMData* d = caller_cache[idx];
00727         int nx = d->get_xsize();
00728         int ny = d->get_ysize();
00729         int nz = d->get_zsize();
00730         size_t num_bytes = nx*ny*nz*sizeof(float);
00731 
00732         float* cuda_rw_data = alloc_rw_data(nx,ny,nz);
00733 
00734         if (nz > 1) {
00735                 cudaExtent extent;
00736                 extent.width  = nx;
00737                 extent.height = ny;
00738                 extent.depth  = nz;
00739                 cudaMemcpy3DParms copyParams = {0};
00740                 copyParams.srcArray   = ro_cache[idx];
00741                 copyParams.dstPtr = make_cudaPitchedPtr((void*)cuda_rw_data, extent.width*sizeof(float), extent.width, extent.height);
00742                 copyParams.extent   = extent;
00743                 copyParams.kind     = cudaMemcpyDeviceToDevice;
00744                 cudaError_t error = cudaMemcpy3D(&copyParams);
00745                 if ( error != cudaSuccess)
00746                         throw UnexpectedBehaviorException( "Copying device array to device pointer - CudaMemcpy3D error : " + string(cudaGetErrorString(error)));
00747 
00748         } else if ( ny > 1 ) {
00749                 cudaError_t error = cudaMemcpyFromArray(cuda_rw_data,ro_cache[idx],0,0,num_bytes,cudaMemcpyDeviceToDevice);
00750                 if ( error != cudaSuccess)
00751                         throw UnexpectedBehaviorException( "Copying device array to device pointer - cudaMemcpyFromArray error : " + string(cudaGetErrorString(error)));
00752         } else throw UnexpectedBehaviorException("Cuda infrastructure has not been designed to work on 1D data");
00753 
00754         rw_cache[idx] = cuda_rw_data;
00755 }
00756 
00757 
00758 void  EMData::CudaCache::copy_ro_to_cpu(const int idx,float* data) {
00759         if (ro_cache[idx] == 0) throw UnexpectedBehaviorException("Can not update RW CUDA data: RO data is null.");
00760         if (data == 0) throw NullPointerException("The cpu data pointer is NULL in copy_ro_to_cpu");
00761 
00762         const EMData* d = caller_cache[idx];
00763         int nx = d->get_xsize();
00764         int ny = d->get_ysize();
00765         int nz = d->get_zsize();
00766         size_t num_bytes = nx*ny*nz*sizeof(float);
00767 
00768         if (nz > 1) {
00769                 cudaExtent extent;
00770                 extent.width  = nx;
00771                 extent.height = ny;
00772                 extent.depth  = nz;
00773                 cudaMemcpy3DParms copyParams = {0};
00774                 copyParams.srcArray   = ro_cache[idx];
00775                 copyParams.dstPtr = make_cudaPitchedPtr((void*)data, extent.width*sizeof(float), extent.width, extent.height);
00776                 copyParams.extent   = extent;
00777                 copyParams.kind     = cudaMemcpyDeviceToHost;
00778                 cudaError_t error = cudaMemcpy3D(&copyParams);
00779                 if ( error != cudaSuccess)
00780                         throw UnexpectedBehaviorException( "Copying device array to device pointer - CudaMemcpy3D error : " + string(cudaGetErrorString(error)));
00781 
00782         } else if ( ny > 1 ) {
00783                 cudaError_t error = cudaMemcpyFromArray(data,ro_cache[idx],0,0,num_bytes,cudaMemcpyDeviceToHost);
00784                 if ( error != cudaSuccess)
00785                         throw UnexpectedBehaviorException( "Copying device array to device pointer - cudaMemcpyFromArray error : " + string(cudaGetErrorString(error)));
00786         } else throw UnexpectedBehaviorException("Cuda infrastructure has not been designed to work on 1D data");
00787 
00788 }
00789 void EMData::CudaCache::clear_item(const int idx) {
00790 //      debug_print();
00791         if  ( rw_cache[idx] != 0) {
00792                 mem_allocated -= get_emdata_bytes(idx);
00793                 cudaError_t error = cudaFree(rw_cache[idx]);
00794                 if ( error != cudaSuccess)
00795                         throw UnexpectedBehaviorException( "CudaFree error : " + string(cudaGetErrorString(error)));
00796         }
00797         rw_cache[idx] = 0;
00798 
00799         if  ( ro_cache[idx] != 0) {
00800                 mem_allocated -= get_emdata_bytes(idx);
00801                 cudaError_t error = cudaFreeArray(ro_cache[idx]);
00802                 if ( error != cudaSuccess) throw UnexpectedBehaviorException( "CudaFreeArray error : " + string(cudaGetErrorString(error)));
00803 
00804         }
00805         ro_cache[idx] = 0;
00806 
00807         caller_cache[idx] = 0;
00808 
00809         locked[idx] = 0;
00810 }
00811 
00812 
00813 EMData::CudaDataLock::CudaDataLock(const EMData* const emdata) : data_cuda_handle(-1)
00814 {
00815         emdata->set_gpu_rw_current();
00816         data_cuda_handle = emdata->cuda_cache_handle;
00817         EMData::cuda_cache.lock(data_cuda_handle);
00818 }
00819 
00820 EMData::CudaDataLock::~CudaDataLock() {
00821         EMData::cuda_cache.unlock(data_cuda_handle);
00822 }
00823 
00824 
00825 #endif //EMAN2_USING_CUDA

Generated on Tue May 25 17:34:09 2010 for EMAN2 by  doxygen 1.4.4