EMAN2
pca.cpp
Go to the documentation of this file.
00001 /*
00002  * Author: Chao Yang
00003  * Copyright (c) 2000-2006
00004  *
00005  * This software is issued under a joint BSD/GNU license. You may use the
00006  * source code in this file under either license. However, note that the
00007  * complete EMAN2 and SPARX software packages have some GPL dependencies,
00008  * so you are responsible for compliance with the licenses of these packages
00009  * if you opt to use BSD licensing. The warranty disclaimer below holds
00010  * in either instance.
00011  *
00012  * This complete copyright notice must be included in any revised version of the
00013  * source code. Additional authorship citations may be added, but existing
00014  * author citations must be preserved.
00015  *
00016  * This program is free software; you can redistribute it and/or modify
00017  * it under the terms of the GNU General Public License as published by
00018  * the Free Software Foundation; either version 2 of the License, or
00019  * (at your option) any later version.
00020  *
00021  * This program is distributed in the hope that it will be useful,
00022  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00023  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
00024  * GNU General Public License for more details.
00025  *
00026  * You should have received a copy of the GNU General Public License
00027  * along with this program; if not, write to the Free Software
00028  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00029  *
00030  */
00031 
00032 #include "emdata.h"
00033 #include "util.h"
00034 #include "emutil.h"
00035 
00036 #include "pca.h"
00037 #include "lapackblas.h"
00038 
00039 using namespace EMAN;
00040 
00041 // return all right singular vectors
00042 int PCA::dopca(vector <EMData*> imgstack, EMData *mask)
00043 {
00044    // performs PCA on a list of images (each under a mask)
00045    // returns a list of eigenimages
00046 
00047    int i, status = 0;
00048 
00049    vector<EMData*> img1dlst;
00050    vector<EMData*> eigvecs;
00051 
00052    int nimgs = imgstack.size();
00053 
00054    for (i=0; i<nimgs; i++) {
00055       img1dlst.push_back(Util::compress_image_mask(imgstack[i],mask));
00056    }
00057 
00058    // for right now, compute a full SVD
00059    eigvecs = Util::svdcmp(img1dlst, 0);
00060 
00061    img1dlst.clear();
00062 
00063    for (i=0; i<nimgs; i++) {
00064       eigenimages.push_back(Util::reconstitute_image_mask(eigvecs[i],mask));
00065    }
00066 
00067    eigvecs.clear();
00068 
00069    return status;
00070 }
00071 
00072 //------------------------------------------------------------------
00073 // return a subset of right singular vectors
00074 int PCA::dopca(vector <EMData*> imgstack, EMData *mask, int nvec)
00075 {
00076    // performs PCA on a list of images (each under a mask)
00077    // returns a list of eigenimages
00078 
00079    vector<EMData*> img1dlst;
00080    vector<EMData*> eigvecs;
00081    int status = 0;
00082 
00083    int nimgs = imgstack.size();
00084 
00085    for (int i=0; i<nimgs; i++) {
00086       img1dlst.push_back(Util::compress_image_mask(imgstack[i],mask));
00087    }
00088 
00089    if ( nimgs < nvec || nvec == 0) nvec = nimgs;
00090    eigvecs = Util::svdcmp(img1dlst, nvec);
00091    img1dlst.clear();
00092 
00093    for (int i=0; i<nvec; i++) {
00094       eigenimages.push_back(Util::reconstitute_image_mask(eigvecs[i],mask));
00095    }
00096 
00097    eigvecs.clear();
00098 
00099    return status;
00100 }
00101 
00102 //------------------------------------------------------------------
00103 // PCA by Lanczos
00104 #define qmat(i,j) qmat[((j)-1)*kstep + (i) -1]
00105 #define diag(i)   diag[(i)-1]
00106 
00107 int PCA::dopca_lan(vector <EMData*> imgstack, EMData *mask, int nvec)
00108 {
00109    // performs PCA on a list of images (each under a mask)
00110    // returns a list of eigenimages
00111 
00112    int ione = 1; 
00113    float one = 1.0, zero = 0.0;
00114    char trans;
00115 
00116    vector<EMData*> img1dlst;
00117    vector<EMData*> eigvecs;
00118 
00119    int status = 0;
00120    int nimgs = imgstack.size();
00121    for (int i=0; i<nimgs; i++) {
00122       img1dlst.push_back(Util::compress_image_mask(imgstack[i],mask));
00123    }
00124 
00125    float resnrm = 0.0;
00126 
00127    if ( nvec > nimgs || nvec ==0 ) nvec = nimgs;
00128 
00129    int nx = img1dlst[0]->get_xsize();
00130    // the definition of kstep is purely a heuristic for right now
00131    int kstep = nvec + 20;
00132    if (kstep > nimgs) kstep = nimgs;
00133 
00134    float *diag    = new float[kstep];
00135    float *subdiag = new float[kstep-1];
00136    float *vmat    = new float[nx*kstep];
00137 
00138    // run kstep-step Lanczos factorization
00139    status = Lanczos(img1dlst, &kstep, diag, subdiag, vmat, &resnrm);
00140 
00141    char jobz[2] = "V";
00142    float *qmat  = new float[kstep*kstep];
00143    // workspace size will be optimized later
00144    int   lwork  = 100 + 4*kstep + kstep*kstep;
00145    int   liwork = 3+5*kstep;
00146 
00147    float *work  = new float[lwork];
00148    int   *iwork = new int[liwork]; 
00149    int   info = 0;
00150 
00151    // call LAPACK tridiagonal eigensolver
00152    sstevd_(jobz, &kstep, diag, subdiag, qmat, &kstep, work, &lwork,
00153            iwork, &liwork, &info);
00154 
00155    // store singular values
00156    // eigenvalues of the cov matrix have been sorted in ascending order
00157    for (int j = kstep; j > kstep - nvec; j--) {
00158       singular_vals.push_back((float)sqrt(diag(j)));
00159    }
00160 
00161    img1dlst.clear();
00162    
00163    EMData *eigvec = new EMData();
00164 
00165    eigvec->set_size(nx, 1, 1);
00166    float *ritzvec = eigvec->get_data(); 
00167 
00168    // compute Ritz vectors (approximate eigenvectors) one at a time
00169    for (int j=0; j<nvec; j++) {
00170       trans = 'N';
00171       sgemv_(&trans, &nx, &kstep, &one, vmat, &nx, &qmat(1,kstep-j), &ione,
00172              &zero , ritzvec, &ione);  
00173       eigenimages.push_back(Util::reconstitute_image_mask(eigvec,mask));
00174    }
00175 
00176    eigvecs.clear();
00177 
00178    EMDeleteArray(diag);
00179    EMDeleteArray(subdiag);
00180    EMDeleteArray(vmat);
00181    EMDeleteArray(qmat);
00182    EMDeleteArray(work);
00183    EMDeleteArray(iwork);
00184    EMDeletePtr(eigvec);
00185    return status;
00186 }
00187 
00188 //------------------------------------------------------------------
00189 // out of core version of PCA, not completed yet
00190 int PCA::dopca_ooc(const string &filename_in, const string &filename_out, 
00191                    const string &lanscratch,  EMData *mask, int nvec)
00192 {
00193    int status = 0, ione = 1;
00194    EMData *image_raw = new EMData();
00195    EMData *image_masked = NULL;
00196 
00197    int nimgs = EMUtil::get_image_count(filename_in);
00198 
00199    if (nimgs <= 0) {
00200       status = 2;
00201       fprintf(stderr,"dopca_ooc: no image in %s\n", filename_in.c_str());
00202    }
00203    for (int i=0; i<nimgs; i++) {
00204        image_raw->read_image(filename_in, i);      
00205        image_masked=Util::compress_image_mask(image_raw,mask);
00206        image_masked->write_image("temp_masked_images.img",i); 
00207    }
00208 
00209    int ndim = image_masked->get_ndim();
00210    if (ndim != 1) {
00211        fprintf(stderr,"dopca_ooc: masked image should be 1-D\n");
00212        status = 3; // images should all be 1-d
00213    }
00214 
00215    int nx = image_masked->get_xsize();
00216    
00217    float resnrm = 0.0;
00218 
00219    if ( nvec > nimgs || nvec ==0 ) nvec = nimgs;
00220 
00221    // the definition of kstep is purely a heuristic for right now
00222    int kstep = nvec + 20;
00223    if (kstep > nimgs) kstep = nimgs;
00224 
00225    float *diag    = new float[kstep];
00226    float *subdiag = new float[kstep-1];
00227 
00228    status = Lanczos_ooc("temp_masked_images.img", &kstep, diag, subdiag,
00229                         lanscratch , &resnrm);
00230 
00231    char jobz[2] = "V";
00232    float *qmat  = new float[kstep*kstep];
00233    // workspace size will be optimized later
00234    int   lwork  = 100 + 4*kstep + kstep*kstep;
00235    int   liwork = 3+5*kstep;
00236 
00237    float *work  = new float[lwork];
00238    int   *iwork = new int[liwork]; 
00239    int   info = 0;
00240 
00241    // call LAPACK tridiagonal eigensolver
00242    sstevd_(jobz, &kstep, diag, subdiag, qmat, &kstep, work, &lwork,
00243            iwork, &liwork, &info);
00244 
00245    // store singular values
00246    // eigenvalues of the cov matrix have been sorted in ascending order
00247    for (int j = kstep; j > kstep - nvec; j--) {
00248       singular_vals.push_back((float)sqrt(diag(j)));
00249    }
00250 
00251    EMData *eigvec = new EMData();
00252    eigvec->set_size(nx, 1, 1);
00253    float *ritzvec = eigvec->get_data(); 
00254    EMData *newimage = 0;
00255 
00256    // compute Ritz vectors (approximate eigenvectors) one at a time
00257    FILE *fp;
00258    float *vlan = new float[nx];
00259    fp = fopen(lanscratch.c_str(),"rb");
00260    for (int j=0; j<nvec; j++) {
00261       for (int i = 0; i<nx; i++) ritzvec[i]=0.0;
00262       for (int jlan = 1; jlan <= kstep; jlan++) {
00263            fread(vlan, sizeof(float), nx, fp);
00264            saxpy_(&nx, &qmat(jlan,kstep-j), vlan, &ione, ritzvec, &ione);
00265       }
00266       rewind(fp);
00267       newimage = Util::reconstitute_image_mask(eigvec,mask);
00268       newimage->write_image(filename_out,j);
00269    }
00270    fclose(fp);
00271 
00272    EMDeleteArray(diag);
00273    EMDeleteArray(subdiag);
00274    EMDeleteArray(qmat);
00275    EMDeleteArray(work);
00276    EMDeleteArray(iwork);
00277    EMDeleteArray(vlan);
00278    EMDeletePtr(eigvec);
00279    EMDeletePtr(newimage);
00280 
00281    return status;
00282 }
00283 #undef diag
00284 #undef qmat
00285 
00286 //------------------------------------------------------------------
00287 #define TOL 1e-7
00288 #define V(i,j)      V[((j)-1)*imgsize + (i) - 1]
00289 #define v0(i)       v0[(i)-1]
00290 #define Av(i)       Av[(i)-1]
00291 #define subdiag(i)  subdiag[(i)-1]
00292 #define diag(i)     diag[(i)-1]
00293 #define hvec(i)     hvec[(i)-1]
00294 
00295 int PCA::Lanczos(vector <EMData*> imgstack, int *kstep, 
00296                  float  *diag, float *subdiag, float *V, 
00297                  float  *beta)
00298 {
00299     /*
00300         Purpose: Compute a kstep-step Lanczos factorization
00301                  on the covariant matrix X*trans(X), where 
00302                  X (imgstack) contains a set of images;
00303 
00304         Input: 
00305            imgstack (vector <EMData*>) a set of images on which PCA is 
00306                                        to be performed;
00307            
00308            kstep (int*) The maximum number of Lanczos iterations allowed.
00309                           If Lanczos terminates before kstep steps
00310                           is reached (an invariant subspace is found), 
00311                           kstep returns the number of steps taken;
00312       
00313         Output:
00314            diag (float *) The projection of the covariant matrix into a
00315                           Krylov subspace of dimension at most kstep.
00316                           The projection is a tridiagonal matrix. The
00317                           diagonal elements of this matrix is stored in 
00318                           the diag array.
00319 
00320            subdiag (float*) The subdiagonal elements of the projection
00321                             is stored here.
00322 
00323            V (float *)    an imgsize by kstep array that contains a 
00324                           set of orthonormal Lanczos basis vectors;
00325 
00326            beta (float *) the residual norm of the factorization;
00327     */
00328     int i, iter;
00329     
00330     float alpha;
00331     int   ione = 1;
00332     float zero = 0.0, one = 1.0, mone = -1.0;
00333     int   status = 0;
00334     
00335     char trans;
00336     int  nimgs = 0, imgsize = 0, ndim = 0; 
00337     float *v0, *Av, *hvec, *htmp, *imgdata;
00338 
00339     nimgs   = imgstack.size();
00340     if (nimgs <= 0) {
00341         status = 2; // no image in the stack
00342         goto EXIT; 
00343     }
00344 
00345     ndim = imgstack[0]->get_ndim();
00346     if (ndim != 1) {
00347         status = 3; // images should all be 1-d
00348         goto EXIT; 
00349     }
00350 
00351     imgsize = imgstack[0]->get_xsize();
00352      
00353     v0   = new float[imgsize];
00354     Av   = new float[imgsize];
00355     hvec = new float[*kstep];
00356     htmp = new float[*kstep];
00357 
00358     if (v0 == NULL || Av == NULL || hvec == NULL || htmp == NULL ) {
00359         fprintf(stderr, "Lanczos: failed to allocate v0,Av,hvec,htmp\n"); 
00360         status = -1;
00361         goto EXIT;
00362     }
00363 
00364     // may choose a random starting guess here     
00365     for ( i = 1; i <= imgsize; i++) v0(i) = 1.0;
00366 
00367     // normalize the starting vector
00368     *beta  = snrm2_(&imgsize, v0, &ione);
00369     for (i = 1; i<=imgsize; i++)
00370         V(i,1) = v0(i) / (*beta);
00371 
00372     // do Av <-- A*v0, where A is a cov matrix
00373     for (i = 0; i < nimgs; i++) {
00374        imgdata = imgstack[i]->get_data();
00375        alpha = sdot_(&imgsize, imgdata, &ione, V, &ione); 
00376        saxpy_(&imgsize, &alpha, imgdata, &ione, Av, &ione);
00377     }
00378 
00379     // Av <--- Av - V(:,1)*V(:,1)'*Av 
00380     diag(1) = sdot_(&imgsize, V, &ione, Av, &ione); 
00381     alpha   = -diag(1);
00382     saxpy_(&imgsize, &alpha, V, &ione, Av, &ione);
00383 
00384     // main loop 
00385     for ( iter = 2 ; iter <= *kstep ; iter++ ) {
00386         *beta = snrm2_(&imgsize, Av, &ione);
00387 
00388         if (*beta < TOL) {
00389             // found an invariant subspace, exit
00390             *kstep = iter;
00391             break;
00392         }
00393  
00394         subdiag(iter-1) = *beta;
00395         for ( i = 1 ; i <= imgsize ; i++ ) {
00396             V(i,iter) = Av(i) / (*beta);
00397         }       
00398 
00399         // do Av <-- A*V(:,iter), where A is a cov matrix
00400         for (i = 0; i < imgsize; i++) Av[i] = 0;
00401         for (i = 0; i < nimgs; i++) {
00402            imgdata = imgstack[i]->get_data();
00403            alpha = sdot_(&imgsize, imgdata, &ione, &V(1,iter), &ione); 
00404            saxpy_(&imgsize, &alpha, imgdata, &ione, Av, &ione);
00405         }
00406         
00407         // f <--- Av - V(:,1:iter)*V(:,1:iter)'*Av
00408         trans = 'T';
00409         status = sgemv_(&trans, &imgsize, &iter, &one, V, &imgsize, Av, &ione,
00410                         &zero , hvec    , &ione); 
00411         trans = 'N';
00412         status = sgemv_(&trans, &imgsize, &iter, &mone, V, &imgsize, hvec, 
00413                         &ione , &one    , Av, &ione);
00414 
00415         // one step of reorthogonalization
00416         trans = 'T';
00417         status = sgemv_(&trans, &imgsize, &iter, &one, V, &imgsize, Av, &ione,
00418                         &zero , htmp    , &ione); 
00419         saxpy_(&iter, &one, htmp, &ione, hvec, &ione); 
00420         trans = 'N';
00421         status = sgemv_(&trans, &imgsize, &iter, &mone, V, &imgsize, htmp, 
00422                         &ione , &one    , Av, &ione);
00423         diag(iter) = hvec(iter);
00424     }
00425 
00426     EMDeleteArray(v0);
00427     EMDeleteArray(Av);
00428     EMDeleteArray(hvec);
00429     EMDeleteArray(htmp);
00430 
00431 EXIT:
00432     return status;
00433 }
00434 
00435 #undef v0
00436 #undef Av
00437 #undef V
00438 #undef hvec
00439 #undef diag
00440 #undef subdiag
00441 #undef TOL
00442 
00443 //------------------------------------------------------------------
00444 #define TOL 1e-7
00445 #define v(i)        v[(i) - 1]
00446 #define resid(i)    resid[(i) - 1]
00447 #define Av(i)       Av[(i)-1]
00448 #define subdiag(i)  subdiag[(i)-1]
00449 #define diag(i)     diag[(i)-1]
00450 
00451 // currently reading one image at a time, can be improved by
00452 // allocate a buffer to store a subset of images and Lanczos
00453 // vectors
00454 
00455 int PCA::Lanczos_ooc(string const& filename_in, int *kstep, 
00456     float  *diag, float *subdiag, string const& lanscratch,
00457     float  *beta)
00458 {
00459     /*
00460         Purpose: Compute a kstep-step Lanczos factorization
00461                  on the covariant matrix X*trans(X), where 
00462                  X (imgstack) contains a set of images;
00463 
00464         Input: 
00465            filename_in  The name of the input file that contains
00466            (string)     masked images;
00467            
00468            kstep (int*) The maximum number of Lanczos iterations allowed.
00469                         If Lanczos terminates before kstep steps
00470                         is reached (an invariant subspace is found), 
00471                         kstep returns the number of steps taken;
00472 
00473            lanscratch   The name of the file that will be used to 
00474            (string)     store a set of orthonormal Lanczos basis vectors;
00475 
00476         Output:
00477            diag (float*)    The projection of the covariant matrix into a
00478                             Krylov subspace of dimension at most kstep.
00479                             The projection is a tridiagonal matrix. The
00480                             diagonal elements of this matrix is stored in 
00481                             the diag array;
00482 
00483            subdiag (float*) The subdiagonal elements of the projection
00484                             is stored here;
00485 
00486            beta (float *)   The residual norm of the factorization;
00487     */
00488     int i, iter;
00489     EMData *maskedimage;
00490     
00491     float alpha;
00492     int   ione = 1;
00493     int   status = 0;
00494     
00495     int   imgsize = 0, ndim = 0;
00496     float *v, *Av, *resid, *imgdata;
00497     float h = 0.0, htmp=0.0;
00498 
00499     int nimgs = EMUtil::get_image_count(filename_in);
00500     if (nimgs <= 0) {
00501         status = 2; // no image in the stack
00502         goto EXIT; 
00503     }
00504 
00505     maskedimage = new EMData();
00506     maskedimage->read_image(filename_in,0);
00507     // what happens when filename_in does not exist, or when read fails?
00508 
00509     ndim = maskedimage->get_ndim();
00510     if (ndim != 1) {
00511         status = 3; // images should all be 1-d
00512         goto EXIT; 
00513     }
00514 
00515     imgsize = maskedimage->get_xsize();
00516      
00517     v     = new float[imgsize];
00518     Av    = new float[imgsize];
00519     resid = new float[imgsize];
00520 
00521     if (v == NULL || Av == NULL ) {
00522         fprintf(stderr, "Lanczos: failed to allocate v,Av\n"); 
00523         status = -1;
00524         goto EXIT;
00525     }
00526 
00527     // may choose a random starting guess here     
00528     for ( i = 1; i <= imgsize; i++) v(i) = 1.0;
00529 
00530     // normalize the starting vector
00531     *beta  = snrm2_(&imgsize, v, &ione);
00532     alpha = 1/(*beta);    
00533     sscal_(&imgsize, &alpha, v, &ione);
00534     // write v out to a scratch file
00535     FILE *fp;
00536     fp = fopen(lanscratch.c_str(), "wb");
00537     fwrite(v, sizeof(float), imgsize, fp);
00538     fclose(fp);
00539 
00540     // do Av <-- A*v, where A is a cov matrix
00541     for (i = 0; i < nimgs; i++) {
00542        maskedimage->read_image(filename_in,i);
00543        imgdata = maskedimage->get_data();
00544        alpha = sdot_(&imgsize, imgdata, &ione, v, &ione); 
00545        saxpy_(&imgsize, &alpha, imgdata, &ione, Av, &ione);
00546     }
00547 
00548     // Av <--- Av - V(:,1)*V(:,1)'*Av 
00549     diag(1) = sdot_(&imgsize, v, &ione, Av, &ione); 
00550     alpha   = -diag(1);
00551     scopy_(&imgsize, Av, &ione, resid, &ione);
00552     saxpy_(&imgsize, &alpha, v, &ione, resid, &ione);
00553 
00554     // main loop 
00555     for ( iter = 2 ; iter <= *kstep ; iter++ ) {
00556         *beta = snrm2_(&imgsize, resid, &ione);
00557 
00558         if (*beta < TOL) {
00559             // found an invariant subspace, exit
00560             *kstep = iter;
00561             break;
00562         }
00563  
00564         subdiag(iter-1) = *beta;
00565         for ( i = 1 ; i <= imgsize ; i++ ) {
00566             v(i) = resid(i) / (*beta);
00567         }       
00568 
00569         // write v out to a scratch file at appropriate position
00570         fp = fopen(lanscratch.c_str(),"ab");
00571         fwrite(v, sizeof(float), imgsize, fp);
00572         fclose(fp);
00573 
00574         // do Av <-- A*V(:,iter), where A is a cov matrix
00575         for (i = 0; i < imgsize; i++) Av[i] = 0;
00576         for (i = 0; i < nimgs; i++) {
00577            maskedimage->read_image(filename_in,i);
00578            imgdata = maskedimage->get_data();
00579            alpha = sdot_(&imgsize, imgdata, &ione, v, &ione); 
00580            saxpy_(&imgsize, &alpha, imgdata, &ione, Av, &ione);
00581         }
00582 
00583         // f <--- Av - V(:,1:iter)*V(:,1:iter)'*Av
00584         // the out-of-core version reads one Lanczos vector at a time
00585         scopy_(&imgsize, Av, &ione, resid, &ione);
00586         fp = fopen(lanscratch.c_str(),"rb");
00587         for (int jlan = 1; jlan <= iter; jlan++) {
00588            fread(v, sizeof(float), imgsize, fp);
00589            h     = sdot_(&imgsize, v, &ione, Av, &ione);
00590            alpha = -h;
00591            saxpy_(&imgsize, &alpha, v, &ione, resid, &ione);
00592         }
00593         fclose(fp);
00594 
00595         // one step of reorthogonalization
00596         scopy_(&imgsize, resid, &ione, Av, &ione);
00597         fp = fopen(lanscratch.c_str(),"rb");
00598         for (int jlan = 1; jlan <= iter; jlan++) {
00599            fread(v, sizeof(float), imgsize, fp);
00600            htmp  = sdot_(&imgsize, v, &ione, Av, &ione);
00601            alpha = -htmp;
00602            saxpy_(&imgsize, &alpha, v, &ione, resid, &ione);
00603            h += htmp;
00604         }
00605         fclose(fp);
00606         diag(iter) = h;
00607     }
00608 
00609     EMDeleteArray(v);
00610     EMDeleteArray(Av);
00611     EMDeleteArray(resid);
00612     EMDeletePtr(maskedimage);
00613 
00614 EXIT:
00615     return status;
00616 }
00617 
00618 #undef Av
00619 #undef v
00620 #undef resid
00621 #undef diag
00622 #undef subdiag
00623 #undef TOL
00624 
00625 //------------------------------------------------------------------
00626 vector<float> PCA::get_vals()
00627 {
00628    // method for retrieving singular values
00629    return singular_vals;
00630 }
00631 
00632 vector<EMData*> PCA::get_vecs()
00633 {
00634    // method for retrieving right singular vectors (eigenimages)
00635    return eigenimages;
00636 }
00637 
00638 void PCA::clear()
00639 {
00640     // flush singular values and vectors
00641    singular_vals.clear();
00642    eigenimages.clear();
00643 }