#include <boxingtools.h>
Public Member Functions | |
BoxSVDClassifier (const vector< vector< float > > &data, const unsigned int &classes=4) | |
~BoxSVDClassifier () | |
map< unsigned int, unsigned int > | go () |
Static Public Member Functions | |
map< unsigned int, unsigned int > | colorMappingByClassSize (const map< unsigned int, unsigned int > &grouping) |
Private Member Functions | |
map< unsigned int, unsigned int > | randomSeedCluster (const gsl_matrix *const svd_coords, unsigned int matrix_dims) |
map< unsigned int, unsigned int > | getIterativeCluster (const gsl_matrix *const svd_coords, const map< unsigned int, unsigned int > ¤t_grouping) |
bool | setDims (const vector< vector< float > > &data) |
vector< vector< float > > | getDistances (const gsl_matrix *const svd_coords, const gsl_matrix *const ref_coords) |
map< unsigned int, unsigned int > | getMapping (const vector< vector< float > > &distances) |
Private Attributes | |
const vector< vector< float > > & | mData |
unsigned int | mColumns |
unsigned int | mRows |
unsigned int | mClasses |
|
Definition at line 102 of file boxingtools.cpp. References data, mData, and setDims().
|
|
Definition at line 109 of file boxingtools.cpp. 00110 { 00111 00112 }
|
|
Definition at line 416 of file boxingtools.cpp. Referenced by EMAN::BoxingTools::classify(). 00417 { 00418 00419 vector<unsigned int> current_mappings; 00420 // Get the extent of the current mappings 00421 for (map< unsigned int, unsigned int>::const_iterator it = grouping.begin(); it != grouping.end(); ++it ) 00422 { 00423 if ( find( current_mappings.begin(), current_mappings.end(), it->second ) == current_mappings.end() ) 00424 { 00425 current_mappings.push_back( it->second ); 00426 } 00427 } 00428 00429 if ( current_mappings.size() < 2 ) 00430 { 00431 cerr << "Error, cannot call colMappingByClassSize when less than 2 classes have been specified, I think you created " << current_mappings.size() << " classes " << endl; 00432 throw; 00433 } 00434 00435 // Record how many data points are in each class. 00436 map<unsigned int, unsigned int> mappings_tally; 00437 for( vector<unsigned int>::const_iterator it = current_mappings.begin(); it != current_mappings.end(); ++it ) 00438 { 00439 // First initialize each total to zero 00440 mappings_tally[*it] = 0; 00441 } 00442 00443 // Now do the actual counting 00444 for (map< unsigned int, unsigned int>::const_iterator it = grouping.begin(); it != grouping.end(); ++it ) 00445 { 00446 mappings_tally[it->second] += 1; 00447 } 00448 00449 // find the largest tally 00450 unsigned int current_mapping_idx = 0; 00451 map< unsigned int, unsigned int> return_map; 00452 while ( mappings_tally.size() > 0 ) 00453 { 00454 #if SVD_CLASSIFIER_DEBUG 00455 cout << "Printing mappings_tally" << endl; 00456 print_map(mappings_tally); 00457 #endif 00458 00459 map< unsigned int, unsigned int>::iterator it = mappings_tally.begin(); 00460 map< unsigned int, unsigned int>::iterator track_it = mappings_tally.begin(); 00461 unsigned int current_max = it->second; 00462 unsigned int current_idx = it->first; 00463 ++it; 00464 for (; it != mappings_tally.end(); ++it ) 00465 { 00466 if ( it->second > current_max ) 00467 { 00468 current_max = it->second; 00469 current_idx = it->first; 00470 track_it = it; 00471 } 00472 } 00473 00474 #if SVD_CLASSIFIER_DEBUG 00475 cout << "The mapping is " << current_idx << " to " << current_mapping_idx << endl; 00476 #endif 00477 for (map< unsigned int, unsigned int>::const_iterator group_it = grouping.begin(); group_it != grouping.end(); ++group_it ) 00478 { 00479 if ( group_it->second == current_idx ) 00480 { 00481 return_map[group_it->first] = current_mapping_idx; 00482 } 00483 } 00484 00485 mappings_tally.erase( current_idx ); 00486 00487 current_mapping_idx++; 00488 } 00489 00490 00491 #if SVD_CLASSIFIER_DEBUG 00492 cout << "Printing adjusted classification map" << endl; 00493 print_map(return_map); 00494 #endif 00495 00496 00497 return return_map; 00498 }
|
|
Definition at line 366 of file boxingtools.cpp. Referenced by getIterativeCluster(), and randomSeedCluster(). 00367 { 00368 // accrue the distance data - this could be done more concisely, but there shouldn't be much cost 00369 // because the data should be fairl small. By more concisely I mean, the distance data would not need 00370 // to be stored, it could be determined without storing it in distances. 00371 vector<vector<float> > distances; 00372 for (unsigned int i = 0; i < mColumns; ++i ) 00373 { 00374 vector<float> ith_distances; 00375 for( unsigned int random_seed_idx = 0; random_seed_idx < mClasses; ++random_seed_idx ) 00376 { 00377 float distance = 0; 00378 for (unsigned int j = 0; j < mColumns; ++j ) 00379 { 00380 float value = (float)( (gsl_matrix_get( ref_coords, random_seed_idx, j) - gsl_matrix_get( svd_coords, i, j)) ); 00381 distance += value * value; 00382 } 00383 ith_distances.push_back(distance); 00384 } 00385 distances.push_back(ith_distances); 00386 } 00387 00388 return distances; 00389 }
|
|
Definition at line 243 of file boxingtools.cpp. References getDistances(), getMapping(), mClasses, and mColumns. Referenced by go(). 00244 { 00245 // Space to store the reference vectors 00246 gsl_matrix * ref_coords = gsl_matrix_calloc( mClasses, mColumns ); 00247 00248 // Assumes there are a total of mClasses in the current_groupings mapping 00249 for(unsigned int i = 0; i < mClasses; ++i) 00250 { 00251 unsigned int tally = 0; 00252 for (map< unsigned int, unsigned int>::const_iterator it = current_grouping.begin(); it != current_grouping.end(); ++it ) 00253 { 00254 if ( it->second == i ) 00255 { 00256 for( unsigned int j = 0; j < mColumns; ++j ) 00257 { 00258 gsl_matrix_set(ref_coords, i, j, gsl_matrix_get( svd_coords, it->first, j ) + gsl_matrix_get( ref_coords, i, j)); 00259 } 00260 ++tally; 00261 } 00262 00263 } 00264 // then normalize the the addition 00265 if (tally != 0) 00266 for( unsigned int j = 0; j < mColumns; ++j ) 00267 { 00268 gsl_matrix_set(ref_coords, i, j, gsl_matrix_get( ref_coords, i, j )/((float) tally)); 00269 } 00270 } 00271 00272 vector<vector<float> > distances = getDistances(svd_coords, ref_coords); 00273 00274 #if SVD_CLASSIFIER_DEBUG 00275 cout << "The distance matrix is " << endl; 00276 for( unsigned int i = 0; i < distances.size(); ++i ) 00277 { 00278 for( unsigned int j = 0; j < distances[i].size(); ++j ) 00279 { 00280 cout << distances[i][j] << " "; 00281 } 00282 cout << endl; 00283 } 00284 #endif 00285 00286 00287 // Finally decide which of the randomly chosen vectors is closest to each of the input vectors 00288 // and use that as the basis of the grouping 00289 map< unsigned int, unsigned int> return_map = getMapping(distances); 00290 00291 #if SVD_CLASSIFIER_DEBUG 00292 cout << "Printing classification map" << endl; 00293 print_map(return_map); 00294 #endif 00295 00296 gsl_matrix_free(ref_coords); 00297 00298 return return_map; 00299 }
|
|
Definition at line 391 of file boxingtools.cpp. References min. Referenced by getIterativeCluster(), and randomSeedCluster(). 00392 { 00393 // Finally decide which of the randomly chosen vectors is closest to each of the input vectors 00394 // and use that as the basis of the grouping 00395 map< unsigned int, unsigned int> return_map; 00396 unsigned int vector_idx = 0; 00397 for( vector<vector<float> >::const_iterator it = distances.begin(); it != distances.end(); ++it, ++vector_idx ) 00398 { 00399 vector<float>::const_iterator mIt = it->begin(); 00400 float min = *mIt; 00401 unsigned int min_idx = 0; 00402 for ( unsigned int current_idx = 0; mIt != it->end(); ++mIt, ++current_idx ) 00403 { 00404 if ( *mIt < min ) 00405 { 00406 min = *mIt; 00407 min_idx = current_idx; 00408 } 00409 } 00410 return_map[vector_idx] = min_idx; 00411 } 00412 00413 return return_map; 00414 }
|
|
Definition at line 134 of file boxingtools.cpp. References getIterativeCluster(), mColumns, mData, mRows, norm(), randomSeedCluster(), and V. Referenced by EMAN::BoxingTools::classify(). 00135 { 00136 // This is done in the constructor 00137 // setDims(mData); 00138 00139 00140 unsigned int local_columns = mColumns; 00141 if ( mRows < mColumns ) 00142 { 00143 // cerr << "Warning: gsl SVD works only when m > n, you have m = " << mRows << " and n = " << mColumns << endl; 00144 // This local adaptation means things will proceed the same way even if there are more columns in A then rows 00145 // Every input data is still classified, just the SVD eigenvectors are found using a subset of all the data 00146 local_columns = mRows; 00147 } 00148 00149 gsl_matrix * U = gsl_matrix_calloc( mRows, local_columns ); 00150 gsl_matrix * A = gsl_matrix_calloc( mRows, mColumns ); 00151 for ( unsigned int i = 0; i < mRows; ++i ) 00152 { 00153 for ( unsigned int j = 0; j < mColumns; ++j ) 00154 { 00155 gsl_matrix_set( A, i, j, mData[j][i] ); 00156 if ( j < local_columns ) 00157 gsl_matrix_set( U, i, j, mData[j][i] ); 00158 } 00159 } 00160 #if SVD_CLASSIFIER_DEBUG 00161 printMatrix( A, mRows, mColumns, "A" ); 00162 #endif 00163 00164 gsl_matrix * V = gsl_matrix_calloc( local_columns, local_columns ); 00165 gsl_vector * S = gsl_vector_calloc( local_columns ); 00166 gsl_vector * work = gsl_vector_calloc( local_columns ); 00167 00168 if ( gsl_linalg_SV_decomp (U, V, S, work) ) 00169 { 00170 cerr << "ERROR: gsl returned a non zero value on application of the SVD" << endl; 00171 } 00172 00173 #if SVD_CLASSIFIER_DEBUG 00174 printMatrix( U, mRows, local_columns, "U" ); 00175 printVector( S, local_columns, "S" ); 00176 printMatrix( V, local_columns, local_columns, "V"); 00177 #endif 00178 00179 // normalize the columns of matrix A 00180 for ( unsigned int j = 0; j < mColumns; ++j ) 00181 { 00182 float norm = 0; 00183 for ( unsigned int i = 0; i < mRows; ++i ) 00184 { 00185 norm += (float)(gsl_matrix_get( A, i, j)*gsl_matrix_get( A, i, j)); 00186 } 00187 norm = sqrtf(norm); 00188 for ( unsigned int i = 0; i < mRows; ++i ) 00189 { 00190 gsl_matrix_set( A, i, j, gsl_matrix_get(A,i,j)/norm); 00191 } 00192 } 00193 00194 #if SVD_CLASSIFIER_DEBUG 00195 for ( unsigned int j = 0; j < mColumns; ++j ) 00196 { 00197 double norm = 0; 00198 for ( unsigned int i = 0; i < mRows; ++i ) 00199 { 00200 norm += gsl_matrix_get( A, i, j)*gsl_matrix_get( A, i, j); 00201 } 00202 cout << "For column " << j << " the squared norm is " << norm << endl; 00203 } 00204 #endif 00205 00206 00207 gsl_matrix * svd_coords = gsl_matrix_calloc( mColumns, mColumns ); 00208 // Correlate the columns of A with the columns of U and store the information in a martrix called svd_coords 00209 for ( unsigned int i = 0; i < mColumns; ++i ) 00210 { 00211 for ( unsigned int j = 0; j < local_columns; ++j ) 00212 { 00213 double result = 0.0; 00214 for ( unsigned int k = 0; k < mRows; ++k ) 00215 { 00216 result += gsl_matrix_get(A,k,i)*gsl_matrix_get(U,k,j); 00217 } 00218 gsl_matrix_set( svd_coords, i, j, result); 00219 } 00220 } 00221 00222 #if SVD_CLASSIFIER_DEBUG 00223 printMatrix( svd_coords, mColumns, mColumns, "svd_coords" ); 00224 #endif 00225 00226 map< unsigned int, unsigned int> grouping = randomSeedCluster(svd_coords, mColumns); 00227 00228 for ( unsigned int i = 0; i < 20; ++ i ) 00229 { 00230 grouping = getIterativeCluster(svd_coords, grouping); 00231 } 00232 00233 gsl_matrix_free(A); 00234 gsl_matrix_free(U); 00235 gsl_matrix_free(V); 00236 gsl_vector_free(S); 00237 gsl_vector_free(work); 00238 gsl_matrix_free(svd_coords); 00239 00240 return grouping; 00241 }
|
|
Definition at line 302 of file boxingtools.cpp. References getDistances(), getMapping(), mClasses, and mColumns. Referenced by go(). 00303 { 00304 // Seed the random number generator 00305 srand(static_cast<unsigned int>(time(0))); 00306 00307 vector<unsigned int> random_seed_indices; 00308 while ( random_seed_indices.size() < mClasses ) 00309 { 00310 unsigned int random_idx = static_cast<int>(((float)rand()/RAND_MAX)*matrix_dims); 00311 if ( find( random_seed_indices.begin(), random_seed_indices.end(), random_idx ) == random_seed_indices.end() ) 00312 { 00313 random_seed_indices.push_back( random_idx ); 00314 } 00315 } 00316 00317 // Space to store the reference vectors 00318 gsl_matrix * ref_coords = gsl_matrix_calloc( mClasses, mColumns ); 00319 00320 // Put the reference vectors into a matrix to make the approach transparent to the reader 00321 for(unsigned int i = 0; i < random_seed_indices.size(); ++i) 00322 { 00323 for( unsigned int j = 0; j < matrix_dims; ++j ) 00324 { 00325 gsl_matrix_set(ref_coords, i, j, gsl_matrix_get( svd_coords, random_seed_indices[i], j )); 00326 } 00327 } 00328 00329 #if SVD_CLASSIFIER_DEBUG 00330 printMatrix( ref_coords, mClasses, matrix_dims, "Reference matrix in first grouping"); 00331 #endif 00332 00333 // accrue the distance data - this could be done more concisely, but there shouldn't be much cost 00334 // because the data should be fairl small. By more concisely I mean, the distance data would not need 00335 // to be stored, it could be determined without storing it in distances. 00336 vector<vector<float> > distances = getDistances(svd_coords, ref_coords); 00337 00338 #if SVD_CLASSIFIER_DEBUG 00339 cout << "The distance matrix is " << endl; 00340 for( unsigned int i = 0; i < distances.size(); ++i ) 00341 { 00342 for( unsigned int j = 0; j < distances[i].size(); ++j ) 00343 { 00344 cout << distances[i][j] << " "; 00345 } 00346 cout << endl; 00347 } 00348 #endif 00349 00350 00351 // Finally decide which of the randomly chosen vectors is closest to each of the input vectors 00352 // and use that as the basis of the grouping 00353 map< unsigned int, unsigned int> return_map = getMapping(distances); 00354 00355 #if SVD_CLASSIFIER_DEBUG 00356 cout << "Printing classification map, randomly seeded" << endl; 00357 print_map(return_map); 00358 #endif 00359 00360 gsl_matrix_free(ref_coords); 00361 00362 return return_map; 00363 }
|
|
Definition at line 114 of file boxingtools.cpp. References data, mColumns, mData, and mRows. Referenced by BoxSVDClassifier(). 00115 { 00116 mColumns = mData.size(); 00117 vector<vector<float> >::const_iterator it = data.begin(); 00118 mRows = it->size(); 00119 it++; 00120 for( ; it != data.end(); ++it ) 00121 { 00122 if ( it->size() != mRows ) 00123 { 00124 cerr << "ERROR: can not initial the BoxSVDClassifier with vectors of un-equal lengths " << endl; 00125 cerr << "The vector lengths that did not agree were " << mRows << " and " << it->size() << endl; 00126 return false; 00127 } 00128 } 00129 00130 return true; 00131 }
|
|
Definition at line 139 of file boxingtools.h. Referenced by getIterativeCluster(), and randomSeedCluster(). |
|
Definition at line 136 of file boxingtools.h. Referenced by getIterativeCluster(), go(), randomSeedCluster(), and setDims(). |
|
Definition at line 134 of file boxingtools.h. Referenced by BoxSVDClassifier(), go(), and setDims(). |
|
Definition at line 137 of file boxingtools.h. |