[ VIGRA Homepage | Function Index | Class Index | Namespaces | File List | Main Page ]

rf_algorithm.hxx
1 /************************************************************************/
2 /* */
3 /* Copyright 2008-2009 by Rahul Nair */
4 /* */
5 /* This file is part of the VIGRA computer vision library. */
6 /* The VIGRA Website is */
7 /* http://hci.iwr.uni-heidelberg.de/vigra/ */
8 /* Please direct questions, bug reports, and contributions to */
9 /* ullrich.koethe@iwr.uni-heidelberg.de or */
10 /* vigra@informatik.uni-hamburg.de */
11 /* */
12 /* Permission is hereby granted, free of charge, to any person */
13 /* obtaining a copy of this software and associated documentation */
14 /* files (the "Software"), to deal in the Software without */
15 /* restriction, including without limitation the rights to use, */
16 /* copy, modify, merge, publish, distribute, sublicense, and/or */
17 /* sell copies of the Software, and to permit persons to whom the */
18 /* Software is furnished to do so, subject to the following */
19 /* conditions: */
20 /* */
21 /* The above copyright notice and this permission notice shall be */
22 /* included in all copies or substantial portions of the */
23 /* Software. */
24 /* */
25 /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND */
26 /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES */
27 /* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND */
28 /* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT */
29 /* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, */
30 /* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING */
31 /* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR */
32 /* OTHER DEALINGS IN THE SOFTWARE. */
33 /* */
34 /************************************************************************/
35 #define VIGRA_RF_ALGORTIHM_HXX
36 
37 #include <vector>
38 #include "splices.hxx"
39 #include <queue>
40 namespace vigra
41 {
42 
43 namespace rf
44 {
45 /** This namespace contains all algorithms developed for feature
46  * selection
47  *
48  */
49 namespace algorithms
50 {
51 
52 namespace detail
53 {
54  /** create a MultiArray containing only columns supplied between iterators
55  b and e
56  */
57  template<class OrigMultiArray,
58  class Iter,
59  class DestMultiArray>
60  void choose(OrigMultiArray const & in,
61  Iter const & b,
62  Iter const & e,
63  DestMultiArray & out)
64  {
65  int columnCount = std::distance(b, e);
66  int rowCount = in.shape(0);
67  out.reshape(MultiArrayShape<2>::type(rowCount, columnCount));
68  int ii = 0;
69  for(Iter iter = b; iter != e; ++iter, ++ii)
70  {
71  columnVector(out, ii) = columnVector(in, *iter);
72  }
73  }
74 }
75 
76 
77 
78 /** Standard random forest Errorrate callback functor
79  *
80  * returns the random forest error estimate when invoked.
81  */
83 {
84  RandomForestOptions options;
85 
86  public:
87  /** Default constructor
88  *
89  * optionally supply options to the random forest classifier
90  * \sa RandomForestOptions
91  */
93  : options(opt)
94  {}
95 
96  /** returns the RF OOB error estimate given features and
97  * labels
98  */
99  template<class Feature_t, class Response_t>
100  double operator() (Feature_t const & features,
101  Response_t const & response)
102  {
103  RandomForest<> rf(options);
105  rf.learn(features,
106  response,
108  return oob.oob_breiman;
109  }
110 };
111 
112 
113 /** Structure to hold Variable Selection results
114  */
116 {
117  bool initialized;
118 
119  public:
121  : initialized(false)
122  {}
123 
124  typedef std::vector<int> FeatureList_t;
125  typedef std::vector<double> ErrorList_t;
126  typedef FeatureList_t::iterator Pivot_t;
127 
128  Pivot_t pivot;
129 
130  /** list of features.
131  */
132  FeatureList_t selected;
133 
134  /** vector of size (number of features)
135  *
136  * the i-th entry encodes the error rate obtained
137  * while using features [0 - i](including i)
138  *
139  * if the i-th entry is -1 then no error rate was obtained
140  * this may happen if more than one feature is added to the
141  * selected list in one step of the algorithm.
142  *
143  * during initialisation error[m+n-1] is always filled
144  */
145  ErrorList_t errors;
146 
147 
148  /** errorrate using no features
149  */
150  double no_features;
151 
152  template<class FeatureT,
153  class ResponseT,
154  class Iter,
155  class ErrorRateCallBack>
156  bool init(FeatureT const & all_features,
157  ResponseT const & response,
158  Iter b,
159  Iter e,
160  ErrorRateCallBack errorcallback)
161  {
162  bool ret_ = init(all_features, response, errorcallback);
163  if(!ret_)
164  return false;
165  vigra_precondition(std::distance(b, e) == selected.size(),
166  "Number of features in ranking != number of features matrix");
167  std::copy(b, e, selected.begin());
168  return true;
169  }
170 
171  template<class FeatureT,
172  class ResponseT,
173  class Iter>
174  bool init(FeatureT const & all_features,
175  ResponseT const & response,
176  Iter b,
177  Iter e)
178  {
179  RFErrorCallback ecallback;
180  return init(all_features, response, b, e, ecallback);
181  }
182 
183 
184  template<class FeatureT,
185  class ResponseT>
186  bool init(FeatureT const & all_features,
187  ResponseT const & response)
188  {
189  return init(all_features, response, RFErrorCallback());
190  }
191  /**initialization routine. Will be called only once in the lifetime
192  * of a VariableSelectionResult. Subsequent calls will not reinitialize
193  * member variables.
194  *
195  * This is intended, to allow continuing variable selection at a point
196  * stopped in an earlier iteration.
197  *
198  * returns true if initialization was successful and false if
199  * the object was already initialized before.
200  */
201  template<class FeatureT,
202  class ResponseT,
203  class ErrorRateCallBack>
204  bool init(FeatureT const & all_features,
205  ResponseT const & response,
206  ErrorRateCallBack errorcallback)
207  {
208  if(initialized)
209  {
210  return false;
211  }
212  // calculate error with all features
213  selected.resize(all_features.shape(1), 0);
214  for(unsigned int ii = 0; ii < selected.size(); ++ii)
215  selected[ii] = ii;
216  errors.resize(all_features.shape(1), -1);
217  errors.back() = errorcallback(all_features, response);
218 
219  // calculate error rate if no features are chosen
220  // corresponds to max(prior probability) of the classes
221  std::map<typename ResponseT::value_type, int> res_map;
222  std::vector<int> cts;
223  int counter = 0;
224  for(int ii = 0; ii < response.shape(0); ++ii)
225  {
226  if(res_map.find(response(ii, 0)) == res_map.end())
227  {
228  res_map[response(ii, 0)] = counter;
229  ++counter;
230  cts.push_back(0);
231  }
232  cts[res_map[response(ii,0)]] +=1;
233  }
234  no_features = double(*(std::max_element(cts.begin(),
235  cts.end())))
236  / double(response.shape(0));
237 
238  /*init not_selected vector;
239  not_selected.resize(all_features.shape(1), 0);
240  for(int ii = 0; ii < not_selected.size(); ++ii)
241  {
242  not_selected[ii] = ii;
243  }
244  initialized = true;
245  */
246  pivot = selected.begin();
247  return true;
248  }
249 };
250 
251 
252 
253 /** Perform forward selection
254  *
255  * \param features IN: n x p matrix containing n instances with p attributes/features
256  * used in the variable selection algorithm
257  * \param response IN: n x 1 matrix containing the corresponding response
258  * \param result IN/OUT: VariableSelectionResult struct which will contain the results
259  * of the algorithm.
260  * Features between result.selected.begin() and result.pivot will
261  * be left untouched.
262  * \sa VariableSelectionResult
263  * \param errorcallback
264  * IN, OPTIONAL:
265  * Functor that returns the error rate given a set of
266  * features and labels. Default is the RandomForest OOB Error.
267  *
268  * Forward selection subsequently chooses the next feature that decreases the Error rate most.
269  *
270  * usage:
271  * \code
272  * MultiArray<2, double> features = createSomeFeatures();
273  * MultiArray<2, int> labels = createCorrespondingLabels();
274  * VariableSelectionResult result;
275  * forward_selection(features, labels, result);
276  * \endcode
277  * To use forward selection but ensure that a specific feature e.g. feature 5 is always
278  * included one would do the following
279  *
280  * \code
281  * VariableSelectionResult result;
282  * result.init(features, labels);
283  * std::swap(result.selected[0], result.selected[5]);
284  * result.setPivot(1);
285  * forward_selection(features, labels, result);
286  * \endcode
287  *
288  * \sa VariableSelectionResult
289  *
290  */
291 template<class FeatureT, class ResponseT, class ErrorRateCallBack>
292 void forward_selection(FeatureT const & features,
293  ResponseT const & response,
294  VariableSelectionResult & result,
295  ErrorRateCallBack errorcallback)
296 {
297  VariableSelectionResult::FeatureList_t & selected = result.selected;
298  VariableSelectionResult::ErrorList_t & errors = result.errors;
299  VariableSelectionResult::Pivot_t & pivot = result.pivot;
300  int featureCount = features.shape(1);
301  // initialize result struct if in use for the first time
302  if(!result.init(features, response, errorcallback))
303  {
304  //result is being reused just ensure that the number of features is
305  //the same.
306  vigra_precondition(selected.size() == featureCount,
307  "forward_selection(): Number of features in Feature "
308  "matrix and number of features in previously used "
309  "result struct mismatch!");
310  }
311 
312 
313  int not_selected_size = std::distance(pivot, selected.end());
314  while(not_selected_size > 1)
315  {
316  std::vector<int> current_errors;
317  VariableSelectionResult::Pivot_t next = pivot;
318  for(int ii = 0; ii < not_selected_size; ++ii, ++next)
319  {
320  std::swap(*pivot, *next);
321  MultiArray<2, double> cur_feats;
322  detail::choose( features,
323  selected.begin(),
324  pivot+1,
325  cur_feats);
326  double error = errorcallback(cur_feats, response);
327  current_errors.push_back(error);
328  std::swap(*pivot, *next);
329  }
330  int pos = std::distance(current_errors.begin(),
331  std::min_element(current_errors.begin(),
332  current_errors.end()));
333  next = pivot;
334  std::advance(next, pos);
335  std::swap(*pivot, *next);
336  errors[std::distance(selected.begin(), pivot)] = current_errors[pos];
337  ++pivot;
338  not_selected_size = std::distance(pivot, selected.end());
339  }
340 }
341 template<class FeatureT, class ResponseT>
342 void forward_selection(FeatureT const & features,
343  ResponseT const & response,
344  VariableSelectionResult & result)
345 {
346  forward_selection(features, response, result, RFErrorCallback());
347 }
348 
349 
350 /** Perform backward elimination
351  *
352  * \param features IN: n x p matrix containing n instances with p attributes/features
353  * used in the variable selection algorithm
354  * \param response IN: n x 1 matrix containing the corresponding response
355  * \param result IN/OUT: VariableSelectionResult struct which will contain the results
356  * of the algorithm.
357  * Features between result.pivot and result.selected.end() will
358  * be left untouched.
359  * \sa VariableSelectionResult
360  * \param errorcallback
361  * IN, OPTIONAL:
362  * Functor that returns the error rate given a set of
363  * features and labels. Default is the RandomForest OOB Error.
364  *
365  * Backward elimination subsequently eliminates features that have the least influence
366  * on the error rate
367  *
368  * usage:
369  * \code
370  * MultiArray<2, double> features = createSomeFeatures();
371  * MultiArray<2, int> labels = createCorrespondingLabels();
372  * VariableSelectionResult result;
373  * backward_elimination(features, labels, result);
374  * \endcode
375  * To use backward elimination but ensure that a specific feature e.g. feature 5 is always
376  * excluded one would do the following:
377  *
378  * \code
379  * VariableSelectionResult result;
380  * result.init(features, labels);
381  * std::swap(result.selected[result.selected.size()-1], result.selected[5]);
382  * result.setPivot(result.selected.size()-1);
383  * backward_elimination(features, labels, result);
384  * \endcode
385  *
386  * \sa VariableSelectionResult
387  *
388  */
389 template<class FeatureT, class ResponseT, class ErrorRateCallBack>
390 void backward_elimination(FeatureT const & features,
391  ResponseT const & response,
392  VariableSelectionResult & result,
393  ErrorRateCallBack errorcallback)
394 {
395  int featureCount = features.shape(1);
396  VariableSelectionResult::FeatureList_t & selected = result.selected;
397  VariableSelectionResult::ErrorList_t & errors = result.errors;
398  VariableSelectionResult::Pivot_t & pivot = result.pivot;
399 
400  // initialize result struct if in use for the first time
401  if(!result.init(features, response, errorcallback))
402  {
403  //result is being reused just ensure that the number of features is
404  //the same.
405  vigra_precondition(selected.size() == featureCount,
406  "backward_elimination(): Number of features in Feature "
407  "matrix and number of features in previously used "
408  "result struct mismatch!");
409  }
410  pivot = selected.end() - 1;
411 
412  int selected_size = std::distance(selected.begin(), pivot);
413  while(selected_size > 1)
414  {
415  VariableSelectionResult::Pivot_t next = selected.begin();
416  std::vector<int> current_errors;
417  for(int ii = 0; ii < selected_size; ++ii, ++next)
418  {
419  std::swap(*pivot, *next);
420  MultiArray<2, double> cur_feats;
421  detail::choose( features,
422  selected.begin(),
423  pivot,
424  cur_feats);
425  double error = errorcallback(cur_feats, response);
426  current_errors.push_back(error);
427  std::swap(*pivot, *next);
428  }
429  int pos = std::distance(current_errors.begin(),
430  std::max_element(current_errors.begin(),
431  current_errors.end()));
432  next = selected.begin();
433  std::advance(next, pos);
434  std::swap(*pivot, *next);
435 // std::cerr << std::distance(selected.begin(), pivot) << " " << pos << " " << current_errors.size() << " " << errors.size() << std::endl;
436  errors[std::distance(selected.begin(), pivot)] = current_errors[pos];
437  selected_size = std::distance(selected.begin(), pivot);
438  --pivot;
439  }
440 }
441 
442 template<class FeatureT, class ResponseT>
443 void backward_elimination(FeatureT const & features,
444  ResponseT const & response,
445  VariableSelectionResult & result)
446 {
447  backward_elimination(features, response, result, RFErrorCallback());
448 }
449 
450 /** Perform rank selection using a predefined ranking
451  *
452  * \param features IN: n x p matrix containing n instances with p attributes/features
453  * used in the variable selection algorithm
454  * \param response IN: n x 1 matrix containing the corresponding response
455  * \param result IN/OUT: VariableSelectionResult struct which will contain the results
456  * of the algorithm. The struct should be initialized with the
457  * predefined ranking.
458  *
459  * \sa VariableSelectionResult
460  * \param errorcallback
461  * IN, OPTIONAL:
462  * Functor that returns the error rate given a set of
463  * features and labels. Default is the RandomForest OOB Error.
464  *
465  * Often some variable importance, score measure is used to create the ordering in which
466  * variables have to be selected. This method takes such a ranking and calculates the
467  * corresponding error rates.
468  *
469  * usage:
470  * \code
471  * MultiArray<2, double> features = createSomeFeatures();
472  * MultiArray<2, int> labels = createCorrespondingLabels();
473  * std::vector<int> ranking = createRanking(features);
474  * VariableSelectionResult result;
475  * result.init(features, labels, ranking.begin(), ranking.end());
476  * backward_elimination(features, labels, result);
477  * \endcode
478  *
479  * \sa VariableSelectionResult
480  *
481  */
482 template<class FeatureT, class ResponseT, class ErrorRateCallBack>
483 void rank_selection (FeatureT const & features,
484  ResponseT const & response,
485  VariableSelectionResult & result,
486  ErrorRateCallBack errorcallback)
487 {
488  VariableSelectionResult::FeatureList_t & selected = result.selected;
489  VariableSelectionResult::ErrorList_t & errors = result.errors;
490  VariableSelectionResult::Pivot_t & iter = result.pivot;
491  int featureCount = features.shape(1);
492  // initialize result struct if in use for the first time
493  if(!result.init(features, response, errorcallback))
494  {
495  //result is being reused just ensure that the number of features is
496  //the same.
497  vigra_precondition(selected.size() == featureCount,
498  "forward_selection(): Number of features in Feature "
499  "matrix and number of features in previously used "
500  "result struct mismatch!");
501  }
502 
503  int ii = 0;
504  for(; iter != selected.end(); ++iter)
505  {
506 // std::cerr << ii<< std::endl;
507  ++ii;
508  MultiArray<2, double> cur_feats;
509  detail::choose( features,
510  selected.begin(),
511  iter,
512  cur_feats);
513  double error = errorcallback(cur_feats, response);
514  errors[std::distance(selected.begin(), iter)] = error;
515 
516  }
517 }
518 
519 template<class FeatureT, class ResponseT>
520 void rank_selection (FeatureT const & features,
521  ResponseT const & response,
522  VariableSelectionResult & result)
523 {
524  rank_selection(features, response, result, RFErrorCallback());
525 }
526 
527 
528 
529 enum ClusterLeafTypes{c_Leaf = 95, c_Node = 99};
530 
531 /* View of a Node in the hierarchical clustering
532  * class
533  * For internal use only -
534  * \sa NodeBase
535  */
536 class ClusterNode
537 : public NodeBase
538 {
539  public:
540 
541  typedef NodeBase BT;
542 
543  /**constructors **/
544  ClusterNode():NodeBase(){}
545  ClusterNode( int nCol,
546  BT::T_Container_type & topology,
547  BT::P_Container_type & split_param)
548  : BT(nCol + 5, 5,topology, split_param)
549  {
550  status() = 0;
551  BT::column_data()[0] = nCol;
552  if(nCol == 1)
553  BT::typeID() = c_Leaf;
554  else
555  BT::typeID() = c_Node;
556  }
557 
558  ClusterNode( BT::T_Container_type const & topology,
559  BT::P_Container_type const & split_param,
560  int n )
561  : NodeBase(5 , 5,topology, split_param, n)
562  {
563  //TODO : is there a more elegant way to do this?
564  BT::topology_size_ += BT::column_data()[0];
565  }
566 
567  ClusterNode( BT & node_)
568  : BT(5, 5, node_)
569  {
570  //TODO : is there a more elegant way to do this?
571  BT::topology_size_ += BT::column_data()[0];
572  BT::parameter_size_ += 0;
573  }
574  int index()
575  {
576  return static_cast<int>(BT::parameters_begin()[1]);
577  }
578  void set_index(int in)
579  {
580  BT::parameters_begin()[1] = in;
581  }
582  double& mean()
583  {
584  return BT::parameters_begin()[2];
585  }
586  double& stdev()
587  {
588  return BT::parameters_begin()[3];
589  }
590  double& status()
591  {
592  return BT::parameters_begin()[4];
593  }
594 };
595 
596 /** Stackentry class for HClustering class
597  */
598 struct HC_Entry
599 {
600  int parent;
601  int level;
602  int addr;
603  bool infm;
604  HC_Entry(int p, int l, int a, bool in)
605  : parent(p), level(l), addr(a), infm(in)
606  {}
607 };
608 
609 
610 /** Hierarchical Clustering class.
611  * Performs single linkage clustering
612  * \code
613  * Matrix<double> distance = get_distance_matrix();
614  * linkage.cluster(distance);
615  * // Draw clustering tree.
616  * Draw<double, int> draw(features, labels, "linkagetree.graph");
617  * linkage.breadth_first_traversal(draw);
618  * \endcode
619  * \sa ClusterImportanceVisitor
620  *
621  * once the clustering has taken place. Information queries can be made
622  * using the breadth_first_traversal() method and iterate() method
623  *
624  */
626 {
627 public:
629  ArrayVector<int> topology_;
630  ArrayVector<double> parameters_;
631  int begin_addr;
632 
633  // Calculates the distance between two
634  double dist_func(double a, double b)
635  {
636  return std::min(a, b);
637  }
638 
639  /** Visit each node with a Functor
640  * in creation order (should be depth first)
641  */
642  template<class Functor>
643  void iterate(Functor & tester)
644  {
645 
646  std::vector<int> stack;
647  stack.push_back(begin_addr);
648  while(!stack.empty())
649  {
650  ClusterNode node(topology_, parameters_, stack.back());
651  stack.pop_back();
652  if(!tester(node))
653  {
654  if(node.columns_size() != 1)
655  {
656  stack.push_back(node.child(0));
657  stack.push_back(node.child(1));
658  }
659  }
660  }
661  }
662 
663  /** Perform breadth first traversal of hierarchical cluster tree
664  */
665  template<class Functor>
666  void breadth_first_traversal(Functor & tester)
667  {
668 
669  std::queue<HC_Entry> queue;
670  int level = 0;
671  int parent = -1;
672  int addr = -1;
673  bool infm = false;
674  queue.push(HC_Entry(parent,level,begin_addr, infm));
675  while(!queue.empty())
676  {
677  level = queue.front().level;
678  parent = queue.front().parent;
679  addr = queue.front().addr;
680  infm = queue.front().infm;
681  ClusterNode node(topology_, parameters_, queue.front().addr);
682  ClusterNode parnt;
683  if(parent != -1)
684  {
685  parnt = ClusterNode(topology_, parameters_, parent);
686  }
687  queue.pop();
688  bool istrue = tester(node, level, parnt, infm);
689  if(node.columns_size() != 1)
690  {
691  queue.push(HC_Entry(addr, level +1,node.child(0),istrue));
692  queue.push(HC_Entry(addr, level +1,node.child(1),istrue));
693  }
694  }
695  }
696  /**save to HDF5 - defunct - has to be updated to new HDF5 interface
697  */
698  void save(std::string file, std::string prefix)
699  {
700 
701  vigra::writeHDF5(file.c_str(), (prefix + "topology").c_str(),
703  Shp(topology_.size(),1),
704  topology_.data()));
705  vigra::writeHDF5(file.c_str(), (prefix + "parameters").c_str(),
707  Shp(parameters_.size(), 1),
708  parameters_.data()));
709  vigra::writeHDF5(file.c_str(), (prefix + "begin_addr").c_str(),
710  MultiArrayView<2, int>(Shp(1,1), &begin_addr));
711 
712  }
713 
714  /**Perform single linkage clustering
715  * \param distance distance matrix used. \sa CorrelationVisitor
716  */
717  template<class T, class C>
719  {
720  MultiArray<2, T> dist(distance);
721  std::vector<std::pair<int, int> > addr;
722  typedef std::pair<int, int> Entry;
723  int index = 0;
724  for(int ii = 0; ii < distance.shape(0); ++ii)
725  {
726  addr.push_back(std::make_pair(topology_.size(), ii));
727  ClusterNode leaf(1, topology_, parameters_);
728  leaf.set_index(index);
729  ++index;
730  leaf.columns_begin()[0] = ii;
731  }
732 
733  while(addr.size() != 1)
734  {
735  //find the two nodes with the smallest distance
736  int ii_min = 0;
737  int jj_min = 1;
738  double min_dist = dist((addr.begin()+ii_min)->second,
739  (addr.begin()+jj_min)->second);
740  for(unsigned int ii = 0; ii < addr.size(); ++ii)
741  {
742  for(unsigned int jj = ii+1; jj < addr.size(); ++jj)
743  {
744  if( dist((addr.begin()+ii_min)->second,
745  (addr.begin()+jj_min)->second)
746  > dist((addr.begin()+ii)->second,
747  (addr.begin()+jj)->second))
748  {
749  min_dist = dist((addr.begin()+ii)->second,
750  (addr.begin()+jj)->second);
751  ii_min = ii;
752  jj_min = jj;
753  }
754  }
755  }
756 
757  //merge two nodes
758  int col_size = 0;
759  // The problem is that creating a new node invalidates the iterators stored
760  // in firstChild and secondChild.
761  {
762  ClusterNode firstChild(topology_,
763  parameters_,
764  (addr.begin() +ii_min)->first);
765  ClusterNode secondChild(topology_,
766  parameters_,
767  (addr.begin() +jj_min)->first);
768  col_size = firstChild.columns_size() + secondChild.columns_size();
769  }
770  int cur_addr = topology_.size();
771  begin_addr = cur_addr;
772 // std::cerr << col_size << std::endl;
773  ClusterNode parent(col_size,
774  topology_,
775  parameters_);
776  ClusterNode firstChild(topology_,
777  parameters_,
778  (addr.begin() +ii_min)->first);
779  ClusterNode secondChild(topology_,
780  parameters_,
781  (addr.begin() +jj_min)->first);
782  parent.parameters_begin()[0] = min_dist;
783  parent.set_index(index);
784  ++index;
785  std::merge(firstChild.columns_begin(), firstChild.columns_end(),
786  secondChild.columns_begin(),secondChild.columns_end(),
787  parent.columns_begin());
788  //merge nodes in addr
789  int to_keep;
790  int to_desc;
791  int ii_keep;
792  if(*parent.columns_begin() == *firstChild.columns_begin())
793  {
794  parent.child(0) = (addr.begin()+ii_min)->first;
795  parent.child(1) = (addr.begin()+jj_min)->first;
796  (addr.begin()+ii_min)->first = cur_addr;
797  ii_keep = ii_min;
798  to_keep = (addr.begin()+ii_min)->second;
799  to_desc = (addr.begin()+jj_min)->second;
800  addr.erase(addr.begin()+jj_min);
801  }
802  else
803  {
804  parent.child(1) = (addr.begin()+ii_min)->first;
805  parent.child(0) = (addr.begin()+jj_min)->first;
806  (addr.begin()+jj_min)->first = cur_addr;
807  ii_keep = jj_min;
808  to_keep = (addr.begin()+jj_min)->second;
809  to_desc = (addr.begin()+ii_min)->second;
810  addr.erase(addr.begin()+ii_min);
811  }
812  //update distances;
813 
814  for(unsigned int jj = 0 ; jj < addr.size(); ++jj)
815  {
816  if(jj == ii_keep)
817  continue;
818  double bla = dist_func(
819  dist(to_desc, (addr.begin()+jj)->second),
820  dist((addr.begin()+ii_keep)->second,
821  (addr.begin()+jj)->second));
822 
823  dist((addr.begin()+ii_keep)->second,
824  (addr.begin()+jj)->second) = bla;
825  dist((addr.begin()+jj)->second,
826  (addr.begin()+ii_keep)->second) = bla;
827  }
828  }
829  }
830 
831 };
832 
833 
834 /** Normalize the status value in the HClustering tree (HClustering Visitor)
835  */
837 {
838 public:
839  double n;
840  /** Constructor
841  * \param m normalize status() by m
842  */
843  NormalizeStatus(double m)
844  :n(m)
845  {}
846  template<class Node>
847  bool operator()(Node& node)
848  {
849  node.status()/=n;
850  return false;
851  }
852 };
853 
854 
855 /** Perform Permutation importance on HClustering clusters
856  * (See visit_after_tree() method of visitors::VariableImportance to
857  * see the basic idea. (Just that we apply the permutation not only to
858  * variables but also to clusters))
859  */
860 template<class Iter, class DT>
862 {
863 public:
865  Matrix<double> tmp_mem_;
866  MultiArrayView<2, double> perm_imp;
867  MultiArrayView<2, double> orig_imp;
868  Matrix<double> feats_;
869  Matrix<int> labels_;
870  const int nPerm;
871  DT const & dt;
872  int index;
873  int oob_size;
874 
875  template<class Feat_T, class Label_T>
876  PermuteCluster(Iter a,
877  Iter b,
878  Feat_T const & feats,
879  Label_T const & labls,
882  int np,
883  DT const & dt_)
884  :tmp_mem_(_spl(a, b).size(), feats.shape(1)),
885  perm_imp(p_imp),
886  orig_imp(o_imp),
887  feats_(_spl(a,b).size(), feats.shape(1)),
888  labels_(_spl(a,b).size(),1),
889  nPerm(np),
890  dt(dt_),
891  index(0),
892  oob_size(b-a)
893  {
894  copy_splice(_spl(a,b),
895  _spl(feats.shape(1)),
896  feats,
897  feats_);
898  copy_splice(_spl(a,b),
899  _spl(labls.shape(1)),
900  labls,
901  labels_);
902  }
903 
904  template<class Node>
905  bool operator()(Node& node)
906  {
907  tmp_mem_ = feats_;
908  RandomMT19937 random;
909  int class_count = perm_imp.shape(1) - 1;
910  //permute columns together
911  for(int kk = 0; kk < nPerm; ++kk)
912  {
913  tmp_mem_ = feats_;
914  for(int ii = 0; ii < rowCount(feats_); ++ii)
915  {
916  int index = random.uniformInt(rowCount(feats_) - ii) +ii;
917  for(int jj = 0; jj < node.columns_size(); ++jj)
918  {
919  if(node.columns_begin()[jj] != feats_.shape(1))
920  tmp_mem_(ii, node.columns_begin()[jj])
921  = tmp_mem_(index, node.columns_begin()[jj]);
922  }
923  }
924 
925  for(int ii = 0; ii < rowCount(tmp_mem_); ++ii)
926  {
927  if(dt
928  .predictLabel(rowVector(tmp_mem_, ii))
929  == labels_(ii, 0))
930  {
931  //per class
932  ++perm_imp(index,labels_(ii, 0));
933  //total
934  ++perm_imp(index, class_count);
935  }
936  }
937  }
938  double node_status = perm_imp(index, class_count);
939  node_status /= nPerm;
940  node_status -= orig_imp(0, class_count);
941  node_status *= -1;
942  node_status /= oob_size;
943  node.status() += node_status;
944  ++index;
945 
946  return false;
947  }
948 };
949 
950 /** Convert ClusteringTree into a list (HClustering visitor)
951  */
953 {
954 public:
955  /** NumberOfClusters x NumberOfVariables MultiArrayView containing
956  * in each row the variable belonging to a cluster
957  */
959  int index;
961  :variables(vars), index(0)
962  {}
963  void save(std::string file, std::string prefix)
964  {
965  vigra::writeHDF5(file.c_str(), (prefix + "_variables").c_str(),
966  variables);
967  }
968 
969  template<class Node>
970  bool operator()(Node& node)
971  {
972  for(int ii = 0; ii < node.columns_size(); ++ii)
973  variables(index, ii) = node.columns_begin()[ii];
974  ++index;
975  return false;
976  }
977 };
978 /** corrects the status fields of a linkage Clustering (HClustering Visitor)
979  *
980  * such that status(currentNode) = min(status(parent), status(currentNode))
981  * \sa cluster_permutation_importance()
982  */
984 {
985 public:
986  template<class Nde>
987  bool operator()(Nde & cur, int level, Nde parent, bool infm)
988  {
989  if(parent.hasData_)
990  cur.status() = std::min(parent.status(), cur.status());
991  return true;
992  }
993 };
994 
995 
996 /** draw current linkage Clustering (HClustering Visitor)
997  *
998  * create a graphviz .dot file
999  * usage:
1000  * \code
1001  * Matrix<double> distance = get_distance_matrix();
1002  * linkage.cluster(distance);
1003  * Draw<double, int> draw(features, labels, "linkagetree.graph");
1004  * linkage.breadth_first_traversal(draw);
1005  * \endcode
1006  */
1007 template<class T1,
1008  class T2,
1009  class C1 = UnstridedArrayTag,
1010  class C2 = UnstridedArrayTag>
1011 class Draw
1012 {
1013 public:
1014  typedef MultiArrayShape<2>::type Shp;
1015  MultiArrayView<2, T1, C1> const & features_;
1016  MultiArrayView<2, T2, C2> const & labels_;
1017  std::ofstream graphviz;
1018 
1019 
1020  Draw(MultiArrayView<2, T1, C1> const & features,
1021  MultiArrayView<2, T2, C2> const& labels,
1022  std::string const gz)
1023  :features_(features), labels_(labels),
1024  graphviz(gz.c_str(), std::ios::out)
1025  {
1026  graphviz << "digraph G\n{\n node [shape=\"record\"]";
1027  }
1028  ~Draw()
1029  {
1030  graphviz << "\n}\n";
1031  graphviz.close();
1032  }
1033 
1034  template<class Nde>
1035  bool operator()(Nde & cur, int level, Nde parent, bool infm)
1036  {
1037  graphviz << "node" << cur.index() << " [style=\"filled\"][label = \" #Feats: "<< cur.columns_size() << "\\n";
1038  graphviz << " status: " << cur.status() << "\\n";
1039  for(int kk = 0; kk < cur.columns_size(); ++kk)
1040  {
1041  graphviz << cur.columns_begin()[kk] << " ";
1042  if(kk % 15 == 14)
1043  graphviz << "\\n";
1044  }
1045  graphviz << "\"] [color = \"" <<cur.status() << " 1.000 1.000\"];\n";
1046  if(parent.hasData_)
1047  graphviz << "\"node" << parent.index() << "\" -> \"node" << cur.index() <<"\";\n";
1048  return true;
1049  }
1050 };
1051 
1052 /** calculate Cluster based permutation importance while learning. (RandomForestVisitor)
1053  */
1055 {
1056  public:
1057 
1058  /** List of variables as produced by GetClusterVariables
1059  */
1061  /** Corresponding importance measures
1062  */
1064  /** Corresponding error
1065  */
1067  int repetition_count_;
1068  bool in_place_;
1069  HClustering & clustering;
1070 
1071 
1072 #ifdef HasHDF5
1073  void save(std::string filename, std::string prefix)
1074  {
1075  std::string prefix1 = "cluster_importance_" + prefix;
1076  writeHDF5(filename.c_str(),
1077  prefix1.c_str(),
1079  prefix1 = "vars_" + prefix;
1080  writeHDF5(filename.c_str(),
1081  prefix1.c_str(),
1082  variables);
1083  }
1084 #endif
1085 
1086  ClusterImportanceVisitor(HClustering & clst, int rep_cnt = 10)
1087  : repetition_count_(rep_cnt), clustering(clst)
1088 
1089  {}
1090 
1091  /** Allocate enough memory
1092  */
1093  template<class RF, class PR>
1094  void visit_at_beginning(RF const & rf, PR const & pr)
1095  {
1096  Int32 const class_count = rf.ext_param_.class_count_;
1097  Int32 const column_count = rf.ext_param_.column_count_+1;
1099  .reshape(MultiArrayShape<2>::type(2*column_count-1,
1100  class_count+1));
1102  .reshape(MultiArrayShape<2>::type(2*column_count-1,
1103  class_count+1));
1104  variables
1105  .reshape(MultiArrayShape<2>::type(2*column_count-1,
1106  column_count), -1);
1108  clustering.iterate(gcv);
1109 
1110  }
1111 
1112  /**compute permutation based var imp.
1113  * (Only an Array of size oob_sample_count x 1 is created.
1114  * - apposed to oob_sample_count x feature_count in the other method.
1115  *
1116  * \sa FieldProxy
1117  */
1118  template<class RF, class PR, class SM, class ST>
1119  void after_tree_ip_impl(RF& rf, PR & pr, SM & sm, ST & st, int index)
1120  {
1121  typedef MultiArrayShape<2>::type Shp_t;
1122  Int32 column_count = rf.ext_param_.column_count_ +1;
1123  Int32 class_count = rf.ext_param_.class_count_;
1124 
1125  // remove the const cast on the features (yep , I know what I am
1126  // doing here.) data is not destroyed.
1127  typename PR::Feature_t & features
1128  = const_cast<typename PR::Feature_t &>(pr.features());
1129 
1130  //find the oob indices of current tree.
1131  ArrayVector<Int32> oob_indices;
1133  iter;
1134 
1135  if(rf.ext_param_.actual_msample_ < pr.features().shape(0)- 10000)
1136  {
1137  ArrayVector<int> cts(2, 0);
1138  ArrayVector<Int32> indices(pr.features().shape(0));
1139  for(int ii = 0; ii < pr.features().shape(0); ++ii)
1140  indices.push_back(ii);
1141  std::random_shuffle(indices.begin(), indices.end());
1142  for(int ii = 0; ii < rf.ext_param_.row_count_; ++ii)
1143  {
1144  if(!sm.is_used()[indices[ii]] && cts[pr.response()(indices[ii], 0)] < 3000)
1145  {
1146  oob_indices.push_back(indices[ii]);
1147  ++cts[pr.response()(indices[ii], 0)];
1148  }
1149  }
1150  }
1151  else
1152  {
1153  for(int ii = 0; ii < rf.ext_param_.row_count_; ++ii)
1154  if(!sm.is_used()[ii])
1155  oob_indices.push_back(ii);
1156  }
1157 
1158  // Random foo
1159  RandomMT19937 random(RandomSeed);
1161  randint(random);
1162 
1163  //make some space for the results
1165  oob_right(Shp_t(1, class_count + 1));
1166 
1167  // get the oob success rate with the original samples
1168  for(iter = oob_indices.begin();
1169  iter != oob_indices.end();
1170  ++iter)
1171  {
1172  if(rf.tree(index)
1173  .predictLabel(rowVector(features, *iter))
1174  == pr.response()(*iter, 0))
1175  {
1176  //per class
1177  ++oob_right[pr.response()(*iter,0)];
1178  //total
1179  ++oob_right[class_count];
1180  }
1181  }
1182 
1184  perm_oob_right (Shp_t(2* column_count-1, class_count + 1));
1185 
1186  PermuteCluster<ArrayVector<Int32>::iterator,typename RF::DecisionTree_t>
1187  pc(oob_indices.begin(), oob_indices.end(),
1188  pr.features(),
1189  pr.response(),
1190  perm_oob_right,
1191  oob_right,
1192  repetition_count_,
1193  rf.tree(index));
1194  clustering.iterate(pc);
1195 
1196  perm_oob_right /= repetition_count_;
1197  for(int ii = 0; ii < rowCount(perm_oob_right); ++ii)
1198  rowVector(perm_oob_right, ii) -= oob_right;
1199 
1200  perm_oob_right *= -1;
1201  perm_oob_right /= oob_indices.size();
1202  cluster_importance_ += perm_oob_right;
1203  }
1204 
1205  /** calculate permutation based impurity after every tree has been
1206  * learned default behaviour is that this happens out of place.
1207  * If you have very big data sets and want to avoid copying of data
1208  * set the in_place_ flag to true.
1209  */
1210  template<class RF, class PR, class SM, class ST>
1211  void visit_after_tree(RF& rf, PR & pr, SM & sm, ST & st, int index)
1212  {
1213  after_tree_ip_impl(rf, pr, sm, st, index);
1214  }
1215 
1216  /** Normalise variable importance after the number of trees is known.
1217  */
1218  template<class RF, class PR>
1219  void visit_at_end(RF & rf, PR & pr)
1220  {
1221  NormalizeStatus nrm(rf.tree_count());
1222  clustering.iterate(nrm);
1223  cluster_importance_ /= rf.trees_.size();
1224  }
1225 };
1226 
1227 /** Perform hierarchical clustering of variables and assess importance of clusters
1228  *
1229  * \param features IN: n x p matrix containing n instances with p attributes/features
1230  * used in the variable selection algorithm
1231  * \param response IN: n x 1 matrix containing the corresponding response
1232  * \param linkage OUT: Hierarchical grouping of variables.
1233  * \param distance OUT: distance matrix used for creating the linkage
1234  *
1235  * Performs Hierarchical clustering of variables. And calculates the permutation importance
1236  * measures of each of the clusters. Use the Draw functor to create human readable output
1237  * The cluster-permutation importance measure corresponds to the normal permutation importance
1238  * measure with all columns corresponding to a cluster permuted.
1239  * The importance measure for each cluster is stored as the status() field of each clusternode
1240  * \sa HClustering
1241  *
1242  * usage:
1243  * \code
1244  * MultiArray<2, double> features = createSomeFeatures();
1245  * MultiArray<2, int> labels = createCorrespondingLabels();
1246  * HClustering linkage;
1247  * MultiArray<2, double> distance;
1248  * cluster_permutation_importance(features, labels, linkage, distance)
1249  * // create graphviz output
1250  *
1251  * Draw<double, int> draw(features, labels, "linkagetree.graph");
1252  * linkage.breadth_first_traversal(draw);
1253  *
1254  * \endcode
1255  *
1256  *
1257  */
1258 template<class FeatureT, class ResponseT>
1259 void cluster_permutation_importance(FeatureT const & features,
1260  ResponseT const & response,
1261  HClustering & linkage,
1262  MultiArray<2, double> & distance)
1263 {
1264 
1265  RandomForestOptions opt;
1266  opt.tree_count(100);
1267  if(features.shape(0) > 40000)
1268  opt.samples_per_tree(20000).use_stratification(RF_EQUAL);
1269 
1270 
1271  vigra::RandomForest<int> RF(opt);
1274  RF.learn(features, response,
1275  create_visitor(missc, progress));
1276  distance = missc.distance;
1277  /*
1278  missc.save(exp_dir + dset.name() + "_result.h5", dset.name()+"MACH");
1279  */
1280 
1281 
1282  // Produce linkage
1283  linkage.cluster(distance);
1284 
1285  //linkage.save(exp_dir + dset.name() + "_result.h5", "_linkage_CC/");
1286  vigra::RandomForest<int> RF2(opt);
1287  ClusterImportanceVisitor ci(linkage);
1288  RF2.learn(features,
1289  response,
1290  create_visitor(progress, ci));
1291 
1292 
1293  CorrectStatus cs;
1294  linkage.breadth_first_traversal(cs);
1295 
1296  //ci.save(exp_dir + dset.name() + "_result.h5", dset.name());
1297  //Draw<double, int> draw(dset.features(), dset.response(), exp_dir+ dset.name() + ".graph");
1298  //linkage.breadth_first_traversal(draw);
1299 
1300 }
1301 
1302 
1303 template<class FeatureT, class ResponseT>
1304 void cluster_permutation_importance(FeatureT const & features,
1305  ResponseT const & response,
1306  HClustering & linkage)
1307 {
1308  MultiArray<2, double> distance;
1309  cluster_permutation_importance(features, response, linkage, distance);
1310 }
1311 }//namespace algorithms
1312 }//namespace rf
1313 }//namespace vigra

© Ullrich Köthe (ullrich.koethe@iwr.uni-heidelberg.de)
Heidelberg Collaboratory for Image Processing, University of Heidelberg, Germany

html generated using doxygen and Python
vigra 1.7.1 (Wed Mar 12 2014)