casacore
FitToHalfStatistics.h
Go to the documentation of this file.
1 //# Copyright (C) 2000,2001
2 //# Associated Universities, Inc. Washington DC, USA.
3 //#
4 //# This library is free software; you can redistribute it and/or modify it
5 //# under the terms of the GNU Library General Public License as published by
6 //# the Free Software Foundation; either version 2 of the License, or (at your
7 //# option) any later version.
8 //#
9 //# This library is distributed in the hope that it will be useful, but WITHOUT
10 //# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 //# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
12 //# License for more details.
13 //#
14 //# You should have received a copy of the GNU Library General Public License
15 //# along with this library; if not, write to the Free Software Foundation,
16 //# Inc., 675 Massachusetts Ave, Cambridge, MA 02139, USA.
17 //#
18 //# Correspondence concerning AIPS++ should be addressed as follows:
19 //# Internet email: aips2-request@nrao.edu.
20 //# Postal address: AIPS++ Project Office
21 //# National Radio Astronomy Observatory
22 //# 520 Edgemont Road
23 //# Charlottesville, VA 22903-2475 USA
24 //#
25 
26 #ifndef SCIMATH_FITTOHALFSTATISTICS_H
27 #define SCIMATH_FITTOHALFSTATISTICS_H
28 
29 #include <casacore/casa/aips.h>
30 
31 #include <casacore/scimath/StatsFramework/ConstrainedRangeStatistics.h>
32 #include <casacore/scimath/StatsFramework/FitToHalfStatisticsData.h>
33 
34 namespace casacore {
35 
36 // Class to calculate statistics using the so-called fit to half algorithm. In
37 // this algorithm, a center value is specified, and only points greater or equal
38 // or less or equal this value are included. Furthermore, each of the included
39 // points is reflected about the center value, and these virtual points are
40 // added to the included points and the union of sets of included real points
41 // and virtual points are used for computing statistics. The specified center
42 // point is therefore the mean and median of the resulting distribution, and the
43 // total number of points is exactly twice the number of real data points that
44 // are included.
45 //
46 // This class uses a ConstrainedRangeQuantileComputer object for computing
47 // quantile-like statistics. See class documentation for StatisticsAlgorithm for
48 // details regarding QuantileComputer classes.
49 
50 template <
51  class AccumType, class DataIterator, class MaskIterator=const Bool *,
52  class WeightsIterator=DataIterator
53 >
55  : public ConstrainedRangeStatistics<CASA_STATP> {
56 public:
57 
58  const static AccumType TWO;
59 
60  // <src>value</src> is only used if <src>center</src>=CVALUE
65  );
66 
67  // copy semantics
69 
71 
72  // copy semantics
75  );
76 
77  // Clone this instance. Caller is responsible for deleting the returned
78  // pointer.
80 
81  // get the algorithm that this object uses for computing stats
84  };
85 
86  // The median is just the center value, so none of the parameters to this
87  // method are used.
88  AccumType getMedian(
89  CountedPtr<uInt64> knownNpts=nullptr,
90  CountedPtr<AccumType> knownMin=nullptr,
91  CountedPtr<AccumType> knownMax=nullptr,
92  uInt binningThreshholdSizeBytes=4096*4096,
93  Bool persistSortedArray=False, uInt nBins=10000
94  );
95 
96  // <group>
97  // In the following group of methods, if the size of the composite dataset
98  // is smaller than <src>binningThreshholdSizeBytes</src>, the composite
99  // dataset will be (perhaps partially) sorted and persisted in memory during
100  // the call. In that case, and if <src>persistSortedArray</src> is True,
101  // this sorted array will remain in memory after the call and will be used
102  // on subsequent calls of this method when
103  // <src>binningThreshholdSizeBytes</src> is greater than the size of the
104  // composite dataset. If <src>persistSortedArray</src> is False, the sorted
105  // array will not be stored after this call completes and so any subsequent
106  // calls for which the dataset size is less than
107  // <src>binningThreshholdSizeBytes</src>, the dataset will be sorted from
108  // scratch. Values which are not included due to non-unity strides, are not
109  // included in any specified ranges, are masked, or have associated weights
110  // of zero are not considered as dataset members for quantile computations.
111  // If one has a priori information regarding the number of points (npts)
112  // and/or the minimum and maximum values of the data set, these can be
113  // supplied to improve performance. Note however, that if these values are
114  // not correct, the resulting median and/or quantile values will also not be
115  // correct (although see the following notes regarding max/min). Note that
116  // if this object has already had getStatistics() called, and the min and
117  // max were calculated, there is no need to pass these values in as they
118  // have been stored internally and will be used (although passing them
119  // explicitly shouldn't hurt anything). If provided, npts, the number of
120  // points falling in the specified ranges which are not masked and have
121  // weights > 0, should be correct. <src>min</src> can be less than the true
122  // minimum, and <src>max</src> can be greater than the True maximum, but for
123  // best performance, these should be as close to the actual min and max as
124  // possible (and ideally the actual min/max values of the data set).
126  std::map<Double, AccumType>& quantiles,
127  const std::set<Double>& fractions,
128  CountedPtr<uInt64> knownNpts=nullptr,
129  CountedPtr<AccumType> knownMin=nullptr,
130  CountedPtr<AccumType> knownMax=nullptr,
131  uInt binningThreshholdSizeBytes=4096*4096,
132  Bool persistSortedArray=False, uInt nBins=10000
133  );
134 
135  // get the median of the absolute deviation about the median of the data.
137  CountedPtr<uInt64> knownNpts=nullptr,
138  CountedPtr<AccumType> knownMin=nullptr,
139  CountedPtr<AccumType> knownMax=nullptr,
140  uInt binningThreshholdSizeBytes=4096*4096,
141  Bool persistSortedArray=False, uInt nBins=10000
142  );
143 
144  // Get the specified quantiles. <src>fractions</src> must be between 0 and
145  // 1, noninclusive.
146  std::map<Double, AccumType> getQuantiles(
147  const std::set<Double>& fractions,
148  CountedPtr<uInt64> knownNpts=nullptr,
149  CountedPtr<AccumType> knownMin=nullptr,
150  CountedPtr<AccumType> knownMax=nullptr,
151  uInt binningThreshholdSizeBytes=4096*4096,
152  Bool persistSortedArray=False, uInt nBins=10000
153  );
154  // </group>
155 
156  // scan the dataset(s) that have been added, and find the min and max.
157  // This method may be called even if setStatsToCaclulate has been called and
158  // MAX and MIN has been excluded.
159  virtual void getMinMax(AccumType& mymin, AccumType& mymax);
160 
161  // scan the dataset(s) that have been added, and find the number of good
162  // points. This method may be called even if setStatsToCaclulate has been
163  // called and NPTS has been excluded. If setCalculateAsAdded(True) has
164  // previously been called after this object has been (re)initialized, an
165  // exception will be thrown.
167 
168  // reset object to initial state. Clears all private fields including data,
169  // accumulators, global range. It does not affect the center type, center
170  // value, or which "side" to use which were set at construction.
171  virtual void reset();
172 
173  // This class does not allow statistics to be calculated as datasets are
174  // added, so an exception will be thrown if <src>c</src> is True.
176 
177  // Override base class method by requiring mean to be computed in addition
178  // to what is added in stats if the requested center value is CMEAN.
179  virtual void setStatsToCalculate(std::set<StatisticsData::STATS>& stats);
180 
181 protected:
182 
184 
186 
188 
189  inline const StatsData<AccumType>& _getStatsData() const {
190  return _statsData;
191  }
192 
193  // <group>
194  // no weights, no mask, no ranges
196  StatsData<AccumType>& stats, uInt64& ngood, LocationType& location,
197  const DataIterator& dataBegin, uInt64 nr, uInt dataStride
198  );
199 
200  // no weights, no mask
202  StatsData<AccumType>& stats, uInt64& ngood, LocationType& location,
203  const DataIterator& dataBegin, uInt64 nr, uInt dataStride,
204  const DataRanges& ranges, Bool isInclude
205  );
206 
208  StatsData<AccumType>& stats, uInt64& ngood, LocationType& location,
209  const DataIterator& dataBegin, uInt64 nr, uInt dataStride,
210  const MaskIterator& maskBegin, uInt maskStride
211  );
212 
214  StatsData<AccumType>& stats, uInt64& ngood, LocationType& location,
215  const DataIterator& dataBegin, uInt64 nr, uInt dataStride,
216  const MaskIterator& maskBegin, uInt maskStride,
217  const DataRanges& ranges, Bool isInclude
218  );
219  // </group>
220 
222 
223  // <group>
224  // has weights, but no mask, no ranges
226  StatsData<AccumType>& stats, LocationType& location,
227  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
228  uInt64 nr, uInt dataStride
229  );
230 
232  StatsData<AccumType>& stats, LocationType& location,
233  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
234  uInt64 nr, uInt dataStride, const DataRanges& ranges, Bool isInclude
235  );
236 
238  StatsData<AccumType>& stats, LocationType& location,
239  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
240  uInt64 nr, uInt dataStride, const MaskIterator& maskBegin,
241  uInt maskStride
242  );
243 
245  StatsData<AccumType>& stats, LocationType& location,
246  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
247  uInt64 nr, uInt dataStride, const MaskIterator& maskBegin,
248  uInt maskStride, const DataRanges& ranges, Bool isInclude
249  );
250  // </group>
251 
252 private:
255  AccumType _centerValue;
258  // these are the max and min for the real portion of the dataset
261  // This is used for convenience and performance. It should always
262  // be the same as the _range value used in the base
263  // ConstrainedRangeStatistics object
265 
266  // get the min max of the entire (real + virtual) data set. Only
267  // used for quantile computation
269  CountedPtr<AccumType>& realMin, CountedPtr<AccumType>& realMax,
271  );
272 
273  // get the min/max of the real portion only of the dataset
274  void _getRealMinMax(AccumType& realMin, AccumType& realMax);
275 
276  void _setRange();
277 
278 };
279 
280 }
281 
282 #ifndef CASACORE_NO_AUTO_TEMPLATES
283 #include <casacore/scimath/StatsFramework/FitToHalfStatistics.tcc>
284 #endif
285 
286 #endif
#define DataRanges
Abstract base class for statistics algorithms which are characterized by a range of good values.
Referenced counted pointer for constant data.
Definition: CountedPtr.h:81
USE_DATA
which section of data to use, greater than or less than the center value
CENTER
choice of center point based on the corresponding statistics from the entire distribution of data,...
Class to calculate statistics using the so-called fit to half algorithm.
void _weightedStats(StatsData< AccumType > &stats, LocationType &location, const DataIterator &dataBegin, const WeightsIterator &weightBegin, uInt64 nr, uInt dataStride, const MaskIterator &maskBegin, uInt maskStride, const DataRanges &ranges, Bool isInclude)
void _updateDataProviderMaxMin(const StatsData< AccumType > &threadStats)
virtual void getMinMax(AccumType &mymin, AccumType &mymax)
scan the dataset(s) that have been added, and find the min and max.
FitToHalfStatistics(const FitToHalfStatistics< CASA_STATP > &other)
copy semantics
StatsData< AccumType > & _getStatsData()
Retrieve stats structure.
CountedPtr< AccumType > _realMax
these are the max and min for the real portion of the dataset
AccumType getMedianAbsDevMed(CountedPtr< uInt64 > knownNpts=nullptr, CountedPtr< AccumType > knownMin=nullptr, CountedPtr< AccumType > knownMax=nullptr, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt nBins=10000)
get the median of the absolute deviation about the median of the data.
CountedPtr< std::pair< AccumType, AccumType > > _range
This is used for convenience and performance.
FitToHalfStatisticsData::CENTER _centerType
const StatsData< AccumType > & _getStatsData() const
AccumType getMedian(CountedPtr< uInt64 > knownNpts=nullptr, CountedPtr< AccumType > knownMin=nullptr, CountedPtr< AccumType > knownMax=nullptr, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt nBins=10000)
The median is just the center value, so none of the parameters to this method are used.
void _unweightedStats(StatsData< AccumType > &stats, uInt64 &ngood, LocationType &location, const DataIterator &dataBegin, uInt64 nr, uInt dataStride, const MaskIterator &maskBegin, uInt maskStride, const DataRanges &ranges, Bool isInclude)
void _weightedStats(StatsData< AccumType > &stats, LocationType &location, const DataIterator &dataBegin, const WeightsIterator &weightsBegin, uInt64 nr, uInt dataStride, const DataRanges &ranges, Bool isInclude)
virtual StatsData< AccumType > _getStatistics()
virtual StatsData< AccumType > _getInitialStats() const
void _setRange()
derived classes need to implement how to set their respective range
FitToHalfStatistics(FitToHalfStatisticsData::CENTER center=FitToHalfStatisticsData::CMEAN, FitToHalfStatisticsData::USE_DATA useData=FitToHalfStatisticsData::LE_CENTER, AccumType value=0)
value is only used if center=CVALUE
void _unweightedStats(StatsData< AccumType > &stats, uInt64 &ngood, LocationType &location, const DataIterator &dataBegin, uInt64 nr, uInt dataStride, const DataRanges &ranges, Bool isInclude)
no weights, no mask
FitToHalfStatistics< CASA_STATP > & operator=(const FitToHalfStatistics< CASA_STATP > &other)
copy semantics
void _unweightedStats(StatsData< AccumType > &stats, uInt64 &ngood, LocationType &location, const DataIterator &dataBegin, uInt64 nr, uInt dataStride)
no weights, no mask, no ranges
virtual void setStatsToCalculate(std::set< StatisticsData::STATS > &stats)
Override base class method by requiring mean to be computed in addition to what is added in stats if ...
virtual StatisticsAlgorithm< CASA_STATP > * clone() const
Clone this instance.
StatsData< AccumType > _statsData
virtual StatisticsData::ALGORITHM algorithm() const
get the algorithm that this object uses for computing stats
void _weightedStats(StatsData< AccumType > &stats, LocationType &location, const DataIterator &dataBegin, const WeightsIterator &weightBegin, uInt64 nr, uInt dataStride, const MaskIterator &maskBegin, uInt maskStride)
std::map< Double, AccumType > getQuantiles(const std::set< Double > &fractions, CountedPtr< uInt64 > knownNpts=nullptr, CountedPtr< AccumType > knownMin=nullptr, CountedPtr< AccumType > knownMax=nullptr, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt nBins=10000)
Get the specified quantiles.
void _unweightedStats(StatsData< AccumType > &stats, uInt64 &ngood, LocationType &location, const DataIterator &dataBegin, uInt64 nr, uInt dataStride, const MaskIterator &maskBegin, uInt maskStride)
uInt64 getNPts()
scan the dataset(s) that have been added, and find the number of good points.
void setCalculateAsAdded(Bool c)
This class does not allow statistics to be calculated as datasets are added, so an exception will be ...
AccumType getMedianAndQuantiles(std::map< Double, AccumType > &quantiles, const std::set< Double > &fractions, CountedPtr< uInt64 > knownNpts=nullptr, CountedPtr< AccumType > knownMin=nullptr, CountedPtr< AccumType > knownMax=nullptr, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt nBins=10000)
In the following group of methods, if the size of the composite dataset is smaller than binningThresh...
void _getMinMax(CountedPtr< AccumType > &realMin, CountedPtr< AccumType > &realMax, CountedPtr< AccumType > knownMin, CountedPtr< AccumType > knownMax)
get the min max of the entire (real + virtual) data set.
virtual void reset()
reset object to initial state.
CountedPtr< AccumType > _realMin
void _getRealMinMax(AccumType &realMin, AccumType &realMax)
get the min/max of the real portion only of the dataset
void _weightedStats(StatsData< AccumType > &stats, LocationType &location, const DataIterator &dataBegin, const WeightsIterator &weightsBegin, uInt64 nr, uInt dataStride)
has weights, but no mask, no ranges
ALGORITHM
implemented algorithms
const Double c
Fundamental physical constants (SI units):
this file contains all the compiler specific defines
Definition: mainpage.dox:28
const Bool False
Definition: aipstype.h:44
unsigned int uInt
Definition: aipstype.h:51
bool Bool
Define the standard types used by Casacore.
Definition: aipstype.h:42
LatticeExprNode value(const LatticeExprNode &expr)
This function returns the value of the expression without a mask.
unsigned long long uInt64
Definition: aipsxtype.h:39
std::pair< Int64, Int64 > LocationType