casacore
StringDistance.h
Go to the documentation of this file.
1 //# StringDistance.h: Class to deal with Levensthein distance of strings
2 //# Copyright (C) 2009
3 //# Associated Universities, Inc. Washington DC, USA.
4 //#
5 //# This library is free software; you can redistribute it and/or modify it
6 //# under the terms of the GNU Library General Public License as published by
7 //# the Free Software Foundation; either version 2 of the License, or (at your
8 //# option) any later version.
9 //#
10 //# This library is distributed in the hope that it will be useful, but WITHOUT
11 //# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 //# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
13 //# License for more details.
14 //#
15 //# You should have received a copy of the GNU Library General Public License
16 //# along with this library; if not, write to the Free Software Foundation,
17 //# Inc., 675 Massachusetts Ave, Cambridge, MA 02139, USA.
18 //#
19 //# Correspondence concerning AIPS++ should be addressed as follows:
20 //# Internet email: aips2-request@nrao.edu.
21 //# Postal address: AIPS++ Project Office
22 //# National Radio Astronomy Observatory
23 //# 520 Edgemont Road
24 //# Charlottesville, VA 22903-2475 USA
25 //#
26 //# $Id$
27 
28 #ifndef CASA_STRINGDISTANCE_H
29 #define CASA_STRINGDISTANCE_H
30 
31 //# Includes
32 #include <casacore/casa/aips.h>
33 #include <casacore/casa/BasicSL/String.h>
34 #include <casacore/casa/Arrays/Matrix.h>
35 
36 namespace casacore {
37 
38 // <summary>
39 // Class to deal with Levensthein distance of strings.
40 // </summary>
41 
42 // <synopsis>
43 // The Levenshtein Distance is a metric telling how similar strings are.
44 // It is also known as the Edit Distance.
45 //
46 // The distance tells how many operations (i.e., character substitutions,
47 // insertions, and deletions are needed to transform one string into another.
48 // <br>There are several extensions to the basic definition:
49 // <ul>
50 // <li> Treat a swap of adjacent characters as one operation.
51 // <li> Give a weight to operations (e.g., insertions and deletions have
52 // half the weight of the other operations).
53 // <li> Take locality into account. For example, successive substitutions
54 // weigh more than non-successive.
55 // <li> Extend to wildcarded strings.
56 // </ul>
57 // This class optionally uses the swap extension. Furthermore one can
58 // optionally ignore blanks. By default both options are used.
59 //
60 // The code is based on code written by Anders Sewerin Johansen.
61 // Calculating the distance is an expensive O(N^2) operation, thus
62 // should be used with care.
63 //
64 // The class is constructed with the source string to compare against.
65 // Thereafter its <code>match</code> or <code>distance</code>
66 // function can be used for each target string.
67 // </synopsis>
68 
70 {
71 public:
72  // Default constructor sets maxDistance to 0.
74 
75  // Construct from the source string and maximum distance.
76  // If the maximum distance is negative, it defaults to 1+strlength/3.
77  // Note that maximum distance 0 means that the strings must match exactly.
78  explicit StringDistance (const String& source, Int maxDistance=-1,
79  Bool countSwaps=True, Bool ignoreBlanks=True,
80  Bool caseInsensitive=False);
81 
82  // Get data members.
83  // <group>
84  const string& source() const
85  { return itsSource; }
86  Int maxDistance() const
87  { return itsMaxDistance; }
88  const Matrix<Int>& matrix() const
89  { return itsMatrix; }
90  // </group>
91 
92  // Test if the given target string is within the maximum distance.
93  Bool match (const String& target) const;
94 
95  // Calculate the distance from the string to the string given in the constructor.
96  // If the length of target exceeds source length + maxDistance,
97  // the difference in lengths is returned.
98  Int distance (const String& target) const;
99 
100  // Calculate the distance between the two strings.
101  // This is slower than the <src>distance</src> member function, because
102  // it has to allocate the underlying Matrix for each invocation.
103  static Int distance (const String& source, const String& target,
104  Bool countSwaps=True);
105 
106  // Remove blanks from the given string.
107  static String removeBlanks (const String& source);
108 
109 private:
110  // Calculate the distance.
111  static Int doDistance (const String& source, const String& target,
112  Bool countSwaps, Matrix<Int>& matrix);
113 
114 
115 private:
122 };
123 
124 } //# end namespace
125 
126 #endif
StringDistance(const String &source, Int maxDistance=-1, Bool countSwaps=True, Bool ignoreBlanks=True, Bool caseInsensitive=False)
Construct from the source string and maximum distance.
static Int doDistance(const String &source, const String &target, Bool countSwaps, Matrix< Int > &matrix)
Calculate the distance.
const Matrix< Int > & matrix() const
static Int distance(const String &source, const String &target, Bool countSwaps=True)
Calculate the distance between the two strings.
static String removeBlanks(const String &source)
Remove blanks from the given string.
Int distance(const String &target) const
Calculate the distance from the string to the string given in the constructor.
const string & source() const
Get data members.
StringDistance()
Default constructor sets maxDistance to 0.
Bool match(const String &target) const
Test if the given target string is within the maximum distance.
String: the storage and methods of handling collections of characters.
Definition: String.h:225
this file contains all the compiler specific defines
Definition: mainpage.dox:28
const Bool False
Definition: aipstype.h:44
int Int
Definition: aipstype.h:50
bool Bool
Define the standard types used by Casacore.
Definition: aipstype.h:42
const Bool True
Definition: aipstype.h:43