casacore
Regex.h
Go to the documentation of this file.
1 //# Regex.h: Regular expression class
2 //# Copyright (C) 1993,1994,1995,1996,1997,1999,2000,2001,2003
3 //# Associated Universities, Inc. Washington DC, USA.
4 //#
5 //# This library is free software; you can redistribute it and/or modify it
6 //# under the terms of the GNU Library General Public License as published by
7 //# the Free Software Foundation; either version 2 of the License, or (at your
8 //# option) any later version.
9 //#
10 //# This library is distributed in the hope that it will be useful, but WITHOUT
11 //# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 //# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
13 //# License for more details.
14 //#
15 //# You should have received a copy of the GNU Library General Public License
16 //# along with this library; if not, write to the Free Software Foundation,
17 //# Inc., 675 Massachusetts Ave, Cambridge, MA 02139, USA.
18 //#
19 //# Correspondence concerning AIPS++ should be addressed as follows:
20 //# Internet email: aips2-request@nrao.edu.
21 //# Postal address: AIPS++ Project Office
22 //# National Radio Astronomy Observatory
23 //# 520 Edgemont Road
24 //# Charlottesville, VA 22903-2475 USA
25 //#
26 //# $Id$
27 
28 #ifndef CASA_REGEX_H
29 #define CASA_REGEX_H
30 
31 //# Includes
32 #include <casacore/casa/aips.h>
33 #include <casacore/casa/iosfwd.h>
34 #include <regex>
35 #include <casacore/casa/BasicSL/String.h>
36 
37 namespace casacore { //# NAMESPACE CASACORE - BEGIN
38 
39 //# Forward declarations.
40 struct re_pattern_buffer;
41 struct re_registers;
42 
43 // <summary>
44 // Regular expression class (based on std::regex)
45 // </summary>
46 
47 // <use visibility=export>
48 
49 // <reviewed reviewer="Friso Olnon" date="1995/03/20" tests="tRegex" demos="">
50 // </reviewed>
51 
52 // <synopsis>
53 // This class provides regular expression functionality, such as
54 // matching and searching in strings, comparison of expressions, and
55 // input/output. It is built on the standard C++ regular expression class
56 // using the ECMAScript syntax. It is almost the same as the regular expression
57 // syntax used until March 2019 which used GNU's cregex.cc.
58 // ECMAScript offers more functionality (such as non-greedy matching),
59 // but there is a slight difference how brackets are used. In the old
60 // regex they did not need to be escaped, while they have to for ECMAScript.
61 // Furthermore, in the old Regex up to 9 backreferences could be given, so
62 // \15 meant the first backreference followed by a 5. In ECMAScript it means
63 // the 15th and parentheses are needed to get the old meaning.
64 // These differences are solved in the Regex constructor which adds escape
65 // characters as needed. Thus existing code using Regex does not need to be changed.
66 //
67 // Apart from proper regular expressions, it also supports glob patterns
68 // (UNIX file name patterns) by means of a conversion to a proper regex string.
69 // Also ordinary strings and SQL-style patterns can be converted to a proper
70 // regex string.
71 // <p>
72 // See http://www.cplusplus.com/reference/regex/ECMAScript for the syntax.
73 // <dl>
74 // <dt> ^
75 // <dd> matches the beginning of a line.
76 // <dt> $
77 // <dd> matches the end of a line.
78 // <dt> .
79 // <dd> matches any character
80 // <dt> *
81 // <dd> zero or more times the previous subexpression.
82 // <dt> +
83 // <dd> one or more times the previous subexpression.
84 // <dt> ?
85 // <dd> zero or one time the previous subexpression.
86 // <dt> {n,m}
87 // <dd> interval operator to specify how many times a subexpression
88 // can match. See man page of egrep or regexp for more detail.
89 // <dt> []
90 // <dd> matches any character inside the brackets; e.g. <src>[abc]</src>.
91 // A hyphen can be used for a character range; e.g. <src>[a-z]</src>.
92 // <br>
93 // A ^ right after the opening bracket indicates "not";
94 // e.g. <src>[^abc]</src> means any character but a, b, and c.
95 // If ^ is not the first character, it is a literal caret.
96 // If - is the last character, it is a literal hyphen.
97 // If ] is the first character, it is a literal closing bracket.
98 // <br>
99 // Special character classes are
100 // [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:],
101 // [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:].
102 // The brackets are part of the name; e.g.
103 // <src>[^[:upper:]]</src> is equal to <src>[^A-Z]</src>.
104 // Note that [:upper:] is more portable, because A-Z fails
105 // for the EBCDIC character set.
106 // <dt> ( )
107 // <dd> grouping to change the normal operator precedence.
108 // <dt> |
109 // <dd> or operator. Matches left side or right side.
110 // <dt> \\1 till \\9. Backreference to a subexpression. Matches part of string
111 // equal to string part that matched the subexpression.
112 // </dl>
113 // Special characters have to be escaped with a backslash to use them
114 // literally. Only inside the square brackets, escaping should not be done.
115 // See the man page of egrep or regexp for more information about
116 // regular expressions.
117 // <p>
118 // Several global Regex objects are predefined for common functionality.
119 // <dl>
120 // <dt> RXwhite
121 // <dd> one or more whitespace characters
122 // <dt> RXint
123 // <dd> integer number (also negative)
124 // <dt> RXdouble
125 // <dd> double number (with e or E as exponent)
126 // <dt> RXalpha
127 // <dd> one or more alphabetic characters (lowercase and/or uppercase)
128 // <dt> RXlowercase
129 // <dd> lowercase alphabetic
130 // <dt> RXuppercase
131 // <dd> uppercase alphabetic
132 // <dt> RXalphanum
133 // <dd> one or more alphabetic/numeric characters (lowercase and/or uppercase)
134 // <dt> RXidentifier
135 // <dd> identifier name (first alphabetic or underscore, then zero or
136 // more alphanumeric and/or underscores
137 // </dl>
138 // The static member function <src>fromPattern</src> converts a shell-like
139 // pattern to a String which can be used to create a Regex from it.
140 // A pattern has the following special characters:
141 // <dl>
142 // <dt> *
143 // <dd> Zero or more arbitrary characters.
144 // <dt> ?
145 // <dd> One arbitrary character
146 // <dt> []
147 // <dd> The same as [] in a regular expression (see above).
148 // In addition to ^ a ! can be used to indicate "not".
149 // <dt> {,}
150 // <dd> A brace expression which is like brace expansion in some shells.
151 // It is similar to the | construct in a regular expression.
152 // <br>
153 // E.g. <src>{abc,defg}</src> means <src>abc</src> or <src>defg</src>.
154 // Brace expressions can be nested and can contain other
155 // special characters.
156 // <br>
157 // E.g. St{Man*.{h,cc},Col?*.{h,cc,l,y}}
158 // <br>A literal comma or brace in a brace expression can be given by
159 // escaping it with a backslash.
160 // </dl>
161 // The static member function <src>fromSQLPattern</src> converts an SQL-like
162 // pattern to a String which can be used to create a Regex from it.
163 // A pattern has the following special characters:
164 // <dl>
165 // <dt> %
166 // <dd> Zero or more arbitrary characters.
167 // <dt> _
168 // <dd> One arbitrary character
169 // </dl>
170 // The static member function <src>fromString</src> converts a normal
171 // string to a regular expression. This function escapes characters in
172 // the string which are special in a regular expression. In this way a
173 // normal string can be passed to a function taking a regular expression.
174 //
175 // The static member function <src>makeCaseInsensitive</src> returns a
176 // new regular expression string containing the case-insensitive version of
177 // the given expression string.
178 // </synopsis>
179 
180 // <example>
181 // <srcblock>
182 // Regex RXwhite("[ \n\t\r\v\f]+");
183 // (blank, newline, tab, return, vertical tab, formfeed)
184 // Regex RXint("[-+]?[0-9]+");
185 // Regex RXdouble("[-+]?(([0-9]+\\.[0-9]*)|([0-9]+)|(\\.[0-9]+))([eE][+-]?[0-9]+)?");
186 // Regex RXalpha("[A-Za-z]+");
187 // Regex RXlowercase("[a-z]+");
188 // Regex RXuppercase("[A-Z]+");
189 // Regex RXalphanum("[0-9A-Za-z]+");
190 // Regex RXidentifier("[A-Za-z_][A-Za-z0-9_]*");
191 // </srcblock>
192 // In RXdouble the . is escaped via a backslash to get it literally.
193 // The second backslash is needed to escape the backslash in C++.
194 // <srcblock>
195 // Regex rx1 (Regex::fromPattern ("St*.{h,cc}");
196 // results in regexp "St.*\.((h)|(cc))"
197 // Regex rx2 (Regex::fromString ("tRegex.cc");
198 // results in regexp "tRegex\.cc"
199 // </srcblock>
200 // </example>
201 
202 //# <todo asof="2001/07/15">
203 //# </todo>
204 
205 
207 {
208 public:
209  // Default constructor uses a zero-length regular expression.
210  Regex();
211 
212  // Construct a regular expression from the string.
213  // If toECMAScript=True, function toEcma is called to convert the old cregex
214  // syntax to the new ECMAScript syntax.
215  // If fast=True, matching efficiency is preferred over efficiency constructing
216  // the regex object.
217  explicit Regex(const String& exp, Bool fast=False, Bool toECMAScript=True);
218 
219  // Construct a new regex (using the default Regex constructor arguments).
220  void operator=(const String& str);
221 
222  // Convert the possibly old-style regex to the Ecma regex which means
223  // that unescaped [ and ] inside a bracket expression will be escaped and
224  // that a numeric character after a backreference is enclosed in brackets
225  // (otherwise the backreference uses multiple characters).
226  static String toEcma(const String& rx);
227 
228  // Convert a shell-like pattern to a regular expression string.
229  // This is useful for people who are more familiar with patterns
230  // than with regular expressions.
232 
233  // Convert an SQL-like pattern to a regular expression string.
234  // This is useful TaQL which mimics SQL.
236 
237  // Convert a normal string to a regular expression string.
238  // This consists of escaping the special characters.
239  // This is useful when one wants to provide a normal string
240  // (which may contain special characters) to a function working
241  // on regular expressions.
242  static String fromString(const String& str);
243 
244  // Create a case-insensitive regular expression string from the given
245  // regular expression string.
246  // It does it by inserting the lowercase and uppercase version of
247  // characters in the input string into the output string.
248  static String makeCaseInsensitive (const String& str);
249 
250  // Get the regular expression string.
251  const String& regexp() const
252  { return itsStr; }
253 
254  // Test if the regular expression matches (first part of) string <src>s</src>.
255  // The return value gives the length of the matching string part,
256  // or String::npos if there is no match or an error.
257  // The string has <src>len</src> characters and the test starts at
258  // position <src>pos</src>. The string may contain null characters.
259  // Negative p is allowed to define the start from the end.
260  //
261  // <note role=tip>
262  // Use the appropriate <linkto class=String>String</linkto> functions
263  // to test if a string matches a regular expression.
264  // <src>Regex::match</src> is pretty low-level.
265  // </note>
267  String::size_type len,
268  String::size_type pos=0) const;
269 
270  // Test if the regular expression matches the entire string.
271  Bool fullMatch(const Char* s, String::size_type len) const;
272 
273  // Test if the regular expression occurs anywhere in string <src>s</src>.
274  // The return value gives the position of the first substring
275  // matching the regular expression. The length of that substring
276  // is returned in <src>matchlen</src>.
277  // The string has <src>len</src> characters and the test starts at
278  // position <src>pos</src>. The string may contain null characters.
279  // If the pos given is negative, the search starts -pos from the end.
280  // <note role=tip>
281  // Use the appropriate <linkto class=String>String</linkto> functions
282  // to test if a regular expression occurs in a string.
283  // <src>Regex::search</src> is pretty low-level.
284  // </note>
285  // <group>
287  String::size_type len,
288  Int& matchlen,
289  Int pos=0) const;
291  Int& matchlen,
292  String::size_type pos=0) const;
293  // </group>
294 
295  // Search backwards.
297  Int& matchlen,
298  uInt pos) const;
299 
300  // Write the regex string.
301  friend ostream& operator<<(ostream& ios, const Regex& exp);
302 
303 protected:
304  String itsStr; // the reg. exp. string
305 };
306 
307 
308 // some built in regular expressions
309 
310 extern const Regex RXwhite; //# = "[ \n\t\r\v\f]+"
311 extern const Regex RXint; //# = "-?[0-9]+"
312 extern const Regex RXdouble; //# = "-?(([0-9]+\\.[0-9]*)|
313  //# ([0-9]+)|(\\.[0-9]+))
314  //# ([eE][+-]?[0-9]+)?"
315 extern const Regex RXalpha; //# = "[A-Za-z]+"
316 extern const Regex RXlowercase; //# = "[a-z]+"
317 extern const Regex RXuppercase; //# = "[A-Z]+"
318 extern const Regex RXalphanum; //# = "[0-9A-Za-z]+"
319 extern const Regex RXidentifier; //# = "[A-Za-z_][A-Za-z0-9_]*"
320 
321 
322 } //# NAMESPACE CASACORE - END
323 
324 #endif
friend ostream & operator<<(ostream &ios, const Regex &exp)
Write the regex string.
void operator=(const String &str)
Construct a new regex (using the default Regex constructor arguments).
const String & regexp() const
Get the regular expression string.
Definition: Regex.h:251
Regex()
Default constructor uses a zero-length regular expression.
String::size_type search(const Char *s, String::size_type len, Int &matchlen, Int pos=0) const
Test if the regular expression occurs anywhere in string s.
static String fromPattern(const String &pattern)
Convert a shell-like pattern to a regular expression string.
Bool fullMatch(const Char *s, String::size_type len) const
Test if the regular expression matches the entire string.
String::size_type searchBack(const Char *s, String::size_type len, Int &matchlen, uInt pos) const
Search backwards.
Regex(const String &exp, Bool fast=False, Bool toECMAScript=True)
Construct a regular expression from the string.
static String toEcma(const String &rx)
Convert the possibly old-style regex to the Ecma regex which means that unescaped [ and ] inside a br...
String::size_type find(const Char *s, String::size_type len, Int &matchlen, String::size_type pos=0) const
static String fromString(const String &str)
Convert a normal string to a regular expression string.
static String makeCaseInsensitive(const String &str)
Create a case-insensitive regular expression string from the given regular expression string.
static String fromSQLPattern(const String &pattern)
Convert an SQL-like pattern to a regular expression string.
String::size_type match(const Char *s, String::size_type len, String::size_type pos=0) const
Test if the regular expression matches (first part of) string s.
String itsStr
Definition: Regex.h:304
String: the storage and methods of handling collections of characters.
Definition: String.h:225
string::size_type size_type
Definition: String.h:233
this file contains all the compiler specific defines
Definition: mainpage.dox:28
TableExprNode pattern(const TableExprNode &node)
Definition: ExprNode.h:1487
const Regex RXint
const Bool False
Definition: aipstype.h:44
LatticeExprNode exp(const LatticeExprNode &expr)
const Regex RXalpha
unsigned int uInt
Definition: aipstype.h:51
const Regex RXuppercase
const Regex RXdouble
TableExprNode regex(const TableExprNode &node)
Functions for regular expression matching and pattern matching.
Definition: ExprNode.h:1483
int Int
Definition: aipstype.h:50
bool Bool
Define the standard types used by Casacore.
Definition: aipstype.h:42
const Regex RXidentifier
const Regex RXlowercase
const Bool True
Definition: aipstype.h:43
char Char
Definition: aipstype.h:46
const Regex RXwhite
some built in regular expressions
const Regex RXalphanum