Frames | No Frames |
1: /* Matcher.java -- Instance of a regular expression applied to a char sequence. 2: Copyright (C) 2002, 2004, 2006 Free Software Foundation, Inc. 3: 4: This file is part of GNU Classpath. 5: 6: GNU Classpath is free software; you can redistribute it and/or modify 7: it under the terms of the GNU General Public License as published by 8: the Free Software Foundation; either version 2, or (at your option) 9: any later version. 10: 11: GNU Classpath is distributed in the hope that it will be useful, but 12: WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: General Public License for more details. 15: 16: You should have received a copy of the GNU General Public License 17: along with GNU Classpath; see the file COPYING. If not, write to the 18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19: 02110-1301 USA. 20: 21: Linking this library statically or dynamically with other modules is 22: making a combined work based on this library. Thus, the terms and 23: conditions of the GNU General Public License cover the whole 24: combination. 25: 26: As a special exception, the copyright holders of this library give you 27: permission to link this library with independent modules to produce an 28: executable, regardless of the license terms of these independent 29: modules, and to copy and distribute the resulting executable under 30: terms of your choice, provided that you also meet, for each linked 31: independent module, the terms and conditions of the license of that 32: module. An independent module is a module which is not derived from 33: or based on this library. If you modify this library, you may extend 34: this exception to your version of the library, but you are not 35: obligated to do so. If you do not wish to do so, delete this 36: exception statement from your version. */ 37: 38: 39: package java.util.regex; 40: 41: import gnu.java.lang.CPStringBuilder; 42: 43: import gnu.java.util.regex.CharIndexed; 44: import gnu.java.util.regex.RE; 45: import gnu.java.util.regex.REMatch; 46: 47: /** 48: * Instance of a regular expression applied to a char sequence. 49: * 50: * @since 1.4 51: */ 52: public final class Matcher implements MatchResult 53: { 54: private Pattern pattern; 55: private CharSequence input; 56: // We use CharIndexed as an input object to the getMatch method in order 57: // that /\G/ (the end of the previous match) may work. The information 58: // of the previous match is stored in the CharIndexed object. 59: private CharIndexed inputCharIndexed; 60: private int position; 61: private int appendPosition; 62: private REMatch match; 63: 64: /** 65: * The start of the region of the input on which to match. 66: */ 67: private int regionStart; 68: 69: /** 70: * The end of the region of the input on which to match. 71: */ 72: private int regionEnd; 73: 74: /** 75: * True if the match process should look beyond the 76: * region marked by regionStart to regionEnd when 77: * performing lookAhead, lookBehind and boundary 78: * matching. 79: */ 80: private boolean transparentBounds; 81: 82: /** 83: * The flags that affect the anchoring bounds. 84: * If {@link #hasAnchoringBounds()} is {@code true}, 85: * the match process will honour the 86: * anchoring bounds: ^, \A, \Z, \z and $. If 87: * {@link #hasAnchoringBounds()} is {@code false}, 88: * the anchors are ignored and appropriate flags, 89: * stored in this variable, are used to provide this 90: * behaviour. 91: */ 92: private int anchoringBounds; 93: 94: Matcher(Pattern pattern, CharSequence input) 95: { 96: this.pattern = pattern; 97: this.input = input; 98: this.inputCharIndexed = RE.makeCharIndexed(input, 0); 99: regionStart = 0; 100: regionEnd = input.length(); 101: transparentBounds = false; 102: anchoringBounds = 0; 103: } 104: 105: /** 106: * Changes the pattern used by the {@link Matcher} to 107: * the one specified. Existing match information is lost, 108: * but the input and the matcher's position within it is 109: * retained. 110: * 111: * @param newPattern the new pattern to use. 112: * @return this matcher. 113: * @throws IllegalArgumentException if {@code newPattern} is 114: * {@code null}. 115: * @since 1.5 116: */ 117: public Matcher usePattern(Pattern newPattern) 118: { 119: if (newPattern == null) 120: throw new IllegalArgumentException("The new pattern was null."); 121: pattern = newPattern; 122: match = null; 123: 124: return this; 125: } 126: 127: /** 128: * @param sb The target string buffer 129: * @param replacement The replacement string 130: * 131: * @exception IllegalStateException If no match has yet been attempted, 132: * or if the previous match operation failed 133: * @exception IndexOutOfBoundsException If the replacement string refers 134: * to a capturing group that does not exist in the pattern 135: */ 136: public Matcher appendReplacement (StringBuffer sb, String replacement) 137: throws IllegalStateException 138: { 139: assertMatchOp(); 140: sb.append(input.subSequence(appendPosition, 141: match.getStartIndex()).toString()); 142: sb.append(RE.getReplacement(replacement, match, 143: RE.REG_REPLACE_USE_BACKSLASHESCAPE)); 144: appendPosition = match.getEndIndex(); 145: return this; 146: } 147: 148: /** 149: * @param sb The target string buffer 150: */ 151: public StringBuffer appendTail (StringBuffer sb) 152: { 153: sb.append(input.subSequence(appendPosition, input.length()).toString()); 154: return sb; 155: } 156: 157: /** 158: * @exception IllegalStateException If no match has yet been attempted, 159: * or if the previous match operation failed 160: */ 161: public int end () 162: throws IllegalStateException 163: { 164: assertMatchOp(); 165: return match.getEndIndex(); 166: } 167: 168: /** 169: * @param group The index of a capturing group in this matcher's pattern 170: * 171: * @exception IllegalStateException If no match has yet been attempted, 172: * or if the previous match operation failed 173: * @exception IndexOutOfBoundsException If the replacement string refers 174: * to a capturing group that does not exist in the pattern 175: */ 176: public int end (int group) 177: throws IllegalStateException 178: { 179: assertMatchOp(); 180: return match.getEndIndex(group); 181: } 182: 183: public boolean find () 184: { 185: boolean first = (match == null); 186: if (transparentBounds || (regionStart == 0 && regionEnd == input.length())) 187: match = pattern.getRE().getMatch(inputCharIndexed, position, anchoringBounds); 188: else 189: match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 190: position, anchoringBounds); 191: if (match != null) 192: { 193: int endIndex = match.getEndIndex(); 194: // Is the match within input limits? 195: if (endIndex > input.length()) 196: { 197: match = null; 198: return false; 199: } 200: // Are we stuck at the same position? 201: if (!first && endIndex == position) 202: { 203: match = null; 204: // Not at the end of the input yet? 205: if (position < input.length() - 1) 206: { 207: position++; 208: return find(position); 209: } 210: else 211: return false; 212: } 213: position = endIndex; 214: return true; 215: } 216: return false; 217: } 218: 219: /** 220: * @param start The index to start the new pattern matching 221: * 222: * @exception IndexOutOfBoundsException If the replacement string refers 223: * to a capturing group that does not exist in the pattern 224: */ 225: public boolean find (int start) 226: { 227: if (transparentBounds || (regionStart == 0 && regionEnd == input.length())) 228: match = pattern.getRE().getMatch(inputCharIndexed, start, anchoringBounds); 229: else 230: match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 231: start, anchoringBounds); 232: if (match != null) 233: { 234: position = match.getEndIndex(); 235: return true; 236: } 237: return false; 238: } 239: 240: /** 241: * @exception IllegalStateException If no match has yet been attempted, 242: * or if the previous match operation failed 243: */ 244: public String group () 245: { 246: assertMatchOp(); 247: return match.toString(); 248: } 249: 250: /** 251: * @param group The index of a capturing group in this matcher's pattern 252: * 253: * @exception IllegalStateException If no match has yet been attempted, 254: * or if the previous match operation failed 255: * @exception IndexOutOfBoundsException If the replacement string refers 256: * to a capturing group that does not exist in the pattern 257: */ 258: public String group (int group) 259: throws IllegalStateException 260: { 261: assertMatchOp(); 262: return match.toString(group); 263: } 264: 265: /** 266: * @param replacement The replacement string 267: */ 268: public String replaceFirst (String replacement) 269: { 270: reset(); 271: // Semantics might not quite match 272: return pattern.getRE().substitute(input, replacement, position, 273: RE.REG_REPLACE_USE_BACKSLASHESCAPE); 274: } 275: 276: /** 277: * @param replacement The replacement string 278: */ 279: public String replaceAll (String replacement) 280: { 281: reset(); 282: return pattern.getRE().substituteAll(input, replacement, position, 283: RE.REG_REPLACE_USE_BACKSLASHESCAPE); 284: } 285: 286: public int groupCount () 287: { 288: return pattern.getRE().getNumSubs(); 289: } 290: 291: public boolean lookingAt () 292: { 293: if (transparentBounds || (regionStart == 0 && regionEnd == input.length())) 294: match = pattern.getRE().getMatch(inputCharIndexed, regionStart, 295: anchoringBounds|RE.REG_FIX_STARTING_POSITION|RE.REG_ANCHORINDEX); 296: else 297: match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 0, 298: anchoringBounds|RE.REG_FIX_STARTING_POSITION); 299: if (match != null) 300: { 301: if (match.getStartIndex() == 0) 302: { 303: position = match.getEndIndex(); 304: return true; 305: } 306: match = null; 307: } 308: return false; 309: } 310: 311: /** 312: * Attempts to match the entire input sequence against the pattern. 313: * 314: * If the match succeeds then more information can be obtained via the 315: * start, end, and group methods. 316: * 317: * @see #start() 318: * @see #end() 319: * @see #group() 320: */ 321: public boolean matches () 322: { 323: if (transparentBounds || (regionStart == 0 && regionEnd == input.length())) 324: match = pattern.getRE().getMatch(inputCharIndexed, regionStart, 325: anchoringBounds|RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION|RE.REG_ANCHORINDEX); 326: else 327: match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 0, 328: anchoringBounds|RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION); 329: if (match != null) 330: { 331: if (match.getStartIndex() == 0) 332: { 333: position = match.getEndIndex(); 334: if (position == input.length()) 335: return true; 336: } 337: match = null; 338: } 339: return false; 340: } 341: 342: /** 343: * Returns the Pattern that is interpreted by this Matcher 344: */ 345: public Pattern pattern () 346: { 347: return pattern; 348: } 349: 350: /** 351: * Resets the internal state of the matcher, including 352: * resetting the region to its default state of encompassing 353: * the whole input. The state of {@link #hasTransparentBounds()} 354: * and {@link #hasAnchoringBounds()} are unaffected. 355: * 356: * @return a reference to this matcher. 357: * @see #regionStart() 358: * @see #regionEnd() 359: * @see #hasTransparentBounds() 360: * @see #hasAnchoringBounds() 361: */ 362: public Matcher reset () 363: { 364: position = 0; 365: match = null; 366: regionStart = 0; 367: regionEnd = input.length(); 368: appendPosition = 0; 369: return this; 370: } 371: 372: /** 373: * Resets the internal state of the matcher, including 374: * resetting the region to its default state of encompassing 375: * the whole input. The state of {@link #hasTransparentBounds()} 376: * and {@link #hasAnchoringBounds()} are unaffected. 377: * 378: * @param input The new input character sequence. 379: * @return a reference to this matcher. 380: * @see #regionStart() 381: * @see #regionEnd() 382: * @see #hasTransparentBounds() 383: * @see #hasAnchoringBounds() 384: */ 385: public Matcher reset (CharSequence input) 386: { 387: this.input = input; 388: this.inputCharIndexed = RE.makeCharIndexed(input, 0); 389: return reset(); 390: } 391: 392: /** 393: * @return the index of a capturing group in this matcher's pattern 394: * 395: * @exception IllegalStateException If no match has yet been attempted, 396: * or if the previous match operation failed 397: */ 398: public int start () 399: throws IllegalStateException 400: { 401: assertMatchOp(); 402: return match.getStartIndex(); 403: } 404: 405: /** 406: * @param group The index of a capturing group in this matcher's pattern 407: * 408: * @exception IllegalStateException If no match has yet been attempted, 409: * or if the previous match operation failed 410: * @exception IndexOutOfBoundsException If the replacement string refers 411: * to a capturing group that does not exist in the pattern 412: */ 413: public int start (int group) 414: throws IllegalStateException 415: { 416: assertMatchOp(); 417: return match.getStartIndex(group); 418: } 419: 420: /** 421: * @return True if and only if the matcher hit the end of input. 422: * @since 1.5 423: */ 424: public boolean hitEnd() 425: { 426: return inputCharIndexed.hitEnd(); 427: } 428: 429: /** 430: * @return A string expression of this matcher. 431: */ 432: public String toString() 433: { 434: CPStringBuilder sb = new CPStringBuilder(); 435: sb.append(this.getClass().getName()) 436: .append("[pattern=").append(pattern.pattern()) 437: .append(" region=").append(regionStart).append(",").append(regionEnd) 438: .append(" anchoringBounds=").append(anchoringBounds == 0) 439: .append(" transparentBounds=").append(transparentBounds) 440: .append(" lastmatch=").append(match == null ? "" : match.toString()) 441: .append("]"); 442: return sb.toString(); 443: } 444: 445: private void assertMatchOp() 446: { 447: if (match == null) throw new IllegalStateException(); 448: } 449: 450: /** 451: * <p> 452: * Defines the region of the input on which to match. 453: * By default, the {@link Matcher} attempts to match 454: * the whole string (from 0 to the length of the input), 455: * but a region between {@code start} (inclusive) and 456: * {@code end} (exclusive) on which to match may instead 457: * be defined using this method. 458: * </p> 459: * <p> 460: * The behaviour of region matching is further affected 461: * by the use of transparent or opaque bounds (see 462: * {@link #useTransparentBounds(boolean)}) and whether or not 463: * anchors ({@code ^} and {@code $}) are in use 464: * (see {@link #useAnchoringBounds(boolean)}). With transparent 465: * bounds, the matcher is aware of input outside the bounds 466: * set by this method, whereas, with opaque bounds (the default) 467: * only the input within the bounds is used. The use of 468: * anchors are affected by this setting; with transparent 469: * bounds, anchors will match the beginning of the real input, 470: * while with opaque bounds they match the beginning of the 471: * region. {@link #useAnchoringBounds(boolean)} can be used 472: * to turn on or off the matching of anchors. 473: * </p> 474: * 475: * @param start the start of the region (inclusive). 476: * @param end the end of the region (exclusive). 477: * @return a reference to this matcher. 478: * @throws IndexOutOfBoundsException if either {@code start} or 479: * {@code end} are less than zero, 480: * if either {@code start} or 481: * {@code end} are greater than the 482: * length of the input, or if 483: * {@code start} is greater than 484: * {@code end}. 485: * @see #regionStart() 486: * @see #regionEnd() 487: * @see #hasTransparentBounds() 488: * @see #useTransparentBounds(boolean) 489: * @see #hasAnchoringBounds() 490: * @see #useAnchoringBounds(boolean) 491: * @since 1.5 492: */ 493: public Matcher region(int start, int end) 494: { 495: int length = input.length(); 496: if (start < 0) 497: throw new IndexOutOfBoundsException("The start position was less than zero."); 498: if (start >= length) 499: throw new IndexOutOfBoundsException("The start position is after the end of the input."); 500: if (end < 0) 501: throw new IndexOutOfBoundsException("The end position was less than zero."); 502: if (end > length) 503: throw new IndexOutOfBoundsException("The end position is after the end of the input."); 504: if (start > end) 505: throw new IndexOutOfBoundsException("The start position is after the end position."); 506: reset(); 507: regionStart = start; 508: regionEnd = end; 509: return this; 510: } 511: 512: /** 513: * The start of the region on which to perform matches (inclusive). 514: * 515: * @return the start index of the region. 516: * @see #region(int,int) 517: * #see #regionEnd() 518: * @since 1.5 519: */ 520: public int regionStart() 521: { 522: return regionStart; 523: } 524: 525: /** 526: * The end of the region on which to perform matches (exclusive). 527: * 528: * @return the end index of the region. 529: * @see #region(int,int) 530: * @see #regionStart() 531: * @since 1.5 532: */ 533: public int regionEnd() 534: { 535: return regionEnd; 536: } 537: 538: /** 539: * Returns true if the bounds of the region marked by 540: * {@link #regionStart()} and {@link #regionEnd()} are 541: * transparent. When these bounds are transparent, the 542: * matching process can look beyond them to perform 543: * lookahead, lookbehind and boundary matching operations. 544: * By default, the bounds are opaque. 545: * 546: * @return true if the bounds of the matching region are 547: * transparent. 548: * @see #useTransparentBounds(boolean) 549: * @see #region(int,int) 550: * @see #regionStart() 551: * @see #regionEnd() 552: * @since 1.5 553: */ 554: public boolean hasTransparentBounds() 555: { 556: return transparentBounds; 557: } 558: 559: /** 560: * Sets the transparency of the bounds of the region 561: * marked by {@link #regionStart()} and {@link #regionEnd()}. 562: * A value of {@code true} makes the bounds transparent, 563: * so the matcher can see beyond them to perform lookahead, 564: * lookbehind and boundary matching operations. A value 565: * of {@code false} (the default) makes the bounds opaque, 566: * restricting the match to the input region denoted 567: * by {@link #regionStart()} and {@link #regionEnd()}. 568: * 569: * @param transparent true if the bounds should be transparent. 570: * @return a reference to this matcher. 571: * @see #hasTransparentBounds() 572: * @see #region(int,int) 573: * @see #regionStart() 574: * @see #regionEnd() 575: * @since 1.5 576: */ 577: public Matcher useTransparentBounds(boolean transparent) 578: { 579: transparentBounds = transparent; 580: return this; 581: } 582: 583: /** 584: * Returns true if the matcher will honour the use of 585: * the anchoring bounds: {@code ^}, {@code \A}, {@code \Z}, 586: * {@code \z} and {@code $}. By default, the anchors 587: * are used. Note that the effect of the anchors is 588: * also affected by {@link #hasTransparentBounds()}. 589: * 590: * @return true if the matcher will attempt to match 591: * the anchoring bounds. 592: * @see #useAnchoringBounds(boolean) 593: * @see #hasTransparentBounds() 594: * @since 1.5 595: */ 596: public boolean hasAnchoringBounds() 597: { 598: return anchoringBounds == 0; 599: } 600: 601: /** 602: * Enables or disables the use of the anchoring bounds: 603: * {@code ^}, {@code \A}, {@code \Z}, {@code \z} and 604: * {@code $}. By default, their use is enabled. When 605: * disabled, the matcher will not attempt to match 606: * the anchors. 607: * 608: * @param useAnchors true if anchoring bounds should be used. 609: * @return a reference to this matcher. 610: * @since 1.5 611: * @see #hasAnchoringBounds() 612: */ 613: public Matcher useAnchoringBounds(boolean useAnchors) 614: { 615: if (useAnchors) 616: anchoringBounds = 0; 617: else 618: anchoringBounds = RE.REG_NOTBOL|RE.REG_NOTEOL; 619: return this; 620: } 621: 622: /** 623: * Returns a read-only snapshot of the current state of 624: * the {@link Matcher} as a {@link MatchResult}. Any 625: * subsequent changes to this instance are not reflected 626: * in the returned {@link MatchResult}. 627: * 628: * @return a {@link MatchResult} instance representing the 629: * current state of the {@link Matcher}. 630: */ 631: public MatchResult toMatchResult() 632: { 633: Matcher snapshot = new Matcher(pattern, input); 634: if (match != null) 635: snapshot.match = (REMatch) match.clone(); 636: return snapshot; 637: } 638: 639: /** 640: * Returns a literalized string of s where characters {@code $} and {@code 641: * \\} are escaped. 642: * 643: * @param s the string to literalize. 644: * @return the literalized string. 645: * @since 1.5 646: */ 647: public static String quoteReplacement(String s) 648: { 649: if (s == null) 650: throw new NullPointerException(); 651: CPStringBuilder sb = new CPStringBuilder(); 652: for (int i = 0; i < s.length(); i++) 653: { 654: char ch = s.charAt(i); 655: if (ch == '$' || ch == '\\') 656: sb.append('\\'); 657: sb.append(ch); 658: } 659: return sb.toString(); 660: } 661: 662: }