Frames | No Frames |
1: /* ReaderTokenizer.java -- splits the input char sequence int tokens. 2: Copyright (C) 2005 Free Software Foundation, Inc. 3: 4: This file is part of GNU Classpath. 5: 6: GNU Classpath is free software; you can redistribute it and/or modify 7: it under the terms of the GNU General Public License as published by 8: the Free Software Foundation; either version 2, or (at your option) 9: any later version. 10: 11: GNU Classpath is distributed in the hope that it will be useful, but 12: WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: General Public License for more details. 15: 16: You should have received a copy of the GNU General Public License 17: along with GNU Classpath; see the file COPYING. If not, write to the 18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19: 02110-1301 USA. 20: 21: Linking this library statically or dynamically with other modules is 22: making a combined work based on this library. Thus, the terms and 23: conditions of the GNU General Public License cover the whole 24: combination. 25: 26: As a special exception, the copyright holders of this library give you 27: permission to link this library with independent modules to produce an 28: executable, regardless of the license terms of these independent 29: modules, and to copy and distribute the resulting executable under 30: terms of your choice, provided that you also meet, for each linked 31: independent module, the terms and conditions of the license of that 32: module. An independent module is a module which is not derived from 33: or based on this library. If you modify this library, you may extend 34: this exception to your version of the library, but you are not 35: obligated to do so. If you do not wish to do so, delete this 36: exception statement from your version. */ 37: 38: 39: package gnu.javax.swing.text.html.parser.support.low; 40: 41: import java.io.IOException; 42: import java.io.Reader; 43: 44: /** 45: * Reader splits the input char sequence into tokens. 46: * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org) 47: */ 48: public class ReaderTokenizer 49: extends Constants 50: { 51: /** 52: * This is set to true each time the getNextToken is called. 53: * Used in preventing loops when all patterns refuse to accept 54: * the invalid input. 55: */ 56: protected boolean advanced; 57: 58: /** 59: * If true, the returned tokens are also placed in the backup 60: * queue. 61: */ 62: protected boolean backupMode; 63: 64: /** 65: * The buffer to read document into. 66: */ 67: Buffer buffer = new Buffer(); 68: 69: /** 70: * The queue for supporting mark(). 71: */ 72: Queue backup = new Queue(); 73: 74: /** 75: * The queue of found tokens. 76: */ 77: Queue queue = new Queue(); 78: 79: /** 80: * The reader to read the document from. 81: */ 82: Reader reader; 83: 84: /** 85: * Array of char tokens 86: */ 87: char[] charTokens; 88: 89: /** 90: * Array of string tokens. 91: */ 92: String[] stringTokens; 93: 94: /** 95: * The current reader position. 96: */ 97: int readerPosition = -1; 98: 99: /** 100: * Creates a new ReaderTokenizer. The reset(...) method must be 101: * subsequently called to set the reader. 102: */ 103: public ReaderTokenizer() 104: { 105: } 106: 107: /** 108: * Return the sequence, used to separate lines in the document. 109: * @return one of \n, \r or \r\n. 110: */ 111: public String getEndOfLineSequence() 112: { 113: return buffer.getEndOfLineSequence(); 114: } 115: 116: /** 117: * Get the next token. 118: * @return 119: */ 120: public Token getNextToken() 121: { 122: Token rt; 123: advanced = true; 124: try 125: { 126: if (queue.isEmpty()) 127: read(1); 128: 129: if (!queue.isEmpty()) 130: rt = queue.next(); 131: else 132: rt = new Token(EOF, new Location(readerPosition)); 133: } 134: catch (IOException ex) 135: { 136: throw new ParseException("IO Exception", ex); 137: } 138: if (backupMode) 139: backup.add(rt); 140: return rt; 141: } 142: 143: /** 144: * Get a token, lying the given number of tokens 145: * ahead. getToken(0) will return the same token, 146: * what would be returned by getNextToken(). 147: * getToken(..) does change the current position 148: * in the input stream. If the end of stream is 149: * reached, the EOF token is always returned. 150: */ 151: public Token getTokenAhead(int ahead) 152: { 153: try 154: { 155: read(ahead - queue.size() + 1); 156: return queue.size() >= ahead ? queue.get(ahead) : eofToken(); 157: } 158: catch (IOException ex) 159: { 160: throw new ParseException("IO Exception", ex); 161: } 162: } 163: 164: /** 165: * Get a token, bein immediatley ahead. 166: * If the end of stream is 167: * reached, the EOF token is always returned. 168: * The method is equivalent calling getTokenAhead(0). 169: */ 170: public Token getTokenAhead() 171: { 172: try 173: { 174: if (queue.isEmpty()) 175: read(1); 176: if (!queue.isEmpty()) 177: return queue.get(0); 178: else 179: return eofToken(); 180: } 181: catch (IOException ex) 182: { 183: throw new ParseException("IO Exception", ex); 184: } 185: } 186: 187: /** 188: * Invokes the error handler. 189: */ 190: public void error(String msg, Token at) 191: { 192: System.out.println(msg); 193: } 194: 195: /** 196: * Turns the backup mode on or off. 197: * It is possible to return where the mark(true) was last called 198: * by calling reset(). 199: * @param mode True if it is required to save tokens, making 200: * returning to the current point possible. 201: */ 202: public void mark(boolean mode) 203: { 204: backup.clear(); 205: backupMode = mode; 206: } 207: 208: /** 209: * Prepare for new parsing from the given stream. 210: * @param a_reader A reader to parse from. 211: */ 212: public void reset(Reader a_reader) 213: { 214: reader = a_reader; 215: readerPosition = -1; 216: buffer.reset(); 217: queue.clear(); 218: } 219: 220: /** 221: * Reset the internal cursor to the position where the mark() 222: * was last time called. Switches the backup mode off. 223: */ 224: public void reset() 225: { 226: if (!backupMode) 227: throw new AssertionError("Call mark(true) before using reset()!"); 228: backupMode = false; 229: 230: // That is now in the queue, will be appended to the end of backup. 231: while (!queue.isEmpty()) 232: backup.add(queue.next()); 233: 234: Queue t = queue; 235: queue = backup; 236: backup = t; 237: backup.clear(); 238: } 239: 240: /** 241: * Read the given number of the tokens. Add the needed number of EOF 242: * tokens if there are no more data in the stream. 243: * @param numberOfTokens The number of additional tokens to read. 244: */ 245: void read(int numberOfTokens) 246: throws IOException 247: { 248: if (numberOfTokens <= 0) 249: return; 250: 251: for (int i = 0; i < numberOfTokens; i++) 252: readToken(); 253: } 254: 255: /** 256: * Read next token from the reader, add it to the queue 257: */ 258: void readToken() 259: throws IOException 260: { 261: Token t; 262: int ch; 263: 264: enlarging: 265: while (true) 266: { 267: t = tokenMatches(); 268: if (t != null) 269: break enlarging; 270: else 271: { 272: ch = reader.read(); 273: readerPosition++; 274: if (ch == ETX) 275: ch = ' '; 276: if (ch < 0) 277: { 278: if (buffer.length() == 0) 279: { 280: queue.add(eofToken()); 281: return; 282: } 283: else 284: { 285: if (buffer.charAt(buffer.length() - 1) != ETX) 286: buffer.append(ETX, readerPosition++); 287: else 288: { 289: // Discard terminating ETX 290: buffer.setLength(buffer.length() - 1); 291: if (buffer.length() > 0) 292: { 293: t = new Token(OTHER, buffer.toString(), 294: buffer.getLocation(0, buffer.length()) 295: ); 296: queue.add(t); 297: buffer.setLength(0); 298: } 299: return; 300: } 301: } 302: } 303: else 304: buffer.append((char) ch, readerPosition); 305: } 306: } 307: } 308: 309: /** 310: * Check if the end of buffer matches one of the tokens. If it does, 311: * return this token and remove the token sequence from the end of 312: * buffer. 313: * @return The matching token. 314: */ 315: Token tokenMatches() 316: { 317: Token rt = endMatches(buffer); 318: if (rt != null) // Remove the matched image 319: { 320: // Consume future character if it was an entity and the future 321: // character is semicolon. 322: if (rt.kind == ENTITY) 323: { 324: if (buffer.charAt(buffer.length() - 1) == ';') 325: buffer.setLength(buffer.length() - rt.getImage().length() - 1); 326: else 327: { 328: error("Missing closing semicolon for entity '" + rt.getImage() + 329: "'", rt 330: ); 331: consumeBuffer(rt); 332: } 333: } 334: else 335: { 336: consumeBuffer(rt); 337: } 338: } 339: 340: // If the buffer is not empty, some sequence does not match any tokens. 341: // Add it to the queue as "OTHER". 342: if (rt != null) 343: { 344: if (buffer.length() > 1) 345: { 346: String rest = buffer.toString(); 347: rest = rest.substring(0, rest.length() - 1); 348: 349: Token other = 350: new Token(OTHER, rest, buffer.getLocation(0, buffer.length)); 351: queue.add(other); 352: consumeBuffer(other); 353: } 354: queue.add(rt); 355: } 356: return rt; 357: } 358: 359: private void consumeBuffer(Token rt) 360: { 361: buffer.delete(buffer.length() - rt.getImage().length() - 1, 362: buffer.length() - 1 363: ); 364: } 365: 366: /** 367: * Create EOF token. 368: */ 369: private Token eofToken() 370: { 371: return new Token(EOF, "#", new Location(readerPosition)); 372: } 373: }