Source for gnu.javax.swing.text.html.parser.support.low.ReaderTokenizer

   1: /* ReaderTokenizer.java -- splits the input char sequence int tokens.
   2:    Copyright (C) 2005 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: 
  39: package gnu.javax.swing.text.html.parser.support.low;
  40: 
  41: import java.io.IOException;
  42: import java.io.Reader;
  43: 
  44: /**
  45:  * Reader splits the input char sequence into tokens.
  46:  * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org)
  47:  */
  48: public class ReaderTokenizer
  49:   extends Constants
  50: {
  51:   /**
  52:    * This is set to true each time the getNextToken is called.
  53:    * Used in preventing loops when all patterns refuse to accept
  54:    * the invalid input.
  55:    */
  56:   protected boolean advanced;
  57: 
  58:   /**
  59:    * If true, the returned tokens are also placed in the backup
  60:    * queue.
  61:    */
  62:   protected boolean backupMode;
  63: 
  64:   /**
  65:    * The buffer to read document into.
  66:    */
  67:   Buffer buffer = new Buffer();
  68: 
  69:   /**
  70:    * The queue for supporting mark().
  71:    */
  72:   Queue backup = new Queue();
  73: 
  74:   /**
  75:    * The queue of found tokens.
  76:    */
  77:   Queue queue = new Queue();
  78: 
  79:   /**
  80:    * The reader to read the document from.
  81:    */
  82:   Reader reader;
  83: 
  84:   /**
  85:    * Array of char tokens
  86:    */
  87:   char[] charTokens;
  88: 
  89:   /**
  90:    * Array of string tokens.
  91:    */
  92:   String[] stringTokens;
  93: 
  94:   /**
  95:    * The current reader position.
  96:    */
  97:   int readerPosition = -1;
  98: 
  99:   /**
 100:    * Creates a new ReaderTokenizer. The reset(...) method must be
 101:    * subsequently called to set the reader.
 102:    */
 103:   public ReaderTokenizer()
 104:   {
 105:   }
 106: 
 107:   /**
 108:    * Return the sequence, used to separate lines in the document.
 109:    * @return one of \n, \r or \r\n.
 110:    */
 111:   public String getEndOfLineSequence()
 112:   {
 113:     return buffer.getEndOfLineSequence();
 114:   }
 115: 
 116:   /**
 117:    * Get the next token.
 118:    * @return
 119:    */
 120:   public Token getNextToken()
 121:   {
 122:     Token rt;
 123:     advanced = true;
 124:     try
 125:       {
 126:         if (queue.isEmpty())
 127:           read(1);
 128: 
 129:         if (!queue.isEmpty())
 130:           rt = queue.next();
 131:         else
 132:           rt = new Token(EOF, new Location(readerPosition));
 133:       }
 134:     catch (IOException ex)
 135:       {
 136:         throw new ParseException("IO Exception", ex);
 137:       }
 138:     if (backupMode)
 139:       backup.add(rt);
 140:     return rt;
 141:   }
 142: 
 143:   /**
 144:    * Get a token, lying the given number of tokens
 145:    * ahead. getToken(0) will return the same token,
 146:    * what would be returned by getNextToken().
 147:    * getToken(..) does change the current position
 148:    * in the input stream. If the end of stream is
 149:    * reached, the EOF token is always returned.
 150:    */
 151:   public Token getTokenAhead(int ahead)
 152:   {
 153:     try
 154:       {
 155:         read(ahead - queue.size() + 1);
 156:         return queue.size() >= ahead ? queue.get(ahead) : eofToken();
 157:       }
 158:     catch (IOException ex)
 159:       {
 160:         throw new ParseException("IO Exception", ex);
 161:       }
 162:   }
 163: 
 164:   /**
 165:    * Get a token, bein immediatley ahead.
 166:    * If the end of stream is
 167:    * reached, the EOF token is always returned.
 168:    * The method is equivalent calling getTokenAhead(0).
 169:    */
 170:   public Token getTokenAhead()
 171:   {
 172:     try
 173:       {
 174:         if (queue.isEmpty())
 175:           read(1);
 176:         if (!queue.isEmpty())
 177:           return queue.get(0);
 178:         else
 179:           return eofToken();
 180:       }
 181:     catch (IOException ex)
 182:       {
 183:         throw new ParseException("IO Exception", ex);
 184:       }
 185:   }
 186: 
 187:   /**
 188:    * Invokes the error handler.
 189:    */
 190:   public void error(String msg, Token at)
 191:   {
 192:     System.out.println(msg);
 193:   }
 194: 
 195:   /**
 196:    * Turns the backup mode on or off.
 197:    * It is possible to return where the mark(true) was last called
 198:    * by calling reset().
 199:    * @param mode True if it is required to save tokens, making
 200:    * returning to the current point possible.
 201:    */
 202:   public void mark(boolean mode)
 203:   {
 204:     backup.clear();
 205:     backupMode = mode;
 206:   }
 207: 
 208:   /**
 209:    * Prepare for new parsing from the given stream.
 210:    * @param a_reader A reader to parse from.
 211:    */
 212:   public void reset(Reader a_reader)
 213:   {
 214:     reader = a_reader;
 215:     readerPosition = -1;
 216:     buffer.reset();
 217:     queue.clear();
 218:   }
 219: 
 220:   /**
 221:    * Reset the internal cursor to the position where the mark()
 222:    * was last time called. Switches the backup mode off.
 223:    */
 224:   public void reset()
 225:   {
 226:     if (!backupMode)
 227:       throw new AssertionError("Call mark(true) before using reset()!");
 228:     backupMode = false;
 229: 
 230:     // That is now in the queue, will be appended to the end of backup.
 231:     while (!queue.isEmpty())
 232:       backup.add(queue.next());
 233: 
 234:     Queue t = queue;
 235:     queue = backup;
 236:     backup = t;
 237:     backup.clear();
 238:   }
 239: 
 240:   /**
 241:    * Read the given number of the tokens. Add the needed number of EOF
 242:    * tokens if there are no more data in the stream.
 243:    * @param numberOfTokens The number of additional tokens to read.
 244:    */
 245:   void read(int numberOfTokens)
 246:      throws IOException
 247:   {
 248:     if (numberOfTokens <= 0)
 249:       return;
 250: 
 251:     for (int i = 0; i < numberOfTokens; i++)
 252:       readToken();
 253:   }
 254: 
 255:   /**
 256:    * Read next token from the reader, add it to the queue
 257:    */
 258:   void readToken()
 259:           throws IOException
 260:   {
 261:     Token t;
 262:     int ch;
 263: 
 264:     enlarging:
 265:     while (true)
 266:       {
 267:         t = tokenMatches();
 268:         if (t != null)
 269:           break enlarging;
 270:         else
 271:           {
 272:             ch = reader.read();
 273:             readerPosition++;
 274:             if (ch == ETX)
 275:               ch = ' ';
 276:             if (ch < 0)
 277:               {
 278:                 if (buffer.length() == 0)
 279:                   {
 280:                     queue.add(eofToken());
 281:                     return;
 282:                   }
 283:                 else
 284:                   {
 285:                     if (buffer.charAt(buffer.length() - 1) != ETX)
 286:                       buffer.append(ETX, readerPosition++);
 287:                     else
 288:                       {
 289:                         // Discard terminating ETX
 290:                         buffer.setLength(buffer.length() - 1);
 291:                         if (buffer.length() > 0)
 292:                           {
 293:                             t = new Token(OTHER, buffer.toString(),
 294:                                           buffer.getLocation(0, buffer.length())
 295:                                          );
 296:                             queue.add(t);
 297:                             buffer.setLength(0);
 298:                           }
 299:                         return;
 300:                       }
 301:                   }
 302:               }
 303:             else
 304:               buffer.append((char) ch, readerPosition);
 305:           }
 306:       }
 307:   }
 308: 
 309:   /**
 310:    * Check if the end of buffer matches one of the tokens. If it does,
 311:    * return this token and remove the token sequence from the end of
 312:    * buffer.
 313:    * @return The matching token.
 314:    */
 315:   Token tokenMatches()
 316:   {
 317:     Token rt = endMatches(buffer);
 318:     if (rt != null) // Remove the matched image
 319:       {
 320:         // Consume future character if it was an entity and the future
 321:         // character is semicolon.
 322:         if (rt.kind == ENTITY)
 323:           {
 324:             if (buffer.charAt(buffer.length() - 1) == ';')
 325:               buffer.setLength(buffer.length() - rt.getImage().length() - 1);
 326:             else
 327:               {
 328:                 error("Missing closing semicolon for entity '" + rt.getImage() +
 329:                       "'", rt
 330:                      );
 331:                 consumeBuffer(rt);
 332:               }
 333:           }
 334:         else
 335:           {
 336:             consumeBuffer(rt);
 337:           }
 338:       }
 339: 
 340:     // If the buffer is not empty, some sequence does not match any tokens.
 341:     // Add it to the queue as "OTHER".
 342:     if (rt != null)
 343:       {
 344:         if (buffer.length() > 1)
 345:           {
 346:             String rest = buffer.toString();
 347:             rest = rest.substring(0, rest.length() - 1);
 348: 
 349:             Token other =
 350:               new Token(OTHER, rest, buffer.getLocation(0, buffer.length));
 351:             queue.add(other);
 352:             consumeBuffer(other);
 353:           }
 354:         queue.add(rt);
 355:       }
 356:     return rt;
 357:   }
 358: 
 359:   private void consumeBuffer(Token rt)
 360:   {
 361:     buffer.delete(buffer.length() - rt.getImage().length() - 1,
 362:                   buffer.length() - 1
 363:                  );
 364:   }
 365: 
 366:   /**
 367:    * Create EOF token.
 368:    */
 369:   private Token eofToken()
 370:   {
 371:     return new Token(EOF, "#", new Location(readerPosition));
 372:   }
 373: }