Source for gnu.javax.swing.text.html.parser.support.low.Constants

   1: /* Constants.java --
   2:    Copyright (C) 2005 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: 
  39: package gnu.javax.swing.text.html.parser.support.low;
  40: 
  41: import java.util.BitSet;
  42: 
  43: /**
  44:  * The parser constants and operations, directly related to the parser
  45:  * constants.
  46:  * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org)
  47:  */
  48: public class Constants
  49: {
  50:   /* Single character tokens are reflected into they ASCII codes. */
  51: 
  52:   /**
  53:    * Start of HTML token.
  54:    */
  55:   public static final int BEGIN = '<';
  56: 
  57:   /**
  58:    * End of HTML token.
  59:    */
  60:   public static final int END = '>';
  61: 
  62:   /**
  63:    * Exclamation (indicates SGML or comment).
  64:    */
  65:   public static final int EXCLAMATION = '!';
  66: 
  67:   /**
  68:    * Slash (indicates closing tag).
  69:    */
  70:   public static final int SLASH = '/';
  71: 
  72:   /**
  73:    * Equals sign.
  74:    */
  75:   public static final int EQ = '=';
  76: 
  77:   /**
  78:    * Quoting sign.
  79:    */
  80:   public static final int AP = '\'';
  81: 
  82:   /**
  83:    * Quoting sign.
  84:    */
  85:   public static final int QUOT = '"';
  86: 
  87:   /* The numbers of other tokens start outside the ascii space. */
  88:   /* String tokens */
  89: 
  90:   /**
  91:    * Double dash (--)
  92:    */
  93:   public static final int DOUBLE_DASH = 1000;
  94: 
  95:   /**
  96:    * The STYLE tag (needs special handling).
  97:    */
  98:   public static final int STYLE = 1001;
  99: 
 100:   /**
 101:    * The SCRIPT tag (needs special handling).
 102:    */
 103:   public static final int SCRIPT = 1002;
 104: 
 105:   /* Pattern tokens */
 106: 
 107:   /**
 108:    * HTML whitespace.
 109:    */
 110:   public static final int WS = 1003;
 111: 
 112:   /**
 113:    * Named or numeric entity,
 114:    */
 115:   public static final int ENTITY = 1004;
 116: 
 117:   /**
 118:    * Sequence of valid name characters (can start from digit).
 119:    */
 120:   public static final int NUMTOKEN = 1005;
 121: 
 122:   /* Complex tokens */
 123: 
 124:   /**
 125:    * Comment opening sequence.
 126:    */
 127:   public static final pattern COMMENT_OPEN =
 128:     new pattern(new node[]
 129:                 {
 130:                   new node(BEGIN), new node(WS, true), new node(EXCLAMATION),
 131:                   new node(WS, true), new node(DOUBLE_DASH),
 132:                 }
 133:                );
 134: 
 135:   /**
 136:    * Comment closing sequence
 137:    */
 138:   public static final pattern COMMENT_END =
 139:     new pattern(new node[]
 140:                 {
 141:                   new node(DOUBLE_DASH), new node(WS, true), new node(END)
 142:                 }
 143:                );
 144: 
 145:   /**
 146:    * Special case ---> (also is treated as end of comment).
 147:    */
 148:   public static final pattern COMMENT_TRIPLEDASH_END =
 149:     new pattern(new node[]
 150:                 {
 151:                   new node(DOUBLE_DASH), new node(NUMTOKEN), new node(END)
 152:                 }
 153:                );
 154: 
 155:   /**
 156:    * STYLE element heading pattern.
 157:    */
 158:   public static final pattern STYLE_OPEN =
 159:     new pattern(new node[] { new node(BEGIN), new node(WS, true), new node(STYLE) });
 160: 
 161:   /**
 162:    * SCRIPT element heading pattern.
 163:    */
 164:   public static final pattern SCRIPT_OPEN =
 165:     new pattern(new node[] { new node(BEGIN), new node(WS, true), new node(SCRIPT) });
 166: 
 167:   /**
 168:    * SGML element heading pattern.
 169:    */
 170:   public static final pattern SGML =
 171:     new pattern(new node[]
 172:                 {
 173:                   new node(BEGIN), new node(WS, true), new node(EXCLAMATION)
 174:                 }
 175:                );
 176: 
 177:   /**
 178:    * SCRIPT element closing pattern.
 179:    */
 180:   public static final pattern SCRIPT_CLOSE =
 181:     new pattern(new node[]
 182:                 {
 183:                   new node(BEGIN), new node(WS, true), new node(SLASH),
 184:                   new node(WS, true), new node(SCRIPT), new node(WS, true),
 185:                   new node(END)
 186:                 }
 187:                );
 188: 
 189:   /**
 190:    * STYLE element closing pattern.
 191:    */
 192:   public static final pattern STYLE_CLOSE =
 193:     new pattern(new node[]
 194:                 {
 195:                   new node(BEGIN), new node(WS, true), new node(SLASH),
 196:                   new node(WS, true), new node(STYLE), new node(WS, true),
 197:                   new node(END)
 198:                 }
 199:                );
 200: 
 201:   /**
 202:    * Ordinary HTML tag heading pattern.
 203:    */
 204:   public static final pattern TAG =
 205:     new pattern(new node[]
 206:                 {
 207:                   new node(BEGIN), new node(WS, true), new node(SLASH, true),
 208:                   new node(WS, true), new node(NUMTOKEN)
 209:                 }
 210:                );
 211: 
 212:   /**
 213:    * Ordinary HTML tag closing pattern.
 214:    */
 215:   public static final pattern TAG_CLOSE =
 216:     new pattern(new node[]
 217:                 {
 218:                   new node(BEGIN), new node(WS, true), new node(SLASH),
 219:                   new node(WS, true), new node(NUMTOKEN)
 220:                 }
 221:                );
 222: 
 223:   /* Special tokens */
 224: 
 225:   /**
 226:    * All other tokens.
 227:    */
 228:   public static final int OTHER = 1999;
 229: 
 230:   /**
 231:    * The UNICODE "end of text" control code
 232:    */
 233:   static final char ETX = 3;
 234: 
 235:   /**
 236:    * End of file.
 237:    */
 238:   public static final int EOF = ETX;
 239: 
 240:   /* Character categories */
 241: 
 242:   /**
 243:    * All single char tokens.
 244:    */
 245:   public static final BitSet bSINGLE_CHAR_TOKEN = new BitSet();
 246: 
 247:   /**
 248:    * Non letters and non numbers, allowed in HTML names.
 249:    */
 250:   public static final BitSet bSPECIAL = new BitSet();
 251: 
 252:   /**
 253:    * All letters, used in HTML names.
 254:    */
 255:   public static final BitSet bLETTER = new BitSet();
 256: 
 257:   /**
 258:    * Digits.
 259:    */
 260:   public static final BitSet bDIGIT = new BitSet();
 261: 
 262:   /**
 263:    * Both line breaks.
 264:    */
 265:   public static final BitSet bLINEBREAK = new BitSet();
 266: 
 267:   /**
 268:    * All whitespace.
 269:    */
 270:   public static final BitSet bWHITESPACE = new BitSet();
 271: 
 272:   /**
 273:    * Both quoting characters.
 274:    */
 275:   public static final BitSet bQUOTING = new BitSet();
 276: 
 277:   /**
 278:    * Valid name characters.
 279:    */
 280:   public static final BitSet bNAME = new BitSet();
 281: 
 282:   /* Entity subcategories */
 283: 
 284:   /**
 285:    * Named entity.
 286:    */
 287:   public static final int ENTITY_NAMED = 1;
 288: 
 289:   /**
 290:    * Numeric entity.
 291:    */
 292:   public static final int ENTITY_NUMERIC = 2;
 293: 
 294:   static
 295:   {
 296:     bQUOTING.set(AP);
 297:     bQUOTING.set(QUOT);
 298: 
 299:     bSINGLE_CHAR_TOKEN.set(BEGIN);
 300:     bSINGLE_CHAR_TOKEN.set(END);
 301:     bSINGLE_CHAR_TOKEN.set(EXCLAMATION);
 302:     bSINGLE_CHAR_TOKEN.set(SLASH);
 303:     bSINGLE_CHAR_TOKEN.set(EQ);
 304:     bSINGLE_CHAR_TOKEN.set(EOF);
 305: 
 306:     bSINGLE_CHAR_TOKEN.or(bQUOTING);
 307: 
 308:     bLINEBREAK.set('\r');
 309:     bLINEBREAK.set('\n');
 310: 
 311:     bWHITESPACE.set(' ');
 312:     bWHITESPACE.set('\t');
 313:     bWHITESPACE.set(0xC);
 314:     bWHITESPACE.or(bLINEBREAK);
 315: 
 316:     for (char i = '0'; i <= '9'; i++)
 317:       {
 318:         bDIGIT.set(i);
 319:       }
 320: 
 321:     for (char i = 'a'; i <= 'z'; i++)
 322:       {
 323:         bLETTER.set(i);
 324:       }
 325: 
 326:     for (char i = 'A'; i <= 'Z'; i++)
 327:       {
 328:         bLETTER.set(i);
 329:       }
 330: 
 331:     bSPECIAL.set('-');
 332:     bSPECIAL.set('_');
 333:     bSPECIAL.set(':');
 334:     bSPECIAL.set('.');
 335: 
 336:     bNAME.or(bLETTER);
 337:     bNAME.or(bDIGIT);
 338:     bNAME.or(bSPECIAL);
 339:   }
 340: 
 341:   /**
 342:    * Verifies if one of the tokens matches the end of string
 343:    * buffer. The last character in the string buffer is the
 344:    * "future character", some tokens needs to verify it the
 345:    * token does not continue "towards the future". If the token
 346:    * matches, it matches till "pre-last" character in the buffer.
 347:    * @param b
 348:    * @return
 349:    */
 350:   public Token endMatches(Buffer b)
 351:   {
 352:     if (b.length() < 2)
 353:       return null;
 354: 
 355:     int p = b.length() - 2;
 356: 
 357:     if (b.length() > 2 && b.charAt(p) == '-' && b.charAt(p - 1) == '-')
 358:       return new Token(DOUBLE_DASH, "--", b.getLocation(p - 1, p + 1));
 359: 
 360:     char last = b.charAt(p);
 361: 
 362:     if (bSINGLE_CHAR_TOKEN.get(last))
 363:       return new Token(last, last, b.getLocation(p, p + 1));
 364: 
 365:     char future = b.charAt(p + 1);
 366: 
 367:     // Check for numtokens, script and style:
 368:     if (bNAME.get(last) && !bNAME.get(future))
 369:       {
 370:         // Scan the history up:
 371:         int u = p - 1;
 372:         while (u >= 0 && bNAME.get(b.charAt(u)))
 373:           u--;
 374:         u++;
 375: 
 376:         char[] token = new char[ p - u + 1 ];
 377: 
 378:         // Found a numtoken
 379:         b.getChars(u, p + 1, token, 0);
 380: 
 381:         // Verify for the built-in tokens:
 382:         String e = new String(token);
 383: 
 384:         // found the entity reference
 385:         if (u > 0 && b.charAt(u - 1) == '&')
 386:           {
 387:             // The subsequent semicolon may be the part of the token
 388:             // as well. The semicolon must be ignored. This must be
 389:             // handled elsewhere.
 390:             return new Token(ENTITY, ENTITY_NAMED, "&" + e,
 391:                              b.getLocation(u - 1, p + 1)
 392:                             );
 393:           }
 394: 
 395:         // found the numeric entity reference
 396:         if (u > 1 && b.charAt(u - 1) == '#' && b.charAt(u - 2) == '&')
 397:           {
 398:             // The subsequent semicolon may be the part of the token
 399:             // as well. The semicolon must be ignored. This must be
 400:             // handled elsewhere.
 401:             return new Token(ENTITY, ENTITY_NUMERIC, "&#" + e,
 402:                              b.getLocation(u - 2, p + 2)
 403:                             );
 404:           }
 405: 
 406:         Location le = b.getLocation(u, p + 1);
 407: 
 408:         if (e.equalsIgnoreCase("SCRIPT"))
 409:           return new Token(SCRIPT, e, le);
 410:         else if (e.equalsIgnoreCase("STYLE"))
 411:           return new Token(STYLE, e, le);
 412:         else
 413:           return new Token(NUMTOKEN, e, le);
 414:       }
 415: 
 416:     // Check for whitespace
 417:     if (bWHITESPACE.get(last) && !bWHITESPACE.get(future))
 418:       {
 419:         // Scan the history up:
 420:         int u = p - 1;
 421:         while (u >= 0 && bWHITESPACE.get(b.charAt(u)))
 422:           u--;
 423:         u++;
 424: 
 425:         char[] token = new char[ p - u + 1 ];
 426:         b.getChars(u, p + 1, token, 0);
 427: 
 428:         return new Token(WS, new String(token), b.getLocation(u, p + 1));
 429:       }
 430: 
 431:     return null;
 432:   }
 433: }