Source for gnu.javax.swing.text.html.parser.support.Parser

   1: /* Parser.java -- HTML parser.
   2:    Copyright (C) 2005 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: 
  39: package gnu.javax.swing.text.html.parser.support;
  40: 
  41: import gnu.java.lang.CPStringBuilder;
  42: 
  43: import gnu.javax.swing.text.html.parser.htmlAttributeSet;
  44: import gnu.javax.swing.text.html.parser.htmlValidator;
  45: import gnu.javax.swing.text.html.parser.support.low.Constants;
  46: import gnu.javax.swing.text.html.parser.support.low.ParseException;
  47: import gnu.javax.swing.text.html.parser.support.low.ReaderTokenizer;
  48: import gnu.javax.swing.text.html.parser.support.low.Token;
  49: import gnu.javax.swing.text.html.parser.support.low.node;
  50: import gnu.javax.swing.text.html.parser.support.low.pattern;
  51: 
  52: import java.io.IOException;
  53: import java.io.Reader;
  54: 
  55: import java.util.Comparator;
  56: import java.util.Set;
  57: import java.util.TreeSet;
  58: import java.util.Vector;
  59: 
  60: import javax.swing.text.ChangedCharSetException;
  61: import javax.swing.text.SimpleAttributeSet;
  62: import javax.swing.text.html.HTML;
  63: import javax.swing.text.html.parser.AttributeList;
  64: import javax.swing.text.html.parser.DTD;
  65: import javax.swing.text.html.parser.DTDConstants;
  66: import javax.swing.text.html.parser.Element;
  67: import javax.swing.text.html.parser.Entity;
  68: import javax.swing.text.html.parser.TagElement;
  69: 
  70: /**
  71:  * <p>A simple error-tolerant HTML parser that uses a DTD document
  72:  * to access data on the possible tokens, arguments and syntax.</p>
  73:  * <p> The parser reads an HTML content from a Reader and calls various
  74:  * notifying methods (which should be overridden in a subclass)
  75:  * when tags or data are encountered.</p>
  76:  * <p>Some HTML elements need no opening or closing tags. The
  77:  * task of this parser is to invoke the tag handling methods also when
  78:  * the tags are not explicitly specified and must be supposed using
  79:  * information, stored in the DTD.
  80:  * For  example, parsing the document
  81:  * <p>&lt;table&gt;&lt;tr&gt;&lt;td&gt;a&lt;td&gt;b&lt;td&gt;c&lt;/tr&gt; <br>
  82:  * will invoke exactly the handling methods exactly in the same order
  83:  * (and with the same parameters) as if parsing the document: <br>
  84:  * <em>&lt;html&gt;&lt;head&gt;&lt;/head&gt;&lt;body&gt;&lt;table&gt;&lt;
  85:  * tbody&gt;</em>&lt;tr&gt;&lt;td&gt;a<em>&lt;/td&gt;</em>&lt;td&gt;b<em>
  86:  * &lt;/td&gt;</em>&lt;td&gt;c<em>&lt;/td&gt;&lt;/tr&gt;</em>&lt;
  87:  * <em>/tbody&gt;&lt;/table&gt;&lt;/body&gt;&lt;/html&gt;</em></p>
  88:  * (supposed tags are given in italics). The parser also supports
  89:  * obsolete elements of HTML syntax.<p>
  90:  * </p>
  91:  * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org)
  92:  */
  93: public class Parser
  94:   extends ReaderTokenizer
  95:   implements DTDConstants
  96: {
  97:   /**
  98:    * The current html tag.
  99:    */
 100:   public Token hTag = new Token();
 101: 
 102:   /**
 103:    * The document template description that will be used to parse the documents.
 104:    */
 105:   protected DTD dtd;
 106: 
 107:   /**
 108:    * The value of this field determines whether or not the Parser will be
 109:    * strict in enforcing SGML compatibility. The default value is false,
 110:    * stating that the parser should do everything to parse and get at least
 111:    * some information even from the incorrectly written HTML input.
 112:    */
 113:   protected boolean strict;
 114: 
 115:   /**
 116:    * This fields has positive values in preformatted tags.
 117:    */
 118:   protected int preformatted = 0;
 119: 
 120:   /**
 121:    * The set of the document tags. This field is used for supporting
 122:    * markFirstTime().
 123:    */
 124:   private Set documentTags =
 125:     new TreeSet(new Comparator()
 126:       {
 127:         public int compare(Object a, Object b)
 128:         {
 129:           return ((String) a).compareToIgnoreCase((String) b);
 130:         }
 131:       }
 132:                );
 133: 
 134:   /**
 135:   * The buffer to collect the incremental output like text or coment.
 136:   */
 137:   private final StringBuffer buffer = new StringBuffer();
 138: 
 139:   /**
 140:    * The buffer to store the document title.
 141:    */
 142:   private final StringBuffer title = new StringBuffer();
 143: 
 144:   /**
 145:    * The current token.
 146:    */
 147:   private Token t;
 148: 
 149:   /**
 150:    * True means that the 'title' tag of this document has
 151:    * already been handled.
 152:    */
 153:   private boolean titleHandled;
 154: 
 155:   /**
 156:    * True means that the 'title' tag is currently open and all
 157:    * text is also added to the title buffer.
 158:    */
 159:   private boolean titleOpen;
 160: 
 161:   /**
 162:    * The attributes of the current HTML element.
 163:    * Package-private to avoid an accessor method.
 164:    */
 165:   htmlAttributeSet attributes =
 166:     htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
 167: 
 168:   /**
 169:    * The validator, controlling the forcible closing of the tags that
 170:    * (in accordance to dtd) are not allowed in the current context.
 171:    */
 172:   private htmlValidator validator;
 173: 
 174:   /**
 175:    * Provides the default values for parameters in the case when these
 176:    * values are defined in the DTD.
 177:    */
 178:   private parameterDefaulter defaulter;
 179: 
 180:   /**
 181:    * The text pre-processor for handling line ends and tabs.
 182:    */
 183:   private textPreProcessor textProcessor = new textPreProcessor();
 184: 
 185:   /**
 186:    * Creates a new Parser that uses the given
 187:    * {@link javax.swing.text.html.parser.DTD }. The only standard way
 188:    * to get an instance of DTD is to construct it manually, filling in
 189:    * all required fields.
 190:    * @param a_dtd The DTD to use. The parser behaviour after passing null
 191:    * as an argument is not documented and may vary between implementations.
 192:    */
 193:   public Parser(DTD a_dtd)
 194:   {
 195:     if (a_dtd == null)
 196:       dtd = gnu.javax.swing.text.html.parser.HTML_401F.getInstance();
 197:     else
 198:       dtd = a_dtd;
 199: 
 200:     defaulter = new parameterDefaulter(dtd);
 201: 
 202:     validator =
 203:       new htmlValidator(dtd)
 204:         {
 205:           /**
 206:            * Handles the error message. This method must be overridden to pass
 207:            * the message where required.
 208:            * @param msg The message text.
 209:            */
 210:           protected void s_error(String msg)
 211:           {
 212:             error(msg);
 213:           }
 214: 
 215:           /**
 216:            * The method is called when the tag validator decides to close the
 217:            * tag on its own initiative. After reaching the end of stream,
 218:            * The tag validator closes all unclosed elements that are required
 219:            * to have the end (closing) tag.
 220:            *
 221:            * @param tElement The tag being fictionally (forcibly) closed.
 222:            */
 223:           protected void handleSupposedEndTag(Element tElement)
 224:           {
 225:             // The tag is cloned as the original tElement is the
 226:             // element from the starting tag - may be accidently used
 227:             // somewhere else.
 228:             TagElement tag = makeTag(tElement, true);
 229:             _handleEndTag_remaining(tag);
 230:           }
 231: 
 232:           /**
 233:            * The method is called when the the tag validator decides to open
 234:            * the new tag on its own initiative. The tags, opened in this
 235:            * way, are HTML, HEAD and BODY. The attribute set is temporary
 236:            * assigned to the empty one, the previous value is
 237:            * restored before return.
 238:            *
 239:            * @param tElement The tag being fictionally (forcibly) closed.
 240:            */
 241:           protected void handleSupposedStartTag(Element tElement)
 242:           {
 243:             TagElement tag = makeTag(tElement, true);
 244:             htmlAttributeSet were = attributes;
 245:             attributes = htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
 246:             _handleStartTag(tag);
 247:             attributes = were;
 248:           }
 249:         };
 250:   }
 251: 
 252:   /**
 253:    * Get the attributes of the current tag.
 254:    * @return The attribute set, representing the attributes of the current tag.
 255:    */
 256:   public SimpleAttributeSet getAttributes()
 257:   {
 258:     return new SimpleAttributeSet(attributes);
 259:   }
 260: 
 261:   /**
 262:    * Invokes the error handler. The default method in this implementation
 263:    * delegates the call to handleError, also providing the current line.
 264:    */
 265:   public void error(String msg)
 266:   {
 267:     error(msg, getTokenAhead());
 268:   }
 269: 
 270:   public void error(String msg, Token atToken)
 271:   {
 272:     if (atToken != null)
 273:       handleError(atToken.where.beginLine,
 274:                   msg + ": line " + atToken.where.beginLine +
 275:                   ", absolute pos " + atToken.where.startPosition
 276:                  );
 277:     else
 278:       handleError(0, msg);
 279:   }
 280: 
 281:   /**
 282:    * Invokes the error handler. The default method in this implementation
 283:    * delegates the call to error (parm1+": '"+parm2+"'").
 284:    */
 285:   public void error(String msg, String invalid)
 286:   {
 287:     error(msg + ": '" + invalid + "'");
 288:   }
 289: 
 290:   /**
 291:    * Invokes the error handler. The default method in this implementation
 292:    * delegates the call to error (parm1+" "+ parm2+" "+ parm3).
 293:    */
 294:   public void error(String parm1, String parm2, String parm3)
 295:   {
 296:     error(parm1 + " " + parm2 + " " + parm3);
 297:   }
 298: 
 299:   /**
 300:    * Invokes the error handler. The default method in this implementation
 301:    * delegates the call to error (parm1+" "+ parm2+" "+ parm3+" "+ parm4).
 302:    */
 303:   public void error(String parm1, String parm2, String parm3, String parm4)
 304:   {
 305:     error(parm1 + " " + parm2 + " " + parm3 + " " + parm4);
 306:   }
 307: 
 308:   public void flushAttributes()
 309:   {
 310:   }
 311: 
 312:   /**
 313:    * Parse the HTML text, calling various methods in response to the
 314:    * occurence of the corresponding HTML constructions.
 315:    * @param reader The reader to read the source HTML from.
 316:    * @throws IOException If the reader throws one.
 317:    */
 318:   public synchronized void parse(Reader reader)
 319:                           throws IOException
 320:   {
 321:     reset(reader);
 322:     restart();
 323:     try
 324:       {
 325:         parseDocument();
 326:         validator.closeAll();
 327:       }
 328:     catch (ParseException ex)
 329:       {
 330:         if (ex != null)
 331:           {
 332:             error("Unable to continue parsing the document", ex.getMessage());
 333: 
 334:             Throwable cause = ex.getCause();
 335:             if (cause instanceof IOException)
 336:               throw (IOException) cause;
 337:           }
 338:       }
 339:   }
 340: 
 341:   /**
 342:    * Parses DTD markup declaration. Currently returns null without action.
 343:    * @return null.
 344:    * @throws IOException
 345:    */
 346:   public String parseDTDMarkup()
 347:                         throws IOException
 348:   {
 349:     return null;
 350:   }
 351: 
 352:   /**
 353:    * Parse SGML insertion ( &lt;! ... &gt; ). When the
 354:    * the SGML insertion is found, this method is called, passing
 355:    * SGML in the string buffer as a parameter. The default method
 356:    * returns false without action and can be overridden to
 357:    * implement user - defined SGML support.
 358:    * <p>
 359:    * If you need more information about SGML insertions in HTML documents,
 360:    * the author suggests to read SGML tutorial on
 361:    * {@link http://www.w3.org/TR/WD-html40-970708/intro/sgmltut.html}.
 362:    * We also recommend Goldfarb C.F (1991) <i>The SGML Handbook</i>,
 363:    * Oxford University Press, 688 p, ISBN: 0198537379.
 364:    * </p>
 365:    * @param strBuff
 366:    * @return true if this is a valid DTD markup declaration.
 367:    * @throws IOException
 368:    */
 369:   public boolean parseMarkupDeclarations(StringBuffer strBuff)
 370:                                   throws IOException
 371:   {
 372:     return false;
 373:   }
 374: 
 375:   /**
 376:    * Get the first line of the last parsed token.
 377:    */
 378:   protected int getCurrentLine()
 379:   {
 380:     return hTag.where.beginLine;
 381:   }
 382: 
 383:   /**
 384:    * Read parseable character data, add to buffer.
 385:    * @param clearBuffer If true, buffer if filled by CDATA section,
 386:    * otherwise the section is appended to the existing content of the
 387:    * buffer.
 388:    *
 389:    * @throws ParseException
 390:    */
 391:   protected void CDATA(boolean clearBuffer)
 392:                 throws ParseException
 393:   {
 394:     Token start = hTag = getTokenAhead();
 395: 
 396:     if (clearBuffer)
 397:       buffer.setLength(0);
 398: 
 399:     // Handle expected EOF.
 400:     if (start.kind == EOF)
 401:       return;
 402: 
 403:     read:
 404:     while (true)
 405:       {
 406:         t = getTokenAhead();
 407:         if (t.kind == EOF)
 408:           {
 409:             error("unexpected eof", t);
 410:             break read;
 411:           }
 412:         else if (t.kind == BEGIN)
 413:           break read;
 414:         else if (t.kind == Constants.ENTITY)
 415:           {
 416:             resolveAndAppendEntity(t);
 417:             getNextToken();
 418:           }
 419:         else
 420:           {
 421:             append(t);
 422:             getNextToken();
 423:           }
 424:       }
 425:     hTag = new Token(start, getTokenAhead(0));
 426:     if (buffer.length() != 0)
 427:       _handleText();
 428:   }
 429: 
 430:   /**
 431:   * Process Comment. This method skips till --> without
 432:   * taking SGML constructs into consideration.  The supported SGML
 433:   * constructs are handled separately.
 434:   */
 435:   protected void Comment()
 436:                   throws ParseException
 437:   {
 438:     buffer.setLength(0);
 439: 
 440:     Token start = hTag = mustBe(BEGIN);
 441:     optional(WS);
 442:     mustBe(EXCLAMATION);
 443:     optional(WS);
 444:     mustBe(DOUBLE_DASH);
 445: 
 446:     Token t;
 447:     Token last;
 448: 
 449:     comment:
 450:     while (true)
 451:       {
 452:         t = getTokenAhead();
 453:         if (t.kind == EOF)
 454:           {
 455:             handleEOFInComment();
 456:             last = t;
 457:             break comment;
 458:           }
 459:         else if (COMMENT_END.matches(this))
 460:           {
 461:             mustBe(DOUBLE_DASH);
 462:             optional(WS);
 463:             last = mustBe(END);
 464:             break comment;
 465:           }
 466:         else if (COMMENT_TRIPLEDASH_END.matches(this))
 467:           {
 468:             mustBe(DOUBLE_DASH);
 469:             t = mustBe(NUMTOKEN);
 470:             if (t.getImage().equals("-"))
 471:               {
 472:                 append(t);
 473:                 last = mustBe(END);
 474:                 break comment;
 475:               }
 476:             else
 477:               {
 478:                 buffer.append("--");
 479:                 append(t);
 480:                 t = getTokenAhead();
 481:               }
 482:           }
 483:         else
 484:         /* The lllll-- can match as NUMTOKEN */
 485:         if ((t.getImage().endsWith("--")) &&
 486:             (
 487:               getTokenAhead(1).kind == END ||
 488:               (getTokenAhead(1).kind == WS && getTokenAhead(2).kind == END)
 489:             )
 490:            )
 491:           {
 492:             buffer.append(t.getImage().substring(0, t.getImage().length() - 2));
 493: 
 494:             /* Skip the closing > that we have already checked. */
 495:             last = mustBe(t.kind);
 496:             break comment;
 497:           }
 498:         else
 499:           append(t);
 500:         mustBe(t.kind);
 501:       }
 502:     hTag = new Token(start, last);
 503: 
 504:     // Consume any whitespace immediately following a comment.
 505:     optional(WS);
 506:     handleComment();
 507:   }
 508: 
 509:   /**
 510:   * Read a script. The text, returned without any changes,
 511:   * is terminated only by the closing tag SCRIPT.
 512:   */
 513:   protected void Script()
 514:                  throws ParseException
 515:   {
 516:     Token name;
 517: 
 518:     Token start = hTag = mustBe(BEGIN);
 519:     optional(WS);
 520: 
 521:     name = mustBe(SCRIPT);
 522: 
 523:     optional(WS);
 524: 
 525:     restOfTag(false, name, start);
 526: 
 527:     buffer.setLength(0);
 528: 
 529:     while (!SCRIPT_CLOSE.matches(this))
 530:       {
 531:         append(getNextToken());
 532:       }
 533: 
 534:     consume(SCRIPT_CLOSE);
 535: 
 536:     _handleText();
 537: 
 538:     endTag(false);
 539:     _handleEndTag(makeTagElement(name.getImage(), false));
 540:   }
 541: 
 542:   /**
 543:   * Process SGML insertion that is not a comment.
 544:   */
 545:   protected void Sgml()
 546:                throws ParseException
 547:   {
 548:     if (COMMENT_OPEN.matches(this))
 549:       Comment();
 550:     else // skip till ">"
 551:       {
 552:         Token start = hTag = mustBe(BEGIN);
 553:         optional(WS);
 554:         mustBe(EXCLAMATION);
 555: 
 556:         buffer.setLength(0);
 557:         read:
 558:         while (true)
 559:           {
 560:             t = getNextToken();
 561:             if (t.kind == Constants.ENTITY)
 562:               {
 563:                 resolveAndAppendEntity(t);
 564:               }
 565:             else if (t.kind == EOF)
 566:               {
 567:                 error("unexpected eof", t);
 568:                 break read;
 569:               }
 570:             else if (t.kind == END)
 571:               break read;
 572:             else
 573:               append(t);
 574:           }
 575: 
 576:         try
 577:           {
 578:             parseMarkupDeclarations(buffer);
 579:           }
 580:         catch (IOException ex)
 581:           {
 582:             error("Unable to parse SGML insertion: '" + buffer + "'",
 583:                   new Token(start, t)
 584:                  );
 585:           }
 586:       }
 587:     // Consume any whitespace that follows the Sgml insertion.
 588:     optional(WS);
 589:   }
 590: 
 591:   /**
 592:   * Read a style definition. The text, returned without any changes,
 593:   * is terminated only by the closing tag STYLE.
 594:   */
 595:   protected void Style()
 596:                 throws ParseException
 597:   {
 598:     Token name;
 599: 
 600:     Token start = hTag = mustBe(BEGIN);
 601:     optional(WS);
 602: 
 603:     name = mustBe(STYLE);
 604: 
 605:     optional(WS);
 606: 
 607:     restOfTag(false, name, start);
 608: 
 609:     buffer.setLength(0);
 610: 
 611:     while (!STYLE_CLOSE.matches(this))
 612:       {
 613:         append(getNextToken());
 614:       }
 615: 
 616:     consume(STYLE_CLOSE);
 617: 
 618:     _handleText();
 619: 
 620:     endTag(false);
 621:     _handleEndTag(makeTagElement(name.getImage(), false));
 622:   }
 623: 
 624:   /**
 625:    * Read a html tag.
 626:    */
 627:   protected void Tag()
 628:               throws ParseException
 629:   {
 630:     mark(true);
 631: 
 632:     boolean closing = false;
 633:     Token name;
 634:     Token start = hTag = mustBe(BEGIN);
 635: 
 636:     optional(WS);
 637:     name = getNextToken();
 638:     optional(WS);
 639: 
 640:     if (name.kind == SLASH)
 641:       {
 642:         closing = true;
 643:         name = getNextToken();
 644:       }
 645: 
 646:     restOfTag(closing, name, start);
 647:   }
 648: 
 649:   /**
 650:    * A hook, for operations, preceeding call to handleText.
 651:    * Handle text in a string buffer.
 652:    * In non - preformatted mode, all line breaks immediately following the
 653:    * start tag and immediately before an end tag is discarded,
 654:    * \r, \n and \t are replaced by spaces, multiple space are replaced
 655:    * by the single one and the result is  moved into array,
 656:    * passing it  to handleText().
 657:    */
 658:   protected void _handleText()
 659:   {
 660:     char[] text;
 661: 
 662:     if (preformatted > 0)
 663:       text = textProcessor.preprocessPreformatted(buffer);
 664:     else
 665:       text = textProcessor.preprocess(buffer);
 666: 
 667:     if (text != null && text.length > 0
 668:         // According to the specs we need to discard whitespace immediately
 669:         // before a closing tag.
 670:         && (text.length > 1 || text[0] != ' ' || ! TAG_CLOSE.matches(this)))
 671:       {
 672:         TagElement pcdata = new TagElement(dtd.getElement("#pcdata"));
 673:         attributes = htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
 674:         _handleEmptyTag(pcdata);
 675: 
 676:         handleText(text);
 677:         if (titleOpen)
 678:           title.append(text);
 679:       }
 680:   }
 681: 
 682:   /**
 683:    * Add the image of this token to the buffer.
 684:    * @param t A token to append.
 685:    */
 686:   protected final void append(Token t)
 687:   {
 688:     if (t.kind != EOF)
 689:       t.appendTo(buffer);
 690:   }
 691: 
 692:   /**
 693:    * Consume pattern that must match.
 694:    * @param p A pattern to consume.
 695:    */
 696:   protected final void consume(pattern p)
 697:   {
 698:     node n;
 699:     for (int i = 0; i < p.nodes.length; i++)
 700:       {
 701:         n = p.nodes [ i ];
 702:         if (n.optional)
 703:           optional(n.kind);
 704:         else
 705:           mustBe(n.kind);
 706:       }
 707:   }
 708: 
 709:   /**
 710:    * The method is called when the HTML end (closing) tag is found or if
 711:    * the parser concludes that the one should be present in the
 712:    * current position. The method is called immediatly
 713:    * before calling the handleEndTag().
 714:    * @param omitted True if the tag is no actually present in the document,
 715:    * but is supposed by the parser (like &lt;/html&gt; at the end of the
 716:    * document).
 717:    */
 718:   protected void endTag(boolean omitted)
 719:   {
 720:   }
 721: 
 722:   /**
 723:    * Handle HTML comment. The default method returns without action.
 724:    * @param comment
 725:    */
 726:   protected void handleComment(char[] comment)
 727:   {
 728:   }
 729: 
 730:   /**
 731:    * This is additionally called in when the HTML content terminates
 732:    * without closing the HTML comment. This can only happen if the
 733:    * HTML document contains errors (for example, the closing --;gt is
 734:    * missing.
 735:    */
 736:   protected void handleEOFInComment()
 737:   {
 738:     error("Unclosed comment");
 739:   }
 740: 
 741:   /**
 742:    * Handle the tag with no content, like &lt;br&gt;. The method is
 743:    * called for the elements that, in accordance with the current DTD,
 744:    * has an empty content.
 745:    * @param tag The tag being handled.
 746:    * @throws javax.swing.text.ChangedCharSetException
 747:    */
 748:   protected void handleEmptyTag(TagElement tag)
 749:                          throws javax.swing.text.ChangedCharSetException
 750:   {
 751:   }
 752: 
 753:   /**
 754:    * The method is called when the HTML closing tag ((like &lt;/table&gt;)
 755:    * is found or if the parser concludes that the one should be present
 756:    * in the current position.
 757:    * @param tag The tag
 758:    */
 759:   protected void handleEndTag(TagElement tag)
 760:   {
 761:   }
 762: 
 763:   /* Handle error that has occured in the given line. */
 764:   protected void handleError(int line, String message)
 765:   {
 766:   }
 767: 
 768:   /**
 769:    * The method is called when the HTML opening tag ((like &lt;table&gt;)
 770:    * is found or if the parser concludes that the one should be present
 771:    * in the current position.
 772:    * @param tag The tag
 773:    */
 774:   protected void handleStartTag(TagElement tag)
 775:   {
 776:   }
 777: 
 778:   /**
 779:    * Handle the text section.
 780:    * <p> For non-preformatted section, the parser replaces
 781:    * \t, \r and \n by spaces and then multiple spaces
 782:    * by a single space. Additionaly, all whitespace around
 783:    * tags is discarded.
 784:    * </p>
 785:    * <p> For pre-formatted text (inside TEXAREA and PRE), the parser preserves
 786:    * all tabs and spaces, but removes <b>one</b>  bounding \r, \n or \r\n,
 787:    * if it is present. Additionally, it replaces each occurence of \r or \r\n
 788:    * by a single \n.</p>
 789:    *
 790:    * @param text A section text.
 791:    */
 792:   protected void handleText(char[] text)
 793:   {
 794:   }
 795: 
 796:   /**
 797:    * Handle HTML &lt;title&gt; tag. This method is invoked when
 798:    * both title starting and closing tags are already behind.
 799:    * The passed argument contains the concatenation of all
 800:    * title text sections.
 801:    * @param title The title text.
 802:    */
 803:   protected void handleTitle(char[] title)
 804:   {
 805:   }
 806: 
 807:   /**
 808:    * Constructs the tag from the given element. In this implementation,
 809:    * this is defined, but never called.
 810:    * @return the tag
 811:    */
 812:   protected TagElement makeTag(Element element)
 813:   {
 814:     return makeTag(element, false);
 815:   }
 816: 
 817:   /**
 818:    * Constructs the tag from the given element.
 819:    * @param the tag base {@link javax.swing.text.html.parser.Element}
 820:    * @param isSupposed true if the tag is not actually present in the
 821:    * html input, but the parser supposes that it should to occur in
 822:    * the current location.
 823:    * @return the tag
 824:    */
 825:   protected TagElement makeTag(Element element, boolean isSupposed)
 826:   {
 827:     return new TagElement(element, isSupposed);
 828:   }
 829: 
 830:   /**
 831:    * This is called when the tag, representing the given element,
 832:    * occurs first time in the document.
 833:    * @param element
 834:    */
 835:   protected void markFirstTime(Element element)
 836:   {
 837:   }
 838: 
 839:   /**
 840:    * Consume the token that was checked before and hence MUST be present.
 841:    * @param kind The kind of token to consume.
 842:    */
 843:   protected Token mustBe(int kind)
 844:   {
 845:     if (getTokenAhead().kind == kind)
 846:       return getNextToken();
 847:     else
 848:       {
 849:         String ei = "";
 850:         if (kind < 1000)
 851:           ei = " ('" + (char) kind + "') ";
 852:         throw new AssertionError("The token of kind " + kind + ei +
 853:                                  " MUST be here,"
 854:                                 );
 855:       }
 856:   }
 857: 
 858:   /**
 859:    * Handle attribute without value. The default method uses
 860:    * the only allowed attribute value from DTD.
 861:    * If the attribute is unknown or allows several values,
 862:    * the HTML.NULL_ATTRIBUTE_VALUE is used. The attribute with
 863:    * this value is added to the attribute set.
 864:    * @param element The name of element.
 865:    * @param attribute The name of attribute without value.
 866:    */
 867:   protected void noValueAttribute(String element, String attribute)
 868:   {
 869:     Object value = HTML.NULL_ATTRIBUTE_VALUE;
 870: 
 871:     Element e = dtd.elementHash.get(element.toLowerCase());
 872:     if (e != null)
 873:       {
 874:         AttributeList attr = e.getAttribute(attribute);
 875:         if (attr != null)
 876:           {
 877:             Vector values = attr.values;
 878:             if (values != null && values.size() == 1)
 879:               value = values.get(0);
 880:           }
 881:       }
 882:     attributes.addAttribute(attribute, value);
 883:   }
 884: 
 885:   /**
 886:    * Consume the optional token, if present.
 887:    * @param kind The kind of token to consume.
 888:    */
 889:   protected Token optional(int kind)
 890:   {
 891:     if (getTokenAhead().kind == kind)
 892:       return getNextToken();
 893:     else
 894:       return null;
 895:   }
 896: 
 897:   /** Parse the html document. */
 898:   protected void parseDocument()
 899:                         throws ParseException
 900:   {
 901:     // Read up any initial whitespace.
 902:     optional(WS);
 903:     while (getTokenAhead().kind != EOF)
 904:       {
 905:         advanced = false;
 906:         if (TAG.matches(this))
 907:           Tag();
 908:         else if (COMMENT_OPEN.matches(this))
 909:           Comment();
 910:         else if (STYLE_OPEN.matches(this))
 911:           Style();
 912:         else if (SCRIPT_OPEN.matches(this))
 913:           Script();
 914:         else if (SGML.matches(this))
 915:           Sgml();
 916:         else
 917:           CDATA(true);
 918: 
 919:         // Surely HTML error, treat as a text.
 920:         if (!advanced)
 921:           {
 922:             Token wrong = getNextToken();
 923:             error("unexpected '" + wrong.getImage() + "'", wrong);
 924:             buffer.setLength(0);
 925:             buffer.append(wrong.getImage());
 926:             _handleText();
 927:           }
 928:       }
 929:   }
 930: 
 931:   /**
 932:    * Read the element attributes, adding them into attribute set.
 933:    * @param element The element name (needed to access attribute
 934:    * information in dtd).
 935:    */
 936:   protected void readAttributes(String element)
 937:   {
 938:     Token name;
 939:     Token value;
 940:     Token next;
 941:     String attrValue;
 942: 
 943:     attributes = new htmlAttributeSet();
 944: 
 945:     optional(WS);
 946: 
 947:     attributeReading:
 948:       while (getTokenAhead().kind == NUMTOKEN)
 949:       {
 950:         name = getNextToken();
 951:         optional(WS);
 952: 
 953:         next = getTokenAhead();
 954:         if (next.kind == EQ)
 955:           {
 956:             mustBe(EQ);
 957:             optional(WS);
 958: 
 959:             next = getNextToken();
 960: 
 961:             switch (next.kind)
 962:               {
 963:               case QUOT:
 964: 
 965:                 // read "quoted" attribute.
 966:                 buffer.setLength(0);
 967:                 readTillTokenE(QUOT);
 968:                 attrValue = buffer.toString();
 969:                 break;
 970: 
 971:               case AP:
 972: 
 973:                 // read 'quoted' attribute.
 974:                 buffer.setLength(0);
 975:                 readTillTokenE(AP);
 976:                 attrValue = buffer.toString();
 977:                 break;
 978: 
 979:               // read unquoted attribute.
 980:               case NUMTOKEN:
 981:                 value = next;
 982:                 optional(WS);
 983: 
 984:                 // Check maybe the opening quote is missing.
 985:                 next = getTokenAhead();
 986:                 if (bQUOTING.get(next.kind))
 987:                   {
 988:                     hTag = next;
 989:                     error("The value without opening quote is closed with '"
 990:                           + next.getImage() + "'");
 991:                     attrValue = value.getImage();
 992:                   }
 993:                 else if (next.kind == SLASH || next.kind == OTHER)
 994:                 // The slash and other characters (like %) in this context is
 995:                 // treated as the ordinary
 996:                 // character, not as a token. The character may be part of
 997:                 // the unquoted URL.
 998:                   {
 999:                     CPStringBuilder image = new CPStringBuilder(value.getImage());
1000:                     while (next.kind == NUMTOKEN || next.kind == SLASH
1001:                            || next.kind == OTHER)
1002:                       {
1003:                         image.append(getNextToken().getImage());
1004:                         next = getTokenAhead();
1005:                       }
1006:                     attrValue = image.toString();
1007:                   }
1008:                 else
1009:                   attrValue = value.getImage();
1010:                 break;
1011: 
1012:               case SLASH:
1013:                 value = next;
1014:                 optional(WS);
1015: 
1016:                 // Check maybe the opening quote is missing.
1017:                 next = getTokenAhead();
1018:                 if (bQUOTING.get(next.kind))
1019:                   {
1020:                     hTag = next;
1021:                     error("The value without opening quote is closed with '"
1022:                           + next.getImage() + "'");
1023:                     attrValue = value.getImage();
1024:                   }
1025:                 else if (next.kind == NUMTOKEN || next.kind == SLASH)
1026:                 // The slash in this context is treated as the ordinary
1027:                 // character, not as a token. The slash may be part of
1028:                 // the unquoted URL.
1029:                   {
1030:                     CPStringBuilder image = new CPStringBuilder(value.getImage());
1031:                     while (next.kind == NUMTOKEN || next.kind == SLASH)
1032:                       {
1033:                         image.append(getNextToken().getImage());
1034:                         next = getTokenAhead();
1035:                       }
1036:                     attrValue = image.toString();
1037:                   }
1038:                 else
1039:                   attrValue = value.getImage();
1040:                 break;
1041:               default:
1042:                 break attributeReading;
1043:               }
1044:             attributes.addAttribute(name.getImage(), attrValue);
1045:             optional(WS);
1046:           }
1047:         else
1048:           // The '=' is missing: attribute without value.
1049:           {
1050:             noValueAttribute(element, name.getImage());
1051:           }
1052:       }
1053:   }
1054: 
1055:   /**
1056:    * Return string, corresponding the given named entity. The name is passed
1057:    * with the preceeding &, but without the ending semicolon.
1058:    */
1059:   protected String resolveNamedEntity(final String a_tag)
1060:   {
1061:     // Discard &
1062:     if (!a_tag.startsWith("&"))
1063:       throw new AssertionError("Named entity " + a_tag +
1064:                                " must start witn '&'."
1065:                               );
1066: 
1067:     String tag = a_tag.substring(1);
1068: 
1069:     try
1070:       {
1071:         Entity entity = dtd.getEntity(tag);
1072:         if (entity != null)
1073:           return entity.getString();
1074: 
1075:         entity = dtd.getEntity(tag.toLowerCase());
1076: 
1077:         if (entity != null)
1078:           {
1079:             error("The name of this entity should be in lowercase", a_tag);
1080:             return entity.getString();
1081:           }
1082:       }
1083:     catch (IndexOutOfBoundsException ibx)
1084:       {
1085:         /* The error will be reported. */
1086:       }
1087: 
1088:     error("Unknown named entity", a_tag);
1089:     return a_tag;
1090:   }
1091: 
1092:   /**
1093:    * Return char, corresponding the given numeric entity.
1094:    * The name is passed with the preceeding &#, but without
1095:    * the ending semicolon.
1096:    */
1097:   protected char resolveNumericEntity(final String a_tag)
1098:   {
1099:     // Discard &#
1100:     if (!a_tag.startsWith("&#"))
1101:       throw new AssertionError("Numeric entity " + a_tag +
1102:                                " must start witn '&#'."
1103:                               );
1104: 
1105:     String tag = a_tag.substring(2);
1106: 
1107:     try
1108:       {
1109:         // Determine the encoding type:
1110:         char cx = tag.charAt(0);
1111:         if (cx == 'x' || cx == 'X') // Hexadecimal &#Xnnn;
1112: 
1113:           return (char) Integer.parseInt(tag.substring(1), 16);
1114: 
1115:         return (char) Integer.parseInt(tag);
1116:       }
1117: 
1118:     /* The error will be reported. */
1119:     catch (NumberFormatException nex)
1120:       {
1121:       }
1122:     catch (IndexOutOfBoundsException ix)
1123:       {
1124:       }
1125: 
1126:     error("Invalid numeric entity", a_tag);
1127:     return '?';
1128:   }
1129: 
1130:   /**
1131:    * Reset all fields into the intial default state, preparing the
1132:    * parset for parsing the next document.
1133:    */
1134:   protected void restart()
1135:   {
1136:     documentTags.clear();
1137:     titleHandled = false;
1138:     titleOpen = false;
1139:     buffer.setLength(0);
1140:     title.setLength(0);
1141:     validator.restart();
1142:   }
1143: 
1144:   /**
1145:    * The method is called when the HTML opening tag ((like &lt;table&gt;)
1146:    * is found or if the parser concludes that the one should be present
1147:    * in the current position. The method is called immediately before
1148:    * calling the handleStartTag.
1149:    * @param tag The tag
1150:    */
1151:   protected void startTag(TagElement tag)
1152:                    throws ChangedCharSetException
1153:   {
1154:   }
1155: 
1156:   /**
1157:    * Handle a complete element, when the tag content is already present in the
1158:    * buffer and both starting and heading tags behind. This is called
1159:    * in the case when the tag text must not be parsed for the nested
1160:    * elements (elements STYLE and SCRIPT).
1161:    */
1162:   private void _handleCompleteElement(TagElement tag)
1163:   {
1164:     _handleStartTag(tag);
1165: 
1166:     // Suppress inclusion of the SCRIPT ans STYLE texts into the title.
1167:     HTML.Tag h = tag.getHTMLTag();
1168:     if (h == HTML.Tag.SCRIPT || h == HTML.Tag.STYLE)
1169:       {
1170:         boolean tmp = titleOpen;
1171:         titleOpen = false;
1172:         _handleText();
1173:         titleOpen = tmp;
1174:       }
1175:     else
1176:       _handleText();
1177: 
1178:     _handleEndTag(tag);
1179:   }
1180: 
1181:   /**
1182:    * A hooks for operations, preceeding call to handleEmptyTag().
1183:    * Handle the tag with no content, like &lt;br&gt;. As no any
1184:    * nested tags are expected, the tag validator is not involved.
1185:    * @param tag The tag being handled.
1186:    */
1187:   private void _handleEmptyTag(TagElement tag)
1188:   {
1189:     try
1190:       {
1191:         validator.validateTag(tag, attributes);
1192:         handleEmptyTag(tag);
1193:         HTML.Tag h = tag.getHTMLTag();
1194:         // When a block tag is closed, consume whitespace that follows after
1195:         // it.
1196:         // For some unknown reason a FRAME tag is not treated as block element.
1197:         // However in this case it should be treated as such.
1198:         if (isBlock(h))
1199:           optional(WS);
1200:       }
1201:     catch (ChangedCharSetException ex)
1202:       {
1203:         error("Changed charset exception:", ex.getMessage());
1204:       }
1205:   }
1206: 
1207:   /**
1208:    * A hooks for operations, preceeding call to handleEndTag().
1209:    * The method is called when the HTML closing tag
1210:    * is found. Calls handleTitle after closing the 'title' tag.
1211:    * @param tag The tag
1212:    */
1213:   private void _handleEndTag(TagElement tag)
1214:   {
1215:     if (validator.closeTag(tag))
1216:        _handleEndTag_remaining(tag);
1217:   }
1218: 
1219:   /**
1220:    * Actions that are also required if the closing action was
1221:    * initiated by the tag validator.
1222:    * Package-private to avoid an accessor method.
1223:    */
1224:   void _handleEndTag_remaining(TagElement tag)
1225:   {
1226:     HTML.Tag h = tag.getHTMLTag();
1227: 
1228:     handleEndTag(tag);
1229:     endTag(tag.fictional());
1230: 
1231:     if (h.isPreformatted())
1232:       preformatted--;
1233:     if (preformatted < 0)
1234:       preformatted = 0;
1235: 
1236:     // When a block tag is closed, consume whitespace that follows after
1237:     // it.
1238:     if (isBlock(h))
1239:       optional(WS);
1240: 
1241:     if (h == HTML.Tag.TITLE)
1242:       {
1243:         titleOpen = false;
1244:         titleHandled = true;
1245: 
1246:         char[] a = new char[ title.length() ];
1247:         title.getChars(0, a.length, a, 0);
1248:         handleTitle(a);
1249:       }
1250:   }
1251: 
1252:   /**
1253:    * A hooks for operations, preceeding call to handleStartTag().
1254:    * The method is called when the HTML opening tag ((like &lt;table&gt;)
1255:    * is found.
1256:    * Package-private to avoid an accessor method.
1257:    * @param tag The tag
1258:    */
1259:   void _handleStartTag(TagElement tag)
1260:   {
1261:     validator.openTag(tag, attributes);
1262:     startingTag(tag);
1263:     handleStartTag(tag);
1264: 
1265:     HTML.Tag h = tag.getHTMLTag();
1266: 
1267:     if (isBlock(h))
1268:       optional(WS);
1269: 
1270:     if (h.isPreformatted())
1271:       preformatted++;
1272: 
1273:     if (h == HTML.Tag.TITLE)
1274:       {
1275:         if (titleHandled)
1276:           error("Repetetive <TITLE> tag");
1277:         titleOpen = true;
1278:         titleHandled = false;
1279:       }
1280:   }
1281: 
1282:   /**
1283:    * Resume parsing after heavy errors in HTML tag structure.
1284:    * @throws ParseException
1285:    */
1286:   private void forciblyCloseTheTag()
1287:                             throws ParseException
1288:   {
1289:     int closeAt = 0;
1290:     buffer.setLength(0);
1291: 
1292:     ahead:
1293:     for (int i = 1; i < 100; i++)
1294:       {
1295:         t = getTokenAhead(i - 1);
1296:         if (t.kind == EOF || t.kind == BEGIN)
1297:           break ahead;
1298:         if (t.kind == END)
1299:           {
1300:             /* Closing '>' found. */
1301:             closeAt = i;
1302:             break ahead;
1303:           }
1304:       }
1305:     if (closeAt > 0)
1306:       {
1307:         buffer.append("Ignoring '");
1308:         for (int i = 1; i <= closeAt; i++)
1309:           {
1310:             t = getNextToken();
1311:             append(t);
1312:           }
1313:         buffer.append('\'');
1314:         error(buffer.toString());
1315:       }
1316:   }
1317: 
1318:   /**
1319:    * Handle comment in string buffer. You can avoid allocating a char
1320:    * array each time by processing your comment directly here.
1321:    */
1322:   private void handleComment()
1323:   {
1324:     char[] a = new char[ buffer.length() ];
1325:     buffer.getChars(0, a.length, a, 0);
1326:     handleComment(a);
1327:   }
1328: 
1329:   private TagElement makeTagElement(String name, boolean isSupposed)
1330:   {
1331:     Element e = dtd.elementHash.get(name.toLowerCase());
1332:     if (e == null)
1333:       {
1334:         error("Unknown tag <" + name + ">");
1335:         e = dtd.getElement(name);
1336:         e.name = name.toUpperCase();
1337:         e.index = -1;
1338:       }
1339: 
1340:     if (!documentTags.contains(e.name))
1341:       {
1342:         markFirstTime(e);
1343:         documentTags.add(e.name);
1344:       }
1345: 
1346:     return makeTag(e, isSupposed);
1347:   }
1348: 
1349:   /**
1350:    * Read till the given token, resolving entities. Consume the given
1351:    * token without adding it to buffer.
1352:    * @param till The token to read till
1353:    * @throws ParseException
1354:    */
1355:   private void readTillTokenE(int till)
1356:                        throws ParseException
1357:   {
1358:     buffer.setLength(0);
1359:     read:
1360:     while (true)
1361:       {
1362:         t = getNextToken();
1363:         if (t.kind == Constants.ENTITY)
1364:           {
1365:             resolveAndAppendEntity(t);
1366:           }
1367:         else if (t.kind == EOF)
1368:           {
1369:             error("unexpected eof", t);
1370:             break read;
1371:           }
1372:         else if (t.kind == till)
1373:           break read;
1374:         else if (t.kind == WS)
1375:           {
1376:             // Processing whitespace in accordance with CDATA rules:
1377:             String s = t.getImage();
1378:             char c;
1379:             for (int i = 0; i < s.length(); i++)
1380:               {
1381:                 c = s.charAt(i);
1382:                 if (c == '\r')
1383:                   buffer.append(' '); // CR replaced by space
1384:                 else if (c == '\n')
1385:                   { /* LF ignored */ }
1386:                 else if (c == '\t')
1387:                   buffer.append(' '); // Tab replaced by space
1388:                 else
1389:                   buffer.append(c);
1390:               }
1391:           }
1392:         else
1393:           append(t);
1394:       }
1395:   }
1396: 
1397:   /**
1398:    * Resolve the entity and append it to the end of buffer.
1399:    * @param entity
1400:    */
1401:   private void resolveAndAppendEntity(Token entity)
1402:   {
1403:     switch (entity.category)
1404:       {
1405:         case ENTITY_NAMED :
1406:           buffer.append(resolveNamedEntity(entity.getImage()));
1407:           break;
1408: 
1409:         case ENTITY_NUMERIC :
1410:           buffer.append(resolveNumericEntity(entity.getImage()));
1411:           break;
1412: 
1413:         default :
1414:           throw new AssertionError("Invalid entity category " +
1415:                                    entity.category
1416:                                   );
1417:       }
1418:   }
1419: 
1420:   /**
1421:    * Handle the remaining of HTML tags. This is a common end for
1422:    * TAG, SCRIPT and STYLE.
1423:    * @param closing True for closing tags ( &lt;/TAG&gt; ).
1424:    * @param name Name of element
1425:    * @param start Token where element has started
1426:    * @throws ParseException
1427:    */
1428:   private void restOfTag(boolean closing, Token name, Token start)
1429:                   throws ParseException
1430:   {
1431:     boolean end = false;
1432:     Token next;
1433: 
1434:     optional(WS);
1435: 
1436:     readAttributes(name.getImage());
1437: 
1438:     optional(WS);
1439: 
1440:     next = getTokenAhead();
1441:     if (next.kind == END)
1442:       {
1443:         mustBe(END);
1444:         end = true;
1445:       }
1446: 
1447:     hTag = new Token(start, next);
1448: 
1449:     if (!end)
1450:       {
1451:         // The tag body contains errors. If additionally the tag
1452:         // name is not valid, this construction is treated as text.
1453:         if (dtd.elementHash.get(name.getImage().toLowerCase()) == null &&
1454:             backupMode
1455:            )
1456:           {
1457:             error("Errors in tag body and unknown tag name. " +
1458:                   "Treating the tag as a text."
1459:                  );
1460:             reset();
1461: 
1462:             hTag = mustBe(BEGIN);
1463:             buffer.setLength(0);
1464:             buffer.append(hTag.getImage());
1465:             CDATA(false);
1466:             return;
1467:           }
1468:         else
1469:           {
1470:             error("Forcibly closing invalid parameter list");
1471:             forciblyCloseTheTag();
1472:           }
1473:       }
1474: 
1475:     if (closing)
1476:       {
1477:         endTag(false);
1478:         _handleEndTag(makeTagElement(name.getImage(), false));
1479:       }
1480:     else
1481:       {
1482:         TagElement te = makeTagElement(name.getImage(), false);
1483:         if (te.getElement().type == DTDConstants.EMPTY)
1484:           _handleEmptyTag(te);
1485:         else
1486:           {
1487:             // According to the specs we need to consume whitespace following
1488:             // immediately after a opening tag.
1489:             optional(WS);
1490:             _handleStartTag(te);
1491:           }
1492:       }
1493:   }
1494: 
1495:   /**
1496:    * This should fire additional actions in response to the
1497:    * ChangedCharSetException.  The current implementation
1498:    * does nothing.
1499:    * @param tag
1500:    */
1501:   private void startingTag(TagElement tag)
1502:   {
1503:     try
1504:       {
1505:         startTag(tag);
1506:       }
1507:     catch (ChangedCharSetException cax)
1508:       {
1509:         error("Invalid change of charset");
1510:       }
1511:   }
1512: 
1513:   private void ws_error()
1514:   {
1515:     error("Whitespace here is not permitted");
1516:   }
1517: 
1518:   /**
1519:    * Returns true when the specified tag should be considered a block tag
1520:    * wrt whitespace handling. We need this special handling, since there
1521:    * are a couple of tags that we must treat as block tags but which aren't
1522:    * officially block tags.
1523:    *
1524:    * @param tag the tag to check
1525:    * @return true when the specified tag should be considered a block tag
1526:    *         wrt whitespace handling
1527:    */
1528:   private boolean isBlock(HTML.Tag tag)
1529:   {
1530:     return tag.isBlock() || tag == HTML.Tag.STYLE || tag == HTML.Tag.FRAME;
1531:   }
1532: }