Source for gnu.javax.swing.text.html.parser.htmlValidator

   1: /* tagStack.java -- The HTML tag stack.
   2:    Copyright (C) 2005 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: 
  39: package gnu.javax.swing.text.html.parser;
  40: 
  41: import gnu.java.lang.CPStringBuilder;
  42: 
  43: import gnu.javax.swing.text.html.parser.models.node;
  44: import gnu.javax.swing.text.html.parser.models.transformer;
  45: 
  46: import java.util.BitSet;
  47: import java.util.Enumeration;
  48: import java.util.LinkedList;
  49: import java.util.ListIterator;
  50: 
  51: import javax.swing.text.SimpleAttributeSet;
  52: import javax.swing.text.html.HTML;
  53: import javax.swing.text.html.parser.*;
  54: 
  55: /**
  56:  * <p>The HTML content validator, is responsible for opening and
  57:  * closing elements with optional start/end tags, detecting
  58:  * the wrongly placed html tags and reporting errors. The working instance
  59:  * is the inner class inside the {@link javax.swing.text.html.parser.Parser }
  60:  * </p>
  61:  * <p>This class could potentially
  62:  * provide basis for automated closing and insertion of the html tags,
  63:  * correcting the found html errors.
  64:  * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org)
  65:  */
  66: public abstract class htmlValidator
  67: {
  68:   /**
  69:    * The tag reference, holding additional information that the tag
  70:    * has been forcibly closed.
  71:    */
  72:   protected class hTag
  73:   {
  74:     protected final Element element;
  75:     protected final HTML.Tag tag;
  76:     protected final TagElement tgElement;
  77:     protected boolean forcibly_closed;
  78:     protected node validationTrace;
  79: 
  80:     protected hTag(TagElement an_element)
  81:     {
  82:       element = an_element.getElement();
  83:       tag = an_element.getHTMLTag();
  84:       tgElement = an_element;
  85: 
  86:       if (element.content != null)
  87:         validationTrace = transformer.transform(element.content, dtd);
  88:     }
  89: 
  90:     /**
  91:      * This is called when the tag must be forcibly closed because
  92:      * it would make the newly appearing tag invalid.
  93:      * The parser is not notified about such event (just the error
  94:      * is reported). For such tags, the closing message does not
  95:      * appear when later reaching the end of stream. The exception is
  96:      * the &lt;head&gt; tag: the parser is notified about its silent closing
  97:      * when &lt;body&gt; or other html content appears.
  98:      */
  99:     protected void forciblyCloseDueContext()
 100:     {
 101:       forcibly_closed = true;
 102:     }
 103: 
 104:     /**
 105:      * This is called when the tag must be forcibly closed after
 106:      * reaching the end of stream. The parser is notified as if
 107:      * closing the tag explicitly.
 108:      */
 109:     protected void forciblyCloseDueEndOfStream()
 110:     {
 111:       forcibly_closed = true;
 112:       handleSupposedEndTag(element);
 113:     }
 114:   }
 115: 
 116:   /**
 117:    * The DTD, providing information about the valid document structure.
 118:    */
 119:   protected final DTD dtd;
 120: 
 121:   /**
 122:   * The stack, holding the current tag context.
 123:   */
 124:   protected final LinkedList stack = new LinkedList();
 125: 
 126:   /**
 127:    * Creates a new tag stack, using the given DTD.
 128:    * @param a_dtd A DTD, providing the information about the valid
 129:    * tag content.
 130:    */
 131:   public htmlValidator(DTD a_dtd)
 132:   {
 133:     dtd = a_dtd;
 134:   }
 135: 
 136:   /**
 137:    * Close all opened tags (called at the end of parsing).
 138:    */
 139:   public void closeAll()
 140:   {
 141:     hTag h;
 142:     while (!stack.isEmpty())
 143:       {
 144:         h = (hTag) stack.getLast();
 145:         if (!h.forcibly_closed && !h.element.omitEnd())
 146:           s_error("Unclosed <" + h.tag + ">, closing at the end of stream");
 147: 
 148:         handleSupposedEndTag(h.element);
 149: 
 150:         closeTag(h.tgElement);
 151:       }
 152:   }
 153: 
 154:   /**
 155:    * Remove the given tag from the stack or (if found) from the list
 156:    * of the forcibly closed tags.
 157:    */
 158:   public boolean closeTag(TagElement tElement)
 159:   {
 160:     HTML.Tag tag = tElement.getHTMLTag();
 161:     hTag x;
 162:     hTag close;
 163: 
 164:     if (!stack.isEmpty())
 165:       {
 166:         ListIterator iter = stack.listIterator(stack.size());
 167: 
 168:         while (iter.hasPrevious())
 169:           {
 170:             x = (hTag) iter.previous();
 171:             if (tag.equals(x.tag))
 172:               {
 173:                 if (x.forcibly_closed && !x.element.omitEnd())
 174:                   s_error("The tag <" + x.tag +
 175:                           "> has already been forcibly closed"
 176:                          );
 177: 
 178: 
 179:                 // If the tag has a content model defined, forcibly close all
 180:                 // tags that were opened after the tag being currently closed.
 181:                 closing:
 182:                 if (x.element.content != null)
 183:                   {
 184:                     iter = stack.listIterator(stack.size());
 185:                     while (iter.hasPrevious())
 186:                       {
 187:                         close = (hTag) iter.previous();
 188:                         if (close == x)
 189:                           break closing;
 190:                         handleSupposedEndTag(close.element);
 191:                         iter.remove();
 192:                       }
 193:                   }
 194: 
 195:                 stack.remove(x);
 196:                 return true;
 197:               }
 198:           }
 199:       }
 200:     s_error("Closing unopened <" + tag + ">");
 201:     return false;
 202:   }
 203: 
 204:   /**
 205:    * Add the given HTML tag to the stack of the opened tags. Forcibly closes
 206:    * all tags in the stack that does not allow this tag in they content (error
 207:    * is reported).
 208:    * @param element
 209:    */
 210:   public void openTag(TagElement tElement, htmlAttributeSet parameters)
 211:   {
 212:     // If this is a fictional call, the message from the parser
 213:     // has recursively returned - ignore.
 214:     if (tElement.fictional())
 215:       return;
 216: 
 217:     validateParameters(tElement, parameters);
 218: 
 219:     // If the stack is empty, start from HTML
 220:     if (stack.isEmpty() && tElement.getHTMLTag() != HTML.Tag.HTML)
 221:       {
 222:         Element html = dtd.getElement(HTML.Tag.HTML.toString());
 223:         openFictionalTag(html);
 224:       }
 225: 
 226:     Object v = tagIsValidForContext(tElement);
 227:     if (v != Boolean.TRUE)
 228:       {
 229:         // The tag is not valid for context, the content
 230:         // model suggest to open another tag.
 231:         if (v instanceof Element)
 232:           {
 233:             int n = 0;
 234:             while (v instanceof Element && (n++ < 100))
 235:               {
 236:                 Element fe = (Element) v;
 237: 
 238:                 // notify the content model that we add the proposed tag
 239:                 node ccm = getCurrentContentModel();
 240:                 if (ccm != null)
 241:                   ccm.show(fe);
 242:                 openFictionalTag(fe);
 243: 
 244:                 Object vv = tagIsValidForContext(tElement);
 245:                 if (vv instanceof Element) // One level of nesting is supported.
 246:                   {
 247:                     openFictionalTag((Element) vv);
 248: 
 249:                     Object vx = tagIsValidForContext(tElement);
 250:                     if (vx instanceof Element)
 251:                       openFictionalTag((Element) vx);
 252:                   }
 253:                 else if (vv == Boolean.FALSE)
 254:                   {
 255:                     // The tag is still not valid for the current
 256:                     // content after opening a fictional element.
 257:                     if (fe.omitEnd())
 258:                       {
 259:                         // close the previously opened fictional tag.
 260:                         closeLast();
 261:                         vv = tagIsValidForContext(tElement);
 262:                         if (vv instanceof Element)
 263: 
 264:                           // another tag was suggested by the content model
 265:                           openFictionalTag((Element) vv);
 266:                       }
 267:                   }
 268:                 v = tagIsValidForContext(tElement);
 269:               }
 270:           }
 271:         else // If the current element has the optional end tag, close it.
 272:           {
 273:             if (!stack.isEmpty())
 274:               {
 275:                 closing:
 276:                 do
 277:                   {
 278:                     hTag last = (hTag) stack.getLast();
 279:                     if (last.element.omitEnd())
 280:                       {
 281:                         closeLast();
 282:                         v = tagIsValidForContext(tElement);
 283:                         if (v instanceof Element) // another tag was suggested by the content model
 284:                           {
 285:                             openFictionalTag((Element) v);
 286:                             break closing;
 287:                           }
 288:                       }
 289:                     else
 290:                       break closing;
 291:                   }
 292:                 while (v == Boolean.FALSE && !stack.isEmpty());
 293:               }
 294:           }
 295:       }
 296: 
 297:     stack.add(new hTag(tElement));
 298:   }
 299: 
 300:   /**
 301:    * Clear the stack.
 302:    */
 303:   public void restart()
 304:   {
 305:     stack.clear();
 306:   }
 307: 
 308:   /**
 309:    * Check if this tag is valid for the current context. Return Boolean.True if
 310:    * it is OK, Boolean.False if it is surely not OK or the Element that the
 311:    * content model recommends to insert making the situation ok. If Boolean.True
 312:    * is returned, the content model current position is moved forward. Otherwise
 313:    * this position remains the same.
 314:    *
 315:    * @param tElement
 316:    * @return
 317:    */
 318:   public Object tagIsValidForContext(TagElement tElement)
 319:   {
 320:     // Check the current content model, if one is available.
 321:     node cv = getCurrentContentModel();
 322: 
 323:     if (cv != null)
 324:       return cv.show(tElement.getElement());
 325: 
 326:     // Check exclusions and inclusions.
 327:     ListIterator iter = stack.listIterator(stack.size());
 328:     hTag t = null;
 329:     final int idx = tElement.getElement().index;
 330: 
 331:     // Check only known tags.
 332:     if (idx >= 0)
 333:       {
 334:         BitSet inclusions = new BitSet();
 335:         while (iter.hasPrevious())
 336:           {
 337:             t = (hTag) iter.previous();
 338:             if (! t.forcibly_closed)
 339:               {
 340:                 if (t.element.exclusions != null
 341:                     && t.element.exclusions.get(idx))
 342:                   return Boolean.FALSE;
 343: 
 344:                 if (t.element.inclusions != null)
 345:                   inclusions.or(t.element.inclusions);
 346:               }
 347:           }
 348:         if (! inclusions.get(idx))
 349:           {
 350:             // If we need to insert something, and cannot do this, but
 351:             // it is allowed to insert the paragraph here, insert the
 352:             // paragraph.
 353:             Element P = dtd.getElement(HTML_401F.P);
 354:             if (inclusions.get(P.index))
 355:               return P;
 356:             else
 357:               return Boolean.FALSE;
 358:           }
 359:       }
 360:     return Boolean.TRUE;
 361:   }
 362: 
 363:   /**
 364:    * Validate tag without storing in into the tag stack. This is called
 365:    * for the empty tags and results the subsequent calls to the openTag
 366:    * and closeTag.
 367:    */
 368:   public void validateTag(TagElement tElement, htmlAttributeSet parameters)
 369:   {
 370:     openTag(tElement, parameters);
 371:     closeTag(tElement);
 372:   }
 373: 
 374:   /**
 375:    * Check for mandatory elements, subsequent to the last tag:
 376:    * @param tElement The element that will be inserted next.
 377:    */
 378:   protected void checkContentModel(TagElement tElement, boolean first)
 379:   {
 380:     if (stack.isEmpty())
 381:       return;
 382: 
 383:     hTag last = (hTag) stack.getLast();
 384:     if (last.validationTrace == null)
 385:       return;
 386: 
 387:     Object r = last.validationTrace.show(tElement.getElement());
 388:     if (r == Boolean.FALSE)
 389:       s_error("The <" + last.element + "> does not match the content model " +
 390:               last.validationTrace
 391:              );
 392:     else if (r instanceof Element) // The content model recommends insertion of this element
 393:       {
 394:         if (!first)
 395:           closeTag(last.tgElement);
 396:         handleSupposedStartTag((Element) r);
 397:         openTag(new TagElement((Element) r), null);
 398:       }
 399:   }
 400: 
 401:   /**
 402:    * The method is called when the tag must be closed because
 403:    * it does not allow the subsequent elements inside its context
 404:    * or the end of stream has been reached. The parser is only
 405:    * informed if the element being closed does not require the
 406:    * end tag (the "omitEnd" flag is set).
 407:    * The closing message must be passed to the parser mechanism
 408:    * before passing message about the opening the next tag.
 409:    *
 410:    * @param element The tag being fictionally (forcibly) closed.
 411:    */
 412:   protected abstract void handleSupposedEndTag(Element element);
 413: 
 414:   /**
 415:    * The method is called when the validator decides to open the
 416:    * tag on its own initiative. This may happen if the content model
 417:    * includes the element with the optional (supposed) start tag.
 418:    *
 419:    * @param element The tag being opened.
 420:    */
 421:   protected abstract void handleSupposedStartTag(Element element);
 422: 
 423:   /**
 424:    * Handles the error message. This method must be overridden to pass
 425:    * the message where required.
 426:    * @param msg The message text.
 427:    */
 428:   protected abstract void s_error(String msg);
 429: 
 430:   /**
 431:    * Validate the parameters, report the error if the given parameter is
 432:    * not in the parameter set, valid for the given attribute. The information
 433:    * about the valid parameter set is taken from the Element, enclosed
 434:    * inside the tag. The method does not validate the default parameters.
 435:    * @param tag The tag
 436:    * @param parameters The parameters of this tag.
 437:    */
 438:   protected void validateParameters(TagElement tag, htmlAttributeSet parameters)
 439:   {
 440:     if (parameters == null ||
 441:         parameters == htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET ||
 442:         parameters == SimpleAttributeSet.EMPTY
 443:        )
 444:       return;
 445: 
 446:     Enumeration enumeration = parameters.getAttributeNames();
 447: 
 448:     while (enumeration.hasMoreElements())
 449:       {
 450:         validateAttribute(tag, parameters, enumeration);
 451:       }
 452: 
 453:     // Check for missing required values.
 454:     AttributeList a = tag.getElement().getAttributes();
 455: 
 456:     while (a != null)
 457:       {
 458:         if (a.getModifier() == DTDConstants.REQUIRED)
 459:           if (parameters.getAttribute(a.getName()) == null)
 460:             {
 461:               s_error("Missing required attribute '" + a.getName() + "' for <" +
 462:                       tag.getHTMLTag() + ">"
 463:                      );
 464:             }
 465:         a = a.next;
 466:       }
 467:   }
 468: 
 469:   private node getCurrentContentModel()
 470:   {
 471:     if (!stack.isEmpty())
 472:       {
 473:         hTag last = (hTag) stack.getLast();
 474:         return last.validationTrace;
 475:       }
 476:     else
 477:       return null;
 478:   }
 479: 
 480:   private void closeLast()
 481:   {
 482:     handleSupposedEndTag(((hTag) stack.getLast()).element);
 483:     stack.removeLast();
 484:   }
 485: 
 486:   private void openFictionalTag(Element e)
 487:   {
 488:     handleSupposedStartTag(e);
 489:     stack.add(new hTag(new TagElement(e, true)));
 490:     if (!e.omitStart())
 491:       s_error("<" + e + "> is expected (supposing it)");
 492:   }
 493: 
 494:   private void validateAttribute(TagElement tag, htmlAttributeSet parameters,
 495:                                  Enumeration enumeration
 496:                                 )
 497:   {
 498:     Object foundAttribute;
 499:     AttributeList dtdAttribute;
 500:     foundAttribute = enumeration.nextElement();
 501:     dtdAttribute = tag.getElement().getAttribute(foundAttribute.toString());
 502:     if (dtdAttribute == null)
 503:       {
 504:         CPStringBuilder valid =
 505:           new CPStringBuilder("The tag <" + tag.getHTMLTag() +
 506:                               "> cannot contain the attribute '" + foundAttribute +
 507:                               "'. The valid attributes for this tag are: "
 508:                               );
 509: 
 510:         AttributeList a = tag.getElement().getAttributes();
 511: 
 512:         while (a != null)
 513:           {
 514:             valid.append(a.name.toUpperCase());
 515:             valid.append(' ');
 516:             a = a.next;
 517:           }
 518:         s_error(valid.toString());
 519:       }
 520: 
 521:     else
 522:       {
 523:         String value = parameters.getAttribute(foundAttribute).toString();
 524: 
 525:         if (dtdAttribute.type == DTDConstants.NUMBER)
 526:           validateNumberAttribute(tag, foundAttribute, value);
 527: 
 528:         if (dtdAttribute.type == DTDConstants.NAME ||
 529:             dtdAttribute.type == DTDConstants.ID
 530:            )
 531:           validateNameOrIdAttribute(tag, foundAttribute, value);
 532: 
 533:         if (dtdAttribute.values != null)
 534:           validateAttributeWithValueList(tag, foundAttribute, dtdAttribute,
 535:                                          value
 536:                                         );
 537:       }
 538:   }
 539: 
 540:   private void validateAttributeWithValueList(TagElement tag,
 541:                                               Object foundAttribute,
 542:                                               AttributeList dtdAttribute,
 543:                                               String value
 544:                                              )
 545:   {
 546:     if (!dtdAttribute.values.contains(value.toLowerCase()) &&
 547:         !dtdAttribute.values.contains(value.toUpperCase())
 548:        )
 549:       {
 550:         CPStringBuilder valid;
 551:         if (dtdAttribute.values.size() == 1)
 552:           valid =
 553:             new CPStringBuilder("The attribute '" + foundAttribute +
 554:                                 "' of the tag <" + tag.getHTMLTag() +
 555:                                 "> cannot have the value '" + value +
 556:                                 "'. The only valid value is "
 557:                                 );
 558:         else
 559:           valid =
 560:             new CPStringBuilder("The attribute '" + foundAttribute +
 561:                                 "' of the tag <" + tag.getHTMLTag() +
 562:                                 "> cannot have the value '" + value + "'. The " +
 563:                                 dtdAttribute.values.size() +
 564:                                 " valid values are: "
 565:                                 );
 566: 
 567:         Enumeration vv = dtdAttribute.values.elements();
 568:         while (vv.hasMoreElements())
 569:           {
 570:             valid.append('"');
 571:             valid.append(vv.nextElement());
 572:             valid.append("\"  ");
 573:           }
 574:         s_error(valid.toString());
 575:       }
 576:   }
 577: 
 578:   private void validateNameOrIdAttribute(TagElement tag, Object foundAttribute,
 579:                                          String value
 580:                                         )
 581:   {
 582:     boolean ok = true;
 583: 
 584:     if (!Character.isLetter(value.charAt(0)))
 585:       ok = false;
 586: 
 587:     char c;
 588:     for (int i = 0; i < value.length(); i++)
 589:       {
 590:         c = value.charAt(i);
 591:         if (!(
 592:               Character.isLetter(c) || Character.isDigit(c) ||
 593:               "".indexOf(c) >= 0
 594:             )
 595:            )
 596:           ok = false;
 597:       }
 598:     if (!ok)
 599:       s_error("The '" + foundAttribute + "' attribute of the tag <" +
 600:               tag.getHTMLTag() + "> must start from letter and consist of " +
 601:               "letters, digits, hypens, colons, underscores and periods. " +
 602:               "It cannot be '" + value + "'"
 603:              );
 604:   }
 605: 
 606:   private void validateNumberAttribute(TagElement tag, Object foundAttribute,
 607:                                        String value
 608:                                       )
 609:   {
 610:     try
 611:       {
 612:         Integer.parseInt(value);
 613:       }
 614:     catch (NumberFormatException ex)
 615:       {
 616:         s_error("The '" + foundAttribute + "' attribute of the tag <" +
 617:                 tag.getHTMLTag() + "> must be a valid number and not '" +
 618:                 value + "'"
 619:                );
 620:       }
 621:   }
 622: }