Source for gnu.xml.pipeline.ValidationConsumer

   1: /* ValidationConsumer.java --
   2:    Copyright (C) 1999,2000,2001 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: package gnu.xml.pipeline;
  39: 
  40: import java.io.IOException;
  41: import java.io.StringReader;
  42: import java.io.StringWriter;
  43: import java.util.EmptyStackException;
  44: import java.util.Enumeration;
  45: import java.util.Hashtable;
  46: import java.util.Stack;
  47: import java.util.StringTokenizer;
  48: import java.util.Vector;
  49: 
  50: import org.xml.sax.Attributes;
  51: import org.xml.sax.EntityResolver;
  52: import org.xml.sax.ErrorHandler;
  53: import org.xml.sax.InputSource;
  54: import org.xml.sax.Locator;
  55: import org.xml.sax.SAXException;
  56: import org.xml.sax.SAXParseException;
  57: import org.xml.sax.XMLReader;
  58: import org.xml.sax.helpers.XMLReaderFactory;
  59: 
  60: /**
  61:  * This class checks SAX2 events to report validity errors; it works as
  62:  * both a filter and a terminus on an event pipeline.  It relies on the
  63:  * producer of SAX events to:  </p> <ol>
  64:  *
  65:  *      <li> Conform to the specification of a non-validating XML parser that
  66:  *      reads all external entities, reported using SAX2 events. </li>
  67:  *
  68:  *      <li> Report ignorable whitespace as such (through the ContentHandler
  69:  *      interface).  This is, strictly speaking, optional for nonvalidating
  70:  *      XML processors.  </li>
  71:  *
  72:  *      <li> Make SAX2 DeclHandler callbacks, with default
  73:  *      attribute values already normalized (and without "&lt;").</li>
  74:  *
  75:  *      <li> Make SAX2 LexicalHandler startDTD() and endDTD ()
  76:  *      callbacks. </li>
  77:  *
  78:  *      <li> Act as if the <em>(URI)/namespace-prefixes</em> property were
  79:  *      set to true, by providing XML 1.0 names and all <code>xmlns*</code>
  80:  *      attributes (rather than omitting either or both). </li>
  81:  *
  82:  *      </ol>
  83:  *
  84:  * <p> At this writing, the major SAX2 parsers (such as &AElig;lfred2,
  85:  * Crimson, and Xerces) meet these requirements, and this validation
  86:  * module is used by the optional &AElig;lfred2 validation support.
  87:  * </p>
  88:  *
  89:  * <p> Note that because this is a layered validator, it has to duplicate some
  90:  * work that the parser is doing; there are also other cost to layering.
  91:  * However, <em>because of layering it doesn't need a parser</em> in order
  92:  * to work! You can use it with anything that generates SAX events, such
  93:  * as an application component that wants to detect invalid content in
  94:  * a changed area without validating an entire document, or which wants to
  95:  * ensure that it doesn't write invalid data to a communications partner.</p>
  96:  *
  97:  * <p> Also, note that because this is a layered validator, the line numbers
  98:  * reported for some errors may seem strange.  For example, if an element does
  99:  * not permit character content, the validator
 100:  * will use the locator provided to it.
 101:  * That might reflect the last character of a <em>characters</em> event
 102:  * callback, rather than the first non-whitespace character. </p>
 103:  *
 104:  * <hr />
 105:  *
 106:  * <!--
 107:  * <p> Of interest is the fact that unlike most currently known XML validators,
 108:  * this one can report some cases of non-determinism in element content models.
 109:  * It is a compile-time option, enabled by default.  This will only report
 110:  * such XML errors if they relate to content actually appearing in a document;
 111:  * content models aren't aggressively scanned for non-deterministic structure.
 112:  * Documents which trigger such non-deterministic transitions may be handled
 113:  * differently by different validating parsers, without losing conformance
 114:  * to the XML specification. </p>
 115:  * -->
 116:  *
 117:  * <p> Current limitations of the validation performed are in roughly three
 118:  * categories.  </p>
 119:  *
 120:  * <p> The first category represents constraints which demand violations
 121:  * of software layering:  exposing lexical details, one of the first things
 122:  * that <em>application</em> programming interfaces (APIs) hide.  These
 123:  * invariably relate to XML entity handling, and to historical oddities
 124:  * of the XML validation semantics.  Curiously,
 125:  * recent (Autumn 1999) conformance testing showed that these constraints are
 126:  * among those handled worst by existing XML validating parsers.  Arguments
 127:  * have been made that each of these VCs should be turned into WFCs (most
 128:  * of them) or discarded (popular for the standalone declaration); in short,
 129:  * that these are bugs in the XML specification (not all via SGML): </p><ul>
 130:  *
 131:  *      <li> The <em>Proper Declaration/PE Nesting</em> and
 132:  *      <em>Proper Group/PE Nesting</em> VCs can't be tested because they
 133:  *      require access to particularly low level lexical level information.
 134:  *      In essence, the reason XML isn't a simple thing to parse is that
 135:  *      it's not a context free grammar, and these constraints elevate that
 136:  *      SGML-derived context sensitivity to the level of a semantic rule.
 137:  *
 138:  *      <li> The <em>Standalone Document Declaration</em> VC can't be
 139:  *      tested.  This is for two reasons.  First, this flag isn't made
 140:  *      available through SAX2.  Second, it also requires breaking that
 141:  *      lexical layering boundary.  (If you ever wondered why classes
 142:  *      in compiler construction or language design barely mention the
 143:  *      existence of context-sensitive grammars, it's because of messy
 144:  *      issues like these.)
 145:  *
 146:  *      <li> The <em>Entity Declared</em> VC can't be tested, because it
 147:  *      also requires breaking that lexical layering boundary!  There's also
 148:  *      another issue: the VC wording (and seemingly intent) is ambiguous.
 149:  *      (This is still true in the "Second edition" XML spec.)
 150:  *      Since there is a WFC of the same name, everyone's life would be
 151:  *      easier if references to undeclared parsed entities were always well
 152:  *      formedness errors, regardless of whether they're parameter entities
 153:  *      or not.  (Note that nonvalidating parsers are not required
 154:  *      to report all such well formedness errors if they don't read external
 155:  *      parameter entities, although currently most XML parsers read them
 156:  *      in an attempt to avoid problems from inconsistent parser behavior.)
 157:  *
 158:  *      </ul>
 159:  *
 160:  * <p> The second category of limitations on this validation represent
 161:  * constraints associated with information that is not guaranteed to be
 162:  * available (or in one case, <em>is guaranteed not to be available</em>,
 163:  * through the SAX2 API: </p><ul>
 164:  *
 165:  *      <li> The <em>Unique Element Type Declaration</em> VC may not be
 166:  *      reportable, if the underlying parser happens not to expose
 167:  *      multiple declarations.   (&AElig;lfred2 reports these validity
 168:  *      errors directly.)</li>
 169:  *
 170:  *      <li> Similarly, the <em>Unique Notation Name</em> VC, added in the
 171:  *      14-January-2000 XML spec errata to restrict typing models used by
 172:  *      elements, may not be reportable.  (&AElig;lfred reports these
 173:  *      validity errors directly.) </li>
 174:  *
 175:  *      </ul>
 176:  *
 177:  * <p> A third category relates to ease of implementation.  (Think of this
 178:  * as "bugs".)  The most notable issue here is character handling.  Rather
 179:  * than attempting to implement the voluminous character tables in the XML
 180:  * specification (Appendix B), Unicode rules are used directly from
 181:  * the java.lang.Character class.  Recent JVMs have begun to diverge from
 182:  * the original specification for that class (Unicode 2.0), meaning that
 183:  * different JVMs may handle that aspect of conformance differently.
 184:  * </p>
 185:  *
 186:  * <p> Note that for some of the validity errors that SAX2 does not
 187:  * expose, a nonvalidating parser is permitted (by the XML specification)
 188:  * to report validity errors.  When used with a parser that does so for
 189:  * the validity constraints mentioned above (or any other SAX2 event
 190:  * stream producer that does the same thing), overall conformance is
 191:  * substantially improved.
 192:  *
 193:  * @see gnu.xml.aelfred2.SAXDriver
 194:  * @see gnu.xml.aelfred2.XmlReader
 195:  *
 196:  * @author David Brownell
 197:  */
 198: public final class ValidationConsumer extends EventFilter
 199: {
 200:     // report error if we happen to notice a non-deterministic choice?
 201:     // we won't report buggy content models; just buggy instances
 202:     private static final boolean        warnNonDeterministic = false;
 203: 
 204:     // for tracking active content models
 205:     private String              rootName;
 206:     private Stack               contentStack = new Stack ();
 207: 
 208:     // flags for "saved DTD" processing
 209:     private boolean             disableDeclarations;
 210:     private boolean             disableReset;
 211: 
 212:     //
 213:     // most VCs get tested when we see element start tags.  the per-element
 214:     // info (including attributes) recorded here duplicates that found inside
 215:     // many nonvalidating parsers, hence dual lookups etc ... that's why a
 216:     // layered validator isn't going to be as fast as a non-layered one.
 217:     //
 218: 
 219:     // key = element name; value = ElementInfo
 220:     private Hashtable           elements = new Hashtable ();
 221: 
 222:     // some VCs relate to ID/IDREF/IDREFS attributes
 223:     // key = id; value = boolean true (defd) or false (refd)
 224:     private Hashtable           ids = new Hashtable ();
 225: 
 226:     // we just record declared notation and unparsed entity names.
 227:     // the implementation here is simple/slow; these features
 228:     // are seldom used, one hopes they'll wither away soon
 229:     private Vector              notations = new Vector (5, 5);
 230:     private Vector              nDeferred = new Vector (5, 5);
 231:     private Vector              unparsed = new Vector (5, 5);
 232:     private Vector              uDeferred = new Vector (5, 5);
 233: 
 234:         // note: DocBk 3.1.7 XML defines over 2 dozen notations,
 235:         // used when defining unparsed entities for graphics
 236:         // (and maybe in other places)
 237: 
 238: 
 239: 
 240:     /**
 241:      * Creates a pipeline terminus which consumes all events passed to
 242:      * it; this will report validity errors as if they were fatal errors,
 243:      * unless an error handler is assigned.
 244:      *
 245:      * @see #setErrorHandler
 246:      */
 247:         // constructor used by PipelineFactory
 248:             // ... and want one taking system ID of an external subset
 249:     public ValidationConsumer ()
 250:     {
 251:         this (null);
 252:     }
 253: 
 254:     /**
 255:      * Creates a pipeline filter which reports validity errors and then
 256:      * passes events on to the next consumer if they were not fatal.
 257:      *
 258:      * @see #setErrorHandler
 259:      */
 260:         // constructor used by PipelineFactory
 261:             // ... and want one taking system ID of an external subset
 262:             // (which won't send declaration events)
 263:     public ValidationConsumer (EventConsumer next)
 264:     {
 265:         super (next);
 266: 
 267:         setContentHandler (this);
 268:         setDTDHandler (this);
 269:         try { setProperty (DECL_HANDLER, this); }
 270:         catch (Exception e) { /* "can't happen" */ }
 271:         try { setProperty (LEXICAL_HANDLER, this); }
 272:         catch (Exception e) { /* "can't happen" */ }
 273:     }
 274: 
 275: 
 276:     private static final String fakeRootName
 277:         = ":Nobody:in:their_Right.Mind_would:use:this-name:1x:";
 278: 
 279:     /**
 280:      * Creates a validation consumer which is preloaded with the DTD provided.
 281:      * It does this by constructing a document with that DTD, then parsing
 282:      * that document and recording its DTD declarations.  Then it arranges
 283:      * not to modify that information.
 284:      *
 285:      * <p> The resulting validation consumer will only validate against
 286:      * the specified DTD, regardless of whether some other DTD is found
 287:      * in a document being parsed.
 288:      *
 289:      * @param rootName The name of the required root element; if this is
 290:      *  null, any root element name will be accepted.
 291:      * @param publicId If non-null and there is a non-null systemId, this
 292:      *  identifier provides an alternate access identifier for the DTD's
 293:      *  external subset.
 294:      * @param systemId If non-null, this is a URI (normally URL) that
 295:      *  may be used to access the DTD's external subset.
 296:      * @param internalSubset If non-null, holds literal markup declarations
 297:      *  comprising the DTD's internal subset.
 298:      * @param resolver If non-null, this will be provided to the parser for
 299:      *  use when resolving parameter entities (including any external subset).
 300:      * @param resolver If non-null, this will be provided to the parser for
 301:      *  use when resolving parameter entities (including any external subset).
 302:      * @param minimalElement If non-null, a minimal valid document.
 303:      *
 304:      * @exception SAXNotSupportedException If the default SAX parser does
 305:      *  not support the standard lexical or declaration handlers.
 306:      * @exception SAXParseException If the specified DTD has either
 307:      *  well-formedness or validity errors
 308:      * @exception IOException If the specified DTD can't be read for
 309:      *  some reason
 310:      */
 311:     public ValidationConsumer (
 312:         String          rootName,
 313:         String          publicId,
 314:         String          systemId,
 315:         String          internalSubset,
 316:         EntityResolver  resolver,
 317:         String          minimalDocument
 318:     ) throws SAXException, IOException
 319:     {
 320:         this (null);
 321: 
 322:         disableReset = true;
 323:         if (rootName == null)
 324:             rootName = fakeRootName;
 325: 
 326:         //
 327:         // Synthesize document with that DTD; is it possible to do
 328:         // better for the declaration of the root element?
 329:         //
 330:         // NOTE:  can't use SAX2 to write internal subsets.
 331:         //
 332:         StringWriter    writer = new StringWriter ();
 333: 
 334:         writer.write ("<!DOCTYPE ");
 335:         writer.write (rootName);
 336:         if (systemId != null) {
 337:             writer.write ("\n  ");
 338:             if (publicId != null) {
 339:                 writer.write ("PUBLIC '");
 340:                 writer.write (publicId);
 341:                 writer.write ("'\n\t'");
 342:             } else
 343:                 writer.write ("SYSTEM '");
 344:             writer.write (systemId);
 345:             writer.write ("'");
 346:         }
 347:         writer.write (" [ ");
 348:         if (rootName == fakeRootName) {
 349:             writer.write ("\n<!ELEMENT ");
 350:             writer.write (rootName);
 351:             writer.write (" EMPTY>");
 352:         }
 353:         if (internalSubset != null)
 354:             writer.write (internalSubset);
 355:         writer.write ("\n ]>");
 356: 
 357:         if (minimalDocument != null) {
 358:             writer.write ("\n");
 359:             writer.write (minimalDocument);
 360:             writer.write ("\n");
 361:         } else {
 362:             writer.write (" <");
 363:             writer.write (rootName);
 364:             writer.write ("/>\n");
 365:         }
 366:         minimalDocument = writer.toString ();
 367: 
 368:         //
 369:         // OK, load it
 370:         //
 371:         XMLReader       producer;
 372: 
 373:         producer = XMLReaderFactory.createXMLReader ();
 374:         bind (producer, this);
 375: 
 376:         if (resolver != null)
 377:             producer.setEntityResolver (resolver);
 378: 
 379:         InputSource     in;
 380: 
 381:         in = new InputSource (new StringReader (minimalDocument));
 382:         producer.parse (in);
 383: 
 384:         disableDeclarations = true;
 385:         if (rootName == fakeRootName)
 386:             this.rootName = null;
 387:     }
 388: 
 389:     private void resetState ()
 390:     {
 391:         if (!disableReset) {
 392:             rootName = null;
 393:             contentStack.removeAllElements ();
 394:             elements.clear ();
 395:             ids.clear ();
 396: 
 397:             notations.removeAllElements ();
 398:             nDeferred.removeAllElements ();
 399:             unparsed.removeAllElements ();
 400:             uDeferred.removeAllElements ();
 401:         }
 402:     }
 403: 
 404: 
 405:     private void warning (String description)
 406:     throws SAXException
 407:     {
 408:         ErrorHandler            errHandler = getErrorHandler ();
 409:         Locator                 locator = getDocumentLocator ();
 410:         SAXParseException       err;
 411: 
 412:         if (errHandler == null)
 413:             return;
 414: 
 415:         if (locator == null)
 416:             err = new SAXParseException (description, null, null, -1, -1);
 417:         else
 418:             err = new SAXParseException (description, locator);
 419:         errHandler.warning (err);
 420:     }
 421: 
 422:     // package private (for ChildrenRecognizer)
 423:     private void error (String description)
 424:     throws SAXException
 425:     {
 426:         ErrorHandler            errHandler = getErrorHandler ();
 427:         Locator                 locator = getDocumentLocator ();
 428:         SAXParseException       err;
 429: 
 430:         if (locator == null)
 431:             err = new SAXParseException (description, null, null, -1, -1);
 432:         else
 433:             err = new SAXParseException (description, locator);
 434:         if (errHandler != null)
 435:             errHandler.error (err);
 436:         else    // else we always treat it as fatal!
 437:             throw err;
 438:     }
 439: 
 440:     private void fatalError (String description)
 441:     throws SAXException
 442:     {
 443:         ErrorHandler            errHandler = getErrorHandler ();
 444:         Locator                 locator = getDocumentLocator ();
 445:         SAXParseException       err;
 446: 
 447:         if (locator != null)
 448:             err = new SAXParseException (description, locator);
 449:         else
 450:             err = new SAXParseException (description, null, null, -1, -1);
 451:         if (errHandler != null)
 452:             errHandler.fatalError (err);
 453:         // we always treat this as fatal, regardless of the handler
 454:         throw err;
 455:     }
 456: 
 457: 
 458:     private static boolean isExtender (char c)
 459:     {
 460:         // [88] Extender ::= ...
 461:         return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
 462:                || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005
 463:                || (c >= 0x3031 && c <= 0x3035)
 464:                || (c >= 0x309d && c <= 0x309e)
 465:                || (c >= 0x30fc && c <= 0x30fe);
 466:     }
 467: 
 468: 
 469:     // use augmented Unicode rules, not full XML rules
 470:     private boolean isName (String name, String context, String id)
 471:     throws SAXException
 472:     {
 473:         char    buf [] = name.toCharArray ();
 474:         boolean pass = true;
 475: 
 476:         if (!Character.isUnicodeIdentifierStart (buf [0])
 477:                 && ":_".indexOf (buf [0]) == -1)
 478:             pass = false;
 479:         else {
 480:             int max = buf.length;
 481:             for (int i = 1; pass && i < max; i++) {
 482:                 char c = buf [i];
 483:                 if (!Character.isUnicodeIdentifierPart (c)
 484:                         && ":-_.".indexOf (c) == -1
 485:                         && !isExtender (c))
 486:                     pass = false;
 487:             }
 488:         }
 489: 
 490:         if (!pass)
 491:             error ("In " + context + " for " + id
 492:                 + ", '" + name + "' is not a name");
 493:         return pass;    // true == OK
 494:     }
 495: 
 496:     // use augmented Unicode rules, not full XML rules
 497:     private boolean isNmtoken (String nmtoken, String context, String id)
 498:     throws SAXException
 499:     {
 500:         char    buf [] = nmtoken.toCharArray ();
 501:         boolean pass = true;
 502:         int     max = buf.length;
 503: 
 504:         // XXX make this share code with isName
 505: 
 506:         for (int i = 0; pass && i < max; i++) {
 507:                 char c = buf [i];
 508:             if (!Character.isUnicodeIdentifierPart (c)
 509:                     && ":-_.".indexOf (c) == -1
 510:                     && !isExtender (c))
 511:                 pass = false;
 512:         }
 513: 
 514:         if (!pass)
 515:             error ("In " + context + " for " + id
 516:                 + ", '" + nmtoken + "' is not a name token");
 517:         return pass;    // true == OK
 518:     }
 519: 
 520:     private void checkEnumeration (String value, String type, String name)
 521:     throws SAXException
 522:     {
 523:         if (!hasMatch (value, type))
 524:             // VC: Enumeration
 525:             error ("Value '" + value
 526:                 + "' for attribute '" + name
 527:                 + "' is not permitted: " + type);
 528:     }
 529: 
 530:     // used to test enumerated attributes and mixed content models
 531:     // package private
 532:     static boolean hasMatch (String value, String orList)
 533:     {
 534:         int len = value.length ();
 535:         int max = orList.length () - len;
 536: 
 537:         for (int start = 0;
 538:                 (start = orList.indexOf (value, start)) != -1;
 539:                 start++) {
 540:             char c;
 541: 
 542:             if (start > max)
 543:                 break;
 544:             c = orList.charAt (start - 1);
 545:             if (c != '|' && c != '('/*)*/)
 546:                 continue;
 547:             c = orList.charAt (start + len);
 548:             if (c != '|' && /*(*/ c != ')')
 549:                 continue;
 550:             return true;
 551:         }
 552:         return false;
 553:     }
 554: 
 555:     /**
 556:      * <b>LexicalHandler</b> Records the declaration of the root
 557:      * element, so it can be verified later.
 558:      * Passed to the next consumer, unless this one was
 559:      * preloaded with a particular DTD.
 560:      */
 561:     public void startDTD (String name, String publicId, String systemId)
 562:     throws SAXException
 563:     {
 564:         if (disableDeclarations)
 565:             return;
 566: 
 567:         rootName = name;
 568:         super.startDTD (name, publicId, systemId);
 569:     }
 570: 
 571:     /**
 572:      * <b>LexicalHandler</b> Verifies that all referenced notations
 573:      * and unparsed entities have been declared.
 574:      * Passed to the next consumer, unless this one was
 575:      * preloaded with a particular DTD.
 576:      */
 577:     public void endDTD ()
 578:     throws SAXException
 579:     {
 580:         if (disableDeclarations)
 581:             return;
 582: 
 583:         // this is a convenient hook for end-of-dtd checks, but we
 584:         // could also trigger it in the first startElement call.
 585:         // locator info is more appropriate here though.
 586: 
 587:         // VC: Notation Declared (NDATA can refer to them before decls,
 588:         //      as can NOTATION attribute enumerations and defaults)
 589:         int length = nDeferred.size ();
 590:         for (int i = 0; i < length; i++) {
 591:             String notation = (String) nDeferred.elementAt (i);
 592:             if (!notations.contains (notation)) {
 593:                 error ("A declaration referred to notation '" + notation
 594:                         + "' which was never declared");
 595:             }
 596:         }
 597:         nDeferred.removeAllElements ();
 598: 
 599:         // VC: Entity Name (attribute values can refer to them
 600:         //      before they're declared); VC Attribute Default Legal
 601:         length = uDeferred.size ();
 602:         for (int i = 0; i < length; i++) {
 603:             String entity = (String) uDeferred.elementAt (i);
 604:             if (!unparsed.contains (entity)) {
 605:                 error ("An attribute default referred to entity '" + entity
 606:                         + "' which was never declared");
 607:             }
 608:         }
 609:         uDeferred.removeAllElements ();
 610:         super.endDTD ();
 611:     }
 612: 
 613: 
 614:     // These are interned, so we can rely on "==" to find the type of
 615:     // all attributes except enumerations ...
 616:     // "(this|or|that|...)" and "NOTATION (this|or|that|...)"
 617:     static final String types [] = {
 618:         "CDATA",
 619:         "ID", "IDREF", "IDREFS",
 620:         "NMTOKEN", "NMTOKENS",
 621:         "ENTITY", "ENTITIES"
 622:     };
 623: 
 624: 
 625:     /**
 626:      * <b>DecllHandler</b> Records attribute declaration for later use
 627:      * in validating document content, and checks validity constraints
 628:      * that are applicable to attribute declarations.
 629:      * Passed to the next consumer, unless this one was
 630:      * preloaded with a particular DTD.
 631:      */
 632:     public void attributeDecl (
 633:         String eName,
 634:         String aName,
 635:         String type,
 636:         String mode,
 637:         String value
 638:     ) throws SAXException
 639:     {
 640:         if (disableDeclarations)
 641:             return;
 642: 
 643:         ElementInfo     info = (ElementInfo) elements.get (eName);
 644:         AttributeInfo   ainfo = new AttributeInfo ();
 645:         boolean         checkOne = false;
 646:         boolean         interned = false;
 647: 
 648:         // cheap interning of type names and #FIXED, #REQUIRED
 649:         // for faster startElement (we can use "==")
 650:         for (int i = 0; i < types.length; i++) {
 651:             if (types [i].equals (type)) {
 652:                 type = types [i];
 653:                 interned = true;
 654:                 break;
 655:             }
 656:         }
 657:         if ("#FIXED".equals (mode))
 658:             mode = "#FIXED";
 659:         else if ("#REQUIRED".equals (mode))
 660:             mode = "#REQUIRED";
 661: 
 662:         ainfo.type = type;
 663:         ainfo.mode = mode;
 664:         ainfo.value = value;
 665: 
 666:         // we might not have seen the content model yet
 667:         if (info == null) {
 668:             info = new ElementInfo (eName);
 669:             elements.put (eName, info);
 670:         }
 671:         if ("ID" == type) {
 672:             checkOne = true;
 673:             if (!("#REQUIRED" == mode || "#IMPLIED".equals (mode))) {
 674:                 // VC: ID Attribute Default
 675:                 error ("ID attribute '" + aName
 676:                     + "' must be #IMPLIED or #REQUIRED");
 677:             }
 678: 
 679:         } else if (!interned && type.startsWith ("NOTATION ")) {
 680:             checkOne = true;
 681: 
 682:             // VC: Notation Attributes (notations must be declared)
 683:             StringTokenizer     tokens = new StringTokenizer (
 684:                 type.substring (10, type.lastIndexOf (')')),
 685:                 "|");
 686:             while (tokens.hasMoreTokens ()) {
 687:                 String  token = tokens.nextToken ();
 688:                 if (!notations.contains (token))
 689:                     nDeferred.addElement (token);
 690:             }
 691:         }
 692:         if (checkOne) {
 693:             for (Enumeration e = info.attributes.keys ();
 694:                     e.hasMoreElements ();
 695:                     /* NOP */) {
 696:                 String          name;
 697:                 AttributeInfo   ainfo2;
 698: 
 699:                 name = (String) e.nextElement ();
 700:                 ainfo2 = (AttributeInfo) info.attributes.get (name);
 701:                 if (type == ainfo2.type || !interned /* NOTATION */) {
 702:                     // VC: One ID per Element Type
 703:                     // VC: One Notation per Element TYpe
 704:                     error ("Element '" + eName
 705:                         + "' already has an attribute of type "
 706:                         + (interned ? "NOTATION" : type)
 707:                         + " ('" + name
 708:                         + "') so '" + aName
 709:                         + "' is a validity error");
 710:                 }
 711:             }
 712:         }
 713: 
 714:         // VC: Attribute Default Legal
 715:         if (value != null) {
 716: 
 717:             if ("CDATA" == type) {
 718:                 // event source rejected '<'
 719: 
 720:             } else if ("NMTOKEN" == type) {
 721:                 // VC: Name Token (is a nmtoken)
 722:                 isNmtoken (value, "attribute default", aName);
 723: 
 724:             } else if ("NMTOKENS" == type) {
 725:                 // VC: Name Token (is a nmtoken; at least one value)
 726:                 StringTokenizer tokens = new StringTokenizer (value);
 727:                 if (!tokens.hasMoreTokens ())
 728:                     error ("Default for attribute '" + aName
 729:                         + "' must have at least one name token.");
 730:                 else do {
 731:                     String token = tokens.nextToken ();
 732:                     isNmtoken (token, "attribute default", aName);
 733:                 } while (tokens.hasMoreTokens ());
 734: 
 735:             } else if ("IDREF" == type || "ENTITY" == type) {
 736:                 // VC: Entity Name (is a name)
 737:                 // VC: IDREF (is a name) (is declared)
 738:                 isName (value, "attribute default", aName);
 739:                 if ("ENTITY" == type && !unparsed.contains (value))
 740:                     uDeferred.addElement (value);
 741: 
 742:             } else if ("IDREFS" == type || "ENTITIES" == type) {
 743:                 // VC: Entity Name (is a name; at least one value)
 744:                 // VC: IDREF (is a name; at least one value)
 745:                 StringTokenizer names = new StringTokenizer (value);
 746:                 if (!names.hasMoreTokens ())
 747:                     error ("Default for attribute '" + aName
 748:                         + "' must have at least one name.");
 749:                 else do {
 750:                     String name = names.nextToken ();
 751:                     isName (name, "attribute default", aName);
 752:                     if ("ENTITIES" == type && !unparsed.contains (name))
 753:                         uDeferred.addElement (value);
 754:                 } while (names.hasMoreTokens ());
 755: 
 756:             } else if (type.charAt (0) == '(' /*)*/ ) {
 757:                 // VC: Enumeration (must match)
 758:                 checkEnumeration (value, type, aName);
 759: 
 760:             } else if (!interned && checkOne) { /* NOTATION */
 761:                 // VC: Notation attributes (must be names)
 762:                 isName (value, "attribute default", aName);
 763: 
 764:                 // VC: Notation attributes (must be declared)
 765:                 if (!notations.contains (value))
 766:                     nDeferred.addElement (value);
 767: 
 768:                 // VC: Enumeration (must match)
 769:                 checkEnumeration (value, type, aName);
 770: 
 771:             } else if ("ID" != type)
 772:                 throw new RuntimeException ("illegal attribute type: " + type);
 773:         }
 774: 
 775:         if (info.attributes.get (aName) == null)
 776:             info.attributes.put (aName, ainfo);
 777:         /*
 778:         else
 779:             warning ("Element '" + eName
 780:                 + "' already has an attribute named '" + aName + "'");
 781:         */
 782: 
 783:         if ("xml:space".equals (aName)) {
 784:             if (!("(default|preserve)".equals (type)
 785:                     || "(preserve|default)".equals (type)
 786:                         // these next two are arguable; XHTML's DTD doesn't
 787:                         // deserve errors.  After all, it's not like any
 788:                         // illegal _value_ could pass ...
 789:                     || "(preserve)".equals (type)
 790:                     || "(default)".equals (type)
 791:                     ))
 792:                 error (
 793:                     "xml:space attribute type must be like '(default|preserve)'"
 794:                     + " not '" + type + "'"
 795:                     );
 796: 
 797:         }
 798:         super.attributeDecl (eName, aName, type, mode, value);
 799:     }
 800: 
 801:     /**
 802:      * <b>DecllHandler</b> Records the element declaration for later use
 803:      * when checking document content, and checks validity constraints that
 804:      * apply to element declarations.  Passed to the next consumer, unless
 805:      * this one was preloaded with a particular DTD.
 806:      */
 807:     public void elementDecl (String name, String model)
 808:     throws SAXException
 809:     {
 810:         if (disableDeclarations)
 811:             return;
 812: 
 813:         ElementInfo     info = (ElementInfo) elements.get (name);
 814: 
 815:         // we might have seen an attribute decl already
 816:         if (info == null) {
 817:             info = new ElementInfo (name);
 818:             elements.put (name, info);
 819:         }
 820:         if (info.model != null) {
 821:             // NOTE:  not all parsers can report such duplicates.
 822:             // VC: Unique Element Type Declaration
 823:             error ("Element type '" + name
 824:                 + "' was already declared.");
 825:         } else {
 826:             info.model = model;
 827: 
 828:             // VC: No Duplicate Types (in mixed content models)
 829:             if (model.charAt (1) == '#')        // (#PCDATA...
 830:                 info.getRecognizer (this);
 831:         }
 832:         super.elementDecl (name, model);
 833:     }
 834: 
 835:     /**
 836:      * <b>DecllHandler</b> passed to the next consumer, unless this
 837:      * one was preloaded with a particular DTD
 838:      */
 839:     public void internalEntityDecl (String name, String value)
 840:     throws SAXException
 841:     {
 842:         if (!disableDeclarations)
 843:             super.internalEntityDecl (name, value);
 844:     }
 845: 
 846:     /**
 847:      * <b>DecllHandler</b> passed to the next consumer, unless this
 848:      * one was preloaded with a particular DTD
 849:      */
 850:     public void externalEntityDecl (String name,
 851:         String publicId, String systemId)
 852:     throws SAXException
 853:     {
 854:         if (!disableDeclarations)
 855:             super.externalEntityDecl (name, publicId, systemId);
 856:     }
 857: 
 858: 
 859:     /**
 860:      * <b>DTDHandler</b> Records the notation name, for checking
 861:      * NOTATIONS attribute values and declararations of unparsed
 862:      * entities.  Passed to the next consumer, unless this one was
 863:      * preloaded with a particular DTD.
 864:      */
 865:     public void notationDecl (String name, String publicId, String systemId)
 866:     throws SAXException
 867:     {
 868:         if (disableDeclarations)
 869:             return;
 870: 
 871:         notations.addElement (name);
 872:         super.notationDecl (name, publicId, systemId);
 873:     }
 874: 
 875:     /**
 876:      * <b>DTDHandler</b> Records the entity name, for checking
 877:      * ENTITY and ENTITIES attribute values; records the notation
 878:      * name if it hasn't yet been declared.  Passed to the next consumer,
 879:      * unless this one was preloaded with a particular DTD.
 880:      */
 881:     public void unparsedEntityDecl (
 882:         String name,
 883:         String publicId,
 884:         String systemId,
 885:         String notationName
 886:     ) throws SAXException
 887:     {
 888:         if (disableDeclarations)
 889:             return;
 890: 
 891:         unparsed.addElement (name);
 892:         if (!notations.contains (notationName))
 893:             nDeferred.addElement (notationName);
 894:         super.unparsedEntityDecl (name, publicId, systemId, notationName);
 895:     }
 896: 
 897: 
 898:     /**
 899:      * <b>ContentHandler</b> Ensures that state from any previous parse
 900:      * has been deleted.
 901:      * Passed to the next consumer.
 902:      */
 903:     public void startDocument ()
 904:     throws SAXException
 905:     {
 906:         resetState ();
 907:         super.startDocument ();
 908:     }
 909: 
 910: 
 911:     private static boolean isAsciiLetter (char c)
 912:     {
 913:         return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
 914:     }
 915: 
 916: 
 917:     /**
 918:      * <b>ContentHandler</b> Reports a fatal exception.  Validating
 919:      * XML processors may not skip any entities.
 920:      */
 921:     public void skippedEntity (String name)
 922:     throws SAXException
 923:     {
 924:         fatalError ("may not skip entities");
 925:     }
 926: 
 927:     /*
 928:      * SAX2 doesn't expand non-PE refs in attribute defaults...
 929:      */
 930:     private String expandDefaultRefs (String s)
 931:     throws SAXException
 932:     {
 933:         if (s.indexOf ('&') < 0)
 934:             return s;
 935: 
 936: // FIXME: handle &#nn; &#xnn; &name;
 937:         String message = "Can't expand refs in attribute default: " + s;
 938:         warning (message);
 939: 
 940:         return s;
 941:     }
 942: 
 943:     /**
 944:      * <b>ContentHandler</b> Performs validity checks against element
 945:      * (and document) content models, and attribute values.
 946:      * Passed to the next consumer.
 947:      */
 948:     public void startElement (
 949:         String          uri,
 950:         String          localName,
 951:         String          qName,
 952:         Attributes      atts
 953:     ) throws SAXException
 954:     {
 955:         //
 956:         // First check content model for the enclosing scope.
 957:         //
 958:         if (contentStack.isEmpty ()) {
 959:             // VC:  Root Element Type
 960:             if (!qName.equals (rootName)) {
 961:                 if (rootName == null)
 962:                     warning ("This document has no DTD, can't be valid");
 963:                 else
 964:                     error ("Root element type '" + qName
 965:                         + "' was declared to be '" + rootName + "'");
 966:             }
 967:         } else {
 968:             Recognizer state = (Recognizer) contentStack.peek ();
 969: 
 970:             if (state != null) {
 971:                 Recognizer newstate = state.acceptElement (qName);
 972: 
 973:                 if (newstate == null)
 974:                     error ("Element type '" + qName
 975:                         + "' in element '" + state.type.name
 976:                         + "' violates content model " + state.type.model
 977:                         );
 978:                 if (newstate != state) {
 979:                     contentStack.pop ();
 980:                     contentStack.push (newstate);
 981:                 }
 982:             }
 983:         }
 984: 
 985:         //
 986:         // Then check that this element was declared, and push the
 987:         // object used to validate its content model onto our stack.
 988:         //
 989:         // This is where the recognizer gets created, if needed; if
 990:         // it's a "children" (elements) content model, an NDFA is
 991:         // created.  (One recognizer is used per content type, no
 992:         // matter how complex that recognizer is.)
 993:         //
 994:         ElementInfo             info;
 995: 
 996:         info = (ElementInfo) elements.get (qName);
 997:         if (info == null || info.model == null) {
 998:             // VC: Element Valid (base clause)
 999:             error ("Element type '" + qName + "' was not declared");
1000:             contentStack.push (null);
1001: 
1002:             // for less diagnostic noise, fake a declaration.
1003:             elementDecl (qName, "ANY");
1004:         } else
1005:             contentStack.push (info.getRecognizer (this));
1006: 
1007:         //
1008:         // Then check each attribute present
1009:         //
1010:         int                     len;
1011:         String                  aname;
1012:         AttributeInfo           ainfo;
1013: 
1014:         if (atts != null)
1015:             len = atts.getLength ();
1016:         else
1017:             len = 0;
1018: 
1019:         for (int i = 0; i < len; i++) {
1020:             aname = atts.getQName (i);
1021: 
1022:             if (info == null
1023:                     || (ainfo = (AttributeInfo) info.attributes.get (aname))
1024:                             == null) {
1025:                 // VC: Attribute Value Type
1026:                 error ("Attribute '" + aname
1027:                     + "' was not declared for element type " + qName);
1028:                 continue;
1029:             }
1030: 
1031:             String value = atts.getValue (i);
1032: 
1033:             // note that "==" for type names and "#FIXED" is correct
1034:             // (and fast) since we've interned those literals.
1035: 
1036:             if ("#FIXED" == ainfo.mode) {
1037:                 String expanded = expandDefaultRefs (ainfo.value);
1038: 
1039:                 // VC: Fixed Attribute Default
1040:                 if (!value.equals (expanded)) {
1041:                     error ("Attribute '" + aname
1042:                         + "' must match " + expanded
1043:                         );
1044:                     continue;
1045:                 }
1046:             }
1047: 
1048:             if ("CDATA" == ainfo.type)
1049:                 continue;
1050: 
1051:             //
1052:             // For all other attribute types, there are various
1053:             // rules to follow.
1054:             //
1055: 
1056:             if ("ID" == ainfo.type) {
1057:                 // VC: ID (must be a name)
1058:                 if (isName (value, "ID attribute", aname)) {
1059:                     if (Boolean.TRUE == ids.get (value))
1060:                         // VC: ID (appears once)
1061:                         error ("ID attribute " + aname
1062:                             + " uses an ID value '" + value
1063:                             + "' which was already declared.");
1064:                     else
1065:                         // any forward refs are no longer problems
1066:                         ids.put (value, Boolean.TRUE);
1067:                 }
1068:                 continue;
1069:             }
1070: 
1071:             if ("IDREF" == ainfo.type) {
1072:                 // VC: IDREF (value must be a name)
1073:                 if (isName (value, "IDREF attribute", aname)) {
1074:                     // VC: IDREF (must match some ID attribute)
1075:                     if (ids.get (value) == null)
1076:                         // new -- assume it's a forward ref
1077:                         ids.put (value, Boolean.FALSE);
1078:                 }
1079:                 continue;
1080:             }
1081: 
1082:             if ("IDREFS" == ainfo.type) {
1083:                 StringTokenizer tokens = new StringTokenizer (value, " ");
1084: 
1085:                 if (!tokens.hasMoreTokens ()) {
1086:                     // VC: IDREF (one or more values)
1087:                     error ("IDREFS attribute " + aname
1088:                         + " must have at least one ID ref");
1089:                 } else do {
1090:                     String id = tokens.nextToken ();
1091: 
1092:                     // VC: IDREF (value must be a name)
1093:                     if (isName (id, "IDREFS attribute", aname)) {
1094:                         // VC: IDREF (must match some ID attribute)
1095:                         if (ids.get (id) == null)
1096:                             // new -- assume it's a forward ref
1097:                             ids.put (id, Boolean.FALSE);
1098:                     }
1099:                 } while (tokens.hasMoreTokens ());
1100:                 continue;
1101:             }
1102: 
1103:             if ("NMTOKEN" == ainfo.type) {
1104:                 // VC: Name Token (is a name token)
1105:                 isNmtoken (value, "NMTOKEN attribute", aname);
1106:                 continue;
1107:             }
1108: 
1109:             if ("NMTOKENS" == ainfo.type) {
1110:                 StringTokenizer tokens = new StringTokenizer (value, " ");
1111: 
1112:                 if (!tokens.hasMoreTokens ()) {
1113:                     // VC: Name Token (one or more values)
1114:                     error ("NMTOKENS attribute " + aname
1115:                         + " must have at least one name token");
1116:                 } else do {
1117:                     String token = tokens.nextToken ();
1118: 
1119:                     // VC: Name Token (is a name token)
1120:                     isNmtoken (token, "NMTOKENS attribute", aname);
1121:                 } while (tokens.hasMoreTokens ());
1122:                 continue;
1123:             }
1124: 
1125:             if ("ENTITY" == ainfo.type) {
1126:                 if (!unparsed.contains (value))
1127:                     // VC: Entity Name
1128:                     error ("Value of attribute '" + aname
1129:                         + "' refers to unparsed entity '" + value
1130:                         + "' which was not declared.");
1131:                 continue;
1132:             }
1133: 
1134:             if ("ENTITIES" == ainfo.type) {
1135:                 StringTokenizer tokens = new StringTokenizer (value, " ");
1136: 
1137:                 if (!tokens.hasMoreTokens ()) {
1138:                     // VC: Entity Name (one or more values)
1139:                     error ("ENTITIES attribute " + aname
1140:                         + " must have at least one name token");
1141:                 } else do {
1142:                     String entity = tokens.nextToken ();
1143: 
1144:                     if (!unparsed.contains (entity))
1145:                         // VC: Entity Name
1146:                         error ("Value of attribute '" + aname
1147:                             + "' refers to unparsed entity '" + entity
1148:                             + "' which was not declared.");
1149:                 } while (tokens.hasMoreTokens ());
1150:                 continue;
1151:             }
1152: 
1153:             //
1154:             // check for enumerations last; more expensive
1155:             //
1156:             if (ainfo.type.charAt (0) == '(' /*)*/
1157:                     || ainfo.type.startsWith ("NOTATION ")
1158:                     ) {
1159:                 // VC: Enumeration (value must be defined)
1160:                 checkEnumeration (value, ainfo.type, aname);
1161:                 continue;
1162:             }
1163:         }
1164: 
1165:         //
1166:         // Last, check that all #REQUIRED attributes were provided
1167:         //
1168:         if (info != null) {
1169:             Hashtable   table = info.attributes;
1170: 
1171:             if (table.size () != 0) {
1172:                 Enumeration     e = table.keys ();
1173: 
1174:                 // XXX table.keys uses the heap, bleech -- slows things
1175: 
1176:                 while (e.hasMoreElements ()) {
1177:                     aname = (String) e.nextElement ();
1178:                     ainfo = (AttributeInfo) table.get (aname);
1179: 
1180:                     // "#REQUIRED" mode was interned in attributeDecl
1181:                     if ("#REQUIRED" == ainfo.mode
1182:                             && atts.getValue (aname) == null) {
1183:                         // VC: Required Attribute
1184:                         error ("Attribute '" + aname + "' must be specified "
1185:                             + "for element type " + qName);
1186:                     }
1187:                 }
1188:             }
1189:         }
1190:         super.startElement (uri, localName, qName, atts);
1191:     }
1192: 
1193:     /**
1194:      * <b>ContentHandler</b> Reports a validity error if the element's content
1195:      * model does not permit character data.
1196:      * Passed to the next consumer.
1197:      */
1198:     public void characters (char ch [], int start, int length)
1199:     throws SAXException
1200:     {
1201:         Recognizer state;
1202: 
1203:         if (contentStack.empty ())
1204:             state = null;
1205:         else
1206:             state = (Recognizer) contentStack.peek ();
1207: 
1208:         // NOTE:  if this ever supports with SAX parsers that don't
1209:         // report ignorable whitespace as such (only XP?), this class
1210:         // needs to morph it into ignorableWhitespace() as needed ...
1211: 
1212:         if (state != null && !state.acceptCharacters ())
1213:             // VC: Element Valid (clauses three, four -- see recognizer)
1214:             error ("Character content not allowed in element "
1215:                 + state.type.name);
1216: 
1217:         super.characters (ch, start, length);
1218:     }
1219: 
1220: 
1221:     /**
1222:      * <b>ContentHandler</b> Reports a validity error if the element's content
1223:      * model does not permit end-of-element yet, or a well formedness error
1224:      * if there was no matching startElement call.
1225:      * Passed to the next consumer.
1226:      */
1227:     public void endElement (String uri, String localName, String qName)
1228:     throws SAXException
1229:     {
1230:         try {
1231:             Recognizer state = (Recognizer) contentStack.pop ();
1232: 
1233:             if (state != null && !state.completed ())
1234:                 // VC: Element valid (clauses two, three, four; see Recognizer)
1235:                 error ("Premature end for element '"
1236:                     + state.type.name
1237:                     + "', content model "
1238:                     + state.type.model);
1239: 
1240:             // could insist on match of start element, but that's
1241:             // something the input stream must to guarantee.
1242: 
1243:         } catch (EmptyStackException e) {
1244:             fatalError ("endElement without startElement: " + qName
1245:                 + ((uri == null)
1246:                     ? ""
1247:                     : ( " { '" + uri + "', " + localName + " }")));
1248:         }
1249:         super.endElement (uri, localName, qName);
1250:     }
1251: 
1252:     /**
1253:      * <b>ContentHandler</b> Checks whether all ID values that were
1254:      * referenced have been declared, and releases all resources.
1255:      * Passed to the next consumer.
1256:      *
1257:      * @see #setDocumentLocator
1258:      */
1259:     public void endDocument ()
1260:     throws SAXException
1261:     {
1262:         for (Enumeration idNames = ids.keys ();
1263:                 idNames.hasMoreElements ();
1264:                 /* NOP */) {
1265:             String id = (String) idNames.nextElement ();
1266: 
1267:             if (Boolean.FALSE == ids.get (id)) {
1268:                 // VC: IDREF (must match ID)
1269:                 error ("Undeclared ID value '" + id
1270:                     + "' was referred to by an IDREF/IDREFS attribute");
1271:             }
1272:         }
1273: 
1274:         resetState ();
1275:         super.endDocument ();
1276:     }
1277: 
1278: 
1279:     /** Holds per-element declarations */
1280:     static private final class ElementInfo
1281:     {
1282:         String                  name;
1283:         String                  model;
1284: 
1285:         // key = attribute name; value = AttributeInfo
1286:         Hashtable               attributes = new Hashtable (11);
1287: 
1288:         ElementInfo (String n) { name = n; }
1289: 
1290:         private Recognizer      recognizer;
1291: 
1292:         // for validating content models:  one per type, shared,
1293:         // and constructed only on demand ... so unused elements do
1294:         // not need to consume resources.
1295:         Recognizer      getRecognizer (ValidationConsumer consumer)
1296:         throws SAXException
1297:         {
1298:             if (recognizer == null) {
1299:                 if ("ANY".equals (model))
1300:                     recognizer = ANY;
1301:                 else if ("EMPTY".equals (model))
1302:                     recognizer = new EmptyRecognizer (this);
1303:                 else if ('#' == model.charAt (1))
1304:                     // n.b. this constructor does a validity check
1305:                     recognizer = new MixedRecognizer (this, consumer);
1306:                 else
1307:                     recognizer = new ChildrenRecognizer (this, consumer);
1308:             }
1309:             return recognizer;
1310:         }
1311:     }
1312: 
1313:     /** Holds per-attribute declarations */
1314:     static private final class AttributeInfo
1315:     {
1316:         String  type;
1317:         String  mode;           // #REQUIRED, etc (or null)
1318:         String  value;          // or null
1319:     }
1320: 
1321: 
1322:     //
1323:     // Content model validation
1324:     //
1325: 
1326:     static private final Recognizer     ANY = new Recognizer (null);
1327: 
1328: 
1329:     // Base class defines the calls used to validate content,
1330:     // and supports the "ANY" content model
1331:     static private class Recognizer
1332:     {
1333:         final ElementInfo       type;
1334: 
1335:         Recognizer (ElementInfo t) { type = t; }
1336: 
1337:         // return true iff character data is legal here
1338:         boolean acceptCharacters ()
1339:         throws SAXException
1340:             // VC: Element Valid (third and fourth clauses)
1341:             { return true; }
1342: 
1343:         // null return = failure
1344:         // otherwise, next state (like an FSM)
1345:         // prerequisite: tested that name was declared
1346:         Recognizer acceptElement (String name)
1347:         throws SAXException
1348:             // VC: Element Valid (fourth clause)
1349:             { return this; }
1350: 
1351:         // return true iff model is completed, can finish
1352:         boolean completed ()
1353:         throws SAXException
1354:             // VC: Element Valid (fourth clause)
1355:             { return true; }
1356: 
1357:         public String toString ()
1358:             // n.b. "children" is the interesting case!
1359:             { return (type == null) ? "ANY" : type.model; }
1360:     }
1361: 
1362:     // "EMPTY" content model -- no characters or elements
1363:     private static final class EmptyRecognizer extends Recognizer
1364:     {
1365:         public EmptyRecognizer (ElementInfo type)
1366:             { super (type); }
1367: 
1368:         // VC: Element Valid (first clause)
1369:         boolean acceptCharacters ()
1370:             { return false; }
1371: 
1372:         // VC: Element Valid (first clause)
1373:         Recognizer acceptElement (String name)
1374:             { return null; }
1375:     }
1376: 
1377:     // "Mixed" content model -- ANY, but restricts elements
1378:     private static final class MixedRecognizer extends Recognizer
1379:     {
1380:         private String  permitted [];
1381: 
1382:         // N.B. constructor tests for duplicated element names (VC)
1383:         public MixedRecognizer (ElementInfo t, ValidationConsumer v)
1384:         throws SAXException
1385:         {
1386:             super (t);
1387: 
1388:             // (#PCDATA...)* or (#PCDATA) ==> ... or empty
1389:             // with the "..." being "|elname|..."
1390:             StringTokenizer     tokens = new StringTokenizer (
1391:                 t.model.substring (8, t.model.lastIndexOf (')')),
1392:                 "|");
1393:             Vector              vec = new Vector ();
1394: 
1395:             while (tokens.hasMoreTokens ()) {
1396:                 String token = tokens.nextToken ();
1397: 
1398:                 if (vec.contains (token))
1399:                     v.error ("element " + token
1400:                         + " is repeated in mixed content model: "
1401:                         + t.model);
1402:                 else
1403:                     vec.addElement (token.intern ());
1404:             }
1405:             permitted = new String [vec.size ()];
1406:             for (int i = 0; i < permitted.length; i++)
1407:                 permitted [i] = (String) vec.elementAt (i);
1408: 
1409:             // in one large machine-derived DTD sample, most of about
1410:             // 250 mixed content models were empty, and 25 had ten or
1411:             // more entries.  2 had over a hundred elements.  Linear
1412:             // search isn't obviously wrong.
1413:         }
1414: 
1415:         // VC: Element Valid (third clause)
1416:         Recognizer acceptElement (String name)
1417:         {
1418:             int         length = permitted.length;
1419: 
1420:             // first pass -- optimistic w.r.t. event source interning
1421:             // (and document validity)
1422:             for (int i = 0; i < length; i++)
1423:                 if (permitted [i] == name)
1424:                     return this;
1425:             // second pass -- pessimistic w.r.t. event source interning
1426:             for (int i = 0; i < length; i++)
1427:                 if (permitted [i].equals (name))
1428:                     return this;
1429:             return null;
1430:         }
1431:     }
1432: 
1433: 
1434:     // recognizer loop flags, see later
1435:     private static final int            F_LOOPHEAD = 0x01;
1436:     private static final int            F_LOOPNEXT = 0x02;
1437: 
1438:     // for debugging -- used to label/count nodes in toString()
1439:     private static int                  nodeCount;
1440: 
1441:     /**
1442:      * "Children" content model -- these are nodes in NDFA state graphs.
1443:      * They work in fixed space.  Note that these graphs commonly have
1444:      * cycles, handling features such as zero-or-more and one-or-more.
1445:      *
1446:      * <p>It's readonly, so only one copy is ever needed.  The content model
1447:      * stack may have any number of pointers into each graph, when a model
1448:      * happens to be needed more than once due to element nesting.  Since
1449:      * traversing the graph just moves to another node, and never changes
1450:      * it, traversals never interfere with each other.
1451:      *
1452:      * <p>There is an option to report non-deterministic models.  These are
1453:      * always XML errors, but ones which are not often reported despite the
1454:      * fact that they can lead to different validating parsers giving
1455:      * different results for the same input.  (The XML spec doesn't require
1456:      * them to be reported.)
1457:      *
1458:      * <p><b>FIXME</b> There's currently at least one known bug here, in that
1459:      * it's not actually detecting the non-determinism it tries to detect.
1460:      * (Of the "optional.xml" test, the once-or-twice-2* tests are all non-D;
1461:      * maybe some others.)  This may relate to the issue flagged below as
1462:      * "should not" happen (but it was), which showed up when patching the
1463:      * graph to have one exit node (or more EMPTY nodes).
1464:      */
1465:     private static final class ChildrenRecognizer extends Recognizer
1466:         implements Cloneable
1467:     {
1468:         // for reporting non-deterministic content models
1469:         // ... a waste of space if we're not reporting those!
1470:         // ... along with the 'model' member (in base class)
1471:         private ValidationConsumer      consumer;
1472: 
1473:         // for CHOICE nodes -- each component is an arc that
1474:         // accepts a different NAME (or is EMPTY indicating
1475:         // NDFA termination).
1476:         private Recognizer              components [];
1477: 
1478:         // for NAME/SEQUENCE nodes -- accepts that NAME and
1479:         // then goes to the next node (CHOICE, NAME, EMPTY).
1480:         private String                  name;
1481:         private Recognizer              next;
1482: 
1483:         // loops always point back to a CHOICE node. we mark such choice
1484:         // nodes (F_LOOPHEAD) for diagnostics and faster deep cloning.
1485:         // We also mark nodes before back pointers (F_LOOPNEXT), to ensure
1486:         // termination when we patch sequences and loops.
1487:         private int                     flags;
1488: 
1489: 
1490:         // prevent a needless indirection between 'this' and 'node'
1491:         private void copyIn (ChildrenRecognizer node)
1492:         {
1493:             // model & consumer are already set
1494:             components = node.components;
1495:             name = node.name;
1496:             next = node.next;
1497:             flags = node.flags;
1498:         }
1499: 
1500:         // used to construct top level "children" content models,
1501:         public ChildrenRecognizer (ElementInfo type, ValidationConsumer vc)
1502:         {
1503:             this (vc, type);
1504:             populate (type.model.toCharArray (), 0);
1505:             patchNext (new EmptyRecognizer (type), null);
1506:         }
1507: 
1508:         // used internally; populating is separate
1509:         private ChildrenRecognizer (ValidationConsumer vc, ElementInfo type)
1510:         {
1511:             super (type);
1512:             consumer = vc;
1513:         }
1514: 
1515: 
1516:         //
1517:         // When rewriting some graph nodes we need deep clones in one case;
1518:         // mostly shallow clones (what the JVM handles for us) are fine.
1519:         //
1520:         private ChildrenRecognizer shallowClone ()
1521:         {
1522:             try {
1523:                 return (ChildrenRecognizer) clone ();
1524:             } catch (CloneNotSupportedException e) {
1525:                 throw new Error ("clone");
1526:             }
1527:         }
1528: 
1529:         private ChildrenRecognizer deepClone ()
1530:         {
1531:             return deepClone (new Hashtable (37));
1532:         }
1533: 
1534:         private ChildrenRecognizer deepClone (Hashtable table)
1535:         {
1536:             ChildrenRecognizer retval;
1537: 
1538:             if ((flags & F_LOOPHEAD) != 0) {
1539:                 retval = (ChildrenRecognizer) table.get (this);
1540:                 if (retval != null)
1541:                     return this;
1542: 
1543:                 retval = shallowClone ();
1544:                 table.put (this, retval);
1545:             } else
1546:                 retval = shallowClone ();
1547: 
1548:             if (next != null) {
1549:                 if (next instanceof ChildrenRecognizer)
1550:                     retval.next = ((ChildrenRecognizer)next)
1551:                             .deepClone (table);
1552:                 else if (!(next instanceof EmptyRecognizer))
1553:                     throw new RuntimeException ("deepClone");
1554:             }
1555: 
1556:             if (components != null) {
1557:                 retval.components = new Recognizer [components.length];
1558:                 for (int i = 0; i < components.length; i++) {
1559:                     Recognizer temp = components [i];
1560: 
1561:                     if (temp == null)
1562:                         retval.components [i] = null;
1563:                     else if (temp instanceof ChildrenRecognizer)
1564:                         retval.components [i] = ((ChildrenRecognizer)temp)
1565:                                 .deepClone (table);
1566:                     else if (!(temp instanceof EmptyRecognizer))
1567:                         throw new RuntimeException ("deepClone");
1568:                 }
1569:             }
1570: 
1571:             return retval;
1572:         }
1573: 
1574:         // connect subgraphs, first to next (sequencing)
1575:         private void patchNext (Recognizer theNext, Hashtable table)
1576:         {
1577:             // backpointers must not be repatched or followed
1578:             if ((flags & F_LOOPNEXT) != 0)
1579:                 return;
1580: 
1581:             // XXX this table "shouldn't" be needed, right?
1582:             // but some choice nodes looped if it isn't there.
1583:             if (table != null && table.get (this) != null)
1584:                 return;
1585:             if (table == null)
1586:                 table = new Hashtable ();
1587: 
1588:             // NAME/SEQUENCE
1589:             if (name != null) {
1590:                 if (next == null)
1591:                     next = theNext;
1592:                 else if (next instanceof ChildrenRecognizer) {
1593:                     ((ChildrenRecognizer)next).patchNext (theNext, table);
1594:                 } else if (!(next instanceof EmptyRecognizer))
1595:                     throw new RuntimeException ("patchNext");
1596:                 return;
1597:             }
1598: 
1599:             // CHOICE
1600:             for (int i = 0; i < components.length; i++) {
1601:                 if (components [i] == null)
1602:                     components [i] = theNext;
1603:                 else if (components [i] instanceof ChildrenRecognizer) {
1604:                     ((ChildrenRecognizer)components [i])
1605:                             .patchNext (theNext, table);
1606:                 } else if (!(components [i] instanceof EmptyRecognizer))
1607:                     throw new RuntimeException ("patchNext");
1608:             }
1609: 
1610:             if (table != null && (flags & F_LOOPHEAD) != 0)
1611:                 table.put (this, this);
1612:         }
1613: 
1614:         /**
1615:          * Parses a 'children' spec (or recursively 'cp') and makes this
1616:          * become a regular graph node.
1617:          *
1618:          * @return index after this particle
1619:          */
1620:         private int populate (char parseBuf [], int startPos)
1621:         {
1622:             int         nextPos = startPos + 1;
1623:             char        c;
1624: 
1625:             if (nextPos < 0 || nextPos >= parseBuf.length)
1626:                 throw new IndexOutOfBoundsException ();
1627: 
1628:             // Grammar of the string is from the XML spec, but
1629:             // with whitespace removed by the SAX parser.
1630: 
1631:             // children ::= (choice | seq) ('?' | '*' | '+')?
1632:             // cp ::= (Name | choice | seq) ('?' | '*' | '+')?
1633:             // choice ::= '(' cp ('|' choice)* ')'
1634:             // seq ::= '(' cp (',' choice)* ')'
1635: 
1636:             // interior nodes only
1637:             //   cp ::= name ...
1638:             if (parseBuf [startPos] != '('/*)*/) {
1639:                 boolean         done = false;
1640:                 do {
1641:                     switch (c = parseBuf [nextPos]) {
1642:                         case '?': case '*': case '+':
1643:                         case '|': case ',':
1644:                         case /*(*/ ')':
1645:                             done = true;
1646:                             continue;
1647:                         default:
1648:                             nextPos++;
1649:                             continue;
1650:                     }
1651:                 } while (!done);
1652:                 name = new String (parseBuf, startPos, nextPos - startPos);
1653: 
1654:             // interior OR toplevel nodes
1655:             //   cp ::= choice ..
1656:             //   cp ::= seq ..
1657:             } else {
1658:                 // collect everything as a separate list, and merge it
1659:                 // into "this" later if we can (SEQUENCE or singleton)
1660:                 ChildrenRecognizer      first;
1661: 
1662:                 first = new ChildrenRecognizer (consumer, type);
1663:                 nextPos = first.populate (parseBuf, nextPos);
1664:                 c = parseBuf [nextPos++];
1665: 
1666:                 if (c == ',' || c == '|') {
1667:                     ChildrenRecognizer  current = first;
1668:                     char                separator = c;
1669:                     Vector              v = null;
1670: 
1671:                     if (separator == '|') {
1672:                         v = new Vector ();
1673:                         v.addElement (first);
1674:                     }
1675: 
1676:                     do {
1677:                         ChildrenRecognizer link;
1678: 
1679:                         link = new ChildrenRecognizer (consumer, type);
1680:                         nextPos = link.populate (parseBuf, nextPos);
1681: 
1682:                         if (separator == ',') {
1683:                             current.patchNext (link, null);
1684:                             current = link;
1685:                         } else
1686:                             v.addElement (link);
1687: 
1688:                         c = parseBuf [nextPos++];
1689:                     } while (c == separator);
1690: 
1691:                     // choice ... collect everything into one array.
1692:                     if (separator == '|') {
1693:                         // assert v.size() > 1
1694:                         components = new Recognizer [v.size ()];
1695:                         for (int i = 0; i < components.length; i++) {
1696:                             components [i] = (Recognizer)
1697:                                     v.elementAt (i);
1698:                         }
1699:                         // assert flags == 0
1700: 
1701:                     // sequence ... merge into "this" to be smaller.
1702:                     } else
1703:                         copyIn (first);
1704: 
1705:                 // treat singletons like one-node sequences.
1706:                 } else
1707:                     copyIn (first);
1708: 
1709:                 if (c != /*(*/ ')')
1710:                     throw new RuntimeException ("corrupt content model");
1711:             }
1712: 
1713:             //
1714:             // Arity is optional, and the root of all fun.  We keep the
1715:             // FSM state graph simple by only having NAME/SEQUENCE and
1716:             // CHOICE nodes (or EMPTY to terminate a model), easily
1717:             // evaluated.  So we rewrite each node that has arity, using
1718:             // those primitives.  We create loops here, if needed.
1719:             //
1720:             if (nextPos < parseBuf.length) {
1721:                 c = parseBuf [nextPos];
1722:                 if (c == '?' || c == '*' || c == '+') {
1723:                     nextPos++;
1724: 
1725:                     // Rewrite 'zero-or-one' "?" arity to a CHOICE:
1726:                     //   - SEQUENCE (clone, what's next)
1727:                     //   - or, what's next
1728:                     // Size cost: N --> N + 1
1729:                     if (c == '?') {
1730:                         Recognizer              once = shallowClone ();
1731: 
1732:                         components = new Recognizer [2];
1733:                         components [0] = once;
1734:                         // components [1] initted to null
1735:                         name = null;
1736:                         next = null;
1737:                         flags = 0;
1738: 
1739: 
1740:                     // Rewrite 'zero-or-more' "*" arity to a CHOICE.
1741:                     //   - LOOP (clone, back to this CHOICE)
1742:                     //   - or, what's next
1743:                     // Size cost: N --> N + 1
1744:                     } else if (c == '*') {
1745:                         ChildrenRecognizer      loop = shallowClone ();
1746: 
1747:                         loop.patchNext (this, null);
1748:                         loop.flags |= F_LOOPNEXT;
1749:                         flags = F_LOOPHEAD;
1750: 
1751:                         components = new Recognizer [2];
1752:                         components [0] = loop;
1753:                         // components [1] initted to null
1754:                         name = null;
1755:                         next = null;
1756: 
1757: 
1758:                     // Rewrite 'one-or-more' "+" arity to a SEQUENCE.
1759:                     // Basically (a)+ --> ((a),(a)*).
1760:                     //   - this
1761:                     //   - CHOICE
1762:                     //      * LOOP (clone, back to the CHOICE)
1763:                     //      * or, whatever's next
1764:                     // Size cost: N --> 2N + 1
1765:                     } else if (c == '+') {
1766:                         ChildrenRecognizer loop = deepClone ();
1767:                         ChildrenRecognizer choice;
1768: 
1769:                         choice = new ChildrenRecognizer (consumer, type);
1770:                         loop.patchNext (choice, null);
1771:                         loop.flags |= F_LOOPNEXT;
1772:                         choice.flags = F_LOOPHEAD;
1773: 
1774:                         choice.components = new Recognizer [2];
1775:                         choice.components [0] = loop;
1776:                         // choice.components [1] initted to null
1777:                         // choice.name, choice.next initted to null
1778: 
1779:                         patchNext (choice, null);
1780:                     }
1781:                 }
1782:             }
1783: 
1784:             return nextPos;
1785:         }
1786: 
1787:         // VC: Element Valid (second clause)
1788:         boolean acceptCharacters ()
1789:             { return false; }
1790: 
1791:         // VC: Element Valid (second clause)
1792:         Recognizer acceptElement (String type)
1793:         throws SAXException
1794:         {
1795:             // NAME/SEQUENCE
1796:             if (name != null) {
1797:                 if (name.equals (type))
1798:                     return next;
1799:                 return null;
1800:             }
1801: 
1802:             // CHOICE ... optionally reporting nondeterminism we
1803:             // run across.  we won't check out every transition
1804:             // for nondeterminism; only the ones we follow.
1805:             Recognizer  retval = null;
1806: 
1807:             for (int i = 0; i < components.length; i++) {
1808:                 Recognizer temp = components [i].acceptElement (type);
1809: 
1810:                 if (temp == null)
1811:                     continue;
1812:                 else if (!warnNonDeterministic)
1813:                     return temp;
1814:                 else if (retval == null)
1815:                     retval = temp;
1816:                 else if (retval != temp)
1817:                     consumer.error ("Content model " + this.type.model
1818:                         + " is non-deterministic for " + type);
1819:             }
1820:             return retval;
1821:         }
1822: 
1823:         // VC: Element Valid (second clause)
1824:         boolean completed ()
1825:         throws SAXException
1826:         {
1827:             // expecting a specific element
1828:             if (name != null)
1829:                 return false;
1830: 
1831:             // choice, some sequences
1832:             for (int i = 0; i < components.length; i++) {
1833:                 if (components [i].completed ())
1834:                     return true;
1835:             }
1836: 
1837:             return false;
1838:         }
1839: 
1840: /** /
1841:         // FOR DEBUGGING ... flattens the graph for printing.
1842: 
1843:         public String toString ()
1844:         {
1845:             StringBuffer buf = new StringBuffer ();
1846: 
1847:             // only one set of loop labels can be generated
1848:             // at a time...
1849:             synchronized (ANY) {
1850:                 nodeCount = 0;
1851: 
1852:                 toString (buf, new Hashtable ());
1853:                 return buf.toString ();
1854:             }
1855:         }
1856: 
1857:         private void toString (StringBuffer buf, Hashtable table)
1858:         {
1859:             // When we visit a node, label and count it.
1860:             // Nodes are never visited/counted more than once.
1861:             // For small models labels waste space, but if arity
1862:             // mappings were used the savings are substantial.
1863:             // (Plus, the output can be more readily understood.)
1864:             String temp = (String) table.get (this);
1865: 
1866:             if (temp != null) {
1867:                 buf.append ('{');
1868:                 buf.append (temp);
1869:                 buf.append ('}');
1870:                 return;
1871:             } else {
1872:                 StringBuffer scratch = new StringBuffer (15);
1873: 
1874:                 if ((flags & F_LOOPHEAD) != 0)
1875:                     scratch.append ("loop");
1876:                 else
1877:                     scratch.append ("node");
1878:                 scratch.append ('-');
1879:                 scratch.append (++nodeCount);
1880:                 temp = scratch.toString ();
1881: 
1882:                 table.put (this, temp);
1883:                 buf.append ('[');
1884:                 buf.append (temp);
1885:                 buf.append (']');
1886:                 buf.append (':');
1887:             }
1888: 
1889:             // NAME/SEQUENCE
1890:             if (name != null) {
1891:                 // n.b. some output encodings turn some name chars into '?'
1892:                 // e.g. with Japanese names and ASCII output
1893:                 buf.append (name);
1894:                 if (components != null)         // bug!
1895:                     buf.append ('$');
1896:                 if (next == null)
1897:                     buf.append (",*");
1898:                 else if (next instanceof EmptyRecognizer) // patch-to-next
1899:                     buf.append (",{}");
1900:                 else if (next instanceof ChildrenRecognizer) {
1901:                     buf.append (',');
1902:                     ((ChildrenRecognizer)next).toString (buf, table);
1903:                 } else                          // bug!
1904:                     buf.append (",+");
1905:                 return;
1906:             }
1907: 
1908:             // CHOICE
1909:             buf.append ("<");
1910:             for (int i = 0; i < components.length; i++) {
1911:                 if (i != 0)
1912:                     buf.append ("|");
1913:                 if (components [i] instanceof EmptyRecognizer) {
1914:                     buf.append ("{}");
1915:                 } else if (components [i] == null) {    // patch-to-next
1916:                     buf.append ('*');
1917:                 } else {
1918:                     ChildrenRecognizer r;
1919: 
1920:                     r = (ChildrenRecognizer) components [i];
1921:                     r.toString (buf, table);
1922:                 }
1923:             }
1924:             buf.append (">");
1925:         }
1926: /**/
1927:     }
1928: }
Overview Package Class Use Source Tree Index Deprecated About
		Frames \| No Frames