Source for gnu.xml.pipeline.XIncludeFilter

   1: /* XIncludeFilter.java --
   2:    Copyright (C) 2001,2002 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: package gnu.xml.pipeline;
  39: 
  40: import java.io.IOException;
  41: import java.io.InputStream;
  42: import java.io.InputStreamReader;
  43: import java.net.URL;
  44: import java.net.URLConnection;
  45: import java.util.Hashtable;
  46: import java.util.Stack;
  47: import java.util.Vector;
  48: 
  49: import org.xml.sax.Attributes;
  50: import org.xml.sax.ErrorHandler;
  51: import org.xml.sax.InputSource;
  52: import org.xml.sax.Locator;
  53: import org.xml.sax.SAXException;
  54: import org.xml.sax.SAXParseException;
  55: import org.xml.sax.XMLReader;
  56: import org.xml.sax.helpers.XMLReaderFactory;
  57: 
  58: import gnu.xml.util.Resolver;
  59: 
  60: 
  61: 
  62: /**
  63:  * Filter to process an XPointer-free subset of
  64:  * <a href="http://www.w3.org/TR/xinclude">XInclude</a>, supporting its
  65:  * use as a kind of replacement for parsed general entities.
  66:  * XInclude works much like the <code>#include</code> of C/C++ but
  67:  * works for XML documents as well as unparsed text files.
  68:  * Restrictions from the 17-Sept-2002 CR draft of XInclude are as follows:
  69:  *
  70:  * <ul>
  71:  *
  72:  * <li> URIs must not include fragment identifiers.
  73:  * The CR specifies support for XPointer <em>element()</em> fragment IDs,
  74:  * which is not currently implemented here.
  75:  *
  76:  * <li> <em>xi:fallback</em> handling of resource errors is not
  77:  * currently supported.
  78:  *
  79:  * <li> DTDs are not supported in included files, since the SAX DTD events
  80:  * must have completely preceded any included file.
  81:  * The CR explicitly allows the DTD related portions of the infoset to
  82:  * grow as an effect of including XML documents.
  83:  *
  84:  * <li> <em>xml:base</em> fixup isn't done.
  85:  *
  86:  * </ul>
  87:  *
  88:  * <p> XML documents that are included will normally be processed using
  89:  * the default SAX namespace rules, meaning that prefix information may
  90:  * be discarded.  This may be changed with {@link #setSavingPrefixes
  91:  * setSavingPrefixes()}.  <em>You are strongly advised to do this.</em>
  92:  *
  93:  * <p> Note that XInclude allows highly incompatible implementations, which
  94:  * are specialized to handle application-specific infoset extensions.  Some
  95:  * such implementations can be implemented by subclassing this one, but
  96:  * they may only be substituted in applications at "user option".
  97:  *
  98:  * <p>TBD: "IURI" handling.
  99:  *
 100:  * @author David Brownell
 101:  */
 102: public class XIncludeFilter extends EventFilter implements Locator
 103: {
 104:     private Hashtable           extEntities = new Hashtable (5, 5);
 105:     private int                 ignoreCount;
 106:     private Stack               uris = new Stack ();
 107:     private Locator             locator;
 108:     private Vector              inclusions = new Vector (5, 5);
 109:     private boolean             savingPrefixes;
 110: 
 111:     /**
 112:      */
 113:     public XIncludeFilter (EventConsumer next)
 114:     throws SAXException
 115:     {
 116:         super (next);
 117:         setContentHandler (this);
 118:         // DTDHandler callbacks pass straight through
 119:         setProperty (DECL_HANDLER, this);
 120:         setProperty (LEXICAL_HANDLER, this);
 121:     }
 122: 
 123:     private void fatal (SAXParseException e) throws SAXException
 124:     {
 125:         ErrorHandler            eh;
 126: 
 127:         eh = getErrorHandler ();
 128:         if (eh != null)
 129:             eh.fatalError (e);
 130:         throw e;
 131:     }
 132: 
 133:     /**
 134:      * Passes "this" down the filter chain as a proxy locator.
 135:      */
 136:     public void setDocumentLocator (Locator locator)
 137:     {
 138:         this.locator = locator;
 139:         super.setDocumentLocator (this);
 140:     }
 141: 
 142:     /** Used for proxy locator; do not call directly. */
 143:     public String getSystemId ()
 144:         { return (locator == null) ? null : locator.getSystemId (); }
 145:     /** Used for proxy locator; do not call directly. */
 146:     public String getPublicId ()
 147:         { return (locator == null) ? null : locator.getPublicId (); }
 148:     /** Used for proxy locator; do not call directly. */
 149:     public int getLineNumber ()
 150:         { return (locator == null) ? -1 : locator.getLineNumber (); }
 151:     /** Used for proxy locator; do not call directly. */
 152:     public int getColumnNumber ()
 153:         { return (locator == null) ? -1 : locator.getColumnNumber (); }
 154: 
 155:     /**
 156:      * Assigns the flag controlling the setting of the SAX2
 157:      * <em>namespace-prefixes</em> flag.
 158:      */
 159:     public void setSavingPrefixes (boolean flag)
 160:         { savingPrefixes = flag; }
 161: 
 162:     /**
 163:      * Returns the flag controlling the setting of the SAX2
 164:      * <em>namespace-prefixes</em> flag when parsing included documents.
 165:      * The default value is the SAX2 default (false), which discards
 166:      * information that can be useful.
 167:      */
 168:     public boolean isSavingPrefixes ()
 169:         { return savingPrefixes; }
 170: 
 171:     //
 172:     // Two mechanisms are interacting here.
 173:     //
 174:     //  - XML Base implies a stack of base URIs, updated both by
 175:     //    "real entity" boundaries and element boundaries.
 176:     //
 177:     //  - Active "Real Entities" (for document and general entities,
 178:     //    and by xincluded files) are tracked to prevent circular
 179:     //    inclusions.
 180:     //
 181:     private String addMarker (String uri)
 182:     throws SAXException
 183:     {
 184:         if (locator != null && locator.getSystemId () != null)
 185:             uri = locator.getSystemId ();
 186: 
 187:         // guard against InputSource objects without system IDs
 188:         if (uri == null)
 189:             fatal (new SAXParseException ("Entity URI is unknown", locator));
 190: 
 191:         try {
 192:             URL url = new URL (uri);
 193: 
 194:             uri = url.toString ();
 195:             if (inclusions.contains (uri))
 196:                 fatal (new SAXParseException (
 197:                         "XInclude, circular inclusion", locator));
 198:             inclusions.addElement (uri);
 199:             uris.push (url);
 200:         } catch (IOException e) {
 201:             // guard against illegal relative URIs (Xerces)
 202:             fatal (new SAXParseException ("parser bug: relative URI",
 203:                 locator, e));
 204:         }
 205:         return uri;
 206:     }
 207: 
 208:     private void pop (String uri)
 209:     {
 210:         inclusions.removeElement (uri);
 211:         uris.pop ();
 212:     }
 213: 
 214:     //
 215:     // Document entity boundaries get both treatments.
 216:     //
 217:     public void startDocument () throws SAXException
 218:     {
 219:         ignoreCount = 0;
 220:         addMarker (null);
 221:         super.startDocument ();
 222:     }
 223: 
 224:     public void endDocument () throws SAXException
 225:     {
 226:         inclusions.setSize (0);
 227:         extEntities.clear ();
 228:         uris.setSize (0);
 229:         super.endDocument ();
 230:     }
 231: 
 232:     //
 233:     // External general entity boundaries get both treatments.
 234:     //
 235:     public void externalEntityDecl (String name,
 236:         String publicId, String systemId)
 237:     throws SAXException
 238:     {
 239:         if (name.charAt (0) == '%')
 240:             return;
 241:         try {
 242:             URL url = new URL (locator.getSystemId ());
 243:             systemId = new URL (url, systemId).toString ();
 244:         } catch (IOException e) {
 245:             // what could we do?
 246:         }
 247:         extEntities.put (name, systemId);
 248:     }
 249: 
 250:     public void startEntity (String name)
 251:     throws SAXException
 252:     {
 253:         if (ignoreCount != 0) {
 254:             ignoreCount++;
 255:             return;
 256:         }
 257: 
 258:         String  uri = (String) extEntities.get (name);
 259:         if (uri != null)
 260:             addMarker (uri);
 261:         super.startEntity (name);
 262:     }
 263: 
 264:     public void endEntity (String name)
 265:     throws SAXException
 266:     {
 267:         if (ignoreCount != 0) {
 268:             if (--ignoreCount != 0)
 269:                 return;
 270:         }
 271: 
 272:         String  uri = (String) extEntities.get (name);
 273: 
 274:         if (uri != null)
 275:             pop (uri);
 276:         super.endEntity (name);
 277:     }
 278: 
 279:     //
 280:     // element boundaries only affect the base URI stack,
 281:     // unless they're XInclude elements.
 282:     //
 283:     public void
 284:     startElement (String uri, String localName, String qName, Attributes atts)
 285:     throws SAXException
 286:     {
 287:         if (ignoreCount != 0) {
 288:             ignoreCount++;
 289:             return;
 290:         }
 291: 
 292:         URL     baseURI = (URL) uris.peek ();
 293:         String  base;
 294: 
 295:         base = atts.getValue ("http://www.w3.org/XML/1998/namespace", "base");
 296:         if (base == null)
 297:             uris.push (baseURI);
 298:         else {
 299:             URL         url;
 300: 
 301:             if (base.indexOf ('#') != -1)
 302:                 fatal (new SAXParseException (
 303:                     "xml:base with fragment: " + base,
 304:                     locator));
 305: 
 306:             try {
 307:                 baseURI = new URL (baseURI, base);
 308:                 uris.push (baseURI);
 309:             } catch (Exception e) {
 310:                 fatal (new SAXParseException (
 311:                     "xml:base with illegal uri: " + base,
 312:                     locator, e));
 313:             }
 314:         }
 315: 
 316:         if (!"http://www.w3.org/2001/XInclude".equals (uri)) {
 317:             super.startElement (uri, localName, qName, atts);
 318:             return;
 319:         }
 320: 
 321:         if ("include".equals (localName)) {
 322:             String      href = atts.getValue ("href");
 323:             String      parse = atts.getValue ("parse");
 324:             String      encoding = atts.getValue ("encoding");
 325:             URL         url = (URL) uris.peek ();
 326:             SAXParseException   x = null;
 327: 
 328:             if (href == null)
 329:                 fatal (new SAXParseException (
 330:                     "XInclude missing href",
 331:                     locator));
 332:             if (href.indexOf ('#') != -1)
 333:                 fatal (new SAXParseException (
 334:                     "XInclude with fragment: " + href,
 335:                     locator));
 336: 
 337:             if (parse == null || "xml".equals (parse))
 338:                 x = xinclude (url, href);
 339:             else if ("text".equals (parse))
 340:                 x = readText (url, href, encoding);
 341:             else
 342:                 fatal (new SAXParseException (
 343:                     "unknown XInclude parsing mode: " + parse,
 344:                     locator));
 345:             if (x == null) {
 346:                 // strip out all child content
 347:                 ignoreCount++;
 348:                 return;
 349:             }
 350: 
 351:             // FIXME the 17-Sept-2002 CR of XInclude says we "must"
 352:             // use xi:fallback elements to handle resource errors,
 353:             // if they exist.
 354:             fatal (x);
 355: 
 356:         } else if ("fallback".equals (localName)) {
 357:             fatal (new SAXParseException (
 358:                 "illegal top level XInclude 'fallback' element",
 359:                 locator));
 360:         } else {
 361:             ErrorHandler        eh = getErrorHandler ();
 362: 
 363:             // CR doesn't say this is an error
 364:             if (eh != null)
 365:                 eh.warning (new SAXParseException (
 366:                     "unrecognized toplevel XInclude element: " + localName,
 367:                     locator));
 368:             super.startElement (uri, localName, qName, atts);
 369:         }
 370:     }
 371: 
 372:     public void endElement (String uri, String localName, String qName)
 373:     throws SAXException
 374:     {
 375:         if (ignoreCount != 0) {
 376:             if (--ignoreCount != 0)
 377:                 return;
 378:         }
 379: 
 380:         uris.pop ();
 381:         if (!("http://www.w3.org/2001/XInclude".equals (uri)
 382:                 && "include".equals (localName)))
 383:             super.endElement (uri, localName, qName);
 384:     }
 385: 
 386:     //
 387:     // ignore all content within non-empty xi:include elements
 388:     //
 389:     public void characters (char ch [], int start, int length)
 390:     throws SAXException
 391:     {
 392:         if (ignoreCount == 0)
 393:             super.characters (ch, start, length);
 394:     }
 395: 
 396:     public void processingInstruction (String target, String value)
 397:     throws SAXException
 398:     {
 399:         if (ignoreCount == 0)
 400:             super.processingInstruction (target, value);
 401:     }
 402: 
 403:     public void ignorableWhitespace (char ch [], int start, int length)
 404:     throws SAXException
 405:     {
 406:         if (ignoreCount == 0)
 407:             super.ignorableWhitespace (ch, start, length);
 408:     }
 409: 
 410:     public void comment (char ch [], int start, int length)
 411:     throws SAXException
 412:     {
 413:         if (ignoreCount == 0)
 414:             super.comment (ch, start, length);
 415:     }
 416: 
 417:     public void startCDATA () throws SAXException
 418:     {
 419:         if (ignoreCount == 0)
 420:             super.startCDATA ();
 421:     }
 422: 
 423:     public void endCDATA () throws SAXException
 424:     {
 425:         if (ignoreCount == 0)
 426:             super.endCDATA ();
 427:     }
 428: 
 429:     public void startPrefixMapping (String prefix, String uri)
 430:     throws SAXException
 431:     {
 432:         if (ignoreCount == 0)
 433:             super.startPrefixMapping (prefix, uri);
 434:     }
 435: 
 436:     public void endPrefixMapping (String prefix) throws SAXException
 437:     {
 438:         if (ignoreCount == 0)
 439:             super.endPrefixMapping (prefix);
 440:     }
 441: 
 442:     public void skippedEntity (String name) throws SAXException
 443:     {
 444:         if (ignoreCount == 0)
 445:             super.skippedEntity (name);
 446:     }
 447: 
 448:     // JDK 1.1 seems to need it to be done this way, sigh
 449:     void setLocator (Locator l) { locator = l; }
 450:     Locator getLocator () { return locator; }
 451: 
 452: 
 453:     //
 454:     // for XIncluded entities, manage the current locator and
 455:     // filter out events that would be incorrect to report
 456:     //
 457:     private class Scrubber extends EventFilter
 458:     {
 459:         Scrubber (EventFilter f)
 460:         throws SAXException
 461:         {
 462:             // delegation passes to next in chain
 463:             super (f);
 464: 
 465:             // process all content events
 466:             super.setContentHandler (this);
 467:             super.setProperty (LEXICAL_HANDLER, this);
 468: 
 469:             // drop all DTD events
 470:             super.setDTDHandler (null);
 471:             super.setProperty (DECL_HANDLER, null);
 472:         }
 473: 
 474:         // maintain proxy locator
 475:         // only one startDocument()/endDocument() pair per event stream
 476:         public void setDocumentLocator (Locator l)
 477:             { setLocator (l); }
 478:         public void startDocument ()
 479:             { }
 480:         public void endDocument ()
 481:             { }
 482: 
 483:         private void reject (String message) throws SAXException
 484:             { fatal (new SAXParseException (message, getLocator ())); }
 485: 
 486:         // only the DTD from the "base document" gets reported
 487:         public void startDTD (String root, String publicId, String systemId)
 488:         throws SAXException
 489:             { reject ("XIncluded DTD: " + systemId); }
 490:         public void endDTD ()
 491:         throws SAXException
 492:             { reject ("XIncluded DTD"); }
 493:         // ... so this should never happen
 494:         public void skippedEntity (String name) throws SAXException
 495:             { reject ("XInclude skipped entity: " + name); }
 496: 
 497:         // since we rejected DTDs, only builtin entities can be reported
 498:     }
 499: 
 500:     // <xi:include parse='xml' ...>
 501:     // relative to the base URI passed
 502:     private SAXParseException xinclude (URL url, String href)
 503:     throws SAXException
 504:     {
 505:         XMLReader       helper;
 506:         Scrubber        scrubber;
 507:         Locator         savedLocator = locator;
 508: 
 509:         // start with a parser acting just like our input
 510:         // modulo DTD-ish stuff (validation flag, entity resolver)
 511:         helper = XMLReaderFactory.createXMLReader ();
 512:         helper.setErrorHandler (getErrorHandler ());
 513:         helper.setFeature (FEATURE_URI + "namespace-prefixes", true);
 514: 
 515:         // Set up the proxy locator and event filter.
 516:         scrubber = new Scrubber (this);
 517:         locator = null;
 518:         bind (helper, scrubber);
 519: 
 520:         // Merge the included document, except its DTD
 521:         try {
 522:             url = new URL (url, href);
 523:             href = url.toString ();
 524: 
 525:             if (inclusions.contains (href))
 526:                 fatal (new SAXParseException (
 527:                         "XInclude, circular inclusion", locator));
 528: 
 529:             inclusions.addElement (href);
 530:             uris.push (url);
 531:             helper.parse (new InputSource (href));
 532:             return null;
 533:         } catch (java.io.IOException e) {
 534:             return new SAXParseException (href, locator, e);
 535:         } finally {
 536:             pop (href);
 537:             locator = savedLocator;
 538:         }
 539:     }
 540: 
 541:     // <xi:include parse='text' ...>
 542:     // relative to the base URI passed
 543:     private SAXParseException readText (URL url, String href, String encoding)
 544:     throws SAXException
 545:     {
 546:         InputStream     in = null;
 547: 
 548:         try {
 549:             URLConnection       conn;
 550:             InputStreamReader   reader;
 551:             char                buf [] = new char [4096];
 552:             int                 count;
 553: 
 554:             url = new URL (url, href);
 555:             conn = url.openConnection ();
 556:             in = conn.getInputStream ();
 557:             if (encoding == null)
 558:                 encoding = Resolver.getEncoding (conn.getContentType ());
 559:             if (encoding == null) {
 560:                 ErrorHandler    eh = getErrorHandler ();
 561:                 if (eh != null)
 562:                     eh.warning (new SAXParseException (
 563:                         "guessing text encoding for URL: " + url,
 564:                         locator));
 565:                 reader = new InputStreamReader (in);
 566:             } else
 567:                 reader = new InputStreamReader (in, encoding);
 568: 
 569:             while ((count = reader.read (buf, 0, buf.length)) != -1)
 570:                 super.characters (buf, 0, count);
 571:             in.close ();
 572:             return null;
 573:         } catch (IOException e) {
 574:             return new SAXParseException (
 575:                 "can't XInclude text",
 576:                 locator, e);
 577:         }
 578:     }
 579: }