Source for gnu.xml.dom.html2.DomHTMLParser

   1: /* DomHTMLParser.java --
   2:    Copyright (C) 2005 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: 
  39: package gnu.xml.dom.html2;
  40: 
  41: import java.io.IOException;
  42: import java.io.Reader;
  43: 
  44: import java.util.Enumeration;
  45: import java.util.Iterator;
  46: import java.util.LinkedList;
  47: 
  48: import javax.swing.text.AttributeSet;
  49: import javax.swing.text.html.HTML;
  50: import javax.swing.text.html.parser.DTD;
  51: import javax.swing.text.html.parser.TagElement;
  52: 
  53: import org.w3c.dom.NamedNodeMap;
  54: import org.w3c.dom.Node;
  55: import org.w3c.dom.html2.HTMLDocument;
  56: 
  57: /**
  58:  * This parser reads HTML from the given stream and stores into
  59:  * {@link HTMLDocument}. The HTML tag becomes the {@link Node}.
  60:  * The tag attributes become the node attributes. The text inside
  61:  * HTML tag is inserted as one or several text nodes. The nested
  62:  * HTML tags are inserted as child nodes.
  63:  *
  64:  * If the strict tree structure, closing the tag means closing all
  65:  * nested tags. To work around this, this parser closes the nested
  66:  * tags and immediately reopens them after the closed tag.
  67:  * In this way, <code>&lt;b&gt;&lt;i&gt;c&lt;/b&gt;d</code>
  68:  * is parsed as <code>&lt;b&gt;&lt;i&gt;c&lt;/i&gt;&lt;/b&gt;&lt;i&gt;d</code> .
  69:  *
  70:  * @author Audrius Meskauskas (AudriusA@Bioinformatics.org)
  71:  */
  72: public class DomHTMLParser
  73:   extends gnu.javax.swing.text.html.parser.support.Parser
  74: {
  75:   /**
  76:    * The target where HTML document will be inserted.
  77:    */
  78:   protected DomHTMLDocument document;
  79: 
  80:   /**
  81:    * The subsequently created new nodes will be inserted as the
  82:    * childs of this cursor.
  83:    */
  84:   protected Node cursor;
  85: 
  86:   /**
  87:    * Create parser using the given DTD.
  88:    *
  89:    * @param dtd the DTD (for example,
  90:    * {@link gnu.javax.swing.text.html.parser.HTML_401F}).
  91:    */
  92:   public DomHTMLParser(DTD dtd)
  93:   {
  94:     super(dtd);
  95:   }
  96: 
  97:   /**
  98:    * Parse SGML insertion ( &lt;! ... &gt; ).
  99:    * Currently just treats it as comment.
 100:    */
 101:   public boolean parseMarkupDeclarations(StringBuffer strBuff)
 102:                                   throws java.io.IOException
 103:   {
 104:     Node c = document.createComment(strBuff.toString());
 105:     cursor.appendChild(c);
 106:     return false;
 107:   }
 108: 
 109:   /**
 110:    * Read the document, present in the given stream, and
 111:    * return the corresponding {@link HTMLDocument}.
 112:    *
 113:    * @param input a stream to read from.
 114:    * @return a document, reflecting the structure of the provided HTML
 115:    * text.
 116:    *
 117:    * @throws IOException if the reader throws one.
 118:    */
 119:   public HTMLDocument parseDocument(Reader input)
 120:                     throws IOException
 121:   {
 122:     try
 123:       {
 124:         document = new DomHTMLDocument();
 125:         document.setCheckWellformedness(false);
 126:         document.setCheckingCharacters(false);
 127: 
 128:         cursor = document;
 129: 
 130:         parse(input);
 131: 
 132:         DomHTMLDocument h = document;
 133:         document = null;
 134:         return h;
 135:       }
 136:     catch (Exception ex)
 137:       {
 138:         ex.printStackTrace();
 139:         throw new IOException("Exception: " + ex.getMessage());
 140:       }
 141:   }
 142: 
 143:   /**
 144:    * Create a new node.
 145:    * @param name the name of node, case insensitive.
 146:    * @return the created node.
 147:    */
 148:   protected Node createNode(String name)
 149:   {
 150:     Node new_node = document.createElement(name.toLowerCase());
 151:     AttributeSet hatts = getAttributes();
 152:     NamedNodeMap natts = new_node.getAttributes();
 153: 
 154:     Enumeration enumeration = hatts.getAttributeNames();
 155:     Object key;
 156:     Node attribute;
 157: 
 158:     while (hatts != null)
 159:       {
 160:         while (enumeration.hasMoreElements())
 161:           {
 162:             key = enumeration.nextElement();
 163:             attribute = document.createAttribute(key.toString());
 164:             attribute.setNodeValue(hatts.getAttribute(key).toString());
 165:             natts.setNamedItem(attribute);
 166:           }
 167: 
 168:         // The default values are stored in a parent node.
 169:         hatts = hatts.getResolveParent();
 170:       }
 171: 
 172:     return new_node;
 173:   }
 174: 
 175:   /**
 176:    * Handle comment by inserting the comment node.
 177:    * @param text the comment text.
 178:    */
 179:   protected void handleComment(char[] text)
 180:   {
 181:     Node c = document.createComment(new String(text));
 182:     cursor.appendChild(c);
 183:   }
 184: 
 185:   /**
 186:    * Handle the tag with no content.
 187:    * @param tag the tag to handle.
 188:    */
 189:   protected void handleEmptyTag(TagElement tag)
 190:   {
 191:     String name = tag.getHTMLTag().toString();
 192: 
 193:     if (name.equalsIgnoreCase("#pcdata"))
 194:       return;
 195: 
 196:     Node c = createNode(name);
 197:     cursor.appendChild(c);
 198:   }
 199: 
 200:   /**
 201:    * Close the given tag. Close and reopen all nested tags.
 202:    * @param tag the tag to close.
 203:    */
 204:   protected void handleEndTag(TagElement tag)
 205:   {
 206:     String name = tag.getHTMLTag().toString();
 207:     String nname = cursor.getNodeName();
 208: 
 209:     // Closing the current tag.
 210:     if (nname != null && nname.equalsIgnoreCase(name))
 211:       {
 212:         cursor = cursor.getParentNode();
 213:       }
 214:     else
 215:       {
 216:         Node nCursor = cursor.getParentNode();
 217: 
 218:         // Remember the opened nodes.
 219:         LinkedList open = new LinkedList();
 220:         Node close = cursor;
 221:         while (close != null && !close.getNodeName().equalsIgnoreCase(name))
 222:           {
 223:             if (close != document)
 224:               open.addFirst(close);
 225:             close = close.getParentNode();
 226:           }
 227:         if (close == null)
 228:           cursor = document;
 229:         else
 230:           cursor = close.getParentNode();
 231: 
 232:         // Insert the copies of the opened nodes.
 233:         Iterator iter = open.iterator();
 234:         while (iter.hasNext())
 235:           {
 236:             Node item = (Node) iter.next();
 237:             cursor.appendChild(item);
 238:             cursor = item;
 239:           }
 240:       }
 241:   }
 242: 
 243:   /**
 244:    * Handle the start tag by inserting the HTML element.
 245:    * @param tag the tag to handle.
 246:    */
 247:   protected void handleStartTag(TagElement tag)
 248:   {
 249:     HTML.Tag h = tag.getHTMLTag();
 250:     Node c = createNode(h.toString());
 251:     cursor.appendChild(c);
 252:     cursor = c;
 253:   }
 254: 
 255:   /**
 256:    * Handle text by inserting the text node.
 257:    * @param text the text to insert.
 258:    */
 259:   protected void handleText(char[] text)
 260:   {
 261:     Node c = document.createTextNode(text, 0, text.length);
 262:     cursor.appendChild(c);
 263:   }
 264: }