You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/03/27 07:32:10 UTC

svn commit: r641712 - /incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java

Author: jukka
Date: Wed Mar 26 23:32:00 2008
New Revision: 641712

URL: http://svn.apache.org/viewvc?rev=641712&view=rev
Log:
TIKA-97: Tika GUI
    - Simplify the HTML output for JEditorPane to better understand it

Modified:
    incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java?rev=641712&r1=641711&r2=641712&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java Wed Mar 26 23:32:00 2008
@@ -43,12 +43,15 @@
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.ContentHandlerDecorator;
 import org.apache.tika.sax.TeeContentHandler;
 import org.apache.tika.sax.WriteOutContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.sax.xpath.MatchingContentHandler;
 import org.apache.tika.sax.xpath.XPathParser;
+import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
 
 /**
  * Simple Swing GUI for Apache Tika. You can drag and drop files on top
@@ -196,6 +199,24 @@
         editor.setCaretPosition(0);
     }
 
+    /**
+     * Creates and returns a content handler that turns XHTML input to
+     * simplified HTML output that can be correctly parsed and displayed
+     * by {@link JEditorPane}.
+     * <p>
+     * The returned content handler is set to output <code>html</code>
+     * to the given writer. The XHTML namespace is removed from the output
+     * to prevent the serializer from using the &lt;tag/&gt; empty element
+     * syntax that causes extra "&gt;" characters to be displayed.
+     * The &lt;head&gt; tags are dropped to prevent the serializer from
+     * generating a &lt;META&gt; content type tag that makes
+     * {@link JEditorPane} fail thinking that the document character set
+     * is inconsistent.
+     *
+     * @param writer output writer
+     * @return HTML content handler
+     * @throws TransformerConfigurationException if an error occurs
+     */
     private ContentHandler getHtmlHandler(Writer writer)
             throws TransformerConfigurationException {
         SAXTransformerFactory factory = (SAXTransformerFactory)
@@ -203,7 +224,35 @@
         TransformerHandler handler = factory.newTransformerHandler();
         handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
         handler.setResult(new StreamResult(writer));
-        return handler;
+        return new ContentHandlerDecorator(handler) {
+            @Override
+            public void startElement(
+                    String uri, String localName, String name, Attributes atts)
+                    throws SAXException {
+                if (XHTMLContentHandler.XHTML.equals(uri)) {
+                    uri = null;
+                }
+                if (!"head".equals(localName)) {
+                    super.startElement(uri, localName, name, atts);
+                }
+            }
+            @Override
+            public void endElement(String uri, String localName, String name)
+                    throws SAXException {
+                if (XHTMLContentHandler.XHTML.equals(uri)) {
+                    uri = null;
+                }
+                if (!"head".equals(localName)) {
+                    super.endElement(uri, localName, name);
+                }
+            }
+            @Override
+            public void startPrefixMapping(String prefix, String uri) {
+            }
+            @Override
+            public void endPrefixMapping(String prefix) {
+            }
+        };
     }
 
     private ContentHandler getTextContentHandler(Writer writer) {