You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/03/27 07:32:10 UTC
svn commit: r641712 -
/incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java
Author: jukka
Date: Wed Mar 26 23:32:00 2008
New Revision: 641712
URL: http://svn.apache.org/viewvc?rev=641712&view=rev
Log:
TIKA-97: Tika GUI
- Simplify the HTML output for JEditorPane to better understand it
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java?rev=641712&r1=641711&r2=641712&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/gui/TikaGUI.java Wed Mar 26 23:32:00 2008
@@ -43,12 +43,15 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.ContentHandlerDecorator;
import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
+import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
* Simple Swing GUI for Apache Tika. You can drag and drop files on top
@@ -196,6 +199,24 @@
editor.setCaretPosition(0);
}
+ /**
+ * Creates and returns a content handler that turns XHTML input to
+ * simplified HTML output that can be correctly parsed and displayed
+ * by {@link JEditorPane}.
+ * <p>
+ * The returned content handler is set to output <code>html</code>
+ * to the given writer. The XHTML namespace is removed from the output
+ * to prevent the serializer from using the <tag/> empty element
+ * syntax that causes extra ">" characters to be displayed.
+ * The <head> tags are dropped to prevent the serializer from
+ * generating a <META> content type tag that makes
+ * {@link JEditorPane} fail thinking that the document character set
+ * is inconsistent.
+ *
+ * @param writer output writer
+ * @return HTML content handler
+ * @throws TransformerConfigurationException if an error occurs
+ */
private ContentHandler getHtmlHandler(Writer writer)
throws TransformerConfigurationException {
SAXTransformerFactory factory = (SAXTransformerFactory)
@@ -203,7 +224,35 @@
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
handler.setResult(new StreamResult(writer));
- return handler;
+ return new ContentHandlerDecorator(handler) {
+ @Override
+ public void startElement(
+ String uri, String localName, String name, Attributes atts)
+ throws SAXException {
+ if (XHTMLContentHandler.XHTML.equals(uri)) {
+ uri = null;
+ }
+ if (!"head".equals(localName)) {
+ super.startElement(uri, localName, name, atts);
+ }
+ }
+ @Override
+ public void endElement(String uri, String localName, String name)
+ throws SAXException {
+ if (XHTMLContentHandler.XHTML.equals(uri)) {
+ uri = null;
+ }
+ if (!"head".equals(localName)) {
+ super.endElement(uri, localName, name);
+ }
+ }
+ @Override
+ public void startPrefixMapping(String prefix, String uri) {
+ }
+ @Override
+ public void endPrefixMapping(String prefix) {
+ }
+ };
}
private ContentHandler getTextContentHandler(Writer writer) {