You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by rg...@apache.org on 2012/12/18 18:20:46 UTC
svn commit: r1423538 - in /tika/trunk:
tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java
Author: rgauss
Date: Tue Dec 18 17:20:45 2012
New Revision: 1423538
URL: http://svn.apache.org/viewvc?rev=1423538&view=rev
Log:
TIKA-725: Empty title element makes Tika-generated HTML documents not open in Chromium
- Added an assert to TikaCLITest which verifies the issue
- Added ExpandedTitleContentHandler
- Changed TikaCLI to use ExpandedTitleContentHandler for html output
Added:
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java (with props)
Modified:
tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1423538&r1=1423537&r2=1423538&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Tue Dec 18 17:20:45 2012
@@ -84,6 +84,7 @@ import org.apache.tika.parser.ParserDeco
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ExpandedTitleContentHandler;
import org.apache.tika.xmp.XMPMetadata;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -166,7 +167,7 @@ public class TikaCLI {
@Override
protected ContentHandler getContentHandler(
OutputStream output, Metadata metadata) throws Exception {
- return getTransformerHandler(output, "html", encoding, prettyPrint);
+ return new ExpandedTitleContentHandler(getTransformerHandler(output, "html", encoding, prettyPrint));
}
};
Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1423538&r1=1423537&r2=1423538&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java (original)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Tue Dec 18 17:20:45 2012
@@ -98,6 +98,8 @@ public class TikaCLITest extends TestCas
String[] params = {"-h", resourcePrefix + "alice.cli.test"};
TikaCLI.main(params);
Assert.assertTrue(outContent.toString().contains("html xmlns=\"http://www.w3.org/1999/xhtml"));
+ Assert.assertTrue("Expanded <title></title> element should be present",
+ outContent.toString().contains("<title></title>"));
}
/**
Added: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java?rev=1423538&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java Tue Dec 18 17:20:45 2012
@@ -0,0 +1,75 @@
+package org.apache.tika.sax;
+
+import javax.xml.transform.sax.TransformerHandler;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Content handler decorator which wraps a {@link TransformerHandler} in order to
+ * allow the <code>TITLE</code> tag to render as <code><title></title></code>
+ * rather than <code><title/></code> which is accomplished
+ * by calling the {@link TransformerHandler#characters(char[], int, int)} method
+ * with a <code>length</code> of 1 but a zero length char array.
+ * <p>
+ * This workaround is an unfortunate circumstance of the limitations imposed by the
+ * implementation of the XML serialization code in the JDK brought over from
+ * the xalan project which no longer allows for the specification of an
+ * alternate <code>content-handler</code> via xslt templates or other means.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-725">TIKA-725</a>
+ */
+public class ExpandedTitleContentHandler extends ContentHandlerDecorator {
+
+ private boolean isTitleTagOpen;
+ private static final String TITLE_TAG = "TITLE";
+
+ public ExpandedTitleContentHandler() {
+ super();
+ }
+
+ public ExpandedTitleContentHandler(ContentHandler handler) {
+ super(handler);
+ }
+
+ @Override
+ public void startDocument() throws SAXException {
+ super.startDocument();
+ isTitleTagOpen = false;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName,
+ Attributes atts) throws SAXException {
+ super.startElement(uri, localName, qName, atts);
+ if (TITLE_TAG.equalsIgnoreCase(localName) && XHTMLContentHandler.XHTML.equals(uri)) {
+ isTitleTagOpen = true;
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName)
+ throws SAXException {
+ super.endElement(uri, localName, qName);
+ if (TITLE_TAG.equalsIgnoreCase(localName) && XHTMLContentHandler.XHTML.equals(uri)) {
+ isTitleTagOpen = false;
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ if (isTitleTagOpen && length == 0) {
+ // Hack to close the title tag
+ try {
+ super.characters(new char[0], 0, 1);
+ } catch (ArrayIndexOutOfBoundsException e) {
+ // Expected, just wanted to close the title tag
+ }
+ } else {
+ super.characters(ch, start, length);
+ }
+ }
+
+}
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java
------------------------------------------------------------------------------
svn:eol-style = native